In [None]:
'''
语言不是死板的数据组合，还包含了许多隐藏的东西。
文本分析和NLP的目标是：用算法实现人机交互，不再需要程序设计语言来命令计算机执行指令！
1.用标记解析的方法，预处理数据
2.提取文本数据的词干
3.用词形还原的方法，还原文本的基本形式
4.用分块的方法划分文本
5.创建词袋模型
6.创建文本分类器
7.识别性别
8.语句情感分析
9.用主题建模识别文本模式
'''

In [15]:
#_*_ 标记解析：将文本分割为一组有意义的片段的过程 _*_#

text = "Are you curious about tokenization? Let's see how it works! We need to analyze \
a couple of sentences with punctuations to see it in action."

# 句子解析器
from nltk.tokenize import sent_tokenize
sent_tokenize_list = sent_tokenize(text)
print('\nSentence tokenizer: ',sent_tokenize_list)

# 单词解析器
from nltk.tokenize import word_tokenize
print('\nWord tokenizer:')
print(word_tokenize(text))

# 以标点符号分割文本的单词解析器PunktWord，忽略单词中的标点
from nltk.tokenize import WordPunctTokenizer
word_punct_tokenizer = WordPunctTokenizer()
print('\nPunkt Word tokenizer :')
print(word_punct_tokenizer.tokenize(text))


Sentence tokenizer:  ['Are you curious about tokenization?', "Let's see how it works!", 'We need to analyze a couple of sentences with punctuations to see it in action.']

Word tokenizer:
['Are', 'you', 'curious', 'about', 'tokenization', '?', 'Let', "'s", 'see', 'how', 'it', 'works', '!', 'We', 'need', 'to', 'analyze', 'a', 'couple', 'of', 'sentences', 'with', 'punctuations', 'to', 'see', 'it', 'in', 'action', '.']

Punkt Word tokenizer :
['Are', 'you', 'curious', 'about', 'tokenization', '?', 'Let', "'", 's', 'see', 'how', 'it', 'works', '!', 'We', 'need', 'to', 'analyze', 'a', 'couple', 'of', 'sentences', 'with', 'punctuations', 'to', 'see', 'it', 'in', 'action', '.']


In [17]:
#_*_ 词干提取 ：还原词根，不一定是有意义的 _*_#
# 通常选择 snawball， porter 最宽松， lancaster 最严格
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer

words = ['table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches', 'grounded',
         'dreamt', 'envision']

# 不同的词干提取器算法
stemmers = ['PORTER', 'LANCASTER', 'SNOWBALL']
stemmer_porter = PorterStemmer()
stemmer_lancaster = LancasterStemmer()
stemmer_snowball = SnowballStemmer('english')

# 输出格式
formatted_row = '{:>16}' * (len(stemmers) + 1)
print('\n',formatted_row.format('WORD', *stemmers),'\n')

for word in words:
    stemmed_words = [stemmer_porter.stem(word), stemmer_lancaster.stem(word),
                    stemmer_snowball.stem(word)]
    print(formatted_row.format(word, *stemmed_words))


             WORD          PORTER       LANCASTER        SNOWBALL 

           table            tabl            tabl            tabl
        probably         probabl            prob         probabl
          wolves            wolv            wolv            wolv
         playing            play            play            play
              is              is              is              is
             dog             dog             dog             dog
             the             the             the             the
         beaches           beach           beach           beach
        grounded          ground          ground          ground
          dreamt          dreamt          dreamt          dreamt
        envision           envis           envid           envis


In [23]:
#_*_ 词形还原 ：通过对单词进行词汇和语法分析，还原为真实的元单词_*_#
from nltk.stem import WordNetLemmatizer

# 对比不同的词形还原器
lemmatizers = ['NOUN LEMMATIZER', 'VERB LEMMATIZER']
lemmatizer_wordnet = WordNetLemmatizer()

formatted_row = '{:>24}' * (len(lemmatizers) + 1)
print('\n',formatted_row.format('WORD', *lemmatizers),'\n')

for word in words:
    lemmatized_words = [lemmatizer_wordnet.lemmatize(word, pos='n'),
                       lemmatizer_wordnet.lemmatize(word, pos='v')]
    print(formatted_row.format(word, *lemmatized_words))


                     WORD         NOUN LEMMATIZER         VERB LEMMATIZER 

                   table                   table                   table
                probably                probably                probably
                  wolves                    wolf                  wolves
                 playing                 playing                    play
                      is                      is                      be
                     dog                     dog                     dog
                     the                     the                     the
                 beaches                   beach                   beach
                grounded                grounded                  ground
                  dreamt                  dreamt                   dream
                envision                envision                envision


In [22]:
#_*_ 分块法分割文本，每个文本块不一定有实际意义 _*_#
import numpy as np
from nltk.corpus import brown

def splitter(data, num_words):
    words = data.split(' ')
    output = []
    cur_count = 0
    cur_words = []
    
    for word in words:
        cur_words.append(word)
        cur_count += 1
        if cur_count == num_words:
            output.append(' '.join(cur_words))
            cur_words = []
            cur_count = 0
    
    output.append(' '.join(cur_words))
    
    return output

data = ' '.join(brown.words()[:10000])
num_words = 1700

text_chunks = splitter(data, num_words)
print('Number of text chunks = ', len(text_chunks))

Number of text chunks =  6


In [27]:
#_*_ 词袋模型：将词汇转化为数值表示，进行数据分析，并构建所有单词的直方图对每篇文档建模 _*_#
num_words = 2000
counter = 0
chunks = []
text_chunks = splitter(data, num_words)

# 创建一个基于文本块的字典
for text in text_chunks :
    chunk = {'index': counter, 'text': text}
    chunks.append(chunk)
    counter += 1
    
# 提取文档-词矩阵，记录文档中每个单词出现的频次
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=5, max_df=.95)
doc_term_matrix = vectorizer.fit_transform([chunk['text'] for chunk in chunks])
# print(doc_term_matrix.T[:3])

# 从 vectorizer 中提取词汇
vocab = np.array(vectorizer.get_feature_names())
print('\nVocabulary:',vocab)

print('\nDocument term matrix:')
chunk_names = ['Chunk-0', 'Chunk-1', 'Chunk-2', 'Chunk-3', 'Chunk-4']
formatted_row = '{:>12}' * (len(chunk_names) + 1)
print('\n',formatted_row.format('Word', *chunk_names),'\n')

for word, item in zip(vocab, doc_term_matrix.T):
    # item 是压缩的稀疏矩阵(csr_matrix)数据结构
    output = [str(x) for x in item.data]
    print(formatted_row.format(word,*output))

  (1, 0)	2
  (2, 0)	1
  (0, 0)	1
  (1, 1)	3
  (2, 1)	2
  (0, 1)	1
  (1, 2)	2
  (2, 2)	2
  (0, 2)	1
  (1, 3)	1
  (2, 3)	1
  (0, 3)	1
  (1, 4)	3
  (2, 4)	1
  (0, 4)	3

Vocabulary: ['about' 'after' 'against' 'aid' 'all' 'also' 'an' 'and' 'are' 'as' 'at'
 'be' 'been' 'before' 'but' 'by' 'committee' 'congress' 'did' 'each'
 'education' 'first' 'for' 'from' 'general' 'had' 'has' 'have' 'he'
 'health' 'his' 'house' 'in' 'increase' 'is' 'it' 'last' 'made' 'make'
 'may' 'more' 'no' 'not' 'of' 'on' 'one' 'only' 'or' 'other' 'out' 'over'
 'pay' 'program' 'proposed' 'said' 'similar' 'state' 'such' 'take' 'than'
 'that' 'the' 'them' 'there' 'they' 'this' 'time' 'to' 'two' 'under' 'up'
 'was' 'were' 'what' 'which' 'who' 'will' 'with' 'would' 'year' 'years']

Document term matrix:

         Word     Chunk-0     Chunk-1     Chunk-2     Chunk-3     Chunk-4 

       about           1           1           1           1           3
       after           2           3           2           1           3


In [31]:
#_*_ 文本分类：tf-idf(词频-逆文档频率) _*_#
from sklearn.datasets import fetch_20newsgroups

# type dict
category_map = {
    'misc.forsale':'Sales', 'rec.motorcycles':'Motorcycles', 'rec.sport.baseball':'Baseball',
    'sci.crypt':'Cryptgraphy', 'sci.space':'Space'
}

# datasets
training_data = fetch_20newsgroups(subset='train', categories=category_map.keys(),
                                               shuffle=True, random_state=7)

# 特征提取
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train_termcounts = vectorizer.fit_transform(traning_data.data)
print('\nDimensions of training data:',X_train_termcounts.shape)

# 训练
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer

input_data = [
    "The curveballs of right handed pitchers tend to curve to the left",
    "Caesar cipher is an ancient form of encryption",
    "This two-wheeler is really good on slippery roads"
]

# tf-idf transformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transformtransform(X_train_termcounts)
classifier = MultinomialNB().fit(X_train_tfidf, training_data.garget)

X_input_termcounts = vectorizer.transform(input_data)
X_input_tfidf = tfidf_transformer.transform(X_input_termcounts)

# predict
predicted_categories = classifier.predict(X_input_tfidf)
for sentence, category in zip(input_data, predicted_categories):
    print('\nInput:',sentence, '\nPredicted category:',category_map[training_data.target_names[category]])


URLError: <urlopen error [WinError 10060] 由于连接方在一段时间后没有正确答复或连接的主机没有反应，连接尝试失败。>