# NLP 入门

使用的库

- `NTKL` 库
- `jieba` 结巴分词
- `scikit-learn` 库下，`sklearn.feature_extraction.text` 模块

主要内容

- 分词

- tf, idf, tf-idf

参考资料

- NLP 基础：分词，停词，n元语法, 博客园, [website](https://www.cnblogs.com/veager/articles/16288751.html)

- 文本特征提取, 博客园, [website](https://www.cnblogs.com/veager/articles/16285476.html)

In [167]:
import numpy as np

## 1. 分词

### 1.1 `NLTK` 库

In [168]:
import nltk

In [169]:
# 分词所必要的数据
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [170]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize

s = 'Good muffins cost $3.88\nin New York. Please buy me ... two of them.\n\nThanks.'

tokens_1 = word_tokenize(s)    # tokens
tokens_2 = wordpunct_tokenize(s)
print(tokens_1)
print(tokens_2)

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', '...', 'two', 'of', 'them', '.', 'Thanks', '.']
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', '...', 'two', 'of', 'them', '.', 'Thanks', '.']


### 1.2 `jieba` 分词

In [171]:
import jieba

doc = "我来到北京清华大学"
seg_list = jieba.cut(doc, cut_all=True)
print("Full Mode: " + "/ ".join(seg_list))  # 全模式

seg_list = jieba.cut(doc, cut_all=False)
print("Default Mode: " + "/ ".join(seg_list))  # 精确模式

Full Mode: 我/ 来到/ 北京/ 清华/ 清华大学/ 华大/ 大学
Default Mode: 我/ 来到/ 北京/ 清华大学


In [172]:
jieba.del_word("创新办")
jieba.del_word("云计算")
doc= "李小福是创新办主任也是云计算方面的专家"
seg_list = jieba.cut(doc)
print( "/ ".join(seg_list))

jieba.add_word("创新办")
jieba.add_word("云计算")
seg_list = jieba.cut(doc)
print("/ ".join(seg_list))

李小福/ 是/ 创新/ 办/ 主任/ 也/ 是/ 云/ 计算/ 方面/ 的/ 专家
李小福/ 是/ 创新办/ 主任/ 也/ 是/ 云计算/ 方面/ 的/ 专家


## 2. 停词

### 2.1 `NLTK` 库

In [173]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [174]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)
print(len(stop_words))

{'further', 'won', 'only', "should've", 'we', "it's", 'while', 'under', 'whom', 'other', 'ours', 'she', "don't", 'in', 'so', "needn't", 'any', 'there', "hasn't", 'my', 'during', 'because', 'has', 'o', 'do', "wouldn't", 'had', 'few', 'its', 'such', 'mightn', 'at', 'out', 'should', 'than', 'not', 'here', 'on', 'isn', 'just', 'was', 'then', 'for', 'you', 'until', 'no', 'herself', 'or', 'once', 'needn', 'themselves', 'but', "you've", 'him', 'having', 'where', 'aren', 'own', "doesn't", 'over', "couldn't", 'when', 'same', 'll', 'with', 'after', 'your', 'm', "shan't", 'ain', 'itself', 'between', 'this', 'been', 'from', 'they', 'to', 'and', 'am', "shouldn't", 'some', 'me', 'as', 'hasn', 'our', 'if', 'of', 'were', 'doing', 't', 'mustn', 'above', 'below', 'both', 'weren', 'wouldn', 'he', 'by', 'ma', 'don', "haven't", 'ourselves', "isn't", "mightn't", 'd', 'yourself', 'her', "won't", 'are', 's', "that'll", 'against', 'an', 'wasn', 'who', 'off', "weren't", 'being', 'hers', 'have', 'those', 'up', '

#### 过滤停词

In [175]:
print(tokens_1)

filtered_tokens = [w for w in tokens_1 if not w in stopwords.words()]
print(filtered_tokens)

['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', '...', 'two', 'of', 'them', '.', 'Thanks', '.']
['Good', 'muffins', 'cost', '$', '3.88', 'New', 'York', '.', 'Please', 'buy', '...', 'two', '.', 'Thanks', '.']


### 2.2 `scikit-learn` 库

In [176]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(stop_words='english')
corpus = ['This is the first document']
stop_words = vectorizer.fit(corpus).get_stop_words()

print(stop_words)
print(len(stop_words))

frozenset({'we', 'other', 'under', 'ours', 'within', 'latterly', 'my', 'hundred', 'everyone', 'here', 'on', 'ten', 'then', 'everywhere', 'or', 'seemed', 'anywhere', 'still', 'own', 'over', 'upon', 'sometimes', 'after', 'itself', 'onto', 'twelve', 'eight', 'bill', 'less', 'me', 'fifty', 'else', 'due', 'cannot', 'seeming', 'became', 'elsewhere', 'inc', 'find', 'beside', 'whence', 'are', 'cry', 'back', 'part', 'wherever', 'up', 'those', 'what', 'how', 'since', 'whither', 'before', 'mill', 'can', 'becoming', 'might', 'put', 'whereby', 'hereby', 'etc', 'each', 'formerly', 'these', 'much', 'least', 'through', 'go', 'cant', 'the', 'see', 'although', 'only', 'she', 'in', 'several', 'sixty', 'move', 'during', 'do', 'few', 'had', 'such', 'empty', 'at', 'interest', 'bottom', 'than', 'not', 'was', 'many', 'another', 'for', 'behind', 'whose', 'de', 'becomes', 'but', 'him', 'six', 'where', 'nowhere', 'five', 'somewhere', 'serious', 'with', 'every', 'show', 'beforehand', 'between', 'together', 'from'

## 3. n-gram

In [177]:
from nltk.util import ngrams, trigrams, bigrams

tokens = "Insurgents killed in ongoing fighting".split()

trigrams_1 = ngrams(tokens, n=3, pad_left=True, left_pad_symbol='</s>')
trigrams_2 = trigrams(tokens, pad_left=True, left_pad_symbol='</s>')

print(list(tokens))
print(list(trigrams_1))     # 得到的 trigrams_1 和 trigrams_2 相同
print(list(trigrams_2))

['Insurgents', 'killed', 'in', 'ongoing', 'fighting']
[('</s>', '</s>', 'Insurgents'), ('</s>', 'Insurgents', 'killed'), ('Insurgents', 'killed', 'in'), ('killed', 'in', 'ongoing'), ('in', 'ongoing', 'fighting')]
[('</s>', '</s>', 'Insurgents'), ('</s>', 'Insurgents', 'killed'), ('Insurgents', 'killed', 'in'), ('killed', 'in', 'ongoing'), ('in', 'ongoing', 'fighting')]


In [178]:
s = 'Good muffins cost $3.88\nin New York. Please buy me ... two of them.\n\nThanks.'
trigrams_1 = ngrams(s, n=3, pad_left=True)
trigrams_2 = trigrams(s, pad_left=True)

print(list(trigrams_1))
print(list(trigrams_2))

[(None, None, 'G'), (None, 'G', 'o'), ('G', 'o', 'o'), ('o', 'o', 'd'), ('o', 'd', ' '), ('d', ' ', 'm'), (' ', 'm', 'u'), ('m', 'u', 'f'), ('u', 'f', 'f'), ('f', 'f', 'i'), ('f', 'i', 'n'), ('i', 'n', 's'), ('n', 's', ' '), ('s', ' ', 'c'), (' ', 'c', 'o'), ('c', 'o', 's'), ('o', 's', 't'), ('s', 't', ' '), ('t', ' ', '$'), (' ', '$', '3'), ('$', '3', '.'), ('3', '.', '8'), ('.', '8', '8'), ('8', '8', '\n'), ('8', '\n', 'i'), ('\n', 'i', 'n'), ('i', 'n', ' '), ('n', ' ', 'N'), (' ', 'N', 'e'), ('N', 'e', 'w'), ('e', 'w', ' '), ('w', ' ', 'Y'), (' ', 'Y', 'o'), ('Y', 'o', 'r'), ('o', 'r', 'k'), ('r', 'k', '.'), ('k', '.', ' '), ('.', ' ', 'P'), (' ', 'P', 'l'), ('P', 'l', 'e'), ('l', 'e', 'a'), ('e', 'a', 's'), ('a', 's', 'e'), ('s', 'e', ' '), ('e', ' ', 'b'), (' ', 'b', 'u'), ('b', 'u', 'y'), ('u', 'y', ' '), ('y', ' ', 'm'), (' ', 'm', 'e'), ('m', 'e', ' '), ('e', ' ', '.'), (' ', '.', '.'), ('.', '.', '.'), ('.', '.', ' '), ('.', ' ', 't'), (' ', 't', 'w'), ('t', 'w', 'o'), ('w', '

跳元语法

In [179]:
from nltk.util import skipgrams

tokens = "Insurgents killed in ongoing fighting".split()
skipgrams1 = skipgrams(tokens, 3, 2)

print(tokens)
print(list(skipgrams1))

['Insurgents', 'killed', 'in', 'ongoing', 'fighting']
[('Insurgents', 'killed', 'in'), ('Insurgents', 'killed', 'ongoing'), ('Insurgents', 'killed', 'fighting'), ('Insurgents', 'in', 'ongoing'), ('Insurgents', 'in', 'fighting'), ('Insurgents', 'ongoing', 'fighting'), ('killed', 'in', 'ongoing'), ('killed', 'in', 'fighting'), ('killed', 'ongoing', 'fighting'), ('in', 'ongoing', 'fighting')]


## 4. 词频与逆向文件频率 tf-idf

### 4.1 `scikit-learn` 库

#### 4.1.1 统计词频 tf

`CountVectorizer` 类

In [180]:
# 预料，4 个文档
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [181]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer='word')   # stop_words='english' 设置 stop words
word_cnt = vectorizer.fit_transform(corpus)     # 统计 词频，返回 sparse matrix 类型
word_cnt = word_cnt.toarray()

# 输出
print(vectorizer.get_feature_names_out())
print(word_cnt)

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [182]:
# 字符统计
vectorizer1 = CountVectorizer(analyzer='char')
word_cnt1 = vectorizer.fit_transform(corpus)

# 输出
print(vectorizer.get_feature_names_out())
print(word_cnt1.toarray())

['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [183]:
# n 元模型
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
word_cnt2 = vectorizer2.fit_transform(corpus)

# 输出
print(vectorizer2.get_feature_names_out())
print(word_cnt2.toarray())

['and this' 'document is' 'first document' 'is the' 'is this'
 'second document' 'the first' 'the second' 'the third' 'third one'
 'this document' 'this is' 'this the']
[[0 0 1 1 0 0 1 0 0 0 0 1 0]
 [0 1 0 1 0 1 0 1 0 0 1 0 0]
 [1 0 0 1 0 0 0 0 1 1 0 1 0]
 [0 0 1 0 1 0 1 0 0 0 0 0 1]]


利用 `TfidfTransformer` 类，设置 use_idf=False，统计归一化的词频 tf

In [184]:
# 只统计归一化的 tf, 设置 use_idf=False
tf_norm = TfidfTransformer(use_idf=False, smooth_idf=True).fit_transform(word_cnt)
print(tf_norm.toarray())

print(word_cnt / np.linalg.norm(word_cnt, ord=2, axis=1, keepdims=True)) 

[[0.         0.4472136  0.4472136  0.4472136  0.         0.
  0.4472136  0.         0.4472136 ]
 [0.         0.70710678 0.         0.35355339 0.         0.35355339
  0.35355339 0.         0.35355339]
 [0.40824829 0.         0.         0.40824829 0.40824829 0.
  0.40824829 0.40824829 0.40824829]
 [0.         0.4472136  0.4472136  0.4472136  0.         0.
  0.4472136  0.         0.4472136 ]]
[[0.         0.4472136  0.4472136  0.4472136  0.         0.
  0.4472136  0.         0.4472136 ]
 [0.         0.70710678 0.         0.35355339 0.         0.35355339
  0.35355339 0.         0.35355339]
 [0.40824829 0.         0.         0.40824829 0.40824829 0.
  0.40824829 0.40824829 0.40824829]
 [0.         0.4472136  0.4472136  0.4472136  0.         0.
  0.4472136  0.         0.4472136 ]]


#### 4.1.2 统计 idf

计算 `tf-idf` 的对象 `TfidfTransformer`，经过拟合后，可以获得 `.idf_` 属性

In [185]:
from sklearn.feature_extraction.text import TfidfTransformer

word_cnt = [[3, 0, 1], [2, 0, 0], [3, 0, 0], [4, 0, 0], [3, 2, 0], [3, 0, 2]]
# 6 个文档，每个文档有 3 个词

transformer = TfidfTransformer(smooth_idf=True)
transformer.fit_transform(word_cnt)

print(transformer.idf_)
print(transformer.n_features_in_)

[1.         2.25276297 1.84729786]
3


#### 4.1.3 统计 tf-idf

In [186]:
from sklearn.feature_extraction.text import TfidfTransformer

word_cnt = [[3, 0, 1], [2, 0, 0], [3, 0, 0], [4, 0, 0], [3, 2, 0], [3, 0, 2]]
# 6 个文档，每个文档有 3 个词

transformer = TfidfTransformer(smooth_idf=True)
# 参数: norm='l2', 
#       use_idf=True, 设所有 idf=1, 即计算归一化的词频
#       smooth_idf=True, 是否对 idf 的计算公式平滑

# 输入为 统计的词频
tfidf = transformer.fit_transform(word_cnt)  # 返回稀疏矩阵
print(tfidf.toarray())

[[0.85151335 0.         0.52433293]
 [1.         0.         0.        ]
 [1.         0.         0.        ]
 [1.         0.         0.        ]
 [0.55422893 0.83236428 0.        ]
 [0.63035731 0.         0.77630514]]


### 4.2 `NLTK` 库

In [187]:
from nltk.text import TextCollection
from nltk.tokenize import word_tokenize

corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]


texts = TextCollection(corpus)

for d, doc in enumerate(corpus):
    # 分词
    tokens = word_tokenize(doc)
    # 过滤停词
    tokens = [w for w in tokens if w not in ['.', '?']]

    print('document {0}'.format(d+1))
    # for each word
    for w, word in enumerate(tokens):
        tf = texts.tf(word, doc)
        idf = texts.idf(word)
        tf_idf = texts.tf_idf(word, doc)

        print('{0:10}'.format(word), end=' ')
        print('{0:<8.4f}'.format(tf), end=' ')
        print('{0:<8.4f}'.format(idf), end=' ')
        print('{0:<8.4f}'.format(tf_idf), end=' ')
        print()


from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

vectorizer = CountVectorizer(analyzer='word')   # stop_words='english' 设置 stop words
word_cnt = vectorizer.fit_transform(corpus)     # 统计 词频

transformer = TfidfTransformer(smooth_idf=True)
tf_idf = transformer.fit_transform(word_cnt)

print(vectorizer.get_feature_names_out())
print(np.round(tf_idf.toarray(), 4))    # tf-idf
print(np.round(transformer.idf_, 4))    # idf

document 1
This       0.0370   0.6931   0.0257   
is         0.0741   0.0000   0.0000   
the        0.0370   0.0000   0.0000   
first      0.0370   0.6931   0.0257   
document   0.0370   0.2877   0.0107   
document 2
This       0.0270   0.6931   0.0187   
document   0.0541   0.2877   0.0156   
is         0.0541   0.0000   0.0000   
the        0.0270   0.0000   0.0000   
second     0.0270   1.3863   0.0375   
document   0.0541   0.2877   0.0156   
document 3
And        0.0385   1.3863   0.0533   
this       0.0385   0.6931   0.0267   
is         0.0769   0.0000   0.0000   
the        0.0385   0.0000   0.0000   
third      0.0385   1.3863   0.0533   
one        0.0385   1.3863   0.0533   
document 4
Is         0.0370   1.3863   0.0513   
this       0.0370   0.6931   0.0257   
the        0.0370   0.0000   0.0000   
first      0.0370   0.6931   0.0257   
document   0.0370   0.2877   0.0107   
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
[[0.     0.4698 0.5803 0.3841 