# 词袋法(Bag-of-Words)

In [1]:
sent1 = 'The cat is walking in the bedroom.'
sent2 = 'A dog was running across the kitchen.'

from sklearn.feature_extraction.text import CountVectorizer

count_vec = CountVectorizer()

sentences = [sent1, sent2]

print(count_vec.fit_transform(sentences).toarray())  # 特征向量化

print(count_vec.get_feature_names())

[[0 1 1 0 1 1 0 0 2 1 0]
 [1 0 0 1 0 0 1 1 1 0 1]]
['across', 'bedroom', 'cat', 'dog', 'in', 'is', 'kitchen', 'running', 'the', 'walking', 'was']


# NLTK

In [2]:
import nltk

nltk.download('punkt') # use the NLTK Downloader to obtain the resource

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\q7356\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
tokens_1 = nltk.word_tokenize(sent1)

print(tokens_1)   # 词汇分割

['The', 'cat', 'is', 'walking', 'in', 'the', 'bedroom', '.']


In [4]:
tokens_2 = nltk.word_tokenize(sent2)

print(tokens_2)

['A', 'dog', 'was', 'running', 'across', 'the', 'kitchen', '.']


In [5]:
vocab_1 = sorted(set(tokens_1))
print(vocab_1)

['.', 'The', 'bedroom', 'cat', 'in', 'is', 'the', 'walking']


In [6]:
vocab_2 = sorted(set(tokens_2))
print(vocab_2)

['.', 'A', 'across', 'dog', 'kitchen', 'running', 'the', 'was']


In [7]:
stemmer = nltk.stem.PorterStemmer()   # 初始化 stemmer 寻找各个词汇最原始的词根

stem_1 = [stemmer.stem(t) for t in tokens_1]
print(stem_1)


stem_2 = [stemmer.stem(t) for t in tokens_2]
print(stem_2)

['the', 'cat', 'is', 'walk', 'in', 'the', 'bedroom', '.']
['A', 'dog', 'wa', 'run', 'across', 'the', 'kitchen', '.']


In [8]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\q7356\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [9]:
pos_tag_1 = nltk.tag.pos_tag(tokens_1)  # 初始化词性标注器
print(pos_tag_1)

pos_tag_2 = nltk.tag.pos_tag(tokens_2)
print(pos_tag_2)

[('The', 'DT'), ('cat', 'NN'), ('is', 'VBZ'), ('walking', 'VBG'), ('in', 'IN'), ('the', 'DT'), ('bedroom', 'NN'), ('.', '.')]
[('A', 'DT'), ('dog', 'NN'), ('was', 'VBD'), ('running', 'VBG'), ('across', 'IN'), ('the', 'DT'), ('kitchen', 'NN'), ('.', '.')]
