In [13]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

### Bag of Wordsによる文書のベクトル化

In [3]:
#The Zen of Pythonを抜粋して試す
words = ["Beautiful is better than ugly.",
         "Explicit is better than implicit.",
         "Simple is better than complex."]

In [6]:
vect = CountVectorizer().fit(words)
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [21]:
print(vect.get_feature_names())

['beautiful', 'better', 'complex', 'explicit', 'implicit', 'is', 'simple', 'than', 'ugly']


In [11]:
#transformするとスパースマトリクスが返る
bag_of_words = vect.transform(words)
bag_of_words

<3x9 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [23]:
pd.DataFrame(bag_of_words.toarray(), columns=vect.get_feature_names())

Unnamed: 0,beautiful,better,complex,explicit,implicit,is,simple,than,ugly
0,1,1,0,0,0,1,0,1,1
1,0,1,0,1,1,1,0,1,0
2,0,1,1,0,0,1,1,1,0


### 情報量のないトークンを削除する

In [25]:
# 2つ以上の文書に登場しているトークンだけを用いる
vect = CountVectorizer(min_df=2).fit(words)
bag_of_words = vect.transform(words)
pd.DataFrame(bag_of_words.toarray(), columns=vect.get_feature_names())

Unnamed: 0,better,is,than
0,1,1,1
1,1,1,1
2,1,1,1


### ストップワード
* 頻出の役に立たない単語を捨てる

In [32]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

print(len(ENGLISH_STOP_WORDS))
print(list(ENGLISH_STOP_WORDS)[:10])

318
['please', 'nor', 'could', 'however', 'mill', 'never', 'whether', 'done', 'he', 'latterly']


In [33]:
vect = CountVectorizer(stop_words="english").fit(words)
bag_of_words = vect.transform(words)
pd.DataFrame(bag_of_words.toarray(), columns=vect.get_feature_names())

Unnamed: 0,beautiful,better,complex,explicit,implicit,simple,ugly
0,1,1,0,0,0,0,1
1,0,1,0,1,1,0,0
2,0,1,1,0,0,1,0
