In [1]:
# Bag-of-Words
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

vectorizer = CountVectorizer()
# Bag-of-Wordsを実行し変換後の行列を取得
X = vectorizer.fit_transform(corpus)
# 戻り値はscipy.sparseの疎行列なので
# これをNumPy配列に変換して出力
X.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)

In [2]:
# 単語にマッピングされたインデックスを出力
vectorizer.vocabulary_

{'this': 8,
 'is': 3,
 'the': 6,
 'first': 2,
 'document': 1,
 'second': 5,
 'and': 0,
 'third': 7,
 'one': 4}

In [3]:
# N-grams
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

vectorizer = CountVectorizer(
    analyzer='word',    # 単語単位のN-gramsを指定
    ngram_range=(2, 2)) # 2-gramsにする
# 変換後の行列を取得
X = vectorizer.fit_transform(corpus)
# 戻り値はscipy.sparseの疎行列なので
# これをNumPy配列に変換して出力
X.toarray()

array([[0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0],
       [0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0],
       [0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1]], dtype=int64)

In [4]:
# 2単語のつながりにマッピングされたインデックスを出力
vectorizer.vocabulary_

{'this is': 11,
 'is the': 3,
 'the first': 6,
 'first document': 2,
 'this document': 10,
 'document is': 1,
 'the second': 7,
 'second document': 5,
 'and this': 0,
 'the third': 8,
 'third one': 9,
 'is this': 4,
 'this the': 12}

In [5]:
# TF-IDF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

vectorizer = CountVectorizer()
transformer = TfidfTransformer()
# 変換後の行列を取得
tf = vectorizer.fit_transform(corpus)
tfidf = transformer.fit_transform(tf)
# 戻り値はscipy.sparseの疎行列なので
# これをNumPy配列に変換して出力
tfidf.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

In [6]:
# Word2Vec
from gensim.models import word2vec
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]
# 文(センテンス)ごとにリストにする
sentence = [d.split() for d in corpus]
# トレーニング
model = word2vec.Word2Vec(
    sentence,
    size=10,     # 単語ベクトルの次元数
    min_count=1, # n回未満登場する単語を破棄
    window=2     # 学習に使う前後の単語数
    )

In [7]:
# 'This'をベクトルに変換
model.wv['This']

array([ 0.03591875, -0.01909131,  0.00052243, -0.02178675,  0.02444721,
        0.01506153,  0.02261374,  0.02219924,  0.01048681, -0.03532327],
      dtype=float32)

In [8]:
# 'is'をベクトルに変換
model.wv['is']

array([ 0.00682613, -0.01287915, -0.03755166, -0.02067324,  0.04232879,
        0.04784242, -0.00212903,  0.01688278, -0.01779048,  0.02457212],
      dtype=float32)

In [9]:
# 'document'に近い単語を抽出
model.wv.most_similar('document')

[('This', 0.4736090302467346),
 ('And', 0.3979097008705139),
 ('Is', 0.3419610261917114),
 ('first', 0.2937809228897095),
 ('is', 0.23243652284145355),
 ('third', 0.22129905223846436),
 ('one.', 0.16492722928524017),
 ('second', 0.13379749655723572),
 ('document?', 0.03554658219218254),
 ('this', -0.022788207978010178)]