# ライブラリのインストール

In [9]:
!pip install janome

Collecting janome
[?25l  Downloading https://files.pythonhosted.org/packages/a8/63/98858cbead27df7536c7e300c169da0999e9704d02220dc6700b804eeff0/Janome-0.4.1-py2.py3-none-any.whl (19.7MB)
[K     |████████████████████████████████| 19.7MB 2.7MB/s 
[?25hInstalling collected packages: janome
Successfully installed janome-0.4.1


# サンプルデータ

In [10]:
documents = [
        '私 は 野球 が 好き',
        '私 は サッカー が 好き サッカー 選手 は かっこいい サッカー 選手 が 好き',
        '僕 は テニス が 得意',
        '僕 は 車 が 好き',
        '私 は 野球 が 苦手',
]

# sklearnのTF-IDFの例

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

cv = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
tf = cv.fit_transform(documents)

features = cv.get_feature_names()

ttf = TfidfTransformer()
tfidf = ttf.fit_transform(tf.toarray())

df = pd.DataFrame(tfidf.toarray())
df.columns = features
df

Unnamed: 0,かっこいい,が,は,サッカー,テニス,僕,好き,得意,私,苦手,車,選手,野球
0,0.0,0.336767,0.336767,0.0,0.0,0.0,0.473314,0.0,0.473314,0.0,0.0,0.0,0.570196
1,0.235317,0.224259,0.224259,0.70595,0.0,0.0,0.315189,0.0,0.157594,0.0,0.0,0.470633,0.0
2,0.0,0.270418,0.270418,0.0,0.567502,0.457857,0.0,0.567502,0.0,0.0,0.0,0.0,0.0
3,0.0,0.298192,0.298192,0.0,0.0,0.504883,0.419099,0.0,0.0,0.0,0.62579,0.0,0.0
4,0.0,0.298192,0.298192,0.0,0.0,0.0,0.0,0.0,0.419099,0.62579,0.0,0.0,0.504883


# gensimのTF-IDFの例

In [12]:
from gensim.utils import simple_preprocess as sp
from gensim import corpora
from gensim import models
import numpy as np

dct = corpora.Dictionary([sp(line, min_len=1) for line in documents])
corpus = [dct.doc2bow(sp(line, min_len=1)) for line in documents]
tfidf = models.TfidfModel(corpus)

i = 0
l = []
for doc in tfidf[corpus] :
    for id, freq in doc :
        l.append([i, dct[id], np.around(freq, decimals=6)])
    i = i + 1

df = pd.DataFrame(np.array(l))
df.columns = ['ID','単語','TF-IDF']
xdf = pd.crosstab(df['ID'],df['単語'],aggfunc=lambda x:x,values=df['TF-IDF']).reset_index()
xdf.fillna(0,inplace=True)
xdf

単語,ID,かっこいい,サッカー,テニス,僕,好き,得意,私,苦手,車,選手,野球
0,0,0.0,0.0,0.0,0.0,0.437792,0.0,0.437792,0.0,0.0,0.0,0.785287
1,1,0.262579,0.787738,0.0,0.0,0.166682,0.0,0.083341,0.0,0.0,0.525159,0.0
2,2,0.0,0.0,0.655949,0.373447,0.0,0.655949,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.476949,0.265896,0.0,0.0,0.0,0.837747,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.265896,0.837747,0.0,0.0,0.476949


# pandasでTF-IDFをスクラッチ実装

In [14]:
import pandas as pd
import numpy as np
import re
from janome.tokenizer import Tokenizer
tnz = Tokenizer()

i = 0
l = []
for line in documents :
    l.extend([[i, t.surface, t.part_of_speech] for t in tnz.tokenize(re.sub(r'\s', '', line))])
    i = i + 1

df = pd.DataFrame(np.array(l))
df.columns = ['ID','単語','品詞']
    
# TF
xdf = pd.crosstab([df['ID'],df['単語'],df['品詞']],'TF').reset_index()
# DF
xdf2 = pd.crosstab([df['単語'],df['品詞']],'DF',aggfunc=lambda x: len(x.unique()),values=df['ID']).reset_index()
# TFとDFの算出結果をマージ
xdf = pd.merge(xdf,xdf2,how='inner',on=['単語','品詞'])

# IDF（sklearnのデフォルトのIDF算出式）
xdf['IDF'] = np.log( (len(xdf['ID'].unique()) + 1)  / ( xdf['DF'] + 1 ) ) + 1

# IDF（gensimのデフォルトのIDF算出式）
# xdf['IDF'] = np.log2( len(xdf['ID'].unique())  / xdf['DF'] )

# TF-IDF
xdf['TF-IDF'] = xdf['TF'] * xdf['IDF']

# TF-IDFの二乗
xdf['TF-IDF^2'] = xdf['TF-IDF'] ** 2

# L2のノルムで正規化したTF-IDF （sklearnのデフォルトのTF-IDF）
xdf['TF-IDF_l2'] = xdf['TF-IDF'] / np.sqrt( xdf.groupby(['ID'])['TF-IDF^2'].transform('sum') )

xxdf = pd.crosstab(xdf['ID'],xdf['単語'],aggfunc=lambda x:x,values=xdf['TF-IDF_l2']).reset_index()
xxdf.fillna(0,inplace=True)
xxdf

単語,ID,かっこいい,が,は,サッカー,テニス,僕,好き,得意,私,苦手,車,選手,野球
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.437792,0.0,0.437792,0.0,0.0,0.0,0.785287
1,1,0.262579,0.0,0.0,0.787738,0.0,0.0,0.166682,0.0,0.083341,0.0,0.0,0.525159,0.0
2,2,0.0,0.0,0.0,0.0,0.655949,0.373447,0.0,0.655949,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.476949,0.265896,0.0,0.0,0.0,0.837747,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.265896,0.837747,0.0,0.0,0.476949
