In [1]:
import MeCab
import mojimoji


class MecabTokenizer:
    def __init__(self):
        self.tagger = MeCab.Tagger("-u dict/custom.dic")
        # Note: node.surfaceを取得出来るようにするため、空文字をparseする(Python3のバグの模様)
        self.tagger.parse('')

    def tokenize(self, texts):
        splited_texts = []
        for text in texts:
            splited_texts.append(self.tokenize_single_text(text))
        return splited_texts

    def tokenize_single_text(self, text):
        node = self.tagger.parseToNode(text)
        word_list = []
        while node:
            features = node.feature.split(',')
            pos = features[0]

            if pos in ["名詞", "動詞", "形容詞", "感動詞", "助動詞", "副詞"]:
                lemma = node.feature.split(",")[6]

                if pos == '名詞' and features[1] == '非自立':
                    node = node.next
                    continue
                if pos == '動詞' and features[1] == '非自立':
                    node = node.next
                    continue

                if pos == '助動詞' and lemma != 'ない':
                    node = node.next
                    continue

                if lemma == 'ある':
                    node = node.next
                    continue

                if lemma == "*":
                    lemma = node.surface

                word_list.append(mojimoji.han_to_zen(lemma))
            node = node.next
        return word_list

In [2]:
import pandas as pd

data = pd.read_table('/Users/shwld/Downloads/wiki-corpus/jawiki.txt')

In [3]:
from sklearn.externals import joblib

print(data['question'].shape)

i = 5300000
j = 5400000
while i < data['question'].shape[0]:
    filename = '/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_{start}-{end}'.format(start=i, end=j)
    print(filename)
    tokenized_sentences = None
    tokenized_sentences = MecabTokenizer().tokenize(data['question'][i:j])
    joblib.dump(tokenized_sentences, filename)
    i = i + 100000
    j = j + 100000

(6796444,)
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_5300000-5400000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_5400000-5500000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_5500000-5600000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_5600000-5700000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_5700000-5800000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_5800000-5900000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_5900000-6000000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_6000000-6100000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_6100000-6200000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_6200000-6300000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_6300000-6400000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_6400000-6500000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_6500000-6600000
/

In [6]:
import os
from sklearn.externals import joblib

from gensim import models
from gensim import corpora, models, similarities

i = 0
j = 100000
tokenized_sentences = []
while i < data['question'].shape[0]:
    filepath = '/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_{start}-{end}'.format(start=i, end=j)
    print(filepath)
    if os.path.exists(filepath):
        tokenized_sentences = tokenized_sentences + joblib.load(filepath)
    i = i + 100000
    j = j + 100000

len(tokenized_sentences)

/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_0-100000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_100000-200000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_200000-300000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_300000-400000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_400000-500000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_500000-600000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_600000-700000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_700000-800000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_800000-900000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_900000-1000000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_1000000-1100000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_1100000-1200000
/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list_1200000-1300000
/Users/shwld/Downloads/wiki-corpus/t

6696444

In [None]:
# joblib.dump(tokenized_sentences, '/Users/shwld/Downloads/wiki-corpus/tokenized_sentences_list')

In [None]:
from gensim import models
from gensim import corpora, models, similarities

In [None]:
# tokenized_sentences = [
#     ['アンパサンド', '（，', '＆）', '意味', 'する', '記号', 'ラテン語', '合', '字', 'Ｔｒｅｂｕｃｈｅｔ', 'ＭＳ', 'フォント', '表示', 'する', 'れる', '”', 'ｅｔ', '”', '合', '字', '容易', 'わかる', 'ａｍｐｅｒｓａ', '”', 'ａｎｄ', 'ｐｅｒ', 'ｓｅ', 'ａｎｄ', '”、', '意味', '”', 'ａｎｄ', '［', 'ｔｈｅ', 'ｓｙｍｂｏｌ', 'ｗｈｉｃｈ', '］', 'ｂｙ', 'ｉｔｓｅｌｆ', '［', 'ｉｓ', '］', 'ａｎｄ', '”'],
#     ['使用', '１', '世紀', '遡る', 'できる', '（', '１', '）、', '５', '世紀', '中葉', '（', '２', '，', '３', '）', '現代', '（', '４', '−', '６', '）', '至る', '変遷', 'わかる'],
#     ['Ｚ', '続く', 'ラテン', '文字', 'アルファベット', '２７', '字', '目', 'する', 'れる', '時期'],
#     ['アンパサンド', '役割', '果たす', '文字', 'ｅｔ', '呼ぶ', 'れる', '数字', '７', '似る', '記号', '（，', 'Ｕ', '＋', '２０４', 'Ａ', '）。', '記号', '現在', 'ゲール', '文字', '使う', 'れる'],
#     ['記号', '名', 'アンパサンド', 'ラテン語', 'まじる', '英語', '「＆', 'それ', '自身', '”', 'ａｎｄ', '”', '表す', '（＆', 'ｐｅｒ', 'ｓｅ', 'ａｎｄ', '）', 'くずれる', '形', '英語', '言語', '名称', '多様'],
#     ['日常', '的', '手書き', '場合', '欧米', 'アンパサンド', '縦', '線', '引く', '単純', '化', 'する', 'れる', '使う', 'れる'],
#     ['同様', 'ｔ', '「＋（', 'プラス', '輪', '重ねる', '無声', '歯茎', '側面', '摩擦音', '示す', '発音', '記号', '使う', 'れる'],
#     ['プログラミング', '言語', 'Ｃ', '多数', '言語', 'ＡＮＤ', '演算', '子', '用いる', 'られる', 'Ｃ', '例'],
#     ['ＰＨＰ', '変数', '宣言', '記号', '（＄）', '直前', '記述', 'する', '参照', '渡し', '行う', 'できる'],
#     ['ＢＡＳＩＣ', '系列', '言語', '文字', '列', '連結', '演算', '子', '使用', 'する', 'れる', 'ｃｏｄｉｃｅ', '＿', '４', 'ｃｏｄｉｃｅ', '＿', '５', '返す', '主', 'マイクロソフト', '系', '整数', '十', '六', '進', '表記', 'ｃｏｄｉｃｅ', '＿', '６', '用いる', 'ｃｏｄｉｃｅ', '＿', '７', '十', '進', '１５', '表現', 'する']
# ]

In [None]:
gensim_dictionary=corpora.Dictionary(tokenized_sentences)
gensim_dictionary.save('/Users/shwld/Downloads/wiki-corpus/gensim.dict')

In [None]:
#corpusをid表示に変換する．二次元リストで構築しているので内包表記で記述
id_corpus=[gensim_dictionary.doc2bow(sentence) for sentence in tokenized_sentences];
#保存しておく 
corpora.MmCorpus.serialize('/Users/shwld/Downloads/wiki-corpus/gensim.mm', id_corpus);

In [None]:
#tfidfモデルの構築とcorpusの変換
tfidf=models.TfidfModel(id_corpus)
tfidf.save('/Users/shwld/Downloads/wiki-corpus/gensim.tfidf')
tfidf_corpus=tfidf[id_corpus]

In [None]:
#lsiモデルの構築とcorpusの変換
lsi=models.LsiModel(corpus=tfidf_corpus, num_topics=10, id2word=gensim_dictionary)
lsi.save('/Users/shwld/Downloads/wiki-corpus/gensim.lsi')
lsi_corpus=lsi[tfidf_corpus]

In [None]:
len(lsi_corpus)

In [None]:
queries = ['マイクロソフト', '演算']
query_vector = gensim_dictionary.doc2bow(queries)

In [None]:
vec_lsi = lsi[query_vector]
vec_lsi

In [None]:
lsi_index = similarities.SparseMatrixSimilarity(lsi_corpus, num_features=len(lsi_corpus))
lsi_index.save('/Users/shwld/Downloads/wiki-corpus/gensim.lsi')

In [None]:
sims = lsi_index[vec_lsi]
sims = sorted(enumerate(sims), key=lambda item: -item[1])
sims
# print(sorted(enumerate(sims), key=lambda item: -item[1])[:10])

参照元
- http://kensuke-mi.hatenablog.com/entry/20131021/1382384297
- gensimでcosine_similarity
  - http://blog.yuku-t.com/entry/20110623/1308810518