# Doc2Vecの初期動作確認

こちらを参考にしました。


- Doc2Vec Tutorial on the Lee Dataset

　　https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb

仮決めのパラメータにて、質問に対する近似度が戻るところまで確認できました。

## (1) テストデータ／環境準備

In [1]:
'''
    テスト環境を準備するためのモジュールを使用します。
'''
import sys
import os
learning_dir = os.path.abspath("../../") #<--- donusagi-bot/learning
os.chdir(learning_dir)

if learning_dir not in sys.path:
    sys.path.append(learning_dir)

## (2) Doc2Vecの動作確認

### (2-1) コーパス生成

既存実装どおり、Nlangクラスを使い、コーパス（単語が半角スペースで区切られた文字列）を生成します。

In [2]:
import numpy as np

from learning.core.learn.learning_parameter import LearningParameter
from learning.core.datasource import Datasource

_bot_id = 9  # bot_id = 9はセプテーニ
attr = {
    'include_failed_data': False,
    'include_tag_vector': False,
    'classify_threshold': 0.5,
    'algorithm': LearningParameter.ALGORITHM_LOGISTIC_REGRESSION,
    'params_for_algorithm': {'C': 140},
    'excluded_labels_for_fitting': None
}

learning_parameter = LearningParameter(attr)

In [3]:
_datasource = Datasource(type='csv')
learning_training_messages = _datasource.learning_training_messages(_bot_id)
questions = np.array(learning_training_messages['question'])
answer_ids = np.array(learning_training_messages['answer_id'])

2017/05/15 PM 04:19:57 ['./fixtures/learning_training_messages/benefitone.csv', './fixtures/learning_training_messages/ptna.csv', './fixtures/learning_training_messages/septeni.csv', './fixtures/learning_training_messages/toyotsu_human.csv']
2017/05/15 PM 04:19:57 ['./fixtures/question_answers/toyotsu_human.csv']


In [4]:
from learning.core.nlang import Nlang

_sentences = np.array(questions)
_separated_sentences = Nlang.batch_split(_sentences)

### (2-2) コーパスにタグ付け

models.doc2vecの仕様に従います。

In [5]:
from gensim import models
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

In [6]:
def doc_to_sentence(sentences, name):
    words = sentences.split(' ')
    return TaggedDocument(words=words, tags=[name])

def corpus_to_sentences(separated_sentences, answer_ids):
    for idx, (doc, name) in enumerate(zip(separated_sentences, answer_ids)):
        yield doc_to_sentence(doc, name)

### (2-3) 学習処理／モデルのシリアライズ

In [7]:
sentences = corpus_to_sentences(_separated_sentences, answer_ids)

In [8]:
sentence_list = list(sentences)

In [9]:
model = Doc2Vec(size=500, min_count=1, iter=200)

In [10]:
model.build_vocab(sentence_list)

In [11]:
model.train(sentence_list)

13681544

In [12]:
model_path = 'prototype/better_algorithm/doc2vec.model'

model.save(model_path)

### (2-4) 予測処理

In [13]:
loaded_model = models.Doc2Vec.load(model_path)
loaded_model

<gensim.models.doc2vec.Doc2Vec at 0x10c3fdba8>

In [14]:
n_tops = len(loaded_model.docvecs)

In [15]:
'''
    予測処理にかけるコーパスを生成
'''
test_corpus1 = ['情報','システム','アドレス']
test_corpus2 = ['マウス','破損']

In [16]:
'''
    情報システムのアドレス（正解＝7040）
'''
inferred_vector1 = loaded_model.infer_vector(test_corpus1)
loaded_model.docvecs.most_similar([inferred_vector1], topn=n_tops)

[(7040, 0.59112149477005),
 (7065, 0.5866346955299377),
 (4588, 0.5161018371582031),
 (4567, 0.5007367134094238),
 (4478, 0.4670105576515198),
 (4511, 0.4551074504852295),
 (4555, 0.450093537569046),
 (4494, 0.4477253258228302),
 (4473, 0.4360285997390747),
 (4596, 0.42791473865509033),
 (4450, 0.4272078275680542),
 (4458, 0.4252215325832367),
 (7084, 0.41536208987236023),
 (4444, 0.4025437831878662),
 (4597, 0.401836633682251),
 (4603, 0.3993605375289917),
 (4434, 0.39340153336524963),
 (4522, 0.39227989315986633),
 (7042, 0.3907272219657898),
 (4528, 0.38936978578567505),
 (4613, 0.3891337513923645),
 (7037, 0.38615551590919495),
 (7068, 0.38488033413887024),
 (4591, 0.3814826011657715),
 (4501, 0.38123849034309387),
 (4539, 0.3800704777240753),
 (4566, 0.3742132782936096),
 (4594, 0.3739563226699829),
 (4472, 0.3694353997707367),
 (4565, 0.3614947199821472),
 (4420, 0.34803181886672974),
 (4538, 0.3437162935733795),
 (4454, 0.3435979187488556),
 (4608, 0.3423529863357544),
 (7064, 0

In [17]:
'''
    マウス破損（正解＝4458）
'''
inferred_vector2 = loaded_model.infer_vector(test_corpus2)
loaded_model.docvecs.most_similar([inferred_vector2], topn=n_tops)

[(4458, 0.7136568427085876),
 (4530, 0.6769453287124634),
 (4459, 0.6686059236526489),
 (7068, 0.49634963274002075),
 (4598, 0.487174391746521),
 (7065, 0.48263728618621826),
 (4525, 0.48219504952430725),
 (4526, 0.4750104546546936),
 (4581, 0.4738618731498718),
 (4494, 0.47166192531585693),
 (4566, 0.468356192111969),
 (4472, 0.4671318233013153),
 (4520, 0.46611732244491577),
 (7042, 0.465604305267334),
 (4534, 0.4645758867263794),
 (7037, 0.46388915181159973),
 (4587, 0.4633786082267761),
 (4615, 0.46283069252967834),
 (4608, 0.45121896266937256),
 (4498, 0.44793862104415894),
 (7044, 0.4473431706428528),
 (4536, 0.44593361020088196),
 (4542, 0.44139474630355835),
 (4533, 0.4345661997795105),
 (4588, 0.43250733613967896),
 (4535, 0.4302879273891449),
 (4507, 0.42976224422454834),
 (4573, 0.42930853366851807),
 (4478, 0.4287220239639282),
 (4434, 0.42773357033729553),
 (4532, 0.4242286682128906),
 (4601, 0.4239875078201294),
 (4602, 0.4228079915046692),
 (4549, 0.419736385345459),
 (4