# 単一featureの質問から、不適切な回答が選択されてしまう件の調査

質問文の抽出featureが１件しかない場合、かなりの高率のprobabilityで、該当featureが含まれる回答が選択されます。

この場合、（当初関係性があると考えていた）回答候補の数に関係性がないことが判明しています。

回答候補が複数ある場合でも、サンプル間でTF-IDFのばらつきが生じると、１件のクラスに回答が誘導されてしまう動きとなるようです。

（probabilityが、回答候補のクラスにバラけず、１件のクラスへ偏りが生じてしまいます）

## (1) 調査用の環境準備

In [2]:
'''
    テスト環境を準備するためのモジュールを使用します。
'''
import sys
import os
learning_dir = os.path.abspath("../../") #<--- donusagi-bot/learning
os.chdir(learning_dir)

if learning_dir not in sys.path:
    sys.path.append(learning_dir)

learning_dir

'/Users/makmorit/GitHub/donusagi-bot/learning'

In [3]:
import numpy as np
from learning.core.learn.bot import Bot
from learning.core.learn.learning_parameter import LearningParameter



In [4]:
_bot_id = 9  # bot_id = 9はセプテーニ
attr = {
    'include_failed_data': False,
    'include_tag_vector': False,
    'classify_threshold': 0.5,
    'algorithm': LearningParameter.ALGORITHM_LOGISTIC_REGRESSION,
    'params_for_algorithm': {'C': 140},
    'excluded_labels_for_fitting': None
}

learning_parameter = LearningParameter(attr)

In [5]:
from learning.core.datasource import Datasource

_datasource = Datasource(type='csv')
learning_training_messages = _datasource.learning_training_messages(_bot_id)
questions = np.array(learning_training_messages['question'])
answer_ids = np.array(learning_training_messages['answer_id'])

2017/05/12 PM 05:54:14 ['./fixtures/learning_training_messages/benefitone.csv', './fixtures/learning_training_messages/ptna.csv', './fixtures/learning_training_messages/septeni.csv', './fixtures/learning_training_messages/toyotsu_human.csv']
2017/05/12 PM 05:54:14 ['./fixtures/question_answers/toyotsu_human.csv']


In [6]:
from learning.core.predict.reply import Reply
COUNT_OF_APPEND_BLANK = 3

# 空のテキストにラベル0を対応付けるために強制的にトレーニングセットを追加
questions = np.append(questions, [''] * COUNT_OF_APPEND_BLANK)
answer_ids = np.append(answer_ids, [Reply.CLASSIFY_FAILED_ANSWER_ID] * COUNT_OF_APPEND_BLANK)

In [7]:
from learning.core.nlang import Nlang

_sentences = np.array(questions)
_separated_sentences = Nlang.batch_split(_sentences)

## (2) featureのカウント値とTF-IDF値を取得

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# token_patternは1文字のデータを除外しない設定
idf_vectorizer = TfidfVectorizer(use_idf=True, token_pattern=u'(?u)\\b\\w+\\b')
_idf_X = idf_vectorizer.fit_transform(_separated_sentences)

print("TfidfVectorizer: samples=%d, features=%d" % _idf_X.shape)

TfidfVectorizer: samples=16291, features=1154


In [10]:
# token_patternは1文字のデータを除外しない設定
cnt_vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
_count_X = cnt_vectorizer.fit_transform(_separated_sentences)

print("CountVectorizer: samples=%d, features=%d" % _count_X.shape)

CountVectorizer: samples=16291, features=1154


### 「やめる」の出現件数は全サンプルで６件のみ

In [58]:
count_all = np.sum(_count_X, axis=0)
np.array(count_all)[0][124]

6

### 「やめる」の回答候補クラス数は２件

In [52]:
def research_prefferd_answer_id(count_vector, word_index):
    _count_X_yameru = count_vector.T[word_index]
    a = _count_X_yameru.toarray()[0]
    preferred_list = np.unique(answer_ids[np.where(a!=0)])
    
    return preferred_list

In [53]:
word_index = 124
research_prefferd_answer_id(_count_X, word_index)

array([4572, 4579])

### 「やめる」のクラスにおける出現頻度を調査

4579 では最も少ない単語であり、4572 では２番目に少ない単語となっています

In [17]:
def get_item_from_vocabulary(vocabulary, index):
    '''
        vocabulary から指定インデックスの単語を参照
    '''
    for k, v in vocabulary.items():
        if v == index:
            return k

    return None

def print_word_count(vectorizer, count_vector, answer_ids, target_answer_id):
    word_count_info = []
    
    indices = np.where(answer_ids==target_answer_id)
    _count_vector = count_vector[indices]
    
    _count = np.sum(_count_vector, axis=0)
    array = np.array(_count)[0]
    for index, count in enumerate(array):
        if count > 0:
            word_count_info.append((index, get_item_from_vocabulary(vectorizer.vocabulary_, index), count))
            
    sorted_list = sorted(word_count_info, key=lambda x:x[2], reverse=False)
    return sorted_list[0:20]

In [18]:
'''
    単語の出現数を、少ないもの順に20件リスト
'''
print_word_count(cnt_vectorizer, _count_X, answer_ids, 4579)

[(83, 'どうにか', 2),
 (96, 'なんとか', 2),
 (124, 'やめる', 2),
 (127, 'やる', 2),
 (153, 'イヤ', 2),
 (281, 'ドライブ', 2),
 (480, '停止', 2),
 (504, '共同', 2),
 (590, '困難', 2),
 (612, '変更', 2),
 (682, '手段', 2),
 (815, '状態', 2),
 (872, '策', 2),
 (929, '解決', 2),
 (931, '解除', 2),
 (276, 'ドキュメント', 4),
 (634, '対処', 4),
 (74, 'できる', 6),
 (200, 'グーグル', 6),
 (785, '法', 6)]

In [19]:
print_word_count(cnt_vectorizer, _count_X, answer_ids, 4572)

[(15, 'いる', 2),
 (94, 'なる', 2),
 (272, 'データ', 2),
 (274, 'データファイル', 2),
 (480, '停止', 2),
 (825, '画面', 2),
 (832, '発生', 2),
 (859, '移す', 2),
 (863, '移行', 2),
 (901, '良い', 2),
 (5, 'いい', 4),
 (74, 'できる', 4),
 (108, 'ほしい', 4),
 (124, 'やめる', 4),
 (170, 'エラー', 4),
 (416, '中', 4),
 (538, '動かす', 4),
 (722, '教える', 4),
 (771, '欲しい', 4),
 (791, '消す', 4)]

### 各クラスのTF-IDF値を確認

「やめる」に関しては、4579 の TF-IDF は 0.766、対する 4572 の TF-IDF は 0.647〜0.712 と、いずれもかなり高い値でした。

また、クラス間で TF-IDF 値に差ができています。

In [29]:
def research_tfidf_and_count(idf_vector, count_vector, answer_ids, word_index):
    _idf = idf_vector.T[word_index].toarray()[0]
    _df = count_vector.T[word_index].toarray()[0]

    research_list = []
    for index, _ in enumerate(answer_ids):
        if _df[index] != 0:
            research_list.append((answer_ids[index], index, _idf[index], _df[index]))
            
    return research_list

word_index = 124
research_tfidf_and_count(_idf_X, _count_X, answer_ids, word_index)

[(4572, 5435, 0.64743186496629812, 1),
 (4572, 5448, 0.7126081530230749, 1),
 (4579, 5739, 0.76637020521093568, 1),
 (4572, 12356, 0.64743186496629812, 1),
 (4572, 12369, 0.7126081530230749, 1),
 (4579, 12660, 0.76637020521093568, 1)]

### その他のケースで検証

本件「やめる」と同じような単語の出現の仕方をする単語はどうでしょうか？

「どうにか」で見てみます。

In [54]:
word_index = 83
research_prefferd_answer_id(_count_X, word_index)

array([4579])

In [55]:
research_tfidf_and_count(_idf_X, _count_X, answer_ids, word_index)

[(4579, 5737, 0.78221626832693536, 1), (4579, 12658, 0.78221626832693536, 1)]

回答候補が１件しかなく、「やめる」と同様、TF-IDF値がかなり大きくなっています。

それでは、同じように単語出現回数自体が少ない「ドライブ」はどうでしょうか？

In [56]:
word_index = 281
research_prefferd_answer_id(_count_X, word_index)

array([4445, 4450, 4454, 4518, 4579])

In [57]:
research_tfidf_and_count(_idf_X, _count_X, answer_ids, word_index)[0:10] # 多いので最初の10件だけ表示します

[(4450, 34, 0.48400465049577085, 1),
 (4454, 36, 0.49781150257651102, 1),
 (4454, 37, 0.49781150257651102, 1),
 (4518, 165, 0.4454767853538773, 1),
 (4518, 166, 0.53482069438293867, 1),
 (4450, 2137, 0.57549399783831634, 1),
 (4450, 2138, 0.59598931471531102, 1),
 (4450, 2139, 0.47040740573022538, 1),
 (4450, 2140, 0.38389874679922364, 1),
 (4450, 2141, 0.41421172805650591, 1)]

このケースでは、回答候補が５件あるため、TF-IDF値は概して小さくなっています。

また、こちらもクラス間で TF-IDF 値にばらつきができています。

### 各ケースで予測結果を比較

「どうにか」「ドライブ」でも、「やめる」と同様、かなり高率の proba で 4579 が回答されてしまうことが確認できます。

回答候補の数と、probability 値の高さには、関連性がないようです。

In [33]:
import numpy as np
from learning.core.persistance import Persistance
from learning.core.predict.model_not_exists_error import ModelNotExistsError

In [34]:
def load_model_and_vectorizer(bot_id):
    '''
        学習済みのモデルを復元
    '''
    try:
        estimator = Persistance.load_model(bot_id)
        vectorizer = Persistance.load_vectorizer(bot_id)
    except IOError:
        raise ModelNotExistsError()

    return estimator, vectorizer

In [35]:
bot_id = 9  # bot_id = 9はセプテーニ

estimator, vectorizer = load_model_and_vectorizer(bot_id)
print("n_answer=%d, n_feature=%d" % (estimator.coef_.shape[0], estimator.coef_.shape[1]))

n_answer=174, n_feature=1154


In [36]:
def research_preferred_answer_ids(estimator, vectorizer, feature_word, n_top=10):
    '''
        重みテーブルから、featureに対応する列を抽出し、
        重みスコアの降順に、answer_idと共に重みスコアのリストを出力
        デフォルト＝トップの１０件のみ出力
    '''
    feature_index = vectorizer.vocabulary_[feature_word]
    print("research_preferred_answer_ids: feature word=%s (index=%d of %d)" % (
        feature_word, feature_index, estimator.coef_.shape[1]))

    _table_w = []
    for class_index, weight in enumerate(estimator.coef_.T[feature_index]):
        answer_id = estimator.classes_[class_index]
        _table_w.append((class_index, answer_id, weight))
        
    sorted_table_w = sorted(_table_w, key=lambda x:x[2], reverse=True)

    return sorted_table_w[0:n_top]

In [37]:
'''
    回答候補の重み付けを確認
'''
research_preferred_answer_ids(estimator, vectorizer, 'やめる')

research_preferred_answer_ids: feature word=やめる (index=124 of 1154)


[(114, 4579, 10.754334344795975),
 (107, 4572, 2.2669260980155244),
 (34, 4497, -4.9621819786803401e-05),
 (33, 4496, -5.299633357049133e-05),
 (12, 4445, -0.00013470836509254396),
 (45, 4508, -0.00020345183422650895),
 (98, 4563, -0.00020468957614262404),
 (145, 4610, -0.00021637775059685249),
 (42, 4505, -0.00022534947812052333),
 (10, 4441, -0.0002939213319672891)]

In [38]:
research_preferred_answer_ids(estimator, vectorizer, 'どうにか')

research_preferred_answer_ids: feature word=どうにか (index=83 of 1154)


[(114, 4579, 11.003186985356811),
 (34, 4497, -1.8467258571206802e-05),
 (98, 4563, -2.2721703746587414e-05),
 (33, 4496, -3.524457107222128e-05),
 (145, 4610, -5.993493632778569e-05),
 (47, 4510, -6.3056724221403362e-05),
 (51, 4514, -7.7819724515713979e-05),
 (45, 4508, -8.1733302898161534e-05),
 (42, 4505, -8.6107354312569607e-05),
 (44, 4507, -0.00010024372114453094)]

In [39]:
research_preferred_answer_ids(estimator, vectorizer, 'ドライブ')

research_preferred_answer_ids: feature word=ドライブ (index=281 of 1154)


[(55, 4518, 12.650782557710023),
 (14, 4450, 5.3989846212466333),
 (12, 4445, 2.098527675624783),
 (15, 4454, 1.3816368263731191),
 (114, 4579, 0.956221022695393),
 (86, 4551, -0.00026297568542037653),
 (34, 4497, -0.00061228447245395556),
 (33, 4496, -0.00065766111774365307),
 (45, 4508, -0.0021685141666944003),
 (42, 4505, -0.0025815468486155152)]

In [40]:
'''
    予測処理の実行
'''
from learning.core.predict.reply import Reply
from learning.tests import helper

In [41]:
questions = ['やめる'] # 回答候補＝２件
Reply(bot_id, helper.learning_parameter(use_similarity_classification=False)).perform(questions, datasource_type='csv')

2017/05/12 PM 06:07:20 ['./fixtures/learning_training_messages/benefitone.csv', './fixtures/learning_training_messages/ptna.csv', './fixtures/learning_training_messages/septeni.csv', './fixtures/learning_training_messages/toyotsu_human.csv']
2017/05/12 PM 06:07:20 ['./fixtures/question_answers/toyotsu_human.csv']
2017/05/12 PM 06:07:20 TextArray#__init__ start
2017/05/12 PM 06:07:20 Reply#perform text_array.separated_sentences: ['やめる']
2017/05/12 PM 06:07:20 TextArray#to_vec start
2017/05/12 PM 06:07:20 TextArray#to_vec end
2017/05/12 PM 06:07:20 Reply#perform features:   (0, 124)	1.0
2017/05/12 PM 06:07:20 question: やめる
2017/05/12 PM 06:07:20 question_feature_count: 1
2017/05/12 PM 06:07:20 predicted results (order by probability desc)
2017/05/12 PM 06:07:20 {'answer_id': 4579.0, 'probability': 0.95788732602549198}
2017/05/12 PM 06:07:20 {'answer_id': 0.0, 'probability': 0.023378873591347195}
2017/05/12 PM 06:07:20 {'answer_id': 4454.0, 'probability': 0.0038913316449180053}
2017/05/12

<learning.core.predict.reply_result.ReplyResult at 0x10bfb7c50>

In [42]:
questions = ['どうにか'] # 回答候補＝１件
Reply(bot_id, helper.learning_parameter(use_similarity_classification=False)).perform(questions, datasource_type='csv')

2017/05/12 PM 06:07:25 ['./fixtures/learning_training_messages/benefitone.csv', './fixtures/learning_training_messages/ptna.csv', './fixtures/learning_training_messages/septeni.csv', './fixtures/learning_training_messages/toyotsu_human.csv']
2017/05/12 PM 06:07:25 ['./fixtures/question_answers/toyotsu_human.csv']
2017/05/12 PM 06:07:25 TextArray#__init__ start
2017/05/12 PM 06:07:25 Reply#perform text_array.separated_sentences: ['どうにか']
2017/05/12 PM 06:07:25 TextArray#to_vec start
2017/05/12 PM 06:07:25 TextArray#to_vec end
2017/05/12 PM 06:07:25 Reply#perform features:   (0, 83)	1.0
2017/05/12 PM 06:07:25 question: どうにか
2017/05/12 PM 06:07:25 question_feature_count: 1
2017/05/12 PM 06:07:25 predicted results (order by probability desc)
2017/05/12 PM 06:07:25 {'answer_id': 4579.0, 'probability': 0.96028983407835156}
2017/05/12 PM 06:07:25 {'answer_id': 0.0, 'probability': 0.024251529363728909}
2017/05/12 PM 06:07:25 {'answer_id': 4454.0, 'probability': 0.003355696827857062}
2017/05/12

<learning.core.predict.reply_result.ReplyResult at 0x10bc62fd0>

In [43]:
questions = ['ドライブ'] # 回答候補＝５件
Reply(bot_id, helper.learning_parameter(use_similarity_classification=False)).perform(questions, datasource_type='csv')

2017/05/12 PM 06:07:28 ['./fixtures/learning_training_messages/benefitone.csv', './fixtures/learning_training_messages/ptna.csv', './fixtures/learning_training_messages/septeni.csv', './fixtures/learning_training_messages/toyotsu_human.csv']
2017/05/12 PM 06:07:28 ['./fixtures/question_answers/toyotsu_human.csv']
2017/05/12 PM 06:07:28 TextArray#__init__ start
2017/05/12 PM 06:07:28 Reply#perform text_array.separated_sentences: ['ドライブ']
2017/05/12 PM 06:07:28 TextArray#to_vec start
2017/05/12 PM 06:07:28 TextArray#to_vec end
2017/05/12 PM 06:07:28 Reply#perform features:   (0, 281)	1.0
2017/05/12 PM 06:07:28 question: ドライブ
2017/05/12 PM 06:07:28 question_feature_count: 1
2017/05/12 PM 06:07:28 predicted results (order by probability desc)
2017/05/12 PM 06:07:28 {'answer_id': 4518.0, 'probability': 0.9881117304293231}
2017/05/12 PM 06:07:28 {'answer_id': 4454.0, 'probability': 0.0046443897181426405}
2017/05/12 PM 06:07:28 {'answer_id': 0.0, 'probability': 0.0021350857646036535}
2017/05/

<learning.core.predict.reply_result.ReplyResult at 0x10bc5aba8>