# TFIDF
http://www.wildml.com/2016/07/deep-learning-for-chatbots-2-retrieval-based-model-tensorflow

In [1]:
import numpy as np
from scipy import spatial
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Import & Init jieba
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')

# Import pandas
import pandas as pd
from pandas import Series, DataFrame

# Import util
import time
import re

Building prefix dict from /home/sunset/word_contest/datas/dict/dict.txt.big ...
Loading model from cache /tmp/jieba.u849ecfdca27003d306f39ca004b82b5b.cache
Loading model cost 1.176 seconds.
Prefix dict has been built succesfully.


### Load datasets

In [2]:
sample = pd.read_csv('datas/sample_test_data.txt')
x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in x2]) == 0)

test_datas = pd.read_csv('datas/AIFirstProblem.txt')
test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)

def tokenize(s):
    return [w for w in jieba.cut(s) if w.strip() != '']

x1_toks = [[w for s in q for w in tokenize(s)] for q in x1]
x2_toks = [[tokenize(r) for r in rs] for rs in x2]
test_x1_toks = [[w for s in q for w in tokenize(s)] for q in test_x1]
test_x2_toks = [[tokenize(r) for r in rs] for rs in test_x2]

In [3]:
def num_2_str(num):
    s = chr(num%26 + 97)
    num //= 26
    while num:
        s += chr(num%26 + 97)
        num //= 26
    return s

In [4]:
corpus = []
corpus_fname_lst = [
    'datas/training_data/下課花路米.txt',
    'datas/training_data/人生劇展.txt',
    'datas/training_data/公視藝文大道.txt',
    'datas/training_data/成語賽恩思.txt',
    'datas/training_data/我的這一班.txt',
    'datas/training_data/流言追追追.txt',
    'datas/training_data/聽聽看.txt',
    'datas/training_data/誰來晚餐.txt',
]

for fname in corpus_fname_lst:
    with open(fname, 'r') as f:
        corpus.extend([' '.join(line.strip().split('\t')) for line in f])
        
word2id = {}
for cp in corpus:
    for w in cp.split():
        if w not in word2id:
            word2id[w] = len(word2id)

corpus_id = [' '.join([str(word2id[w]) for w in cp.split() if w in word2id]) for cp in corpus]

In [5]:
len(word2id)

184826

In [6]:
tfidf = TfidfVectorizer(max_features=120000)
tfidf.fit(corpus_id)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=120000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [7]:
def predict(tfidf, context, utterances):
    # Convert context and utterances into tfidf vector
    vector_context = tfidf.transform([' '.join([str(word2id[w]) for w in context if w in word2id])])
    vector_doc = tfidf.transform([' '.join([str(word2id[w]) for w in opt if w in word2id]) for opt in utterances])

    # The dot product measures the similarity of the resulting vectors
    result = np.dot(vector_doc, vector_context.T).todense()
    result = np.asarray(result).flatten()

    # Sort by top results and return the indices in descending order
    return np.argsort(result, axis=0)[::-1][0]

In [8]:
my_ans = [predict(tfidf, x1_toks[i], x2_toks[i]) for i in range(len(y))]

In [9]:
np.sum(y == my_ans)

33

In [10]:
my_test_ans = [predict(tfidf, test_x1_toks[i], test_x2_toks[i]) for i in range(len(test_x1_toks))]

In [11]:
# with open('answer/attack-tfidf.txt', 'w') as fo:
#     fo.write('id,ans\n')
#     fo.write('\n'.join(['%d,%s' % (i+1, ans) for i, ans in enumerate(my_test_ans)]))
#     fo.write('\n')

### Cosine distance to predict

In [12]:
def predict_cos(tfidf, context, utterances):
    # Convert context and utterances into tfidf vector
    vector_context = tfidf.transform([' '.join([str(word2id[w]) for w in context if w in word2id])])
    vector_doc = tfidf.transform([' '.join([str(word2id[w]) for w in opt if w in word2id]) for opt in utterances])

    vector_context = vector_context.toarray()[0]
    vector_doc = vector_doc.toarray()
    
    # The dot product measures the similarity of the resulting vectors
    result = [spatial.distance.cosine(vector_context, vec) for vec in vector_doc]

    # Sort by top results and return the indices in descending order
    return np.argmin(result)

In [13]:
my_ans_cos = [predict_cos(tfidf, x1_toks[i], x2_toks[i]) for i in range(len(y))]

  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


In [16]:
np.sum(y == my_ans_cos)

31