# naive word2vec

In [1]:
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
from scipy import spatial
from scipy import stats

# Import & Init jieba
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')

# Import pandas
import pandas as pd
from pandas import Series, DataFrame

# Import util
import time
import re

Building prefix dict from /Users/sunset/Talk2AI_Contest/datas/dict/dict.txt.big ...
Loading model from cache /var/folders/43/l4vp_w_x4wb11mmy_bb1jrkc0000gn/T/jieba.u857f67a870683287981bc6f5b9493ffc.cache
Loading model cost 1.944 seconds.
Prefix dict has been built succesfully.


### Load datasets

In [14]:
sample = pd.read_csv('datas/sample_test_data.txt')
x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in x2]) == 0)

test_datas = pd.read_csv('datas/AIFirstProblem.txt')
test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)
with open('datas/AIFirst_test_answer.txt', 'r') as f:
    f.readline()
    test_y = np.array([int(line.strip().split(',')[-1]) for line in f])

### word2vec model

In [38]:
word_vectors = KeyedVectors.load_word2vec_format('models/word2vec/dual-lstm-14-best.txt', binary=False)
len(word_vectors.vocab)
# word_vectors = KeyedVectors.load_word2vec_format('models/word2vec/vec200_win40_iter15_mincnt1.bin', binary=True)
# len(word_vectors.vocab)

65865

In [39]:
unk_cnt = 0
for a, b in zip(x1, x2):
    a_sentence = [word for s in a for word in jieba.cut(s) if word.strip() != '']
    b_sentences = [[word for word in jieba.cut(s) if word.strip() != ''] for s in b]
    
    unk_cnt += len([w for w in a_sentence if w not in word_vectors.vocab])
    unk_cnt += len([w for s in b_sentences for w in s if w not in word_vectors.vocab])
unk_cnt

64

### Naive trial - centroid

In [40]:
def unitvec(vec):
    l = np.linalg.norm(vec)
    return vec / l if l != 0 else vec

In [41]:
def centroid(sentence):
    _ = [word_vectors.word_vec(word) for word in sentence if word in word_vectors.vocab]
    return np.mean(_, axis=0) if len(_) > 0 else np.zeros(word_vectors.vector_size)

In [42]:
def centroid_score(x1, x2):
    cos_score = []
    dot_score = []
    for a, b in zip(x1, x2):
        a_sentence = [word for s in a for word in jieba.cut(s) if word.strip() != '']
        b_sentences = [[word for word in jieba.cut(s) if word.strip() != ''] for s in b]

        a_center = centroid(a_sentence)
        b_centers = [centroid(s) for s in b_sentences]

        cos_score.append([np.dot(unitvec(a_center), unitvec(bc)) for bc in b_centers])
    return np.array(cos_score).reshape(-1, 6)

In [43]:
def attack_naive_centroid(x1, x2):
    my_cos_ans = centroid_score(x1, x2)
    return np.argmax(my_cos_ans, axis=1)

In [44]:
cos_ans = attack_naive_centroid(x1, x2)
test_cos_ans = attack_naive_centroid(test_x1, test_x2)

print('%30s: %.4f' % ('sample centroid', np.sum(cos_ans == y) / len(y)))
print('%30s: %.4f' % ('test   centroid', np.sum(test_cos_ans == test_y) / len(test_y)))

               sample centroid: 0.6600
               test   centroid: 0.6760


In [45]:
def wvdis_score(x1, x2):
    wvdis = []
    for a, b in zip(x1, x2):
        a_sentence = [word for s in a for word in jieba.cut(s) if word.strip() != '']
        b_sentences = [[word for word in jieba.cut(s) if word.strip() != ''] for s in b]

        wvdis.append([])
        for r in b_sentences:
            wvdis[-1].append(word_vectors.wmdistance(a_sentence, r))
    wvdis = np.array(wvdis).reshape(-1, 6)
    return np.array(wvdis).reshape(-1, 6)

In [46]:
wvdis = wvdis_score(x1, x2)
test_wvdis = wvdis_score(test_x1, test_x2)
wvdis_ans = np.argmin(wvdis, axis=1)
test_wvdis_ans = np.argmin(test_wvdis, axis=1)
print('%30s: %.4f' % ('sample wvm dis', np.sum(wvdis_ans == y) / len(y)))
print('%30s: %.4f' % ('test   wvm dis', np.sum(test_wvdis_ans == test_y) / len(test_y)))

                sample wvm dis: 0.4800
                test   wvm dis: 0.4980


## Output answer on testing datas

In [47]:
with open('answer/exp/exp_14_best_naive.txt', 'w') as f:
    f.write('id,ans\n')
    for i, ans in enumerate(test_cos_ans):
        f.write('%d,%d\n' % (i+1, ans))