# naive word2vec

In [1]:
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
from scipy import spatial
from scipy import stats

# Import & Init jieba
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')

# Import pandas
import pandas as pd
from pandas import Series, DataFrame

# Import util
import time
import re

Building prefix dict from /Users/sunset/Talk2AI_Contest/datas/dict/dict.txt.big ...
Loading model from cache /var/folders/43/l4vp_w_x4wb11mmy_bb1jrkc0000gn/T/jieba.u857f67a870683287981bc6f5b9493ffc.cache
Loading model cost 2.302 seconds.
Prefix dict has been built succesfully.


### Load datasets

In [2]:
sample = pd.read_csv('datas/sample_test_data.txt')
x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in x2]) == 0)

test_datas = pd.read_csv('datas/AIFirstProblem.txt')
test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)

### word2vec model

In [3]:
word_vectors = KeyedVectors.load_word2vec_format('models/word2vec/language-model-1.txt', binary=False)
len(word_vectors.vocab)

5909

In [12]:
unk_cnt = 0
for a, b in zip(x1, x2):
    a_sentence = [ch for s in a for ch in s if ch.strip() != '']
    b_sentences = [[ch for ch in rs if ch.strip() != ''] for rs in b]
    
    unk_cnt += len([ch for ch in a_sentence if ch not in word_vectors.vocab])
    unk_cnt += len([ch for s in b_sentences for ch in s if ch not in word_vectors.vocab])
unk_cnt

1

### Naive trial - centroid

In [13]:
def unitvec(vec):
    l = np.linalg.norm(vec)
    return vec / l if l != 0 else vec

In [14]:
def centroid(sentence):
    _ = [word_vectors.word_vec(word) for word in sentence if word in word_vectors.vocab]
    return np.mean(_, axis=0) if len(_) > 0 else np.zeros(word_vectors.vector_size)

In [15]:
def attack_naive_centroid(x1, x2):
    my_cos_ans = []
    my_dot_ans = []
    for a, b in zip(x1, x2):
        a_sentence = [ch for s in a for ch in s if ch.strip() != '']
        b_sentences = [[ch for ch in rs if ch.strip() != ''] for rs in b]

        a_center = centroid(a_sentence)
        b_centers = [centroid(s) for s in b_sentences]

        score = [np.dot(unitvec(a_center), unitvec(bc)) for bc in b_centers]
        my_cos_ans.append(np.argmax(score))
        
        score = [np.dot(a_center, bc) for bc in b_centers]
        my_dot_ans.append(np.argmax(score))
    return np.array(my_cos_ans), np.array(my_dot_ans)

In [16]:
cos_ans, dot_ans = attack_naive_centroid(x1, x2)

correct = np.sum(cos_ans == y)
print('%20s: %3d / %3d' % ('centroid (cos)', correct, len(y)))

correct = np.sum(dot_ans == y)
print('%20s: %3d / %3d' % ('centroid (dot)', correct, len(y)))

      centroid (cos):  26 /  50
      centroid (dot):  22 /  50


## Output answer on testing datas

In [None]:
my_test_ans, _ = attack_naive_centroid(test_x1, test_x2)
# my_test_ans = stats.mode(test_ans_bag).mode[0]
# my_test_ans.shape

In [None]:
# with open('answer/attack-naive-word2vec-fine-tune-6-not-yet.txt', 'w') as fo:
#     fo.write('id,ans\n')
#     fo.write('\n'.join(['%d,%s' % (i+1, ans) for i, ans in enumerate(my_test_ans)]))
#     fo.write('\n')