# naive word2vec

In [1]:
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
from scipy import spatial
from scipy import stats

# Import & Init jieba
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')

# Import pandas
import pandas as pd
from pandas import Series, DataFrame

# Import util
import time
import re

Building prefix dict from /Users/sunset/Talk2AI_Contest/datas/dict/dict.txt.big ...
Loading model from cache /var/folders/43/l4vp_w_x4wb11mmy_bb1jrkc0000gn/T/jieba.u857f67a870683287981bc6f5b9493ffc.cache
Loading model cost 1.848 seconds.
Prefix dict has been built succesfully.


### Load datasets

In [2]:
sample = pd.read_csv('datas/sample_test_data.txt')
x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in x2]) == 0)

test_datas = pd.read_csv('datas/AIFirstProblem.txt')
test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)

### word2vec model

In [58]:
word_vectors = KeyedVectors.load_word2vec_format('models/word2vec/dual-lstm-12-newest.txt', binary=False)
len(word_vectors.vocab)
# word_vectors = KeyedVectors.load_word2vec_format('models/word2vec/vec200_win40_iter15_mincnt1.bin', binary=True)
# len(word_vectors.vocab)

65865

In [37]:
unk_cnt = 0
for a, b in zip(x1, x2):
    a_sentence = [word for s in a for word in jieba.cut(s) if word.strip() != '']
    b_sentences = [[word for word in jieba.cut(s) if word.strip() != ''] for s in b]
    
    unk_cnt += len([w for w in a_sentence if w not in word_vectors.vocab])
    unk_cnt += len([w for s in b_sentences for w in s if w not in word_vectors.vocab])
unk_cnt

64

In [38]:
def la_score(x1, x2):
    lascore = []
    for a, b in zip(x1, x2):
        q = set()
        for s in a:
            for word in jieba.cut(s):
                q.add(word)
        for r in b:
            lascore.append(0)
            for word in jieba.cut(r):
                if word in q:
                    lascore[-1] += 1
    return np.array(lascore).reshape(-1, 6)

### Naive trial - centroid

In [39]:
def unitvec(vec):
    l = np.linalg.norm(vec)
    return vec / l if l != 0 else vec

In [40]:
def centroid(sentence):
    _ = [word_vectors.word_vec(word) for word in sentence if word in word_vectors.vocab]
    return np.mean(_, axis=0) if len(_) > 0 else np.zeros(word_vectors.vector_size)

In [41]:
def centroid_score(x1, x2):
    cos_score = []
    dot_score = []
    for a, b in zip(x1, x2):
        a_sentence = [word for s in a for word in jieba.cut(s) if word.strip() != '']
        b_sentences = [[word for word in jieba.cut(s) if word.strip() != ''] for s in b]

        a_center = centroid(a_sentence)
        b_centers = [centroid(s) for s in b_sentences]

        cos_score.append([np.dot(unitvec(a_center), unitvec(bc)) for bc in b_centers])
        dot_score.append([np.dot(a_center, bc) for bc in b_centers])
    return np.array(cos_score).reshape(-1, 6), np.array(dot_score).reshape(-1, 6)

In [42]:
def attack_naive_centroid(x1, x2):
    my_cos_ans, my_dot_ans = centroid_score(x1, x2)
    return np.argmax(my_cos_ans, axis=1), np.argmax(my_dot_ans, axis=1)

In [43]:
cos_ans, dot_ans = attack_naive_centroid(x1, x2)

correct = np.sum(cos_ans == y)
print('%20s: %3d / %3d' % ('centroid (cos)', correct, len(y)))

correct = np.sum(dot_ans == y)
print('%20s: %3d / %3d' % ('centroid (dot)', correct, len(y)))

      centroid (cos):  33 /  50
      centroid (dot):  30 /  50


In [44]:
def wvdis_score(x1, x2):
    wvdis = []
    for a, b in zip(x1, x2):
        a_sentence = [word for s in a for word in jieba.cut(s) if word.strip() != '']
        b_sentences = [[word for word in jieba.cut(s) if word.strip() != ''] for s in b]

        wvdis.append([])
        for r in b_sentences:
            wvdis[-1].append(word_vectors.wmdistance(a_sentence, r))
    wvdis = np.array(wvdis).reshape(-1, 6)
    return np.array(wvdis).reshape(-1, 6)

In [45]:
wvdis = wvdis_score(x1, x2)
wvdis_ans = np.argmin(wvdis, axis=1)
correct = np.sum(wvdis_ans == y)
print('%20s: %3d / %3d' % ('wvm dis', correct, len(y)))

             wvm dis:  24 /  50


In [46]:
def my_method(x1, x2):
    return centroid_score(x1, x2)[0] + 0.2 * la_score(x1, x2)

my_score = my_method(x1, x2)
np.sum(np.argmax(my_score, axis=1) == y)

33

### Try bagging

In [None]:
sample_ans_bag = []
test_ans_bag = []

In [None]:
m_txt = [
    'models/word2vec/fine-tuned.txt',
    'models/word2vec/fine-tuned-2.txt',
    'models/word2vec/fine-tuned-3.txt',
    'models/word2vec/fine-tuned-5.txt',
    'models/word2vec/vec200_win40_iter15_mincnt1.txt',
    'models/word2vec/word2vec-dual-lstm-8.txt',
]

m_bin = [
    'models/word2vec/vec200_win100_iter15_mincnt5.bin',
    'models/word2vec/vec200_win10_iter15_mincnt5.bin',
    'models/word2vec/vec200_win15_iter15_mincnt5.bin',
    'models/word2vec/vec200_win15_iter15_mincnt7.bin',
    'models/word2vec/vec200_win15_iter50_mincnt5.bin',
    'models/word2vec/vec200_win20_iter15_mincnt5.bin',
    'models/word2vec/vec200_win30_iter15_mincnt5.bin',
    'models/word2vec/vec200_win40_iter15_mincnt5.bin',
    'models/word2vec/vec200_win60_iter15_mincnt5.bin',
    'models/word2vec/vec512_win15_iter15_mincnt5.bin',
    'models/word2vec/vec512_win40_iter15_mincnt5.bin',
]

In [None]:
for fname in m_txt:
    print(fname)
    word_vectors = KeyedVectors.load_word2vec_format(fname, binary=False)
    cos_ans, dot_ans = attack_naive_centroid(x1, x2)
    if np.sum(cos_ans == y) < 30:
        continue

    my_test_ans, _ = attack_naive_centroid(test_x1, test_x2)
    sample_ans_bag.append(cos_ans)
    test_ans_bag.append(my_test_ans)

for fname in m_txt:
    print(fname)
    word_vectors = KeyedVectors.load_word2vec_format(fname, binary=True)
    cos_ans, dot_ans = attack_naive_centroid(x1, x2)
    if np.sum(cos_ans == y) < 30:
        continue

    my_test_ans, _ = attack_naive_centroid(test_x1, test_x2)
    sample_ans_bag.append(cos_ans)
    test_ans_bag.append(my_test_ans)

In [None]:
sample_ans_bag = np.array(sample_ans_bag)
test_ans_bag = np.array(test_ans_bag)

In [None]:
sample_bag_mode = stats.mode(sample_ans_bag).mode[0]
np.sum(sample_bag_mode == y)

## Output answer on testing datas

In [59]:
my_test_ans, _ = attack_naive_centroid(test_x1, test_x2)
# my_test_ans = np.argmax(my_method(test_x1, test_x2), axis=1)
# my_test_ans = stats.mode(test_ans_bag).mode[0]
# my_test_ans.shape

In [60]:
# with open('answer/dual-lstm-newest.txt', 'w') as fo:
#     fo.write('id,ans\n')
#     fo.write('\n'.join(['%d,%s' % (i+1, ans) for i, ans in enumerate(my_test_ans)]))
#     fo.write('\n')

In [61]:
my_test_ans

array([0, 0, 3, 2, 1, 4, 0, 1, 1, 4, 3, 2, 4, 0, 5, 1, 3, 4, 3, 0, 0, 2, 3,
       1, 3, 1, 4, 5, 4, 1, 3, 2, 5, 4, 4, 0, 5, 4, 1, 1, 1, 4, 3, 0, 2, 5,
       4, 2, 0, 3, 3, 3, 5, 0, 1, 1, 5, 0, 5, 4, 0, 2, 4, 0, 5, 4, 0, 2, 0,
       3, 0, 5, 0, 1, 0, 0, 0, 4, 1, 5, 0, 2, 2, 3, 3, 0, 5, 1, 0, 0, 5, 1,
       2, 5, 3, 1, 4, 5, 3, 0, 3, 2, 0, 0, 0, 2, 1, 2, 2, 5, 4, 1, 4, 0, 5,
       3, 1, 0, 3, 4, 2, 2, 3, 2, 0, 4, 3, 5, 3, 3, 1, 1, 2, 2, 3, 0, 0, 1,
       3, 5, 3, 4, 4, 0, 3, 4, 0, 4, 1, 2, 1, 0, 5, 0, 1, 1, 0, 0, 5, 2, 2,
       4, 1, 5, 5, 3, 0, 0, 5, 3, 1, 1, 4, 2, 0, 0, 3, 4, 2, 5, 1, 5, 2, 1,
       2, 4, 3, 3, 0, 4, 3, 2, 1, 5, 1, 5, 3, 4, 2, 2, 3, 4, 4, 0, 5, 1, 3,
       0, 1, 5, 3, 1, 3, 1, 3, 0, 1, 0, 5, 3, 3, 1, 1, 3, 2, 2, 5, 2, 5, 1,
       0, 5, 0, 0, 3, 4, 3, 5, 5, 0, 4, 5, 4, 0, 1, 5, 5, 3, 1, 5, 3, 2, 4,
       3, 4, 5, 2, 3, 1, 4, 1, 0, 1, 1, 4, 3, 3, 4, 4, 1, 0, 2, 1, 0, 1, 2,
       3, 2, 4, 2, 3, 3, 2, 4, 3, 2, 0, 4, 3, 0, 2, 5, 2, 1, 1, 3, 4, 4, 0,
       0, 0,