# naive word2vec

In [1]:
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
from scipy import spatial
from scipy import stats

# Import & Init jieba
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')

# Import pandas
import pandas as pd
from pandas import Series, DataFrame

# Import util
import time
import re

Building prefix dict from /Users/sunset/Talk2AI_Contest/datas/dict/dict.txt.big ...
Loading model from cache /var/folders/43/l4vp_w_x4wb11mmy_bb1jrkc0000gn/T/jieba.u857f67a870683287981bc6f5b9493ffc.cache
Loading model cost 1.937 seconds.
Prefix dict has been built succesfully.


### Load datasets

In [2]:
sample = pd.read_csv('datas/sample_test_data.txt')
x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in x2]) == 0)

test_datas = pd.read_csv('datas/AIFirstProblem.txt')
test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)
with open('datas/AIFirst_test_answer.txt', 'r') as f:
    f.readline()
    test_y = np.array([int(line.strip().split(',')[-1]) for line in f])

### word2vec model

In [21]:
model_names = [
    'models/word2vec/dual-lstm-12-best',
    'models/word2vec/dual-lstm-12-newest',
    'models/word2vec/dual-lstm-13-best',
    'models/word2vec/dual-lstm-13-newest',
    'models/word2vec/dual-lstm-14-best',
    'models/word2vec/dual-lstm-14-newest',
    'models/word2vec/dual-lstm-15-best',
    'models/word2vec/dual-lstm-15-newest',
    'models/word2vec/dual-lstm-16-best',
    'models/word2vec/dual-lstm-16-newest',
    'models/word2vec/dual-lstm-17-best',
    'models/word2vec/dual-lstm-17-newest',
    'models/word2vec/dual-lstm-18-best',
    'models/word2vec/dual-lstm-18-newest',
    'models/word2vec/dual-lstm-22-best',
    'models/word2vec/dual-lstm-22-newest',
    'models/word2vec/dual-lstm-24-best',
    'models/word2vec/dual-lstm-24-newest',
    'models/word2vec/smn-1-best',
    'models/word2vec/smn-1-newest',
]

In [3]:
word_vectors = KeyedVectors.load('models/word2vec/dual-lstm-12-best')
len(word_vectors.vocab)
# word_vectors = KeyedVectors.load_word2vec_format('models/word2vec/dual-lstm-12-best.txt', binary=False)
# len(word_vectors.vocab)
# word_vectors = KeyedVectors.load_word2vec_format('models/word2vec/vec200_win40_iter15_mincnt1.bin', binary=True)
# len(word_vectors.vocab)

65865

In [4]:
unk_cnt = 0
for a, b in zip(x1, x2):
    a_sentence = [word for s in a for word in jieba.cut(s) if word.strip() != '']
    b_sentences = [[word for word in jieba.cut(s) if word.strip() != ''] for s in b]
    
    unk_cnt += len([w for w in a_sentence if w not in word_vectors.vocab])
    unk_cnt += len([w for s in b_sentences for w in s if w not in word_vectors.vocab])
unk_cnt

64

### Naive trial - centroid

In [5]:
def unitvec(vec):
    l = np.linalg.norm(vec)
    return vec / l if l != 0 else vec

In [6]:
def centroid(sentence):
    vecs = [word_vectors.word_vec(word) for word in sentence if word in word_vectors.vocab]
    return np.mean(vecs, axis=0) if len(vecs) > 0 else np.zeros(word_vectors.vector_size)

In [7]:
def centroid_score(x1, x2):
    cos_score = []
    for a, b in zip(x1, x2):
        a_sentence = [word for s in a for word in jieba.cut(s) if word.strip() != '']
        b_sentences = [[word for word in jieba.cut(s) if word.strip() != ''] for s in b]

        a_center = centroid(a_sentence)
        b_centers = [centroid(s) for s in b_sentences]

        cos_score.append([np.dot(unitvec(a_center), unitvec(bc)) for bc in b_centers])
    return np.array(cos_score).reshape(-1, 6)

In [8]:
def attack_naive_centroid(x1, x2):
    my_cos_ans = centroid_score(x1, x2)
    return np.argmax(my_cos_ans, axis=1)

In [9]:
cos_ans = attack_naive_centroid(x1, x2)
test_cos_ans = attack_naive_centroid(test_x1, test_x2)

print('%30s: %.4f' % ('sample centroid', np.sum(cos_ans == y) / len(y)))
print('%30s: %.4f' % ('test   centroid', np.sum(test_cos_ans == test_y) / len(test_y)))

               sample centroid: 0.6600
               test   centroid: 0.6800


### Naive trial - dis centroid

In [10]:
def dis_centroid(ss, beta=0.77):
    for s in ss:
        assert(type(s) == list)
    vecs = [[word_vectors.word_vec(word) for word in s if word in word_vectors.vocab] for s in ss]
    vecs = [s for s in vecs if len(s) > 0]
    if len(vecs) == 0:
        return np.zeros(word_vectors.vector_size)
    cens = list(reversed([np.mean(vs, axis=0) for vs in vecs]))
    for cen in cens:
        assert(np.sum(np.isnan(cen)) == 0)
    w_sum = sum(beta**i for i in range(len(cens)))
    return np.sum([cens[i] * (beta ** i / w_sum) for i in range(len(cens))], axis=0)

In [11]:
def dis_centroid_score(x1, x2):
    cos_score = []
    for a, b in zip(x1, x2):
        a_sentence = [[word for word in jieba.cut(s) if word.strip()] for s in a]
        b_sentences = [[word for word in jieba.cut(s) if word.strip()] for s in b]

        a_center = dis_centroid(a_sentence)
        b_centers = [dis_centroid([s]) for s in b_sentences]

        cos_score.append([np.dot(unitvec(a_center), unitvec(bc)) for bc in b_centers])
    return np.array(cos_score).reshape(-1, 6)

In [12]:
def attack_naive_dis_centroid(x1, x2):
    my_cos_ans = dis_centroid_score(x1, x2)
    return np.argmax(my_cos_ans, axis=1)

In [13]:
dis_cos_ans = attack_naive_dis_centroid(x1, x2)
test_dis_cos_ans = attack_naive_dis_centroid(test_x1, test_x2)

print('%30s: %.4f' % ('sample centroid', np.sum(dis_cos_ans == y) / len(y)))
print('%30s: %.4f' % ('test   centroid', np.sum(test_dis_cos_ans == test_y) / len(test_y)))

               sample centroid: 0.6000
               test   centroid: 0.6960


### Naive trial - weighted centroid

In [14]:
def w_centroid(ss, beta=0.77):
    for s in ss:
        assert(type(s) == list)
    vecs = [[word_vectors.word_vec(word) for word in s if word in word_vectors.vocab] for s in ss]
    vecs = list(reversed([s for s in vecs if len(s) > 0]))
    w_cen = np.zeros(word_vectors.vector_size)
    if len(vecs) == 0:
        return w_cen
    w = np.array([beta**i for i in range(len(vecs)) for _ in range(len(vecs[i]))]).reshape(-1, 1)
    cen = np.array([vec for s in vecs for vec in s])
    return np.sum(w * cen, axis=0) / np.sum(w)

In [15]:
def w_centroid_score(x1, x2):
    cos_score = []
    for a, b in zip(x1, x2):
        a_sentence = [[word for word in jieba.cut(s) if word.strip()] for s in a]
        b_sentences = [[word for word in jieba.cut(s) if word.strip()] for s in b]

        a_center = w_centroid(a_sentence)
        b_centers = [w_centroid([s]) for s in b_sentences]

        cos_score.append([np.dot(unitvec(a_center), unitvec(bc)) for bc in b_centers])
    return np.array(cos_score).reshape(-1, 6)

In [16]:
def attack_naive_w_centroid(x1, x2):
    my_cos_ans = w_centroid_score(x1, x2)
    return np.argmax(my_cos_ans, axis=1)

In [17]:
w_cos_ans = attack_naive_w_centroid(x1, x2)
test_w_cos_ans = attack_naive_w_centroid(test_x1, test_x2)

print('%30s: %.4f' % ('sample centroid', np.sum(w_cos_ans == y) / len(y)))
print('%30s: %.4f' % ('test   centroid', np.sum(test_w_cos_ans == test_y) / len(test_y)))

               sample centroid: 0.6800
               test   centroid: 0.6920


### Bagging naive

In [18]:
mode_ans = stats.mode([cos_ans, dis_cos_ans, w_cos_ans]).mode[0]
test_mode_ans = stats.mode([test_cos_ans, test_dis_cos_ans, test_w_cos_ans]).mode[0]

print('%30s: %.4f' % ('sample bagging', np.sum(mode_ans == y) / len(y)))
print('%30s: %.4f' % ('test   bagging', np.sum(test_mode_ans == test_y) / len(test_y)))

                sample bagging: 0.6800
                test   bagging: 0.6900


In [19]:
def blend_prob(x1, x2):
    score_a = centroid_score(x1, x2)
    score_b = dis_centroid_score(x1, x2)
    score_c = w_centroid_score(x1, x2)
    prob_a = np.exp(score_a) / np.sum(np.exp(score_a), axis=1).reshape(-1, 1)
    prob_b = np.exp(score_b) / np.sum(np.exp(score_b), axis=1).reshape(-1, 1)
    prob_c = np.exp(score_c) / np.sum(np.exp(score_c), axis=1).reshape(-1, 1)
    return (prob_a + prob_b + prob_c) / 3

In [20]:
blend_prob_ans = np.argmax(blend_prob(x1, x2), axis=1)
test_blend_prob_ans = np.argmax(blend_prob(test_x1, test_x2), axis=1)

print('%30s: %.4f' % ('sample blending', np.sum(blend_prob_ans == y) / len(y)))
print('%30s: %.4f' % ('test   blending', np.sum(test_blend_prob_ans == test_y) / len(test_y)))

               sample blending: 0.6800
               test   blending: 0.6900


In [73]:
word_vectors = KeyedVectors.load('models/word2vec/smn-1-newest')

In [74]:
sample_answers = [
    attack_naive_centroid(x1, x2),
    attack_naive_dis_centroid(x1, x2),
    attack_naive_w_centroid(x1, x2),
]
test_answers = [
    attack_naive_centroid(test_x1, test_x2),
    attack_naive_dis_centroid(test_x1, test_x2),
    attack_naive_w_centroid(test_x1, test_x2),
]

sample_answers.append(stats.mode(sample_answers[:3]).mode[0])
test_answers.append(stats.mode(test_answers[:3]).mode[0])
sample_answers.append(np.argmax(blend_prob(x1, x2), axis=1))
test_answers.append(np.argmax(blend_prob(test_x1, test_x2), axis=1))

' '.join(['%.4f %.4f' % (np.sum(s_ans == y) / len(y), np.sum(t_ans == test_y) / len(test_y))
     for s_ans, t_ans in zip(sample_answers, test_answers)])

'0.6800 0.6840 0.7000 0.6860 0.7000 0.6940 0.7000 0.6900 0.7000 0.6940'

In [76]:
test_answers = []
sample_answers = []
for mn in model_names:
    word_vectors = KeyedVectors.load(mn)
    sample_answers.extend([
        attack_naive_centroid(x1, x2),
        attack_naive_dis_centroid(x1, x2),
        attack_naive_w_centroid(x1, x2),
    ])
    test_answers.extend([
        attack_naive_centroid(test_x1, test_x2),
        attack_naive_dis_centroid(test_x1, test_x2),
        attack_naive_w_centroid(test_x1, test_x2),
    ])
    print('done', mn)
sample_answers = np.array(sample_answers)
test_answers = np.array(test_answers)

done models/word2vec/dual-lstm-12-best
done models/word2vec/dual-lstm-12-newest
done models/word2vec/dual-lstm-13-best
done models/word2vec/dual-lstm-13-newest
done models/word2vec/dual-lstm-14-best
done models/word2vec/dual-lstm-14-newest
done models/word2vec/dual-lstm-15-best
done models/word2vec/dual-lstm-15-newest
done models/word2vec/dual-lstm-16-best
done models/word2vec/dual-lstm-16-newest
done models/word2vec/dual-lstm-17-best
done models/word2vec/dual-lstm-17-newest
done models/word2vec/dual-lstm-18-best
done models/word2vec/dual-lstm-18-newest
done models/word2vec/dual-lstm-22-best
done models/word2vec/dual-lstm-22-newest
done models/word2vec/dual-lstm-24-best
done models/word2vec/dual-lstm-24-newest
done models/word2vec/smn-1-best
done models/word2vec/smn-1-newest


In [131]:
np.sum(test_answers[-12] != test_answers[0])

54

In [118]:
# for i, mn in enumerate(model_names):
#     now_ans = stats.mode(test_answers[i*3:i*3+3]).mode[0]
#     with open('answer/exp/naive-' + mn.split('/')[2] + '.txt', 'w') as f:
#         f.write('id,ans\n')
#         f.write('\n'.join(['%d,%d' % (idx+1, a) for idx, a in enumerate(now_ans)]))
#         f.write('\n')

In [77]:
sample_all_mode = stats.mode(sample_answers).mode[0]
test_all_mode = stats.mode(test_answers).mode[0]

In [79]:
print('%30s: %.4f' % ('sample all vote', np.sum(sample_all_mode == y) / len(y)))
print('%30s: %.4f' % ('test   all vote', np.sum(test_all_mode == test_y) / len(test_y)))

               sample all vote: 0.6800
               test   all vote: 0.6980


In [88]:
np.sum(sample_answers != y, axis=0)

array([60,  0,  0, 60, 60,  0, 60, 60, 54,  0,  0,  3, 60,  0,  0, 60, 60,
        0,  0,  0,  0,  0,  0,  0, 60, 18,  0,  0,  0, 60,  0,  0,  0,  0,
       60,  0,  0,  0, 16, 60, 60,  0,  0, 10,  0, 56, 16,  0, 19, 60])

In [92]:
np.sum(test_answers != test_y, axis=0)

array([60,  0,  0, 60, 60,  0, 26,  0, 60,  0,  0, 59,  0,  0, 60,  0, 24,
        0, 60,  2, 60,  0, 60, 60,  0,  0,  0,  0,  0, 17, 60,  0,  0, 18,
        0, 60, 60,  0,  0,  0, 36,  0, 15,  0, 22,  0,  6,  0, 60, 60, 32,
       15,  0, 58,  0,  0,  0, 60,  0,  0, 58,  0,  0,  0,  0,  0, 60, 54,
       60,  0,  0,  0,  0, 60, 30, 60, 60,  0,  0, 60, 42,  0,  0, 60,  0,
        8,  0, 60, 17,  0,  0,  0, 54,  0,  0, 18,  6,  6,  0,  0, 60,  0,
        0,  0, 60,  0,  0,  0,  0, 60,  0,  0,  0,  0,  0,  0, 60,  0, 60,
        0,  0, 14, 54,  0, 60,  0,  0, 18,  0,  0,  0, 60,  0, 58,  0,  0,
        0, 33,  4,  0,  0, 60, 60, 40,  0, 60, 60,  0,  0, 60,  6,  3,  0,
        5,  0,  0, 53,  0,  0,  3,  0, 60,  0,  0, 60, 60,  0, 56, 60,  0,
        0, 60, 60,  0, 60,  0,  0,  0, 50, 60,  0, 60,  2, 60, 48,  0,  6,
        0,  0,  2, 60, 12,  0,  0, 60,  0, 60, 60,  0, 54, 60,  0, 60,  0,
        0, 54,  0,  0,  0,  0,  0,  0,  0, 60,  0, 60,  0,  0,  0, 21, 60,
        0, 25,  0, 34,  0

In [80]:
test_prob = []
sample_prob = []
for mn in model_names:
    word_vectors = KeyedVectors.load(mn)
    sample_prob.append(blend_prob(x1, x2))
    test_prob.append(blend_prob(test_x1, test_x2))
    print('done', mn)
test_prob = np.array(test_prob)
sample_prob = np.array(sample_prob)
test_all_blend_ans = np.argmax(np.sum(test_prob, axis=0), axis=1)
sample_all_blend_ans = np.argmax(np.sum(sample_prob, axis=0), axis=1)

done models/word2vec/dual-lstm-12-best
done models/word2vec/dual-lstm-12-newest
done models/word2vec/dual-lstm-13-best
done models/word2vec/dual-lstm-13-newest
done models/word2vec/dual-lstm-14-best
done models/word2vec/dual-lstm-14-newest
done models/word2vec/dual-lstm-15-best
done models/word2vec/dual-lstm-15-newest
done models/word2vec/dual-lstm-16-best
done models/word2vec/dual-lstm-16-newest
done models/word2vec/dual-lstm-17-best
done models/word2vec/dual-lstm-17-newest
done models/word2vec/dual-lstm-18-best
done models/word2vec/dual-lstm-18-newest
done models/word2vec/dual-lstm-22-best
done models/word2vec/dual-lstm-22-newest
done models/word2vec/dual-lstm-24-best
done models/word2vec/dual-lstm-24-newest
done models/word2vec/smn-1-best
done models/word2vec/smn-1-newest


In [105]:
print('%30s: %.4f' % ('sample all vote', np.sum(sample_all_blend_ans == y) / len(y)))
print('%30s: %.4f' % ('test   all vote', np.sum(test_all_blend_ans == test_y) / len(test_y)))

               sample all vote: 0.6800
               test   all vote: 0.6980


## Output answer on testing datas

In [None]:
# with open('answer/exp/exp_14_best_naive.txt', 'w') as f:
#     f.write('id,ans\n')
#     for i, ans in enumerate(test_cos_ans):
#         f.write('%d,%d\n' % (i+1, ans))