# naive word2vec

In [1]:
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
from scipy import spatial

# Import & Init jieba
import jieba
jieba.set_dictionary('datas/dict/dict.txt.big')
jieba.load_userdict('datas/dict/edu_dict.txt')

# Import pandas
import pandas as pd
from pandas import Series, DataFrame

# Import util
import time
import re

Building prefix dict from /Users/sunset/Talk2AI_Contest/datas/dict/dict.txt.big ...
Loading model from cache /var/folders/43/l4vp_w_x4wb11mmy_bb1jrkc0000gn/T/jieba.u857f67a870683287981bc6f5b9493ffc.cache
Loading model cost 1.957 seconds.
Prefix dict has been built succesfully.


### Load datasets

In [2]:
sample = pd.read_csv('datas/sample_test_data.txt')
x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.dialogue.values]
x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in sample.options.values]
y = sample.answer.values
assert(np.sum([len(_)!=6 for _ in x2]) == 0)

test_datas = pd.read_csv('datas/AIFirstProblem.txt')
test_x1 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.dialogue.values]
test_x2 = [[s for s in re.sub('[A-Z]:', '\t', _).split('\t') if len(s.strip())] for _ in test_datas.options.values]
assert(np.sum([len(_)!=6 for _ in test_x2]) == 0)

### word2vec model

In [3]:
# word_vectors = KeyedVectors.load_word2vec_format('models/word2vec/fine-tuned.txt', binary=False)
# len(word_vectors.vocab)
word_vectors = KeyedVectors.load_word2vec_format('models/word2vec/vec200_win40_iter15_mincnt5.bin', binary=True)
len(word_vectors.vocab)

65864

In [21]:
# Count total occurance to estimate word probability
total_word_cnt = np.sum([_.count for _ in word_vectors.vocab.values()])
total_word_cnt

2169132045

In [22]:
unk_cnt = 0
for a, b in zip(x1, x2):
    a_sentence = [word for s in a for word in jieba.cut(s) if word.strip() != '']
    b_sentences = [[word for word in jieba.cut(s) if word.strip() != ''] for s in b]
    
    unk_cnt += len([w for w in a_sentence if w not in word_vectors.vocab])
    unk_cnt += len([w for s in b_sentences for w in s if w not in word_vectors.vocab])
unk_cnt

64

### Naive trial - centroid

In [23]:
def centroid(sentence):
    _ = [word_vectors.word_vec(word) for word in sentence if word in word_vectors.vocab]
    return np.mean(_, axis=0) if len(_) > 0 else np.zeros(word_vectors.vector_size)

In [24]:
def attack_naive_centroid(x1, x2):
    my_cos_ans = []
    my_dot_ans = []
    for a, b in zip(x1, x2):
        a_sentence = [word for s in a for word in jieba.cut(s) if word.strip() != '']
        b_sentences = [[word for word in jieba.cut(s) if word.strip() != ''] for s in b]

        a_center = centroid(a_sentence)
        b_centers = [centroid(s) for s in b_sentences]

        score = [spatial.distance.cosine(a_center, bc) for bc in b_centers]
        my_cos_ans.append(np.argmin(score))
        
        score = [np.dot(a_center, bc) for bc in b_centers]
        my_dot_ans.append(np.argmax(score))
    return np.array(my_cos_ans), np.array(my_dot_ans)

In [25]:
cos_ans, dot_ans = attack_naive_centroid(x1, x2)

correct = np.sum(cos_ans == y)
print('%20s: %3d / %3d' % ('centroid (cos)', correct, len(y)))

correct = np.sum(dot_ans == y)
print('%20s: %3d / %3d' % ('centroid (dot)', correct, len(y)))

      centroid (cos):  30 /  50
      centroid (dot):  27 /  50


  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


### Naive trial - weighted centroid

In [26]:
def weighted_centroid(sentence, a=1e-4):
    _ = [a / (a + word_vectors.vocab[word].count / total_word_cnt) * word_vectors.word_vec(word)
            for word in sentence if word in word_vectors.vocab]
    return np.mean(_, axis=0) if len(_) > 0 else np.zeros(word_vectors.vector_size)

In [27]:
def attack_naive_weighted_centroid(x1, x2):
    my_cos_ans = []
    my_dot_ans = []
    for a, b in zip(x1, x2):
        a_sentence = [word for s in a for word in jieba.cut(s)]
        b_sentences = [[word for word in jieba.cut(s)] for s in b]

        a_center = weighted_centroid(a_sentence)
        b_centers = [weighted_centroid(s) for s in b_sentences]

        score = [spatial.distance.cosine(a_center, bc) for bc in b_centers]
        my_cos_ans.append(np.argmin(score))
        
        score = [np.dot(a_center, bc) for bc in b_centers]
        my_dot_ans.append(np.argmax(score))
    return np.array(my_cos_ans), np.array(my_dot_ans)

In [28]:
cos_ans, dot_ans = attack_naive_weighted_centroid(x1, x2)

correct = np.sum(cos_ans == y)
print('%20s: %3d / %3d' % ('weighted centroid (cos)', correct, len(y)))

correct = np.sum(dot_ans == y)
print('%20s: %3d / %3d' % ('weighted centroid (dot)', correct, len(y)))

weighted centroid (cos):  30 /  50
weighted centroid (dot):  27 /  50


  dist = 1.0 - np.dot(u, v) / (norm(u) * norm(v))


### Naive trial - cosine similarity between two sets of words

In [29]:
def cos_sim(a_words_set, b_words_set):
    a = [word for word in a_words_set if word in word_vectors.vocab]
    b = [word for word in b_words_set if word in word_vectors.vocab]
    if len(a) == 0 or len(b) == 0:
        return 0
    return word_vectors.n_similarity(a, b)

In [30]:
def attack_naive_cos_sim(x1, x2):
    my_ans = []
    for a, b in zip(x1, x2):
        a_sentence = [word for s in a for word in jieba.cut(s)]
        b_sentences = [[word for word in jieba.cut(s)] for s in b]

        score = [cos_sim(a_sentence, bs) for bs in b_sentences]
        my_ans.append(np.argmax(score))
    return np.array(my_ans)

In [31]:
cos_ans = attack_naive_cos_sim(x1, x2)
correct = np.sum(cos_ans == y)
print('%20s: %3d / %3d' % ('cos similarity', correct, len(y)))

      cos similarity:  31 /  50


## Try bagging

In [32]:
# QwQ

## Output answer on testing datas

In [33]:
my_test_ans = attack_naive_cos_sim(test_x1, test_x2)

In [34]:
with open('answer/attack-naive-word2vec-not-fine-tune.txt', 'w') as fo:
    fo.write('id,ans\n')
    fo.write('\n'.join(['%d,%s' % (i+1, ans) for i, ans in enumerate(my_test_ans)]))
    fo.write('\n')