In [4]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import random
import os, csv, pickle

In [102]:
# Hyper parameters
n_context = 50
n_data = 50000
n_pairs = 10
noise_rate = 0.2
filename = '../evaluation/datasets/word_analogy/google_analogy.txt'
filepath = '../evaluation/datasets/word_analogy/'

In [103]:
def read_word_analogy(filename):
    with open(filename, "r") as f:
        L = f.read().splitlines()

    questions = []
    for l in L:
        l = l.lower()
        if not l.startswith(":"):
            words = l.split()
            questions.append(words)
            
    return questions

def sample_questions(filename, pair_size):
    questions = read_word_analogy(filename)
    
    # Random question pairs
    # Exclude pairs such as : (w1, w2), (w1, w3)
    pairs = {}
    words = set()
    while True:
        idx = random.randrange(len(questions))
        new_word = True
        for word in questions[idx]:
            if word in words:
                new_word = False
                break
        
        if new_word:
            word1 = questions[idx][0]
            word2 = questions[idx][1]
            word3 = questions[idx][2]
            word4 = questions[idx][3]
            pair1 = make_pair(word1, word2)
            pair2 = make_pair(word3, word4)
            
            if pair1 not in pairs and pair2 not in pairs:
                words.add(word1)
                words.add(word2)
                words.add(word3)
                words.add(word4)
                pairs[pair1] = len(pairs)
                pairs[pair2] = len(pairs)
            
        if len(pairs) >= pair_size:
            break
    
    # Generate questions
    output = []
    for question in questions:
        pair1 = make_pair(question[0], question[1])
        pair2 = make_pair(question[2], question[3])
        
        if pair1 in pairs and pair2 in pairs:
            output.append(question)
            # print questions
#             print('{} {} {} {}'.format(question[0], question[1], question[2], question[3]))
        
    return output, pairs

def make_pair(word1, word2):
    if word1 < word2:
        return (word1, word2)
    else:
        return (word2, word1)

def initialize_dict(sample_set):
    words = set()
    for question in sample_set:
        for word in question:
            words.add(word)
    
    int_to_word = {ii: word for ii, word in enumerate(words)}
    word_to_int = {word: ii for ii, word in int_to_word.items()}
    
    return int_to_word, word_to_int

def question_to_int(questions, word_to_int):
    int_question = [[word_to_int[word] for word in question] for question in questions]
    return int_question

def cosine(vec1, vec2):
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)

    return vec1.dot(vec2) / (norm1 * norm2)

def save_pkl(data, filename, local=False):
    """ Save data to file """
    # create path
    parent_dir = os.path.dirname(filename)
    if not os.path.exists(parent_dir):
        os.makedirs(parent_dir)

    # save file
    output = open(filename, 'wb')
    pickle.dump(data, output, pickle.HIGHEST_PROTOCOL)
    output.close()
    
def generate_word_level(questions):
    same_level_word = []
    word_dis_dict = {}
    for question in questions:
        for i in range(2):
            if question[i] in word_dis_dict and question[i+2] in word_dis_dict:
                if word_dis_dict[question[i]] != word_dis_dict[question[i+2]]:
                    # merge two level
                    merge_idx = word_dis_dict[question[i]]
                    level2 = same_level_word[word_dis_dict[question[i+2]]]
                    level = same_level_word[merge_idx]|level2
                    same_level_word[merge_idx] = level
                    for word in level2:
                        word_dis_dict[word] = merge_idx
                    continue

            if question[i] in word_dis_dict:
                idx = word_dis_dict[question[i]]
                level = same_level_word[idx]
                same_level_word[idx].add(question[i+2])
                
            elif question[i+2] in word_dis_dict:
                idx = word_dis_dict[question[i+2]]
                level = same_level_word[idx]
                same_level_word[idx].add(question[i])
                
            else:
                idx = len(same_level_word)
                level = {question[i], question[i+2]}
                same_level_word.append(level)
            
            word_dis_dict[question[i]] = word_dis_dict[question[i+2]] = idx
            
    return same_level_word, word_dis_dict

def generate_original_distribution(questions, context_size):
    same_level_word, word_dis_dict = generate_word_level(questions)
    original_distribution = []
    for i in range(len(same_level_word)):
        dis = np.random.rand(context_size)
        original_distribution.append(dis)
    
    return original_distribution, word_dis_dict

def generate_noise_distribution(n_pairs, context_size):
    noise_distribution = []
    for i in range(n_pairs):
        dis = np.random.rand(context_size)
        noise_distribution.append(dis)
    
    return noise_distribution

In [104]:
# Initiialize
questions, pairs = sample_questions(filename, n_pairs)
int_to_word, word_to_int = initialize_dict(questions)
questions = question_to_int(questions, word_to_int)
n_question = len(questions)

# Change new parameters
n_word = len(int_to_word)
output_dict = 'output/{}-context-{}-data-{}-questions/'.format(n_context, n_data, n_question)

# Initialize dummy contexts
int_to_cont = {context: context for context in range(n_context)}
cont_to_int = {word: ii for ii, word in int_to_cont.items()}

In [105]:
# Sample context distribution for each words
original_distribution, word_dis_dict = generate_original_distribution(questions, n_context)
noise_distribution = generate_noise_distribution(n_pairs, n_context)
context_distribution = {}

for pair, idx in pairs.items():
    word1, word2 = pair
    word1 = word_to_int[word1]
    word2 = word_to_int[word2]
    
    # word1
    dis = (1- noise_rate) * original_distribution[word_dis_dict[word1]].copy() + noise_rate * noise_distribution[idx]
    dis = dis / np.sum(dis)
    context_distribution[word1] = dis
    
    # word2
    dis = (1- noise_rate) * original_distribution[word_dis_dict[word2]].copy() + noise_rate * noise_distribution[idx]
    dis = dis / np.sum(dis)
    context_distribution[word2] = dis

In [106]:
# Test for word analogy property
for i in range(n_question):
    question = questions[i]
    pair1_dis = cosine(context_distribution[question[0]], context_distribution[question[1]])
    pair2_dis = cosine(context_distribution[question[2]], context_distribution[question[3]])
    print('Pair 1 {}-{}: {}, Pair 2 {}-{}: {}, diff: {}'.format(int_to_word[question[0]], int_to_word[question[1]], pair1_dis,
                                                                int_to_word[question[2]], int_to_word[question[3]], pair2_dis,
                                                                abs(pair1_dis - pair2_dis)))

Pair 1 antananarivo-madagascar: 0.8229622451983442, Pair 2 dushanbe-tajikistan: 0.8182251706647685, diff: 0.004737074533575636
Pair 1 dushanbe-tajikistan: 0.8182251706647685, Pair 2 minsk-belarus: 0.8132208179929188, diff: 0.005004352671849732
Pair 1 minsk-belarus: 0.8132208179929188, Pair 2 santiago-chile: 0.8166286521664511, diff: 0.003407834173532298
Pair 1 santiago-chile: 0.8166286521664511, Pair 2 antananarivo-madagascar: 0.8229622451983442, diff: 0.00633359303189307
Pair 1 high-higher: 0.8596074666392032, Pair 2 safe-safer: 0.862514809411728, diff: 0.0029073427725248013
Pair 1 safe-safer: 0.862514809411728, Pair 2 high-higher: 0.8596074666392032, diff: 0.0029073427725248013
Pair 1 cool-coolest: 0.8371847899503484, Pair 2 short-shortest: 0.844993900174008, diff: 0.007809110223659599
Pair 1 short-shortest: 0.844993900174008, Pair 2 cool-coolest: 0.8371847899503484, diff: 0.007809110223659599
Pair 1 decreasing-decreased: 0.7679342785505269, Pair 2 paying-paid: 0.781020824990799, dif

In [107]:
# Random pairs
pair1_dis = cosine(context_distribution[questions[0][0]], context_distribution[questions[1][2]])
pair2_dis = cosine(context_distribution[questions[-1][1]], context_distribution[questions[-2][3]])
print('Pair 1 {}-{}: {}, Pair 2 {}-{}: {}, diff: {}'.format(int_to_word[questions[0][0]], int_to_word[questions[1][2]], pair1_dis,
                                                            int_to_word[questions[-1][1]], int_to_word[questions[-2][3]], pair2_dis,
                                                            abs(pair1_dis - pair2_dis)))

Pair 1 antananarivo-minsk: 0.9878241549835707, Pair 2 paid-paid: 1.0000000000000002, diff: 0.012175845016429543


In [108]:
# Test word analogy score
result = []
for question in questions:
    # Predict
    answer = question[3]
    pred = context_distribution[question[1]] - context_distribution[question[0]] + context_distribution[question[2]]
    sim_vector = np.zeros(n_word)
    for i in range(n_word):
        if i in question[:3]:
            sim_vector[i] = 0
        else:
            sim_vector[i] = cosine(pred, context_distribution[i])
    pred = np.argsort(sim_vector)[-1]
    result.append(1 if pred == answer else 0)

print('Acc: ', np.mean(result))

Acc:  1.0


In [109]:
# Sample training set
data = []
for i in range(n_data):
    # Sample word
    word = random.randrange(n_word)
    
    # Sample context
    dis = context_distribution[word]
    context = np.argmax(np.random.multinomial(1, dis))
    
    data.append([word, context])

In [110]:
# make directories
if not os.path.exists(output_dict):
    os.makedirs(output_dict)

# Save data
print('Writing processed data back to file...')
output = open(output_dict + 'data.csv', "w", newline='')
writer = csv.writer(output)
writer.writerows(data)
output.close()

# Save dictionaries
save_pkl(int_to_word, output_dict + 'dict/int_to_vocab.dict')
save_pkl(word_to_int, output_dict + 'dict/vocab_to_int.dict')
save_pkl(cont_to_int, output_dict + 'dict/cont_to_int.dict')
save_pkl(int_to_cont, output_dict + 'dict/int_to_cont.dict')
print('Done!')

# Save questions
with open(filepath + 'test-{}-questions.txt'.format(n_question), 'w') as f:
    f.write(': test-category\n')
    for i in range(n_question):
        question = questions[i]
        f.write('{} {} {} {}\n'.format(int_to_word[question[0]], int_to_word[question[1]], int_to_word[question[2]], int_to_word[question[3]]))

Writing processed data back to file...
Done!


In [111]:
# Test for word analogy property
for i in range(n_question):
    question = questions[i]
    print('{}-{}-{}-{}'.format(int_to_word[question[0]], int_to_word[question[1]], int_to_word[question[2]], int_to_word[question[3]]))

antananarivo-madagascar-dushanbe-tajikistan
dushanbe-tajikistan-minsk-belarus
minsk-belarus-santiago-chile
santiago-chile-antananarivo-madagascar
high-higher-safe-safer
safe-safer-high-higher
cool-coolest-short-shortest
short-shortest-cool-coolest
decreasing-decreased-paying-paid
paying-paid-decreasing-decreased
