In [4]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import random
import os, csv, pickle

In [5]:
# Hyper parameters
n_context = 100
n_data = 10000
n_pairs = 100
noise_rate = 0.05
filename = '../evaluation/datasets/word_analogy/google_analogy.txt'
filepath = '../evaluation/datasets/word_analogy/'

In [24]:
def read_word_analogy(filename):
    with open(filename, "r") as f:
        L = f.read().splitlines()

    questions = []
    for l in L:
        l = l.lower()
        if not l.startswith(":"):
            words = l.split()
            questions.append(words)
            
    return questions

def sample_questions(filename, pair_size):
    questions = read_word_analogy(filename)
    
    # Random question pairs
    # Exclude pairs such as : (w1, w2), (w1, w3)
    pairs = {}
    words = set()
    while True:
        idx = random.randrange(len(questions))
        new_word = True
        for word in questions[idx]:
            if word in words:
                new_word = False
                break
        
        if new_word:
            word1 = questions[idx][0]
            word2 = questions[idx][1]
            word3 = questions[idx][2]
            word4 = questions[idx][3]
            pair1 = make_pair(word1, word2)
            pair2 = make_pair(word3, word4)
            
            if pair1 not in pairs and pair2 not in pairs:
                words.add(word1)
                words.add(word2)
                words.add(word3)
                words.add(word4)
                pairs[pair1] = len(pairs)
                pairs[pair2] = len(pairs)
            
        if len(pairs) >= pair_size:
            break
    
    # Generate questions
    output = []
    print(pairs)
    for question in questions:
        pair1 = make_pair(question[0], question[1])
        pair2 = make_pair(question[2], question[3])
        
        if pair1 in pairs and pair2 in pairs:
            output.append(question)
            # print questions
#             print('{} {} {} {}'.format(question[0], question[1], question[2], question[3]))
        
    return output, pairs

def make_pair(word1, word2):
    if word1 < word2:
        return (word1, word2)
    else:
        return (word2, word1)

def initialize_dict(sample_set):
    words = set()
    for question in sample_set:
        for word in question:
            words.add(word)
    
    int_to_word = {ii: word for ii, word in enumerate(words)}
    word_to_int = {word: ii for ii, word in int_to_word.items()}
    
    return int_to_word, word_to_int

def question_to_int(questions, word_to_int):
    int_question = [[word_to_int[word] for word in question] for question in questions]
    return int_question

def cosine(vec1, vec2):
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)

    return vec1.dot(vec2) / (norm1 * norm2)

def save_pkl(data, filename, local=False):
    """ Save data to file """
    # create path
    parent_dir = os.path.dirname(filename)
    if not os.path.exists(parent_dir):
        os.makedirs(parent_dir)

    # save file
    output = open(filename, 'wb')
    pickle.dump(data, output, pickle.HIGHEST_PROTOCOL)
    output.close()
    
def generate_word_level(questions):
    same_level_word = []
    word_dis_dict = {}
    for question in questions:
        for i in range(2):
            if question[i] in word_dis_dict and question[i+2] in word_dis_dict:
                if word_dis_dict[question[i]] != word_dis_dict[question[i+2]]:
                    # merge two level
                    merge_idx = word_dis_dict[question[i]]
                    level2 = same_level_word[word_dis_dict[question[i+2]]]
                    level = same_level_word[merge_idx]|level2
                    same_level_word[merge_idx] = level
                    for word in level2:
                        word_dis_dict[word] = merge_idx
                    continue

            if question[i] in word_dis_dict:
                idx = word_dis_dict[question[i]]
                level = same_level_word[idx]
                same_level_word[idx].add(question[i+2])
                
            elif question[i+2] in word_dis_dict:
                idx = word_dis_dict[question[i+2]]
                level = same_level_word[idx]
                same_level_word[idx].add(question[i])
                
            else:
                idx = len(same_level_word)
                level = {question[i], question[i+2]}
                same_level_word.append(level)
            
            word_dis_dict[question[i]] = word_dis_dict[question[i+2]] = idx
            
    return same_level_word, word_dis_dict

def generate_original_distribution(questions, context_size):
    same_level_word, word_dis_dict = generate_word_level(questions)
    original_distribution = []
    for i in range(len(same_level_word)):
        dis = np.random.rand(context_size)
        original_distribution.append(dis)
    
    return original_distribution, word_dis_dict

def generate_noise_distribution(n_pairs, context_size):
    noise_distribution = []
    for i in range(n_pairs):
        dis = np.random.rand(context_size)
        noise_distribution.append(dis)
    
    return noise_distribution

In [25]:
# Initiialize
questions, pairs = sample_questions(filename, n_pairs)
int_to_word, word_to_int = initialize_dict(questions)
questions = question_to_int(questions, word_to_int)
n_question = len(questions)

# Change new parameters
n_word = len(int_to_word)
output_dict = 'output/{}-context-{}-data-{}-questions/'.format(n_context, n_data, n_question)

# Initialize dummy contexts
int_to_cont = {context: context for context in range(n_context)}
cont_to_int = {word: ii for ii, word in int_to_cont.items()}

{('writing', 'wrote'): 0, ('enhanced', 'enhancing'): 1, ('laredo', 'texas'): 2, ('florida', 'hialeah'): 3, ('provide', 'provides'): 4, ('listen', 'listens'): 5, ('flew', 'flying'): 6, ('selling', 'sold'): 7, ('copenhagen', 'denmark'): 8, ('lusaka', 'zambia'): 9, ('bright', 'brighter'): 10, ('weak', 'weaker'): 11, ('cheap', 'cheaper'): 12, ('easier', 'easy'): 13, ('road', 'roads'): 14, ('elephant', 'elephants'): 15, ('heavier', 'heavy'): 16, ('old', 'older'): 17, ('france', 'french'): 18, ('cambodia', 'cambodian'): 19, ('albania', 'tirana'): 20, ('algeria', 'algiers'): 21, ('father', 'mother'): 22, ('aunt', 'uncle'): 23, ('tunis', 'tunisia'): 24, ('beijing', 'china'): 25, ('canada', 'dollar'): 26, ('brazil', 'real'): 27, ('hit', 'hitting'): 28, ('sat', 'sitting'): 29, ('kathmandu', 'nepal'): 30, ('kenya', 'nairobi'): 31, ('convenient', 'inconvenient'): 32, ('certain', 'uncertain'): 33, ('jamaica', 'kingston'): 34, ('greenland', 'nuuk'): 35, ('quiet', 'quietly'): 36, ('happily', 'happy')

In [26]:
# Sample context distribution for each words
original_distribution, word_dis_dict = generate_original_distribution(questions, n_context)
noise_distribution = generate_noise_distribution(n_pairs, n_context)
context_distribution = {}

for pair, idx in pairs.items():
    word1, word2 = pair
    word1 = word_to_int[word1]
    word2 = word_to_int[word2]
    
    # word1
    dis = (1- noise_rate) * original_distribution[word_dis_dict[word1]].copy() + noise_rate * noise_distribution[idx]
    dis = dis / np.sum(dis)
    context_distribution[word1] = dis
    
    # word2
    dis = (1- noise_rate) * original_distribution[word_dis_dict[word2]].copy() + noise_rate * noise_distribution[idx]
    dis = dis / np.sum(dis)
    context_distribution[word2] = dis

In [48]:
# Test for word analogy property
for i in range(n_question):
    question = questions[i]
    pair1_dis = cosine(context_distribution[question[0]], context_distribution[question[1]])
    pair2_dis = cosine(context_distribution[question[2]], context_distribution[question[3]])
    print('Pair 1 {}-{}: {}, Pair 2 {}-{}: {}, diff: {}'.format(int_to_word[question[0]], int_to_word[question[1]], pair1_dis,
                                                                int_to_word[question[2]], int_to_word[question[3]], pair2_dis,
                                                                abs(pair1_dis - pair2_dis)))

Pair 1 manila-philippines: 0.7654407703943393, Pair 2 ottawa-canada: 0.7654407703943393, diff: 0.0
Pair 1 invent-inventing: 0.7768114965702747, Pair 2 listen-listening: 0.7768114965702747, diff: 0.0
Pair 1 listen-listening: 0.7768114965702747, Pair 2 invent-inventing: 0.7768114965702747, diff: 0.0
Pair 1 israel-israeli: 0.7524785736928562, Pair 2 peru-peruvian: 0.7524785736928562, diff: 0.0
Pair 1 peru-peruvian: 0.7524785736928562, Pair 2 israel-israeli: 0.7524785736928562, diff: 0.0


In [43]:
# Random pairs
pair1_dis = cosine(context_distribution[questions[0][0]], context_distribution[questions[1][3]])
pair2_dis = cosine(context_distribution[questions[2][1]], context_distribution[questions[3][2]])
print('Pair 1 {}-{}: {}, Pair 2 {}-{}: {}, diff: {}'.format(int_to_word[questions[0][0]], int_to_word[questions[1][3]], pair1_dis,
                                                            int_to_word[questions[2][1]], int_to_word[questions[3][2]], pair2_dis,
                                                            abs(pair1_dis - pair2_dis)))

Pair 1 manila-listening: 0.7468246958433704, Pair 2 listening-peru: 0.836128336664007, diff: 0.08930364082063658


In [45]:
# Test word analogy score
result = []
for question in questions:
    # Predict
    answer = question[3]
    pred = context_distribution[question[1]] - context_distribution[question[0]] + context_distribution[question[2]]
    sim_vector = np.zeros(n_word)
    for i in range(n_word):
        if i in question[:3]:
            sim_vector[i] = 0
        else:
            sim_vector[i] = cosine(pred, context_distribution[i])
    pred = np.argsort(sim_vector)[-1]
    print(sim_vector)
    result.append(1 if pred == answer else 0)

print('Acc: ', np.mean(result))

[0.82236467 0.         0.76675514 0.         0.82236467 0.76780085
 0.76675514 1.         0.76780085 0.         0.74120291 0.74120291]
[0.83612834 0.7468247  0.         0.7468247  0.83612834 0.
 1.         0.76675514 0.         0.76675514 0.74363747 0.74363747]
[0.83612834 0.7468247  1.         0.7468247  0.83612834 0.
 0.         0.76675514 0.         0.76675514 0.74363747 0.74363747]
[0.         0.77935171 0.74363747 0.77935171 0.         0.78587478
 0.74363747 0.74120291 0.78587478 0.74120291 1.         0.        ]
[0.         0.77935171 0.74363747 0.77935171 0.         0.78587478
 0.74363747 0.74120291 0.78587478 0.74120291 0.         1.        ]
Acc:  1.0


In [19]:
# Sample training set
data = []
for i in range(n_data):
    # Sample word
    word = random.randrange(n_word)
    
    # Sample context
    dis = context_distribution[word]
    context = np.argmax(np.random.multinomial(1, dis))
    
    data.append([word, context])

In [20]:
# make directories
if not os.path.exists(output_dict):
    os.makedirs(output_dict)

# Save data
print('Writing processed data back to file...')
output = open(output_dict + 'data.csv', "w", newline='')
writer = csv.writer(output)
writer.writerows(data)
output.close()

# Save dictionaries
save_pkl(int_to_word, output_dict + 'dict/int_to_vocab.dict')
save_pkl(word_to_int, output_dict + 'dict/vocab_to_int.dict')
save_pkl(cont_to_int, output_dict + 'dict/cont_to_int.dict')
save_pkl(int_to_cont, output_dict + 'dict/int_to_cont.dict')
print('Done!')

# Save questions
with open(filepath + 'test-{}-questions.txt'.format(n_question), 'w') as f:
    f.write(': test-category\n')
    for i in range(n_question):
        question = questions[i]
        f.write('{} {} {} {}\n'.format(int_to_word[question[0]], int_to_word[question[1]], int_to_word[question[2]], int_to_word[question[3]]))

Writing processed data back to file...
Done!


In [21]:
# Test for word analogy property
for i in range(n_question):
    question = questions[i]
    print('{}-{}-{}-{}'.format(int_to_word[question[0]], int_to_word[question[1]], int_to_word[question[2]], int_to_word[question[3]]))

doha-qatar-jakarta-indonesia
havana-cuba-kampala-uganda
yerevan-armenia-copenhagen-denmark
irvine-california-chicago-illinois
happy-happily-obvious-obviously
immediate-immediately-typical-typically
obvious-obviously-happy-happily
typical-typically-immediate-immediately
loud-louder-low-lower
low-lower-loud-louder
norway-norwegian-sweden-swedish
sweden-swedish-norway-norwegian
banana-bananas-man-men
cloud-clouds-computer-computers
computer-computers-cloud-clouds
man-men-banana-bananas
