In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import timeit
import time
import json

In [2]:
'''A BATS_Analogy is an analogy from the BATS test set. It may allow
multiple correct answers! While most analogies are of the form
a:b::c:d (read as "a is to be as c is to d") a BATS Analogy is of the form
a:b1,b2,...,bn::c:d,d1,d2,...,dn (read as "a is to b1 or b2 or ... or bn as 
c is to d1 or d2 or ... or dn") an example would be:
"snake is to nest/pit/acquarium as tiger is to den/cage."'''
class BATS_Analogy:
    def __init__(self, a, b_set, c, d_set):
        self.a = a
        self.b_set = b_set
        self.c = c
        self.solution_set = d_set
        

In [3]:
'''takes two matrices A and B and returns 
A@B. If A and B are normalized then this is equivalent
to calculating the cosine similarity of each row of A with
each column of B. i.e. the cosine similarity of the ith row of A
with the jth column of B is given by (A@B)[i,j]'''
def cosine_similarity_normalized(A, B):
    return A@B

In [15]:
def calculate_batch_accuracy(analogies, embeddings, words_to_indices, indices_to_words, sim):
    num_correct = 0
    analogy_predictions_indices = dict()
    start = 0
    all_prediction_matrices = []
    num_predictions = 0
    for analogy in analogies:
        num_predictions += len(analogy.b_set)
        end = start + len(analogy.b_set)
        analogy_predictions_indices[analogy] = (start,end)
        start = end
       # temp = np.empty((len(analogy.b_set), embeddings.shape[1]), dtype=np.float32)
        
    all_predictions = np.empty((num_predictions, embeddings.shape[1]), dtype=np.float32)
    for analogy in analogies:
        offset = analogy_predictions_indices.get(analogy)[0]
        for i in range(analogy_predictions_indices.get(analogy)[1] - offset):
            all_predictions[i + offset] = embeddings[words_to_indices.get(analogy.b_set[i])]
        start = analogy_predictions_indices.get(analogy)[0]
        end = analogy_predictions_indices.get(analogy)[1]
        all_predictions[range(start, end)] += embeddings[words_to_indices.get(analogy.c)]
        all_predictions[range(start, end)] -= embeddings[words_to_indices.get(analogy.a)]
     
    '''
        for i in range(len(analogy.b_set)):
            temp[i] = embeddings[words_to_indices.get(analogy.b_set[i])]
        temp += embeddings[words_to_indices.get(analogy.c)]
        temp -= embeddings[words_to_indices.get(analogy.a)]
        all_prediction_matrices.append(temp)
        '''
    #all_predictions = np.concatenate(all_prediction_matrices, axis=0)
    all_distances = sim(embeddings, all_predictions.T)
    for analogy in analogies:
        start_index = analogy_predictions_indices.get(analogy)[0]
        a_ind = words_to_indices.get(analogy.a)
        b_indices = [words_to_indices.get(b) for b in analogy.b_set]
        c_ind = words_to_indices.get(analogy.c)
        for i in range(len(b_indices)):
            all_distances[a_ind,  start_index + i] = np.NINF 
            all_distances[b_indices[i], start_index + i] = np.NINF
            all_distances[c_ind, start_index + i] = np.NINF
    prediction_indices = np.argmax(all_distances, axis=0)
    for analogy in analogies:
        start_ind = analogy_predictions_indices[analogy][0]
        end_ind = analogy_predictions_indices[analogy][1]
        for i in range(start_ind, end_ind):
            if indices_to_words.get(prediction_indices[i]) in analogy.solution_set:
                num_correct += 1
                break
    return num_correct
    

In [16]:

'''construct_analogies takes a list of lines from a BATS file and constructs the corresponding analogy lines.
Each of the original line has a pair, e.g. "cat cats" or "dog dogs" and the corresponding analogy would be 
"cat is to cats as dog is to dogs" '''
def construct_analogies(lines):
    analogies = []
    for i in range(len(lines)):
        for j in range(len(lines)):
            if i == j:
                pass 
            else:
                split_first_line = lines[i].split()
                a = split_first_line[0]
                b_set = split_first_line[1].split("/")
                split_second_line = lines[j].split()
                c = split_second_line[0]
                solution_set = split_second_line[1].split("/")
                analogies.append(BATS_Analogy(a, b_set, c, solution_set))
    return analogies

In [6]:
def get_analogies(filepath):
    file = open(filepath, 'r')
    lines = file.readlines()
    analogies = construct_analogies(lines)
    return analogies

In [7]:
'''load_vectors takes a filepath to a file containing word vectors as a .txt file
of the format word [vector components] e.g. for a 4dimensional vector one line could be
"the 0.004 -10.499 0.000 \n"
the function returns a numpy array of size vxd where v is the size of the vocabulary and 
d is the length of each vector. The function also returns two dictionaries, one of which
has words as keys and indices as values and the other has indices as keys and words as vectors.
'''
def load_vectors(filepath, normalize=1):
    word_to_index = dict()
    index_to_word = dict()
    file = open(filepath, 'r', encoding='utf-8')
    lines = file.readlines()
    embeddings = np.ones((len(lines), len(lines[0].split())-1), dtype=np.float32)
    for i in range(len(lines)):
        vec_info = lines[i].split()
        word = vec_info[0]
        vec_nums = [np.float32(component) for component in vec_info[1:]]
        vec = np.array(vec_nums)
        if normalize:
            #normalize vectors
            vec = vec/(np.linalg.norm(vec))
        embeddings[i] = vec
        word_to_index[word] = i 
        index_to_word[i] = word
    return embeddings, word_to_index, index_to_word
    

In [8]:
def remove_unsolvable(analogies, word_to_index):
    solvable_analogies = []
    for analogy in analogies:
        a_ind = word_to_index.get(analogy.a)
        c_ind = word_to_index.get(analogy.c)
        if a_ind == None or c_ind == None:
            pass
        else:
            b_set_known = []
            b_all_null = True
            for b in analogy.b_set:
                if word_to_index.get(b) != None:
                    b_all_null = False
                    b_set_known.append(b)
                else:
                    continue
            if b_all_null:
                pass
            else:
                solvable_analogies.append(BATS_Analogy(analogy.a, b_set_known, analogy.c, analogy.solution_set))
    return solvable_analogies

In [9]:
vector_path = "./Pretrained_Vectors/glove.6B/glove.6B.50d/glove.6B.50d.txt"
embeddings, word_to_index, index_to_word = load_vectors(vector_path)

In [10]:
bats_path = "./BATS_3.0/BATS_3.0/4_Lexicographic_semantics/L03 [hyponyms - misc].txt"
analogies = get_analogies(bats_path)

In [11]:
analogies = remove_unsolvable(analogies, word_to_index)

In [17]:
def compute_accuracy_on_file(filepath, embeddings, word_to_index, index_to_word, sim, batch_size=100):
    analogies = get_analogies(filepath)
    analogies = remove_unsolvable(analogies, word_to_index)
    i = 0
    total_correct = 0
    while i < len(analogies):
        end_ind = min(i+batch_size, len(analogies))
        batch = analogies[i:end_ind]
        total_correct += calculate_batch_accuracy(batch, embeddings, word_to_index, index_to_word, sim)
        i += batch_size
        print("Completed batch: ", i, "through ", end_ind )
    return total_correct, len(analogies)

In [18]:
def record_analogy_tests(test_filepaths, model_name, embeddings, word_to_index, index_to_word, sim):
    results = []
    for filepath in test_filepaths:
        temp = filepath.split("/")
        filename = temp[-1]
        recorded_file = False
        batch_size = 512
        while not recorded_file:
            try:
                test_correct, test_attempted = compute_accuracy_on_file(filepath, embeddings, word_to_index, index_to_word, sim)
                results.append({'Model': model_name, 'Test': filename, 'Total Correct': test_correct, 'Total Attempted': test_attempted})
                recorded_file = True
            except:
                if batch_size == 1:
                    print("Failed to Solve Analogies with batch_size = 1 on file", filename)
                    break
                batch_size = batch_size/2
    return results

In [None]:
start = time.time()
correct, attempted = compute_accuracy_on_file(bats_path, embeddings, word_to_index, index_to_word, cosine_similarity_normalized)
diff = float(time.time() - start)
print("solved {0} out of {0} analogies in {1:.4f} seconds".format(correct, attempted, diff))

In [15]:
print(len(analogies))

2401


In [None]:
print(word_to_index.get(analogies[0].b_set[0]))

In [None]:
print(analogies[0].b_set[0])

In [None]:
test = [index_to_word.get(b) for b in analogies[0].b_set]

In [None]:
print(word_to_index.get(analogies[4].b_set[0][0]))

In [None]:
var1 = int(10)
var2 = 0.003
print("test{0} solved {1:.4f}".format(var1, var2))

In [None]:
string = "ab/cdef/fdfg"
test = string.split("/")
print(test)

In [None]:
results = []
results.append({ 'Model': 'glove.6B.50d', 'Test': 'L04 [meronyms - substance]', 'Total Correct': 10, 'Total Attempted': 100})
results.append({ 'Model': 'glove.6B.50d', 'Test': 'L05 [meronyms - substance]', 'Total Correct': 10, 'Total Attempted': 100})



In [None]:
with open('test.txt', 'w') as json_file:
    json.dump(results, json_file)
json_file.close()

In [None]:
with open('test.txt', 'r') as json_file:
    test_open = json.load(json_file)
json_file.close()

In [None]:
print(test_open)

[1, 2, 3]
