In [1]:
import numpy as np
import os
import time
import json

In [2]:
'''A BATS_Analogy is an analogy from the BATS test set. It may allow
multiple correct answers! While most analogies are of the form
a:b::c:d (read as "a is to be as c is to d") a BATS Analogy is of the form
a:b1,b2,...,bn::c:d,d1,d2,...,dn (read as "a is to b1 or b2 or ... or bn as 
c is to d1 or d2 or ... or dn") an example would be:
"snake is to nest/pit/acquarium as tiger is to den/cage."'''
class BATS_Analogy:
    def __init__(self, a, b_set, c, d_set):
        self.a = a
        self.b_set = b_set
        self.c = c
        self.solution_set = d_set
        

In [3]:
'''takes two matrices A and B and returns 
A@B. If A and B are normalized then this is equivalent
to calculating the cosine similarity of each row of A with
each column of B. i.e. the cosine similarity of the ith row of A
with the jth column of B is given by (A@B)[i,j]'''
def cosine_similarity_normalized(A, B):
    return A@B

In [4]:
'''This function takes a list of analogies, a numpy array of word embeddings, 2 dictionaries
which are used to get the embedding matrix index corresponding to a specific word and vice versa,
and the last argument sim is a similarity function which takes two numpy arrays as arguments and
calculates a "similarity matrix" whose i,j, component corresponds to the similarity of the ith row of the first matrix
with the jth column of the second matrix. An example of a sim function is cosine similarity.
The function returns the number of correctly answered analogies over the entire batch.'''
def calculate_batch_accuracy(analogies, embeddings, words_to_indices, indices_to_words, sim):
    num_correct = 0
    analogy_predictions_indices = dict()
    start = 0
    all_prediction_matrices = []
    num_predictions = 0
    #loop figures out how large matrix needs to be and which indices will correspond to which vectors
    for analogy in analogies:
        num_predictions += len(analogy.b_set)
        end = start + len(analogy.b_set)
        analogy_predictions_indices[analogy] = (start,end)
        start = end
        
    all_predictions = np.empty((num_predictions, embeddings.shape[1]), dtype=np.float32)
    #after allocating space for matrix, loop is used to fill in values needed to make predictions
    for analogy in analogies:
        offset = analogy_predictions_indices.get(analogy)[0]
        for i in range(analogy_predictions_indices.get(analogy)[1] - offset):
            all_predictions[i + offset] = embeddings[words_to_indices.get(analogy.b_set[i])]
        start = analogy_predictions_indices.get(analogy)[0]
        end = analogy_predictions_indices.get(analogy)[1]
        all_predictions[range(start, end)] += embeddings[words_to_indices.get(analogy.c)]
        all_predictions[range(start, end)] -= embeddings[words_to_indices.get(analogy.a)]
     
    #all_distances calculates the value of sim(prediction, word) for all predictions made and words in vocabulary
    all_distances = sim(embeddings, all_predictions.T)
    
    #next nested loop sets values for a,b,c for each analogy to negative infinity. This prevents model from answering
    #the analogy of the form a:b::c:d with any of a,b, or c. 
    for analogy in analogies:
        start_index = analogy_predictions_indices.get(analogy)[0]
        a_ind = words_to_indices.get(analogy.a)
        b_indices = [words_to_indices.get(b) for b in analogy.b_set]
        c_ind = words_to_indices.get(analogy.c)
        for i in range(len(b_indices)):
            all_distances[a_ind,  start_index + i] = np.NINF 
            all_distances[b_indices[i], start_index + i] = np.NINF
            all_distances[c_ind, start_index + i] = np.NINF
    
    #prediction_indices has index of each prediction made by the model. Used in last loop to find number answered correctly.
    prediction_indices = np.argmax(all_distances, axis=0)
    for analogy in analogies:
        start_ind = analogy_predictions_indices[analogy][0]
        end_ind = analogy_predictions_indices[analogy][1]
        for i in range(start_ind, end_ind):
            if indices_to_words.get(prediction_indices[i]) in analogy.solution_set:
                num_correct += 1
                break
    return num_correct
    

In [5]:

'''construct_analogies takes a list of lines from a BATS file and constructs the corresponding analogy lines.
Each of the original line has a pair, e.g. "cat cats" or "dog dogs" and the corresponding analogy would be 
"cat is to cats as dog is to dogs" '''
def construct_analogies(lines):
    analogies = []
    for i in range(len(lines)):
        for j in range(len(lines)):
            if i == j:
                pass 
            else:
                split_first_line = lines[i].split()
                a = split_first_line[0]
                b_set = split_first_line[1].split("/")
                split_second_line = lines[j].split()
                c = split_second_line[0]
                solution_set = split_second_line[1].split("/")
                analogies.append(BATS_Analogy(a, b_set, c, solution_set))
    return analogies

In [6]:
#given a filepath get_analogies returns a list of BATS_Analogy's 
def get_analogies(filepath):
    file = open(filepath, 'r')
    lines = file.readlines()
    analogies = construct_analogies(lines)
    return analogies

In [7]:
'''load_vectors takes a filepath to a file containing word vectors as a .txt file
of the format word [vector components] e.g. for a 4dimensional vector one line could be
"the 0.004 -10.499 0.000 \n"
the function returns a numpy array of size vxd where v is the size of the vocabulary and 
d is the length of each vector. The function also returns two dictionaries, one of which
has words as keys and indices as values and the other has indices as keys and words as vectors.
'''
def load_vectors(filepath, normalize=1):
    num_failed = 0
    word_to_index = dict()
    index_to_word = dict()
    file = open(filepath, 'r', encoding='utf-8')
    lines = file.readlines()
    embeddings = np.ones((len(lines), len(lines[0].split())-1), dtype=np.float32)
    for i in range(len(lines)):
        try:
            vec_info = lines[i].split()
            word = vec_info[0]
            vec = np.array(vec_info[1:], dtype=np.float32)
            #vec_nums = [np.float32(component) for component in vec_info[1:]]
           # vec = np.array(vec_nums)
            if normalize:
                #normalize vectors
                vec = vec/(np.linalg.norm(vec))
            embeddings[i] = vec
            word_to_index[word] = i 
            index_to_word[i] = word
        except:
            num_failed += 1
            print("ERROR with word ", word)
    print("Successfully loaded ", 100* (len(lines) - num_failed)/float(len(lines)), "% of vectors in file: ", filepath)
    return embeddings, word_to_index, index_to_word
    

In [8]:
#analogies which require vocabulary which the model doesn't know are removed. 
def remove_unsolvable(analogies, word_to_index):
    solvable_analogies = []
    for analogy in analogies:
        a_ind = word_to_index.get(analogy.a)
        c_ind = word_to_index.get(analogy.c)
        if a_ind == None or c_ind == None:
            pass
        else:
            b_set_known = []
            b_all_null = True
            for b in analogy.b_set:
                if word_to_index.get(b) != None:
                    b_all_null = False
                    b_set_known.append(b)
                else:
                    continue
            if b_all_null:
                pass
            else:
                solvable_analogies.append(BATS_Analogy(analogy.a, b_set_known, analogy.c, analogy.solution_set))
    return solvable_analogies

In [9]:
#vector_path = "./Pretrained_Vectors/glove.6B/glove.6B.50d/glove.6B.50d.txt"
#embeddings, word_to_index, index_to_word = load_vectors(vector_path)

In [10]:
#bats_path = "./BATS_3.0/BATS_3.0/4_Lexicographic_semantics/L03 [hyponyms - misc].txt"
#analogies = get_analogies(bats_path)

In [11]:
#analogies = remove_unsolvable(analogies, word_to_index)

In [12]:
'''This function takes a list of analogies, an embeddings array, 2 dictionaries used for translating between words
and indexes and vice versa, a similarity function used to pick the best prediction, and a batch_size which defines how
many analogies to attempt at a time. It returns the number of correctly solved analogies and the number of analogies attempted.'''
def compute_accuracy_on_analogies(analogies, embeddings, word_to_index, index_to_word, sim, batch_size=100):
    i = 0
    total_correct = 0
    while i < len(analogies):
        end_ind = min(i+batch_size, len(analogies))
        batch = analogies[i:end_ind]
        total_correct += calculate_batch_accuracy(batch, embeddings, word_to_index, index_to_word, sim)
        print("Completed batch: ", i, "through ", end_ind )
        i += batch_size
    return total_correct, len(analogies)

In [13]:
'''This function takes a list of filepaths to test files, a model name and the corresponding
embeddings matrix, 2 dictionaries used to translate between words and indexes, and a similarity function used to pick 
optimal predictions.'''
def record_analogy_tests(test_filepaths, model_name, embeddings, word_to_index, index_to_word, sim):
    results = []
    for filepath in test_filepaths:
        temp = filepath.split("/")
        filename = temp[-1]
        recorded_file = False
        batch_size = 4096
        analogies = get_analogies(filepath)
        analogies = remove_unsolvable(analogies, word_to_index)
        while not recorded_file:
            #try except loop enables us to solve as many tests as possible and ignore ones which fail
            try:
                test_correct, test_attempted = compute_accuracy_on_analogies(analogies, embeddings, word_to_index, index_to_word, sim, batch_size)
                results.append({'Model': model_name, 'Test': filename, 'Total Correct': test_correct, 'Total Attempted': test_attempted})
                recorded_file = True
                print("Successfully ran analogies from", filename, "\n")
            except:
                if batch_size == 1:
                    print("Failed to Solve Analogies with batch_size = 1 on file", filename)
                    break
                print("\nFailed to solve with batch_size = ", batch_size, "\n")
                batch_size = batch_size/2
    return results

In [14]:
'''function takes a filepath to a root directory and returns a list of filepaths to all files in the root
directory and subdirectories.'''
def get_list_of_filepaths(root_directory):
    file_paths = []
    for root, dirs, files in os.walk(root_directory):
        for file in files:
            full_path = os.path.join(root, file)
            file_paths.append(full_path)
    return file_paths

In [15]:
'''function takes a filepath to the file which stores the word embedding matrix, a list of filepaths to the test files,
a similarity function, a model name, and a path to an output directory in which to save the results. Note that the 
output directory must be created before this function is called. Otherwise it will not write the experimental results
to a file, it will only return them as a dictionary. When it does successfully write to a file it uses JSON format.'''
def run_one_model(vector_path, test_paths, sim, model_name, output_dir):
    print("Attempting to load embeddings from ", vector_path)
    embeddings, word_to_index, index_to_word = load_vectors(vector_path)
    results = record_analogy_tests(test_paths, model_name, embeddings, word_to_index, index_to_word, sim)
    output_loc = output_dir + model_name
    try:
        with open(output_loc, 'w') as json_file:
            json.dump(results, json_file)
        json_file.close()
        return True
    except:
        return results

In [16]:
'''function accepts a filepath to a root directory in which all the word embedding files are stored (can also be stored
in subdirectories), a filepath to a root directory which holds all the test files (again tests can be stored in subdirectories),
a similarity function used to choose predictions, and a path to an output directory in which to save the results. Note the 
output directory must be created before it is written to. If output file does not exist then no files are written and the 
function returns a list of dictionaries which contain the results. Dictionaries are used because they can be easily written
to JSON files (which is the format used throughout this project to store experimental results.)
'''
def run_all_models(vector_root_dir, test_root_dir, sim, output_dir):
    test_paths = get_list_of_filepaths(test_root_dir)[1:]
    vector_paths = get_list_of_filepaths(vector_root_dir)[5:]   #temp hardcode,cuz alreaady did 5 experiment
    failed_to_test = []
    results = []
    for vector_path in vector_paths:
        print("Current Model at", vector_path, "\n")
        temp = vector_path.split('\\')
        model_name = temp[-1]
        try:
            vec_results = run_one_model(vector_path, test_paths, sim, model_name, output_dir)
            if vec_results != True:
                results.append(vec_results)
                print("Did not save results, but did append to results list.\n")
            else:
                print("Successfully saved results for ", model_name, "to directory ", output_dir, "\n")
        except:
            print("\nFAILED TO TEST EMBEDDINGS FROM", vector_path, "\n")
    return results

In [17]:
#Define path to BATS test files
BATS_loc = "./BATS_3.0/BATS_3.0"


In [18]:
#define path to Vector files
vector_loc = "./Pretrained_Vectors"

In [19]:
#define path to output location. Directory must exist already in order to be written to!!!
output_location = "./Experimental_Results/"

In [None]:
#run all experiments and store results (WARNING THIS MAY TAKE SEVERAL DAYS TO FINISH)
results_all = run_all_models(vector_loc, BATS_loc, cosine_similarity_normalized, output_location)

Current Model at ./Pretrained_Vectors\glove.840B.300d\glove.840B.300d.txt 

Attempting to load embeddings from  ./Pretrained_Vectors\glove.840B.300d\glove.840B.300d.txt
ERROR with word  .
ERROR with word  at
ERROR with word  0.20785
ERROR with word  .
ERROR with word  to
ERROR with word  .
ERROR with word  .
ERROR with word  email
ERROR with word  0.39511
ERROR with word  or
ERROR with word  0.13211
ERROR with word  contact
ERROR with word  -0.38024
ERROR with word  -0.0033421
ERROR with word  Email
ERROR with word  on
ERROR with word  0.14608
ERROR with word  -0.36288
ERROR with word  At
ERROR with word  by
ERROR with word  in
ERROR with word  0.5478
ERROR with word  emailing
ERROR with word  Contact
ERROR with word  0.59759
ERROR with word  at
ERROR with word  •
ERROR with word  at
ERROR with word  is
Successfully loaded  99.99867942734505 % of vectors in file:  ./Pretrained_Vectors\glove.840B.300d\glove.840B.300d.txt
