# lexsub: default program

In [1]:
from default import *
import os

## Run the default solution on dev

In [2]:
lexsub = LexSub(os.path.join('data','glove.6B.100d.magnitude'))
output = []
with open(os.path.join('data','input','dev.txt')) as f:
    for line in f:
        fields = line.strip().split('\t')
        output.append(" ".join(lexsub.substitutes(int(fields[0].strip()), fields[1].strip().split())))
print("\n".join(output[:10]))

sides edge bottom front club line both back place corner
sides edge bottom front club line both back place corner
sides edge bottom front club line both back place corner
sides edge bottom front club line both back place corner
sides edge bottom front club line both back place corner
sides edge bottom front club line both back place corner
sides edge bottom front club line both back place corner
sides edge bottom front club line both back place corner
sides edge bottom front club line both back place corner
sides edge bottom front club line both back place corner


## Evaluate the default output

In [3]:
from lexsub_check import precision
with open(os.path.join('data','reference','dev.out'), 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

Score=27.89


## Documentation

#### Write some beautiful documentation of your program here.

In [None]:
class LexSub:
    def __init__(self, wvec_file, topn=10):
        self.wvecs = pymagnitude.Magnitude(wvec_file)
        self.topn = topn

    def substitutes(self, index, sentence):
        "Return ten guesses that are appropriate lexical substitutions for the word at sentence[index]."
        word = sentence[index]

        return (list(map(lambda k: k[0], self.wvecs.most_similar(word, topn=self.topn))))

def retrofit(wvecs, lexicon, iteration=10):
    '''copy val from old vec to new word vector'''
    new_wvecs = deepcopy(wvecs)

    '''get unique vocab names'''
    wvec_dict = set(new_wvecs.keys())

    '''get list of mutual/intersected word between Lexicon and the embedding keys/words'''
    loop_vocab = wvec_dict.intersection(set(lexicon.keys()))

    ''' iterate based on number of time we want to update'''
    for iter in range(iteration):
        '''loop through every node also in ontology (else just use estimation)'''
        for word in loop_vocab:
            '''get list of neighbor words (from Lexicon) that match the top N most similar word'''
            word_neighbours = set(lexicon[word]).intersection(wvec_dict)
            num_neighbours= len(word_neighbours)

            if num_neighbours == 0:
                continue

            '''create new vec and estimate new vector according to neighbors'''
            new_vec = num_neighbours * wvecs[word]
            '''iterate every neighbor word and calculate according to equation'''
            # hyperparameter
            ALPHA = 0.8
            for pp_word in word_neighbours:
                dis = calculate_cosine_sim(new_wvecs[pp_word], wvecs[word])
                new_vec += ((dis+ALPHA)*new_wvecs[pp_word])
            new_wvecs[word] = new_vec/(2*num_neighbours)
    return new_wvecs

'''Helper function'''
def calculate_cosine_sim(vect1,vect2):
    return dot(vect1, vect2)/(norm(vect1)*norm(vect2))


def load_Glove_to_dict(file_glove):
    '''Load Glove to dictionary (key=word,value=vector)'''
    print("Start loading Glove Model from Stanford Glove.txt")

    file = open(file_glove,'r',encoding='utf-8')
    glove_dict = {}

    for line in tqdm.tqdm(file):
        split_lines = line.split()
        word = split_lines[0]
        word_embedding = np.array([float(value) for value in split_lines[1:]])
        glove_dict[word] = word_embedding

    print(len(glove_dict)," words of Glove loaded successful!")
    return glove_dict

if __name__ == '__main__':
    optparser = optparse.OptionParser()
    optparser.add_option("-i", "--inputfile", dest="input", default=os.path.join('/Users', 'wusiyu', 'Desktop', 'nlp-class-hw', 'lexsub', 'data', 'input',
                                              'dev.txt'), help="input file with target word in context")
    optparser.add_option("-w", "--wordvecfile", dest="wordvecfile", default=os.path.join('/Users', 'wusiyu', 'Desktop', 'nlp-class-hw', 'lexsub','answer', 'data', 'glove.6B.100d.txt'), help="word vectors file")
    optparser.add_option("-n", "--topn", dest="topn", default=10, help="produce these many guesses")
    optparser.add_option("-l", "--logfile", dest="logfile", default=None, help="log file for debugging")
    optparser.add_option("-L", "--lexiconfile", dest="lexicon", default=os.path.join('/Users', 'wusiyu', 'Desktop', 'nlp-class-hw', 'lexsub', 'data',
                                              'lexicons', 'wordnet-synonyms.txt'), help="lexicon file")
    optparser.add_option("-r", "--retrofitted_vecfile", dest="retrofitted_vecfile", default=os.path.join('/Users', 'wusiyu', 'Desktop', 'nlp-class-hw', 'lexsub', 'data',
                                              'retrofitted.glove.magnitude'), help="load retrofited embedding")

    (opts, _) = optparser.parse_args()

    if opts.logfile is not None:
        logging.basicConfig(filename=opts.logfile, filemode='w', level=logging.DEBUG)

    '''get lexicon and format it'''
    lexicon = read_lexicon(opts.lexicon)

    ''' if we don't have retrofitted file, we retrofit the Glove vectors else we just load.
        We retrofitted Glove then save its embedding for later use (txt file) '''
    if os.path.isfile(opts.retrofitted_vecfile) == False:

        print("\nRetrofitted embedding file does not exist. Let's retrofit !\n")

        glove_dict = load_Glove_to_dict(opts.wordvecfile)
        retrofitted_vec = retrofit(glove_dict,lexicon, iteration=10)

        file_loc = 'data/retrofitted_glove.txt'
        save_embedding(retrofitted_vec,file_loc)

        print('\nSuccessfully retrofitting embedding! and save to {}.'.format(file_loc))
        print('-'*50)
        print("PLEASE run pymagnitude.converter to convert .txt file to .magnitude file")
        print('-'*50)

    else:
        lexsub = LexSub(opts.retrofitted_vecfile, int(opts.topn))

        num_lines = sum(1 for line in open(opts.input,'r',encoding='utf-8-sig'))

        with open(opts.input,encoding='utf-8-sig') as f:
            # for line in tqdm.tqdm(f, total=num_lines):
            for line in f:
                fields = line.strip().split('\t')
                print(" ".join(lexsub.substitutes(int(fields[0].strip()), fields[1].strip().split())))


## Procedures to run this code
#### 1. We need Glove.txt file where we can download from Stanford website or we can just convert Glove.magnitude to .txt file.
#### 2. At first, the program will check whether we have retrofitted Glove embedding or not. 
   
####   If not, the program will:
  
   2.1) load up Glove.txt, 
   
   2.2) retrofit those embedding vectors( we can identify the number of iterations, other embedding vectors besides Glove, and lexical files besides the default files), and 
   
   2.3) save into new file (retrofitted_glove.txt).
    AND the program will EXIT.

#### 3. If we don't have Retrofitted embedding in the format of Pymagnitude yet (since we have retrofitted embedding in form .txt from procedure 2):

     We can convert it using : 
     
     python3 -m pymagnitude.converter -i data/retrofitted_glove.txt -o data/retrofitted.glove.magnitude
     
#### 4. Rerun this code again, and the program will use our retrofitted embedding to operate on our input text file. (dev/test.txt)

The program will load embedding vectors into dictionary where the keys store unique words and values store vectors.

## Run our solution on the dev.txt

In [3]:
from lexsub import *
import os

# lexsub = LexSub(os.path.join('/Users', 'wusiyu', 'Desktop', 'nlp-class-hw', 'lexsub', 'data',
#                                               'retrofitted.glove.magnitude'))
# output = []
# with open(os.path.join('/Users', 'wusiyu', 'Desktop', 'nlp-class-hw', 'lexsub', 'data', 'input',
#                                               'dev.txt')) as f:
# lexsub = LexSub(os.path.join('/d','Coding','SFU_CA','CMPT-825','nlpclass-1207-g-oasis','hw2','lexsub','data','retrofitted_glove.magnitude'))

lexsub = LexSub(os.path.join('data/retrofitted_glove.magnitude'))

output = []
with open('data/input/dev.txt') as f:
    for line in f:
        fields = line.strip().split('\t')
        output.append(" ".join(lexsub.substitutes(int(fields[0].strip()), fields[1].strip().split())))
print("\n".join(output[:10]))

english edge position line place point way front while face
english edge position line place point way front while face
english edge position line place point way front while face
english edge position line place point way front while face
english edge position line place point way front while face
english edge position line place point way front while face
english edge position line place point way front while face
english edge position line place point way front while face
english edge position line place point way front while face
english edge position line place point way front while face


## Evaluate our output

In [4]:
from lexsub_check import precision
# with open(os.path.join('/Users', 'wusiyu', 'Desktop', 'nlp-class-hw', 'lexsub', 'data', 'reference','dev.out'), 'rt') as refh:
with open('data/reference/dev.out', 'rt') as refh:
    ref_data = [str(x).strip() for x in refh.read().splitlines()]
print("Score={:.2f}".format(100*precision(ref_data, output)))

True positive:  842.0  False positive: 861.0
Score=49.44


## Analysis

#### Do some analysis of the results. What ideas did you try? What worked and what did not?

    
   The most important thing we did in this task is to implement Retrofitting function and add some modification to it as to improve overall performance. After applying the Retrofitting function with wordnet_synonyms.txt and 10 iterations, the accuracy score on the dev.txt file from our new code could reach around 46.10. 
    
   To improve the performance, we modified the original function by adding some values calculated from Cosine similarity between the words in vocaburary and lexical synonym words around our target word to improve the performance of original Retrofitting function. 

   We select the Cosine distance between two vectors as the criteria in our model. Also, the calculated values can be adjusted according to an arbitrary weight(ALPHA) (hyperparameter). We found out that ALPHA =0.8 can achieve the accuracy of 49.44 with the same iteration and lexicon file.



In [1]:
def retrofit(wvecs, lexicon, iteration=10):
    '''copy val from old vec to new word vector'''
    new_wvecs = deepcopy(wvecs)

    '''get unique vocab names'''
    wvec_dict = set(new_wvecs.keys())

    '''get list of mutual/intersected word between Lexicon and the embedding keys/words'''
    loop_vocab = wvec_dict.intersection(set(lexicon.keys()))

    ''' iterate based on number of time we want to update'''
    for iter in range(iteration):
        '''loop through every node also in ontology (else just use estimation)'''
        for word in loop_vocab:
            '''get list of neighbor words (from Lexicon) that match the top N most similar word'''
            word_neighbours = set(lexicon[word]).intersection(wvec_dict)
            num_neighbours= len(word_neighbours)

            if num_neighbours == 0:
                continue

            '''create new vec and estimate new vector according to neighbors'''
            new_vec = num_neighbours * wvecs[word]
            '''iterate every neighbor word and calculate according to equation'''
            # hyperparameter
            ALPHA = 0.8
            for pp_word in word_neighbours:
                dis = calculate_cosine_sim(new_wvecs[pp_word], wvecs[word])
                new_vec += ((dis+ALPHA)*new_wvecs[pp_word])
            new_wvecs[word] = new_vec/(2*num_neighbours)
    return new_wvecs

Nonetheless, we have tried to consider Context words by applying Window sizes of 5 around our main word and use some models to generate new vectors such as AddBal. However, it turned out to have a negative impact on our result and add substantial time for the model training; therefore, we sticked to our simple approach. For the futuew work, different approaches of selecting Context words and setting hyperparameters like Window size could impact results.