In [None]:
import numpy as np
import pandas as pd
import gc
from gensim.models import Word2Vec
import time
import json

- how do I calculate accuracy for word similarity task?
- in forums this method is mentioned:
$$
Accuracy = \frac{\text{Number of correct predictions}}{\text{Total number of predictions}}
$$

In [None]:
def load_glove_model(file_path,embedding_size=50):
    glove_model = {}
    with open(file_path,'r',encoding="utf-8",errors='ignore') as f:
        try:
            for i,line in enumerate(f):
                # if i%10000==0:
                #     print(i)
                split_line = line.split()
                word = split_line[0]
                embedding = np.array(split_line[-embedding_size:], dtype=np.float64)
                glove_model[word] = embedding
        except Exception as e:
            print(e)
            print(line,i)
    print(f"total {len(glove_model)} words loaded from glove model, each of size {embedding_size}.")
    return glove_model

In [None]:
def get_accuracy(predict_values,thershold):
    """
    return sccuracy score from predicted similar word score 
    """
    total_predictions=len(predict_values)
    correct_predictions=0
    for prediction_score in predict_values:
        if prediction_score>thershold:
            correct_predictions+=1
    
    return correct_predictions/total_predictions

In [None]:
def cosine_similarity(a,b):
    return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))

In [None]:
"""
NOTE: these paths needs to be exactly same. follow these steps to ensure it:
create 'data' dir and change into it
download zip pre-trained embeddings
unzip it in the data dir
"""
'''
NOTE: as sir said in class to only report for 50 and 100, I am doing that only
'''
MODEL_PATHS=['data/hi/50/cbow/hi-d50-m2-cbow.model','data/hi/50/sg/hi-d50-m2-sg.model','data/hi/50/glove/hi-d50-glove.txt','data/hi/50/fasttext/hi-d50-m2-fasttext.model','data/hi/100/cbow/hi-d100-m2-cbow.model','data/hi/100/sg/hi-d100-m2-sg.model','data/hi/100/glove/hi-d100-glove.txt','data/hi/100/fasttext/hi-d100-m2-fasttext.model'] 
#,'data/hi/200/cbow/hi-d200-m2-cbow.model','data/hi/200/sg/hi-d200-m2-sg.model','data/hi/200/glove/hi-d200-glove.txt','data/hi/200/fasttext/hi-d200-m2-fasttext.model','data/hi/300/cbow/hi-d300-m2-cbow.model','data/hi/300/sg/hi-d300-m2-sg.model','data/hi/300/glove/hi-d300-glove.txt','data/hi/300/fasttext/hi-d300-m2-fasttext.model']
accuracy_data={
    'thershold':[0.4,0.5,0.6,0.7,0.8]
}
def main():
    """
    read wordsim file for hindi
    """
    # data file path has to the path to the similar word file
    with open('data/iiith_wordsim/hindi.txt','r') as file:
        # used if condition to remove empty lines if there are any
        wordsim_data=[line.strip().split('\t') for line in file.readlines() if len(line)>1]
    for model_path in MODEL_PATHS:
        model_name=model_path.split("/")[-1]
        print(f'working on -- {model_name}')
        if 'glove' in model_path:
            # does not work if NOTE is not followed
            embedding_size=int(model_path.split('/')[2])
            word_embeddings=load_glove_model(model_path,embedding_size)
        else:
            word_embeddings=Word2Vec.load(model_path)
            word_embeddings=word_embeddings.wv
        
        for i,words in enumerate(wordsim_data):
            cosine_score=cosine_similarity(word_embeddings[words[0]],word_embeddings[words[1]])
            wordsim_data[i].append(cosine_score)

        predict_values=[c[-1] for c in wordsim_data]
        tmp_list=[]
        for thers in accuracy_data['thershold']:
            tmp_list.append(get_accuracy(predict_values,thers))

        accuracy_data[model_name]=tmp_list
        
        time.sleep(1)
        # remove variable from namespace
        del word_embeddings
        # remove variable data from memory
        gc.collect()
        time.sleep(1)

In [None]:
tic=time.time()
main()
toc=time.time()
print('time taken',round(toc-tic))

In [None]:
df=pd.DataFrame(accuracy_data)
df.to_csv('accuracy-data.csv',index=False)

In [None]:
with open('accuracy-data.json','w') as file:
    json.dump(accuracy_data,file)

In [None]:
gc.collect()