# Similar Question finder

In [1]:
import gensim
import numpy as np
from gensim.models.keyedvectors import KeyedVectors



In [28]:
def load_data(filename):
    data = []
    # open the file and load the training questions with positives and
    # negatives
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(line.strip().split('\t'))
    return data

In [29]:
data = load_data(filename='data/validation.tsv')

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

In [38]:
def rank_candidates(question, candidates, embeddings, dim=300):
    # get embeddings for question 
    question_embed = question_embedding(question, word_embeddings)
    # get embeddings for candidates
    candidates_embed = np.array([question_embedding(candidate, word_embeddings) 
                          for candidate in candidates])
    # compute cosine similarity
    candidates_sim = cosine_similarity(question_embed.reshape(1, -1), candidates_embed.reshape(1,-1))[0]
    # make a tuple pair(sim, candidate question index)
    candidates_ques_sim = [(sim, i) for i, sim in enumerate(candidates_sim)]
    # sort the list
    candidates_ques_sim = sorted(candidates_ques_sim)
    final_candidates_ranking = [(index, candidates[index]) for _,index in candidates_ques_sim]
    return final_candidates_ranking

In [2]:
word_embeddings = KeyedVectors.load_word2vec_format(
                    'embeddings/GoogleNews-vectors-negative300.bin',
                    binary=True, limit=500000)

In [4]:
# sanity checking
print('neural' in word_embeddings)
print(word_embeddings.most_similar(positive=['woman', 'king'], negative=['man']))

True
[('queen', 0.7118192911148071), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321243286133), ('kings', 0.5236844420433044), ('queens', 0.518113374710083), ('sultan', 0.5098593235015869), ('monarchy', 0.5087411999702454), ('royal_palace', 0.5087165832519531)]


In [3]:
word_embeddings['apple'].shape

(300,)

In [3]:
# for representing the question as vector of embeddings for 
# known words in question. We take the mean of all the embeddings for the 
# words of the question
def question_embedding(question, word_embeddings):
    # dimension of embedding = 300, since we are using the google's 300d embeddings
    dim = 300
    # tokenize the question 
    question = question.split()
    
    question_vector = [word_embeddings[word] for word in question if word in word_embeddings]
    
    # return if there is atleast one word known to the pretrained embedding
    if len(question_vector) == 0:
        return np.zeros(dim) 
    
    # take the mean along the axis = 0(along each column).
    # since each embedding is (300,1)
    # so all of them will have shape (no. of words, 300)
    question_vector = np.array(question_vector)
    question_vector = np.mean(question_vector, axis=0)
    
    return question_vector 

## <u>Evaluation metrics
We will use two evaluation metrics for this task.
    
***1. Hits@k***

***2. DCG@k***
    

In [11]:
'''
    ranks: list of ranks for each duplicate
    k: (int) stating the window size to scan
'''
def hits_k(k, ranks):
    hit_score = np.array(ranks) <= k
    hit_score = hit_score.mean()
    return hit_score

In [24]:
'''
    ranks: list of ranks for each duplicate
    k: (int) stating the window size to scan
'''
def dcg_k(k, ranks):
    dcg_score = np.array(ranks) <= k
    dcg_score = dcg_score/np.log2(1 + np.array(ranks))
    dcg_score = dcg_score.mean()
    
    return dcg_score