In [11]:
from __future__ import print_function
import gensim
import numpy as np
print(gensim.__version__)


4.2.0


In [12]:
from gensim.models import KeyedVectors

emb_model = KeyedVectors.load_word2vec_format('./embeddings/wiki-news-300d-1M.vec')

In [13]:
# Getting the tokens 
words = []
for word in emb_model.index_to_key:
    words.append(word)

# Printing out number of tokens available
print("Number of Tokens: {}".format(len(words)))

Number of Tokens: 999994


In [14]:
def get_sample_words_embeddings(emb_model, size=100, dim=300):

    sample_words = np.random.choice(list(emb_model.index_to_key), size=size)

    # sample_words = {k: emb_model.index_to_key[k] for k in sample_words_keys}

    embedding = np.array([])

    for word in sample_words:
        embedding = np.append(embedding, emb_model[word])
    embedding = embedding.reshape(size, dim)

    return sample_words, embedding


dim = 300
size = 100
sample_words, embeddings = get_sample_words_embeddings(emb_model)


In [15]:
def get_analogy(example, query, emb_model):

    word_positive = [query, example[1]]
    word_negative = [example[0]]
 
    analogy = emb_model.most_similar(positive = word_positive,
                                            negative = word_negative,
                                            topn = 1)

    return analogy[0][0]


In [16]:
# Word analogies
example = ['boy', 'man']
query = 'girl'

get_analogy(example, query, emb_model)

'woman'

In [17]:
example = ['man', 'woman']
query = 'king'

get_analogy(example, query, emb_model)

'queen'

In [18]:
example = ['Japan', 'France']
query = 'Tokyo'

get_analogy(example, query, emb_model)

'Paris'

In [19]:
example = ['big', 'biggest']
query = 'old'

get_analogy(example, query, emb_model)

'oldest'

### Validation <a class="anchor" id="analogy-validate"></a>

In [None]:
import pandas as pd

In [None]:
val_data = pd.read_csv('../data/Analogy_dataset.txt', sep=" ", header=None)
val_data.columns = ["e1", "e2", "q", "a"]

In [None]:
val_data.head()

In [None]:
def get_analogy_by_row(row):

    example = [row['e1'], row['e2']]
    query = row['q']
    pred_answer = get_analogy(example, query, emb_model)
    return(pred_answer)

val_data['pred'] = val_data.apply(get_analogy_by_row, axis = 1)

In [None]:
val_data.head()

In [None]:
val_data['is_accurate'] = val_data.apply(lambda r : 1 if r.a == r.pred else 0, axis = 1)

In [None]:
val_data.to_csv('../data/analogy_test_set_with_predictions.csv')

In [None]:
val_data = pd.read_csv('./data/google_analogy_test_set_with_predictions.csv')

In [None]:
accuracy = val_data['is_accurate'].sum()/ len(val_data) * 100
print('% of CORRECT predictions: ', accuracy, ' %')