# Making word2vec model
In this ipython notebook we make the word2vec model and save it for future use.
We also look further into the text and analyze the data.

In [1]:
import pandas as pd # dataframe
import numpy as np # matrix maths
from tqdm import tqdm # progress bar
from helper import text_to_wordlist # NLP
import gensim.models.word2vec as word2vec # word2vec model
import multiprocessing # cpu_count
import os # saving the model

Using TensorFlow backend.


In [2]:
# loading the questions
train = pd.read_csv('~/Kaggle_Quora/train.csv')

In [3]:
print(train.shape)
train.head(10)

(404290, 6)


Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0
5,5,11,12,Astrology: I am a Capricorn Sun Cap moon and c...,"I'm a triple Capricorn (Sun, Moon and ascendan...",1
6,6,13,14,Should I buy tiago?,What keeps childern active and far from phone ...,0
7,7,15,16,How can I be a good geologist?,What should I do to be a great geologist?,1
8,8,17,18,When do you use シ instead of し?,"When do you use ""&"" instead of ""and""?",0
9,9,19,20,Motorola (company): Can I hack my Charter Moto...,How do I hack Motorola DCX3400 for free internet?,0


In [4]:
# checking for total number of nan values
print('[*]Total nan values:', train.isnull().sum())
# so there are 2 nan values in question2, we will simply remove them
# as we have sufficient data
train = train.dropna()
print('[*]train.shape:', train.shape)

[*]Total nan values: id              0
qid1            0
qid2            0
question1       0
question2       2
is_duplicate    0
dtype: int64
[*]train.shape: (404288, 6)


In [5]:
q1 = train['question1'].values.tolist()
q2 = train['question2'].values.tolist()
dupli_true = sum(train['is_duplicate'])
print('[*]Number of duplicate questions:', dupli_true)

[*]Number of duplicate questions: 149263


## Cleaning the questions
Now we tokenize the questions and see what are the total number of unique tokens.

In [6]:
q1_sent = []
q2_sent = []
for i in tqdm(range(len(q1))):
    q1_sent.append(text_to_wordlist(q1[i]))
    q2_sent.append(text_to_wordlist(q2[i]))

100%|██████████| 404288/404288 [00:51<00:00, 7891.23it/s]


In [7]:
print(q1_sent[0])
print(type(q1_sent[0]))
print(q1_sent[0][0])
print(type(q1_sent[0][0]))

['what', 'is', 'the', 'step', 'by', 'step', 'guide', 'to', 'invest', 'in', 'share', 'market', 'in', 'india']
<class 'list'>
what
<class 'str'>


In [8]:
q_total = q1_sent
q_total += q2_sent

In [9]:
print(q_total[-1])
print(len(q_total))

['what', 'is', 'it', 'like', 'to', 'have', 'sex', 'with', 'your', 'cousin']
808576


## Making our own Word2Vec model

In [13]:
# word2vec parameters
e_dim = 300 # embedding dimension
min_word_count = 1 # minimum number of times a word comes so that it is registered
num_workers = multiprocessing.cpu_count() # total number of workers
context_size = 10 # number of words before and after the focus word
# context_size is also the maximum distance between the current and predicted word within a sentence.
downsampling = 1e-5 # threshold for configuring which higher-frequency words are randomly downsampled
seed = 1 # seed value, helps with remaking the model
sg = 0 # if sg = 0 CBOW is used, if sg = 1 skip-grams is used, default 0
epochs = 5 # number of iters or epochs over the corpus

#### CBOW Model

In [150]:
# Making the word2vec model --> CBOW
w2vector_cbow = word2vec.Word2Vec(sg = sg, seed = seed, size = e_dim, workers = num_workers,
                             min_count = min_word_count, window = context_size,
                             sample = downsampling, iter = epochs)
# building the vocabulary
w2vector.build_vocab(q_total)
print("Word2Vec vocabulary length:", len(w2vector_cbow.wv.vocab))

# training the model
w2vector.train(q_total, total_examples = w2vector_cbow.corpus_count, epochs = w2vector_cbow.iter)

# saving the model
if not os._exists('trained'):
    os.makedirs('trained')
w2vector_cbow.save(os.path.join('trained', 'w2vector_CBOW.w2v'))

#### Skip Grams model

In [14]:
# Making the word2vec model --> skip_grams
w2vector_sg = word2vec.Word2Vec(sg = 1, seed = seed, size = e_dim, workers = num_workers,
                             min_count = min_word_count, window = context_size,
                             sample = downsampling, iter = epochs)

# building the vocabulary
w2vector_sg.build_vocab(q_total)
print("Word2Vec vocabulary length:", len(w2vector_sg.wv.vocab))

# training the model
w2vector_sg.train(q_total, total_examples = w2vector_sg.corpus_count, epochs = w2vector_sg.iter)

# saving the model
if not os._exists('trained'):
    os.makedirs('trained')
w2vector_sg.save(os.path.join('trained', 'w2vector_sg.w2v'))

Word2Vec vocabulary length: 86005


## Looking at the data
Now that we have saved the model, we are going to play around with the data.

In [15]:
w2vector_sg.most_similar('country')

[('countries', 0.893531322479248),
 ('nation', 0.855096697807312),
 ('asia', 0.8382056355476379),
 ('china', 0.837762713432312),
 ('indonesia', 0.8323123455047607),
 ('foreigner', 0.8274473547935486),
 ('european', 0.8260392546653748),
 ('superpower', 0.8230191469192505),
 ('africa', 0.8229018449783325),
 ('decades', 0.8225271105766296)]

In [16]:
def nearest_similarity_cosmul(start1, end1, end2):
    similarities = w2vector_sg.most_similar_cosmul(
        positive=[end2, start1],
        negative=[end1]
    )
    start2 = similarities[0][0]
    print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
    return start2

In [22]:
# Simple tasks for word2vec model
nearest_similarity_cosmul("eat", "eating", "drinking")
nearest_similarity_cosmul("drink", "drinking", "driving")
nearest_similarity_cosmul("man", "king", "queen")
nearest_similarity_cosmul("quit", "india", "india")

eat is related to eating, as drink is related to drinking
drink is related to drinking, as license is related to driving
man is related to king, as woman is related to queen
quit is related to india, as quitting is related to india


'quitting'