In [1]:
import pickle
import multiprocessing

from time import time

from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

## Load Data

In [2]:
with open('../../data/x_train.pickle', 'rb') as handle:
    X_train = pickle.load(handle)
with open('../../data/y_train.pickle', 'rb') as handle:
    y_train = pickle.load(handle)
with open('../../data/x_val.pickle', 'rb') as handle:
    X_val = pickle.load(handle)
with open('../../data/y_val.pickle', 'rb') as handle:
    y_val = pickle.load(handle)

## Variables

In [3]:
maxLen = 150
BATCH_SIZE = 32 #20
MAX_NUM_WORDS = 20000
MAX_SEQ_LEN = 200

## Bigrams

In [4]:
sent = []
for col in ['question', 'question_body', 'answer']:
    words = [row.split() for row in X_train[col]]
    sent = sent + words

In [6]:
sentences = Phrases(sent)

In [6]:
bigram = Phraser(sentences)

In [7]:
sentences = bigram[sent]

## Gensim word to vector

In [8]:
cores = multiprocessing.cpu_count() 
cores

12

In [9]:
len(sentences)

In [10]:
w2v_model = Word2Vec(min_count=50,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=50,
                     workers=cores-1)

In [11]:
t = time() 
w2v_model.build_vocab(sentences, progress_per=10000)
print('time to build vocabulary : ', format(round((time() - t) / 60, 2)))

## Training Model

In [12]:
t = time()
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=50, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [13]:
pretrained_weights = w2v_model.wv.syn0
vocab_size, embedding_size = pretrained_weights.shape

  """Entry point for launching an IPython kernel.


In [14]:
with open('../../data/w2v_pretrained_weights.pickle', 'wb') as handle:
    pickle.dump(w2v_model, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
pretrained_weights = w2v_model.wv.syn0
vocab_size, embedding_size = pretrained_weights.shape

  """Entry point for launching an IPython kernel.


In [17]:
w2v_model.most_similar(['python'])

  """Entry point for launching an IPython kernel.


[('Python', 0.6817797422409058),
 ('install', 0.4983155131340027),
 ('using', 0.44793903827667236),
 ('library', 0.4355594515800476),
 ('module', 0.42699405550956726),
 ('installed', 0.41153961420059204),
 ('linux', 0.40717512369155884),
 ('libraries', 0.40403202176094055),
 ('virtualenv', 0.390068382024765),
 ('distutils', 0.3893689215183258)]

In [None]:
# © Laëtitia CONSTANTIN & Axel CHENU 2021

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=e80043e2-6875-4b65-a196-a0ffb97a1282' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>