# Import libraries

In [7]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

from gensim.models.phrases import Phrases, Phraser

# Reading Dataset

In [4]:
df = pd.read_csv('dataset/english_1.csv')
df

Unnamed: 0,question,answer
0,"выберите слово, сходное по значению со словом...",profession.
1,"выберите слово, близкое по значению: believe.",suppose.
2,выберите правильное слово this is … way to sc...,the
3,выберите правильный вариант притяжательной ф...,student’s opportunity.
4,выберите правильный вариант сказуемого. there...,is.
...,...,...
1045,выберите правильную форму глагола в страдател...,will be paid.
1046,замените выделенное модальное выражение его э...,could
1047,задайте общий вопрос к предложению: my brothe...,did my brother use to be a taxi-driver?
1048,"выберите предложение, в котором инфинитив исп...",i want to have a rest.


In [12]:
import en_core_web_sm
nlp = en_core_web_sm.load()
#nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [166]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['question'])

In [167]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=10, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.06 mins


In [168]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(330, 1)

In [169]:
df_clean

Unnamed: 0,clean
2,way school
6,east west good
9,pete say apple favorite fruit
10,book read year
11,meet way home
...,...
833,big state usa
869,s use worry
918,know husband
977,arman bolatovich friend


In [200]:
#sent = [row.split() for row in df_clean['clean']]
sent = [row.split() for row in df['question']]

In [201]:
phrases = Phrases(sent, min_count=2, progress_per=100)
bigram = Phraser(phrases)

INFO - 18:16:11: collecting all words and their counts
INFO - 18:16:11: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 18:16:11: PROGRESS: at sentence #100, processed 1397 words and 1643 word types
INFO - 18:16:11: PROGRESS: at sentence #200, processed 2635 words and 2774 word types
INFO - 18:16:11: PROGRESS: at sentence #300, processed 3982 words and 3879 word types
INFO - 18:16:11: PROGRESS: at sentence #400, processed 5336 words and 4747 word types
INFO - 18:16:11: PROGRESS: at sentence #500, processed 6727 words and 5392 word types
INFO - 18:16:11: PROGRESS: at sentence #600, processed 8113 words and 5846 word types
INFO - 18:16:11: PROGRESS: at sentence #700, processed 9435 words and 6089 word types
INFO - 18:16:11: PROGRESS: at sentence #800, processed 10827 words and 6143 word types
INFO - 18:16:11: PROGRESS: at sentence #900, processed 12069 words and 6194 word types
INFO - 18:16:11: PROGRESS: at sentence #1000, processed 13347 words and 6222 word types
INF

In [202]:
sentences = bigram[sent]

In [203]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

2120

In [204]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['the',
 '…',
 'выберите',
 'to',
 'in',
 'a',
 'is',
 'правильный_вариант',
 'i',
 'of']

In [205]:
import multiprocessing

from gensim.models import Word2Vec

In [206]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

In [207]:
w2v_model = Word2Vec(min_count=1,
                     window=2,
                     size=100,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [208]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 18:16:28: collecting all words and their counts
INFO - 18:16:28: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 18:16:28: collected 2120 word types from a corpus of 10872 raw words and 1050 sentences
INFO - 18:16:28: Loading a fresh vocabulary
INFO - 18:16:28: effective_min_count=1 retains 2120 unique words (100% of original 2120, drops 0)
INFO - 18:16:28: effective_min_count=1 leaves 10872 word corpus (100% of original 10872, drops 0)
INFO - 18:16:28: deleting the raw counts dictionary of 2120 items
INFO - 18:16:28: sample=6e-05 downsamples 1714 most-common words
INFO - 18:16:28: downsampling leaves estimated 4359 word corpus (40.1% of prior 10872)
INFO - 18:16:28: estimated required memory for 2120 words and 100 dimensions: 2756000 bytes
INFO - 18:16:28: resetting layer weights


Time to build vocab: 0.01 mins


In [209]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=200, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 18:16:29: training model with 7 workers on 2120 vocabulary and 100 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 18:16:29: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:29: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:29: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:29: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:29: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:29: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:29: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:29: EPOCH - 1 : training on 10872 raw words (4273 effective words) took 0.0s, 95100 effective words/s
INFO - 18:16:29: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:29: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:29: worker thread finished; awaiting finish of 4 more threads
I

INFO - 18:16:30: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:30: EPOCH - 13 : training on 10872 raw words (4336 effective words) took 0.0s, 108226 effective words/s
INFO - 18:16:30: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:30: EPOCH - 14 : training on 10872 raw words (4304 effective words) took 0.0s, 110963 effective words/s
INFO - 18:16:30: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:30:

INFO - 18:16:30: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:30: EPOCH - 26 : training on 10872 raw words (4454 effective words) took 0.0s, 113056 effective words/s
INFO - 18:16:30: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:30: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:30: EPOCH - 27 : training on 10872 raw words 

INFO - 18:16:31: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:31: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:31: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:31: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:31: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:31: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:31: EPOCH - 39 : training on 10872 raw words (4376 effective words) took 0.0s, 109354 effective words/s
INFO - 18:16:31: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:31: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:31: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:31: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:31: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:31: worker thread finished; awaiting finish o

INFO - 18:16:32: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:32: EPOCH - 52 : training on 10872 raw words (4292 effective words) took 0.0s, 103238 effective words/s
INFO - 18:16:32: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:32: worker thread finished; awaiting finish o

INFO - 18:16:32: EPOCH - 64 : training on 10872 raw words (4335 effective words) took 0.0s, 107081 effective words/s
INFO - 18:16:32: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:32: EPOCH - 65 : training on 10872 raw words (4373 effective words) took 0.0s, 102572 effective words/s
INFO - 18:16:32: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:32: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:32:

INFO - 18:16:33: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:33: EPOCH - 77 : training on 10872 raw words (4425 effective words) took 0.0s, 104011 effective words/s
INFO - 18:16:33: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:33: EPOCH - 78 : training on 10872 raw words (4386 effective words) took 0.0s, 109973 effective words/s
INFO - 18:16:33: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:33:

INFO - 18:16:33: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:33: EPOCH - 90 : training on 10872 raw words (4365 effective words) took 0.1s, 73058 effective words/s
INFO - 18:16:33: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:33: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:33: EPOCH - 91 : training on 10872 raw words (

INFO - 18:16:34: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:34: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:34: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:34: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:34: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:34: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:34: EPOCH - 103 : training on 10872 raw words (4294 effective words) took 0.0s, 113880 effective words/s
INFO - 18:16:34: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:34: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:34: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:34: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:34: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:34: worker thread finished; awaiting finish 

INFO - 18:16:35: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:35: EPOCH - 116 : training on 10872 raw words (4423 effective words) took 0.0s, 89127 effective words/s
INFO - 18:16:35: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:35: worker thread finished; awaiting finish o

INFO - 18:16:35: EPOCH - 128 : training on 10872 raw words (4332 effective words) took 0.0s, 111336 effective words/s
INFO - 18:16:35: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:35: EPOCH - 129 : training on 10872 raw words (4371 effective words) took 0.0s, 107133 effective words/s
INFO - 18:16:35: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:35: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:3

INFO - 18:16:36: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:36: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:36: EPOCH - 141 : training on 10872 raw words (4300 effective words) took 0.0s, 104824 effective words/s
INFO - 18:16:36: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:36: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:36: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:36: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:36: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:36: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:36: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:36: EPOCH - 142 : training on 10872 raw words (4343 effective words) took 0.0s, 108574 effective words/s
INFO - 18:16:36: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:3

INFO - 18:16:37: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:37: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:37: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:37: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:37: EPOCH - 154 : training on 10872 raw words (4395 effective words) took 0.0s, 112048 effective words/s
INFO - 18:16:37: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:37: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:37: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:37: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:37: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:37: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:37: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:37: EPOCH - 155 : training on 10872 raw word

INFO - 18:16:38: EPOCH - 179 : training on 10872 raw words (4432 effective words) took 0.0s, 105911 effective words/s
INFO - 18:16:38: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:38: EPOCH - 180 : training on 10872 raw words (4377 effective words) took 0.0s, 108878 effective words/s
INFO - 18:16:38: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:3

INFO - 18:16:38: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:38: EPOCH - 192 : training on 10872 raw words (4405 effective words) took 0.0s, 104873 effective words/s
INFO - 18:16:38: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 5 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 4 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 3 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 2 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 1 more threads
INFO - 18:16:38: worker thread finished; awaiting finish of 0 more threads
INFO - 18:16:38: EPOCH - 193 : training on 10872 raw words (4364 effective words) took 0.0s, 106994 effective words/s
INFO - 18:16:38: worker thread finished; awaiting finish of 6 more threads
INFO - 18:16:3

Time to train the model: 0.17 mins


In [210]:
w2v_model.init_sims(replace=True)

INFO - 18:16:39: precomputing L2-norms of word weight vectors


In [215]:
df.question[103]

' определите функцию герундия в следующем предложении: the meeting was planned for discussing and creating.  '

In [217]:
w2v_model.wv.most_similar(positive=["the", "meeting", "was"])

[('recover.', 0.809019148349762),
 ('planned', 0.795873761177063),
 ('creating.', 0.7929407358169556),
 ('discussing', 0.7864329218864441),
 ('come.', 0.7704119086265564),
 ('homework', 0.7401517629623413),
 ('arman', 0.7391875982284546),
 ('следующем_предложении:', 0.7327815294265747),
 ('battle', 0.7293156385421753),
 ('forgot', 0.7284096479415894)]

In [213]:
w2v_model.wv.most_similar(positive=["city"])

[('was_called', 0.9857474565505981),
 ('land_round', 0.9845600724220276),
 ('district_of', 0.9678270816802979),
 ('site_of', 0.9589980244636536),
 ('columbia,_after', 0.9424110651016235),
 ('far_as', 0.9172682166099548),
 ('businessmen', 0.9110463857650757),
 ('christopher_columbus;', 0.9107587337493896),
 ('come_as', 0.8841084837913513),
 ('city_on', 0.8792195320129395)]