<a href="https://colab.research.google.com/github/yashugupta786/word_vec_similarity/blob/master/word_vec_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [3]:
df = pd.read_csv('/content/simpsons_dataset.csv')
df.shape

(158314, 2)

In [4]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
spoken_words          0
dtype: int64

In [0]:
nlp = spacy.load('en', disable=['ner', 'parser']) # disabling Named Entity Recognition for speed

def cleaning(doc):
    # Lemmatizes and removes stopwords
    # doc needs to be a spacy Doc object
    txt = [token.lemma_ for token in doc if not token.is_stop]
    # Word2Vec uses context words to learn the vector representation of a target word,
    # if a sentence is only one or two words long,
    # the benefit for the training is very small
    if len(txt) > 2:
        return ' '.join(txt)

In [0]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

In [7]:
t = time()

txt = [cleaning(doc) for doc in nlp.pipe(brief_cleaning, batch_size=5000, n_threads=-1)]

print('Time to clean up everything: {} mins'.format(round((time() - t) / 60, 2)))

Time to clean up everything: 0.96 mins


In [8]:
df_clean = pd.DataFrame({'clean': txt})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

(85964, 1)

In [9]:
df_clean.head()

Unnamed: 0,clean
0,actually little disease magazine news show nat...
2,know sure like talk touch lesson plan teach
3,life worth live
4,poll open end recess case decide thought final...
7,victory party slide


In [10]:
from gensim.models.phrases import Phrases, Phraser

INFO - 08:01:43: 'pattern' package not found; tag filters are not available for English


In [0]:
sent = [row.split() for row in df_clean['clean']]

In [12]:
phrases = Phrases(sent, min_count=30, progress_per=10000)

INFO - 08:02:09: collecting all words and their counts
INFO - 08:02:09: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 08:02:09: PROGRESS: at sentence #10000, processed 63561 words and 52816 word types
INFO - 08:02:10: PROGRESS: at sentence #20000, processed 130943 words and 99866 word types
INFO - 08:02:10: PROGRESS: at sentence #30000, processed 192972 words and 138532 word types
INFO - 08:02:10: PROGRESS: at sentence #40000, processed 249842 words and 172659 word types
INFO - 08:02:10: PROGRESS: at sentence #50000, processed 311265 words and 208566 word types
INFO - 08:02:10: PROGRESS: at sentence #60000, processed 373588 words and 243702 word types
INFO - 08:02:10: PROGRESS: at sentence #70000, processed 436441 words and 278740 word types
INFO - 08:02:10: PROGRESS: at sentence #80000, processed 497829 words and 311886 word types
INFO - 08:02:10: collected 330804 word types from a corpus of 537160 words (unigram + bigrams) and 85964 sentences
INFO - 08:02:10: us

In [14]:
bigram = Phraser(phrases)


INFO - 08:02:55: source_vocab length 330804
INFO - 08:02:58: Phraser built with 126 phrasegrams


In [0]:
sentences = bigram[sent]


In [0]:
sentences
for i in sentences:
  print(i)

In [22]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

30178

In [23]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['oh', 'like', 'know', 'get', 'hey', 'think', 'right', 'look', 'want', 'come']

In [0]:
import multiprocessing

from gensim.models import Word2Vec

In [0]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                  )

In [27]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 08:09:41: collecting all words and their counts
INFO - 08:09:41: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 08:09:41: PROGRESS: at sentence #10000, processed 61718 words, keeping 9558 word types
INFO - 08:09:41: PROGRESS: at sentence #20000, processed 127351 words, keeping 14506 word types
INFO - 08:09:41: PROGRESS: at sentence #30000, processed 187829 words, keeping 17619 word types
INFO - 08:09:42: PROGRESS: at sentence #40000, processed 243332 words, keeping 20385 word types
INFO - 08:09:42: PROGRESS: at sentence #50000, processed 303182 words, keeping 22878 word types
INFO - 08:09:42: PROGRESS: at sentence #60000, processed 363940 words, keeping 25200 word types
INFO - 08:09:42: PROGRESS: at sentence #70000, processed 425408 words, keeping 27401 word types
INFO - 08:09:42: PROGRESS: at sentence #80000, processed 485464 words, keeping 29275 word types
INFO - 08:09:42: collected 30178 word types from a corpus of 523700 raw words and 85964 sentence

Time to build vocab: 0.04 mins


In [28]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 08:10:03: training model with 3 workers on 3319 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 08:10:04: EPOCH 1 - PROGRESS: at 37.40% examples, 74816 words/s, in_qsize 0, out_qsize 0
INFO - 08:10:05: EPOCH 1 - PROGRESS: at 78.53% examples, 76880 words/s, in_qsize 0, out_qsize 0
INFO - 08:10:06: worker thread finished; awaiting finish of 2 more threads
INFO - 08:10:06: worker thread finished; awaiting finish of 1 more threads
INFO - 08:10:06: worker thread finished; awaiting finish of 0 more threads
INFO - 08:10:06: EPOCH - 1 : training on 523700 raw words (198820 effective words) took 2.6s, 77786 effective words/s
INFO - 08:10:07: EPOCH 2 - PROGRESS: at 37.40% examples, 75235 words/s, in_qsize 0, out_qsize 0
INFO - 08:10:08: EPOCH 2 - PROGRESS: at 74.71% examples, 72498 words/s, in_qsize 0, out_qsize 0
INFO - 08:10:09: worker thread finished; awaiting finish of 2 more threads
INFO - 08:10:09: worker thread finished; awaiting finish of 1 mo

Time to train the model: 1.33 mins


In [29]:
w2v_model.init_sims(replace=True)

INFO - 08:12:43: precomputing L2-norms of word weight vectors


In [30]:
w2v_model.wv.most_similar(positive=["homer_simpson"])

  if np.issubdtype(vec.dtype, np.int):


[('pleased', 0.7633680701255798),
 ('congratulation', 0.7631620168685913),
 ('recent', 0.7597476243972778),
 ('governor', 0.7590283155441284),
 ('easily', 0.7576010823249817),
 ('council', 0.7408592700958252),
 ('hutz', 0.7347520589828491),
 ('robert', 0.7318389415740967),
 ('simon', 0.7308964729309082),
 ('committee', 0.7258784770965576)]

In [31]:
w2v_model.wv.most_similar(positive=["homer"])

  if np.issubdtype(vec.dtype, np.int):


[('marge', 0.7794296145439148),
 ('rude', 0.7755630016326904),
 ('bongo', 0.7644156813621521),
 ('snuggle', 0.7563874125480652),
 ('wife', 0.7388522624969482),
 ('gee', 0.7364005446434021),
 ('sorry', 0.7309480309486389),
 ('worry', 0.727098822593689),
 ('hammock', 0.7240004539489746),
 ('sweetheart', 0.7224789261817932)]

In [32]:
w2v_model.wv.most_similar(positive=["dr_hibbert"])

  if np.issubdtype(vec.dtype, np.int):


[('convince', 0.8436061143875122),
 ('rabbi', 0.831490159034729),
 ('hearing', 0.8250008821487427),
 ('catholic', 0.7999840378761292),
 ('bitch', 0.7841911911964417),
 ('rude', 0.7727895379066467),
 ('attract', 0.7696405649185181),
 ('stress', 0.7687424421310425),
 ('technically', 0.7678347229957581),
 ('crisis', 0.7677313089370728)]

In [33]:
w2v_model.wv.most_similar(positive=["bart"])

  if np.issubdtype(vec.dtype, np.int):


[('lisa', 0.8431563377380371),
 ('surprised', 0.7916699051856995),
 ('homework', 0.7898970246315002),
 ('mom', 0.7895394563674927),
 ('convince', 0.7756944894790649),
 ('upset', 0.7745141386985779),
 ('mom_dad', 0.77126145362854),
 ('strangle', 0.7669622898101807),
 ('substitute', 0.7586274147033691),
 ('typical', 0.7566486597061157)]

In [37]:
from gensim.models import Phrases
documents = ["the mayor of new york was there", "machine learning can be useful sometimes","new york mayor was present","My name is yashu gupta"]

sentence_stream = [doc.split(" ") for doc in documents]
bigram = Phrases(sentence_stream, min_count=1, threshold=2)
sent = [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there']
print(bigram[sent])

INFO - 08:40:58: collecting all words and their counts
INFO - 08:40:58: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 08:40:58: collected 37 word types from a corpus of 23 words (unigram + bigrams) and 4 sentences
INFO - 08:40:58: using 37 counts as vocab in Phrases<0 vocab, min_count=1, threshold=2, max_vocab_size=40000000>


['the', 'mayor', 'of', 'new_york', 'was', 'there']


