**Import Modules**

In [1]:
import re
import pandas as pd
import nltk
import multiprocessing
from gensim.models import Word2Vec



**Load data**

In [2]:
df = pd.read_csv('D:/MEGA/7.research meeting/TM_lectures/examples/simpsons_dataset.csv')
print(df.shape)

(158314, 2)


**Sample**

In [3]:
df.head()

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...


**Preprocessing**

In [4]:
brief_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['spoken_words'])

**Sentence split**

In [5]:
sentences = []
for idx, s in enumerate(brief_cleaning):
    tokens = nltk.word_tokenize(s)
    tags = nltk.pos_tag(tokens)
    sent = [x for x, _ in tags]
    sentences.append(sent)
    if idx % 100 == 0:
        print('.', end='')

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

**Define model**

In [6]:
w2v_model = Word2Vec(min_count=20,
                     window=2,
                     size=300,
                     sample=6e-5,
                     alpha=0.03,
                     min_alpha=0.0007,
                     negative=20,
                     workers=2)

**Make Vocab.**

In [7]:
w2v_model.build_vocab(sentences, progress_per=10000)

**Train**

In [14]:
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=50, report_delay=1)

(21539392, 71186800)

In [9]:
w2v_model.init_sims(replace=True)  # terminate training

**Use word2vec**

In [10]:
w2v_model.wv.most_similar(positive=["homer"])

[('lisa', 0.9954259395599365),
 ('homie', 0.9935392141342163),
 ('listen', 0.9933094382286072),
 ('bart', 0.9931513071060181),
 ('thinking', 0.9918088316917419),
 ('okay', 0.9913548827171326),
 ('um', 0.9911307692527771),
 ('sweetie', 0.9909362196922302),
 ('hear', 0.9902991056442261),
 ('doing', 0.9901739358901978)]

In [11]:
w2v_model.wv.most_similar(positive=["simpson"])

[('krabappel', 0.9785841703414917),
 ('hello', 0.9777948260307312),
 ('edna', 0.9726208448410034),
 ('dear', 0.9710251092910767),
 ('gay', 0.9703148603439331),
 ('goodbye', 0.9695864915847778),
 ('lame', 0.9674407839775085),
 ('flanders', 0.9668707847595215),
 ('sir', 0.9666049480438232),
 ('mrs', 0.9663475155830383)]

In [12]:
w2v_model.wv.most_similar(positive=["marge"])

[('anymore', 0.9972898960113525),
 ('dad', 0.9972525835037231),
 ('honey', 0.9969974756240845),
 ('homie', 0.9967259168624878),
 ('else', 0.9966236352920532),
 ('really', 0.9965067505836487),
 ('say', 0.9960315823554993),
 ('listen', 0.9955224990844727),
 ('understand', 0.9954319000244141),
 ('much', 0.9952390789985657)]

In [13]:
w2v_model.wv.most_similar(positive=["bart"])

[('lisa', 0.9966187477111816),
 ('sweetie', 0.9959205389022827),
 ('homie', 0.9951599836349487),
 ('listen', 0.9942280650138855),
 ('doing', 0.993816614151001),
 ('thinking', 0.9932003021240234),
 ('homer', 0.9931513071060181),
 ('hear', 0.99308180809021),
 ('am', 0.9929856061935425),
 ('okay', 0.992475152015686)]