In [132]:
# Installs
!pip3 install pandas
!pip3 install -U spacy
!python -m spacy download en
!pip3 install gensim

Collecting en-core-web-sm==3.1.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.1.0/en_core_web_sm-3.1.0-py3-none-any.whl (13.6 MB)
[!] As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the full
pipeline package name 'en_core_web_sm' instead.
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


Collecting gensim
  Downloading gensim-4.1.2-cp39-cp39-win_amd64.whl (24.0 MB)
Collecting scipy>=0.18.1
  Using cached scipy-1.7.1-cp39-cp39-win_amd64.whl (33.8 MB)
Collecting Cython==0.29.23
  Using cached Cython-0.29.23-cp39-cp39-win_amd64.whl (1.7 MB)
Installing collected packages: scipy, Cython, gensim
Successfully installed Cython-0.29.23 gensim-4.1.2 scipy-1.7.1


In [233]:
# Imports
import multiprocessing
import pandas as pd
import re 
import spacy

from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

In [204]:
# Prepare df
df_file = pd.read_csv("simpsons_script_lines.csv", low_memory=False)
df = df_file[["raw_character_text", "spoken_words"]].copy()

In [205]:
# df details
df_shape = df.shape
df_null = df.isna().sum()

print(f"Shape: {df_shape}")
print(f"Null: {df_null}")

Shape: (158271, 2)
Null: raw_character_text    17522
spoken_words          26159
dtype: int64


In [206]:
# Drop null df
df = df.dropna().reset_index(drop=True)

In [207]:
# Create NLP
nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser']) 

In [220]:
# Create clean df
df_clean = df["spoken_words"].copy()
df_clean = [(re.sub("[^A-Za-z']+", ' ', line)).lower() for line in df_clean]
df_clean = pd.Series(df_clean)

In [221]:
# Drop duplicates of clean df
df_clean = df_clean.drop_duplicates().reset_index(drop=True)

In [222]:
# Split sentences
df_clean = [line.split() for line in df_clean]

[['no',
  'actually',
  'it',
  'was',
  'a',
  'little',
  'of',
  'both',
  'sometimes',
  'when',
  'a',
  'disease',
  'is',
  'in',
  'all',
  'the',
  'magazines',
  'and',
  'all',
  'the',
  'news',
  'shows',
  "it's",
  'only',
  'natural',
  'that',
  'you',
  'think',
  'you',
  'have',
  'it'],
 ["where's", 'mr', 'bergstrom'],
 ['i',
  "don't",
  'know',
  'although',
  "i'd",
  'sure',
  'like',
  'to',
  'talk',
  'to',
  'him',
  'he',
  "didn't",
  'touch',
  'my',
  'lesson',
  'plan',
  'what',
  'did',
  'he',
  'teach',
  'you'],
 ['that', 'life', 'is', 'worth', 'living'],
 ['the',
  'polls',
  'will',
  'be',
  'open',
  'from',
  'now',
  'until',
  'the',
  'end',
  'of',
  'recess',
  'now',
  'just',
  'in',
  'case',
  'any',
  'of',
  'you',
  'have',
  'decided',
  'to',
  'put',
  'any',
  'thought',
  'into',
  'this',
  "we'll",
  'have',
  'our',
  'final',
  'statements',
  'martin'],
 ['i', "don't", 'think', "there's", 'anything', 'left', 'to', 'say']

In [234]:
# Create relevant phrases
phrases = Phrases(df_clean, min_count=30, progress_per=10000)

In [235]:
# Dicard model state not needed for bigram detection
bigram = Phraser(phrases)

In [237]:
# Transorm the corpus based on the bigrams detected
sentences = bigram[df_clean]

In [243]:
# Create Word2Vec model
model_w2v = Word2Vec(
    min_count=20,
    window=2,
    vector_size=300,
    sample=6e-5, 
    alpha=0.03, 
    min_alpha=0.0007, 
    negative=20,
    workers=multiprocessing.cpu_count()-1
)

In [244]:
# Build vocabulary table
model_w2v.build_vocab(sentences, progress_per=10000)

In [245]:
# Train Word2Vec
model_w2v.train(sentences, total_examples=model_w2v.corpus_count, epochs=30, report_delay=1)

(12669601, 37774140)

In [260]:
# model_w2v.wv.most_similar(positive=["homer"])
# model_w2v.wv.most_similar(positive=["homer_simpson"])
# model_w2v.wv.most_similar(positive=["marge"])
# model_w2v.wv.most_similar(positive=["bart"])
model_w2v.wv.similarity('maggie', 'baby')
model_w2v.wv.similarity('bart', 'nelson')
model_w2v.wv.doesnt_match(['jimbo', 'milhouse', 'kearney'])
model_w2v.wv.doesnt_match(["nelson", "bart", "milhouse"])
model_w2v.wv.doesnt_match(['homer', 'patty', 'selma'])
model_w2v.wv.most_similar(positive=["woman", "homer"], negative=["marge"], topn=3)
model_w2v.wv.most_similar(positive=["woman", "bart"], negative=["man"], topn=3)

[('lisa', 0.5496333837509155),
 ('mom', 0.47789591550827026),
 ('maggie', 0.4347086548805237)]