# Library

In [None]:
!pip install gensim



In [None]:
import pandas as pd
import gensim
from gensim.models import word2vec
import time
import multiprocessing
from datetime import timedelta

# Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Tugas Kampus/simpsons_script_lines.csv')
df.shape

(158271, 2)

In [None]:
df.head()

Unnamed: 0,raw_character_text,normalized_text
0,Miss Hoover,no actually it was a little of both sometimes ...
1,Lisa Simpson,wheres mr bergstrom
2,Miss Hoover,i dont know although id sure like to talk to h...
3,Lisa Simpson,that life is worth living
4,Edna Krabappel-Flanders,the polls will be open from now until the end ...


# Data Cleaning

In [None]:
df.isnull().sum()

raw_character_text    17522
normalized_text       26184
dtype: int64

In [None]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

raw_character_text    0
normalized_text       0
dtype: int64

# Membuat corpus


In [None]:
corpus_text = "\n".join(df['normalized_text'])

# Menyimpan corpus ke dalam file 'corpus.txt'
corpus_path = 'corpus.txt'
with open(corpus_path, 'w') as f:
    f.write(corpus_text)

# Training Model

In [None]:
start_time = time.time()
print('Training Word2Vec Model...')
sentences = word2vec.LineSentence(corpus_path)
w2v_model = word2vec.Word2Vec(sentences, vector_size=300, workers=multiprocessing.cpu_count())
w2v_model.save('model_word2vec_300_model')
finish_time = time.time()

Training Word2Vec Model...


# Test

In [None]:
w2v_model.wv.similarity('woman','man')

0.7685295

In [None]:
w2v_model.wv.most_similar('chicken')

[('grilled', 0.8598350882530212),
 ('frozen', 0.8556227684020996),
 ('hairy', 0.8531014919281006),
 ('mexican', 0.8504650592803955),
 ('ruthless', 0.8499812483787537),
 ('canadian', 0.8488740921020508),
 ('jackie', 0.847919225692749),
 ('routine', 0.8433297276496887),
 ('hoot', 0.8408330678939819),
 ('dry', 0.8406938314437866)]

In [None]:
w2v_model.wv.most_similar(positive=['homer'])

[('bart', 0.8471477031707764),
 ('marge', 0.8243023157119751),
 ('lisa', 0.8107187747955322),
 ('grampa', 0.6950166821479797),
 ('abe', 0.6775633692741394),
 ('moe', 0.6680812835693359),
 ('genes', 0.6554840207099915),
 ('dad', 0.6307328343391418),
 ('mrs', 0.6284840106964111),
 ('milhouse', 0.6198838949203491)]

In [None]:
w2v_model.wv.most_similar(positive=['marge'])

[('homer', 0.8243023157119751),
 ('lisa', 0.7972437143325806),
 ('bart', 0.7665897607803345),
 ('honey', 0.7270253896713257),
 ('homie', 0.7102665901184082),
 ('moe', 0.696895956993103),
 ('dad', 0.6883558630943298),
 ('milhouse', 0.6762144565582275),
 ('son', 0.6656292676925659),
 ('maggie', 0.6571481227874756)]

In [None]:
w2v_model.wv.similarity('maggie', 'baby')

0.63048804

In [None]:
w2v_model.wv.most_similar(positive=['woman', 'homer'], negative=['marge'], topn=3)

[('man', 0.712121844291687),
 ('person', 0.6678289771080017),
 ('bear', 0.6650072336196899)]

In [None]:
w2v_model.wv.most_similar(positive=['woman','king'], negative=['homer'])

[('star', 0.7295176982879639),
 ('evil', 0.7033276557922363),
 ('poet', 0.6975265741348267),
 ('birth', 0.694437563419342),
 ('mystery', 0.6919242143630981),
 ('group', 0.6870516538619995),
 ('player', 0.6857457756996155),
 ('famous', 0.6849767565727234),
 ('american', 0.6841424703598022),
 ('ancient', 0.6794134378433228)]

In [None]:
w2v_model.wv.doesnt_match('homer drink milk sleep angry'.split())

'angry'