In [1]:
import pandas as pd
from tqdm.auto import tqdm
import spacy

In [2]:
df = pd.read_csv("movies_data.csv", usecols=["title", "overview", "movie_type"])

In [3]:
df.isnull().sum()

title         0
movie_type    0
overview      1
dtype: int64

In [4]:
df[df['overview'].isnull() == True]

Unnamed: 0,title,movie_type,overview
6445,Ret,"['Drama', 'History']",


In [5]:
df = df.drop(6445)
df.shape

(9999, 3)

In [6]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [7]:
def preprocessing(text):
    return [[token.lemma_ for token in doc if not token.is_punct and not token.is_stop] for doc in tqdm(nlp.pipe(text.astype(str).str.lower().str.strip().tolist(), batch_size=1000), total= len(text))]

In [8]:
df['title'] = preprocessing(df['title'])
df['overview'] = preprocessing(df['overview'])

  0%|          | 0/9999 [00:00<?, ?it/s]

  0%|          | 0/9999 [00:00<?, ?it/s]

In [9]:
df.head()

Unnamed: 0,title,movie_type,overview
0,"[shawshank, redemption]","['Drama', 'Crime']","[imprison, 1940, double, murder, wife, lover, ..."
1,[godfather],"['Drama', 'Crime']","[span, year, 1945, 1955, chronicle, fictional,..."
2,"[godfather, ii]","['Drama', 'Crime']","[continue, saga, corleone, crime, family, youn..."
3,"[schindler, list]","['Drama', 'History', 'War']","[true, story, businessman, oskar, schindler, s..."
4,"[12, angry, man]",['Drama'],"[defense, prosecution, rest, jury, file, jury,..."


In [10]:
total_sentences = df['title'].tolist() + df['overview'].tolist()
len(total_sentences)

19998

In [11]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=total_sentences, vector_size=100, window=5, min_count=1, workers=4)

In [12]:
print("\nWord Similarity Test:")
model.wv.most_similar('american', topn=2)


Word Similarity Test:


[('italian', 0.9997267723083496), ('local', 0.9996961951255798)]