In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
df = pd.read_csv("movies_data.csv")

In [3]:
df.shape

(10000, 4)

In [4]:
df.head()

Unnamed: 0,title,rating,movie_type,overview
0,The Shawshank Redemption,8.712,"['Drama', 'Crime']",Imprisoned in the 1940s for the double murder ...
1,The Godfather,8.686,"['Drama', 'Crime']","Spanning the years 1945 to 1955, a chronicle o..."
2,The Godfather Part II,8.571,"['Drama', 'Crime']",In the continuing saga of the Corleone crime f...
3,Schindler's List,8.566,"['Drama', 'History', 'War']",The true story of how businessman Oskar Schind...
4,12 Angry Men,8.549,['Drama'],The defense and the prosecution have rested an...


In [5]:
import ast
df['movie_type'] = df['movie_type'].apply(ast.literal_eval)

In [6]:
df = df.drop(columns="rating")
df.head()

Unnamed: 0,title,movie_type,overview
0,The Shawshank Redemption,"[Drama, Crime]",Imprisoned in the 1940s for the double murder ...
1,The Godfather,"[Drama, Crime]","Spanning the years 1945 to 1955, a chronicle o..."
2,The Godfather Part II,"[Drama, Crime]",In the continuing saga of the Corleone crime f...
3,Schindler's List,"[Drama, History, War]",The true story of how businessman Oskar Schind...
4,12 Angry Men,[Drama],The defense and the prosecution have rested an...


In [7]:
df[df['overview'].isnull() == True]

Unnamed: 0,title,movie_type,overview
6445,Ret,"[Drama, History]",


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       10000 non-null  object
 1   movie_type  10000 non-null  object
 2   overview    9999 non-null   object
dtypes: object(3)
memory usage: 234.5+ KB


In [9]:
df = df.drop(6445)
df.shape

(9999, 3)

## text cleaning:

In [10]:
def lower_casing(text_string):
    if isinstance(text_string, str):
        return text_string.lower()
    return text_string

In [11]:
for column in df.columns:
    df[column] = df[column].apply(lower_casing)

In [12]:
df.head()

Unnamed: 0,title,movie_type,overview
0,the shawshank redemption,"[Drama, Crime]",imprisoned in the 1940s for the double murder ...
1,the godfather,"[Drama, Crime]","spanning the years 1945 to 1955, a chronicle o..."
2,the godfather part ii,"[Drama, Crime]",in the continuing saga of the corleone crime f...
3,schindler's list,"[Drama, History, War]",the true story of how businessman oskar schind...
4,12 angry men,[Drama],the defense and the prosecution have rested an...


## punctuation removal:

In [13]:
import string

In [14]:
def remove_puncuation(text_string):
    if not isinstance(text_string, str):
        return text_string
    return text_string.translate(str.maketrans("", "", string.punctuation))

In [15]:
for column in df.columns:
    df[column] = df[column].apply(lower_casing)

In [16]:
df.head()

Unnamed: 0,title,movie_type,overview
0,the shawshank redemption,"[Drama, Crime]",imprisoned in the 1940s for the double murder ...
1,the godfather,"[Drama, Crime]","spanning the years 1945 to 1955, a chronicle o..."
2,the godfather part ii,"[Drama, Crime]",in the continuing saga of the corleone crime f...
3,schindler's list,"[Drama, History, War]",the true story of how businessman oskar schind...
4,12 angry men,[Drama],the defense and the prosecution have rested an...


## Tokenization:

In [17]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [18]:
def tokenization(text_series):
    return [[token.text for token in nlp(doc)] for doc in text_series]

In [19]:
for column in tqdm(df.columns, colour="green"):
    if column != "movie_type":
        df[column] = tokenization(df[column])

100%|[32m███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████[0m| 3/3 [02:41<00:00, 53.77s/it][0m


In [20]:
df.head()

Unnamed: 0,title,movie_type,overview
0,"[the, shawshank, redemption]","[Drama, Crime]","[imprisoned, in, the, 1940s, for, the, double,..."
1,"[the, godfather]","[Drama, Crime]","[spanning, the, years, 1945, to, 1955, ,, a, c..."
2,"[the, godfather, part, ii]","[Drama, Crime]","[in, the, continuing, saga, of, the, corleone,..."
3,"[schindler, 's, list]","[Drama, History, War]","[the, true, story, of, how, businessman, oskar..."
4,"[12, angry, men]",[Drama],"[the, defense, and, the, prosecution, have, re..."


## stopword removal:

In [21]:
def stop_words_removal(token_lists):
    return [[token for token in list_n if token not in nlp.Defaults.stop_words] for list_n in token_lists]

In [22]:
for column in tqdm(df.columns, colour="green"):
    if column != "movie_type":
        df[column] = stop_words_removal(df[column])

100%|[32m███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████[0m| 3/3 [00:00<00:00, 13.53it/s][0m


In [2]:
df.head()

NameError: name 'df' is not defined

## limmitization(spacy)(stemming is same as limmitization but has very low accuracy)

In [30]:
def lemmatization(token_lists):
    return [[token.lemma_ for token in nlp(" ".join(tokens))] for tokens in token_lists]

In [31]:
for column in tqdm(df.columns, colour="green"):
    if column != "movie_type":
        df[column] = lemmatization(df[column])

100%|[32m███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████[0m| 3/3 [02:07<00:00, 42.44s/it][0m


In [32]:
df.head()

Unnamed: 0,title,movie_type,overview
0,"[shawshank, redemption]","[Drama, Crime]","[imprison, 1940, double, murder, wife, lover, ..."
1,[godfather],"[Drama, Crime]","[span, year, 1945, 1955, ,, chronicle, fiction..."
2,"[godfather, ii]","[Drama, Crime]","[continue, saga, corleone, crime, family, ,, y..."
3,"[schindler, list]","[Drama, History, War]","[true, story, businessman, oskar, schindler, s..."
4,"[12, angry, man]",[Drama],"[defense, prosecution, rest, jury, filing, jur..."
