# Steps :
- 1. Load data
- 2. Clean data
- 3. EDA
- 4. Text preprocessing
- 5. Generating word representations
- 6. Vectorization and cosine similarity
- 7. Model creation and testing
- 8. Recommendation

In [554]:
import numpy as np
import pandas as pd
import json
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import warnings
warnings.filterwarnings("ignore")

# Load data

In [555]:
movies = pd.read_csv("data/movies_metadata.csv")
keywords= pd.read_csv("data/keywords.csv")
credits = pd.read_csv("data/credits.csv")

In [556]:
pd.set_option("display.max_columns", None)

In [557]:
movies.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [558]:
keywords.head(3)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."


In [559]:
credits.head(3)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602


# Cleaning data

### Selecting columns from movies csv file

In [560]:
data = pd.DataFrame()

In [561]:
data["title"] = movies.title
data["id"] = movies.id
data["language"] = movies.original_language
data["imdb_id"] = movies.imdb_id
data["adult"] = movies.adult
data["genres"] = movies.genres

In [562]:
data.shape

(45466, 6)

In [563]:
data.head()

Unnamed: 0,title,id,language,imdb_id,adult,genres
0,Toy Story,862,en,tt0114709,False,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,Jumanji,8844,en,tt0113497,False,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."
2,Grumpier Old Men,15602,en,tt0113228,False,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ..."
3,Waiting to Exhale,31357,en,tt0114885,False,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam..."
4,Father of the Bride Part II,11862,en,tt0113041,False,"[{'id': 35, 'name': 'Comedy'}]"


In [564]:
# data.id.astype("int") gives error because it contains noise data so we will clean it

In [565]:
data[data["id"]=="1997-08-20"]

Unnamed: 0,title,id,language,imdb_id,adult,genres
19730,,1997-08-20,104.0,0,- Written by Ørnås,"[{'name': 'Carousel Productions', 'id': 11176}..."


In [566]:
id_errors = []
for index, row in data.iterrows() :
    row["id"] = row["id"].split("-")
    if (len(row["id"])>1) :
        id_errors.append(index)

data = data.drop(id_errors)
data = data.reset_index(drop=True)

for index, row in data.iterrows() :
    row["id"] = int(row["id"][0])

In [567]:
id_errors

[19730, 29503, 35587]

In [568]:
data["id"] = data.id.astype("int")

In [569]:
data.shape

(45463, 6)

### Merging data dataframe with keywords and credits dataframe

In [570]:
keywords.shape

(46419, 2)

In [571]:
data_merged_keywords = data.merge(keywords, on="id")

In [572]:
data_merged_keywords_and_credits = data_merged_keywords.merge(credits, on="id")

### Removing na values

In [573]:
data_merged_keywords_and_credits.isna().sum()

title        4
id           0
language    11
imdb_id     17
adult        0
genres       0
keywords     0
cast         0
crew         0
dtype: int64

In [574]:
data_merged_keywords_and_credits = data_merged_keywords_and_credits.dropna().reset_index(drop=True)

In [575]:
data_merged_keywords_and_credits.isna().sum()

title       0
id          0
language    0
imdb_id     0
adult       0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [576]:
data_merged_keywords_and_credits.to_csv("merged_data.csv")

# Text preprocessing

In [537]:
data1 = data_merged_keywords_and_credits.copy()

In [538]:
data1.head(3)

Unnamed: 0,title,id,language,imdb_id,adult,genres,keywords,cast,crew
0,Toy Story,862,en,tt0114709,False,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,Jumanji,8844,en,tt0113497,False,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,Grumpier Old Men,15602,en,tt0113228,False,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."


In [539]:
data1.genres[0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

### here we have to extract the important keywords from the  stringified json text

In [540]:
a = data1.genres[0]
a = re.sub("[^a-zA-Z]", " ", a)
a = a.lower()
a = a.replace("id", "")
a = a.replace("name", "")
a = nltk.word_tokenize(a)
a

['animation', 'comedy', 'family']

In [541]:
## for keywords and genres columns
def keyword_extraction(text) :
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    text = text.replace("id", "")
    text = text.replace("name", "")
    text = nltk.word_tokenize(text)
    text = " ".join(text)
    
    return text

In [542]:
keyword_extraction(data1["genres"][0])

'animation comedy family'

In [543]:
keyword_extraction(data1["keywords"][0])

'jealousy toy boy friendship friends rivalry boy next door new toy toy comes to life'

In [544]:
data1["genres_data"] = data1["genres"].apply(keyword_extraction)
data1["keywords_data"] = data1["keywords"].apply(keyword_extraction)

### Remove genres and keywords columns

In [545]:
data1 = data1.drop(["genres","keywords"], axis=1)

In [546]:
data1.head(3)

Unnamed: 0,title,id,language,imdb_id,adult,cast,crew,genres_data,keywords_data
0,Toy Story,862,en,tt0114709,False,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",animation comedy family,jealousy toy boy friendship friends rivalry bo...
1,Jumanji,8844,en,tt0113497,False,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",adventure fantasy family,board game disappearance based on children s b...
2,Grumpier Old Men,15602,en,tt0113228,False,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",romance comedy,fishing best friend duringcreditsstinger old men


### Extract top 3 cast name

In [548]:
def top_3_crew(text) :
    cast = []
    texts = text.split("},")[:]
    for i in range(3) :
        cast.append((texts[i].split("name': '")[1].split("'")[0].replace(" ","")).lower())
        
    cast = " ".join(cast)
    return cast

In [549]:
top_3_crew(data1["cast"][0])

'tomhanks timallen donrickles'

In [551]:
top_3_crew(data1["crew"][456])

'rosetroche rosetroche guinevereturner'

In [552]:
data1["cast_data"] = ""
data1["crew_data"] = ""
data1.head(1)

Unnamed: 0,title,id,language,imdb_id,adult,cast,crew,genres_data,keywords_data,cast_data,crew_data
0,Toy Story,862,en,tt0114709,False,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",animation comedy family,jealousy toy boy friendship friends rivalry bo...,,


In [553]:
for index, row in data1.iterrows() :
    try :
        text1 = row.cast
        casts = top_3_crew(text1)
        data1["cast_data"].iloc[index] = casts
    except IndexError:
        data1["cast_data"].iloc[index] = ""


KeyboardInterrupt



In [None]:
for index, row in data1.iterrows() :
    try :
        text2 = row.crew
        casts = top_3_crew(text2)
        data1["crew_data"].iloc[index] = casts
    except IndexError:
        data1["crew_data"].iloc[index] = ""

#### Remove cast and crew columns

In [315]:
data1 = data1.drop(["cast", "crew"], axis=1)

In [319]:
data1.head()

Unnamed: 0,title,id,language,imdb_id,adult,genres_data,keywords_data,cast_data
0,Toy Story,862,en,tt0114709,False,animation comedy family,jealousy toy boy friendship friends rivalry bo...,tomhanks timallen donrickles
1,Jumanji,8844,en,tt0113497,False,adventure fantasy family,board game disappearance based on children s b...,robinwilliams jonathanhyde kirstendunst
2,Grumpier Old Men,15602,en,tt0113228,False,romance comedy,fishing best friend duringcreditsstinger old men,waltermatthau jacklemmon ann-margret
3,Waiting to Exhale,31357,en,tt0114885,False,comedy drama romance,based on novel interracial relationship single...,whitneyhouston angelabassett lorettadevine
4,Father of the Bride Part II,11862,en,tt0113041,False,comedy,baby mlife crisis confence aging daughter moth...,stevemartin dianekeaton martinshort


## Label encoding categorical data

In [423]:
data2 = data1.copy()

In [424]:
language_counts = pd.DataFrame(data2.language.value_counts())
language_counts = language_counts.reset_index()
language_counts.columns = ["language", "counts"]
language_counts.head()

Unnamed: 0,language,counts
0,en,32921
1,fr,2523
2,it,1565
3,ja,1418
4,de,1111


In [425]:
import plotly.express as px

In [426]:
fig = px.line(language_counts, x="language", y="counts")
# fig.show()

![count_language.png](attachment:count_language.png)

In [427]:
top_5_lang = list(language_counts.language[:5])
top_5_lang

['en', 'fr', 'it', 'ja', 'de']

In [429]:
for index, row in data2.iterrows():
    if row["language"] not in top_5_lang :
        data2 = data2.replace(row["language"], "other_lang")

In [479]:
language_counts2 = pd.DataFrame(data2.language.value_counts())
language_counts2 = language_counts2.reset_index()
language_counts2.columns = ["language", "counts"]
fig = px.line(language_counts2, x="language", y="counts")
fig.show()

In [440]:
le_lang = LabelEncoder()
le_lang.fit(data2.language)
data2["le_language"] = le_lang.transform(data2["language"])

In [447]:
data2.adult.value_counts()

False    46587
True         9
Name: adult, dtype: int64

In [449]:
data2 = data2.replace({"False" : 0, "True" : 1})
data2.adult.value_counts()

0    46587
1        9
Name: adult, dtype: int64

In [451]:
data2["text_data"] = data2["genres_data"] + data2["keywords_data"] + data2["cast_data"]

In [452]:
data2.head()

Unnamed: 0,title,id,language,imdb_id,adult,genres_data,keywords_data,cast_data,le_language,text_data
0,Toy Story,862,en,tt0114709,0,animation comedy family,jealousy toy boy friendship friends rivalry bo...,tomhanks timallen donrickles,1,animation comedy familyjealousy toy boy friend...
1,Jumanji,8844,en,tt0113497,0,adventure fantasy family,board game disappearance based on children s b...,robinwilliams jonathanhyde kirstendunst,1,adventure fantasy familyboard game disappearan...
2,Grumpier Old Men,15602,en,tt0113228,0,romance comedy,fishing best friend duringcreditsstinger old men,waltermatthau jacklemmon ann-margret,1,romance comedyfishing best friend duringcredit...
3,Waiting to Exhale,31357,en,tt0114885,0,comedy drama romance,based on novel interracial relationship single...,whitneyhouston angelabassett lorettadevine,1,comedy drama romancebased on novel interracial...
4,Father of the Bride Part II,11862,en,tt0113041,0,comedy,baby mlife crisis confence aging daughter moth...,stevemartin dianekeaton martinshort,1,comedybaby mlife crisis confence aging daughte...


# Vectorization

In [456]:
data3 = data2[["title", "adult", "le_language", "text_data"]]

In [457]:
data3.head()

Unnamed: 0,title,adult,le_language,text_data
0,Toy Story,0,1,animation comedy familyjealousy toy boy friend...
1,Jumanji,0,1,adventure fantasy familyboard game disappearan...
2,Grumpier Old Men,0,1,romance comedyfishing best friend duringcredit...
3,Waiting to Exhale,0,1,comedy drama romancebased on novel interracial...
4,Father of the Bride Part II,0,1,comedybaby mlife crisis confence aging daughte...


In [459]:
data3.shape

(46596, 4)

- remove duplicates

In [462]:
data3 = data3.drop_duplicates()
data3 = data3.reset_index(drop=True)
data3.shape

(45401, 4)

In [487]:
cv = CountVectorizer()
cv.fit(data3["text_data"])
X = cv.transform(data3["text_data"])

In [488]:
X

<45401x94907 sparse matrix of type '<class 'numpy.int64'>'
	with 371657 stored elements in Compressed Sparse Row format>

In [489]:
cosine_sim = cosine_similarity(X)
print(cosine_sim)

[[1.         0.04811252 0.         ... 0.         0.         0.        ]
 [0.04811252 1.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [493]:
indices = pd.Series(data3['title'])
indices[:5]

0                      Toy Story
1                        Jumanji
2               Grumpier Old Men
3              Waiting to Exhale
4    Father of the Bride Part II
Name: title, dtype: object

In [497]:
indices[indices=="Toy Story"].index[0]

0

In [502]:
def recommend(title, cosine_sim = cosine_sim):
    recommended_movies = []
    idx = indices[indices == title].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_5_indices = list(score_series.iloc[1:6].index)
    
    for i in top_5_indices:
        recommended_movies.append(list(data3['title'])[i])
        
    return recommended_movies

In [510]:
recommend("Thor", cosine_sim)

['Thor: The Dark World',
 'Thor: Ragnarok',
 'Iron Man 2',
 'Captain America: The Winter Soldier',
 'Doctor Strange']

In [513]:
pickle.dump(cv, open("vectorizer.pkl", "wb"))
pickle.dump(cosine_sim, open("cosine_similarity.pkl", "wb"))
pickle.dump(recommender, open("recommender.pkl", "wb"))

MemoryError: 

In [514]:
pickle.dump(cosine_sim, open("cosine_similarity.pkl", "wb"))

MemoryError: 