# Import libraries

In [1]:
import numpy as np
import pandas as pd
import json
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import scipy
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import plotly.express as px
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Load data

In [3]:
!pip install -U -q PyDrive

In [4]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
 
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [95]:
link1 = "https://drive.google.com/file/d/1X-e8LQCi_98tMI-nLPSg0O7KF8GW3vnA/view?usp=sharing"
 
id1 = link1.split("/")[-2]
 
downloaded1 = drive.CreateFile({'id':id1})
downloaded1.GetContentFile('text_preprocessed_data.csv') 
 
text_preprocessed_data = pd.read_csv('text_preprocessed_data.csv')

link2 = "https://drive.google.com/file/d/13jCgCxLVimOpUvL7J0g2AytY-8_mFoOk/view?usp=sharing"
 
id2 = link2.split("/")[-2]
 
downloaded2 = drive.CreateFile({'id':id2})
downloaded2.GetContentFile('credits.csv') 
 
credits = pd.read_csv('credits.csv')

# Get director's name from credits dataset and replace it with crew_data in text_preprocessed_data

In [96]:
text_preprocessed_data = text_preprocessed_data.drop("Unnamed: 0", axis=1)

In [98]:
movies_indices = text_preprocessed_data["id"].values

In [99]:
credits = credits[credits["id"].isin(movies_indices)].reset_index(drop=True)

In [100]:
def get_director_name(text) :
  text = text.split("'job': 'Director', 'name': '")[1].split("',")[0].replace(" ", "")
  text = text.lower()
  return text

In [101]:
credits["director"] = ""

In [102]:
for index, row in credits.iterrows() :
    try :
        text = row.crew
        director = get_director_name(text)
        credits["director"].iloc[index] = director
    except IndexError:
        credits["director"].iloc[index] = ""

In [103]:
credits[credits["id"]==10195]

Unnamed: 0,cast,crew,id,director
6081,"[{'cast_id': 1, 'character': 'Thor Odinson', '...","[{'credit_id': '52fe433f9251416c7500923d', 'de...",10195,kennethbranagh


In [104]:
credits = credits.drop_duplicates().reset_index(drop=True)

In [105]:
data = text_preprocessed_data.merge(credits, on="id")

In [106]:
data = data.drop(["crew_data", "cast", "crew"], axis=1)

In [107]:
for index, row in data.iterrows() :
  text = row["cast_data"].split(" ")[:3]
  text = " ".join(text)
  data["cast_data"].iloc[index] = text

In [108]:
data1 = data.copy()

In [109]:
data1["full_data"] = ""

In [110]:
data1["full_data"] = data["cast_data"] + " " + data["director"]+ " " + data["genres_data"] + " " + data["keywords_data"]

In [111]:
data1 = data1.drop(["cast_data", "director", "genres_data", "keywords_data"], axis=1)

In [112]:
data1.head(2)

Unnamed: 0,id,title,imdb_link,release_year,full_data
0,862,Toy Story,https://www.imdb.com/title/tt0114709/,1995,tomhanks timallen donrickles johnlasseter anim...
1,8844,Jumanji,https://www.imdb.com/title/tt0113497/,1995,robinwilliams jonathanhyde kirstendunst joejoh...


In [113]:
data1["full_data"][1]

'robinwilliams jonathanhyde kirstendunst joejohnston adventure fantasy family board game disappearance based on children s book new home recluse giant insect'

In [114]:
data1 = data1.drop_duplicates().reset_index(drop=True)

# Vectorization

In [140]:
data2 = data1.copy()
data2.head(1)

Unnamed: 0,id,title,imdb_link,release_year,full_data
0,862,Toy Story,https://www.imdb.com/title/tt0114709/,1995,tomhanks timallen donrickles johnlasseter anim...


In [141]:
data2.isna().sum()

id                 0
title              0
imdb_link          0
release_year       0
full_data       2291
dtype: int64

In [142]:
data2 = data2.fillna("")

In [143]:
# data2 = data2.dropna().reset_index(drop=True)

In [144]:
data2.head(1)

Unnamed: 0,id,title,imdb_link,release_year,full_data
0,862,Toy Story,https://www.imdb.com/title/tt0114709/,1995,tomhanks timallen donrickles johnlasseter anim...


In [145]:
cv = CountVectorizer()
cv.fit(data2["full_data"])
X = cv.transform(data2["full_data"])

In [146]:
X

<12750x24836 sparse matrix of type '<class 'numpy.int64'>'
	with 161206 stored elements in Compressed Sparse Row format>

## Converting sparse matrix to sparse dataframe to convert int64 into int8

In [147]:
df = pd.DataFrame.sparse.from_spmatrix(X)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12750 entries, 0 to 12749
Columns: 24836 entries, 0 to 24835
dtypes: Sparse[int64, 0](24836)
memory usage: 1.8 MB


In [148]:
## finding the max value in dataframe
max_vals = df.max()
max(max_vals)

9

In [149]:
## since the max value in dataframe is 7 hence we can change data type from int64 to int8
df = df.astype("int8")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12750 entries, 0 to 12749
Columns: 24836 entries, 0 to 24835
dtypes: Sparse[int8, 0](24836)
memory usage: 787.3 KB


## Converting sparse dataframe to sparse matrix

In [150]:
sdf = df.astype(pd.SparseDtype("int8", 0))
sdf_matrix = scipy.sparse.csr_matrix(sdf.values)
sdf_matrix

<12750x24836 sparse matrix of type '<class 'numpy.int8'>'
	with 161206 stored elements in Compressed Sparse Row format>

# Cosine Similarity

In [152]:
data3 = data2.copy()
data3["title"] = data3["title"].apply(lambda x: x.lower())

In [154]:
cosine_sim = cosine_similarity(sdf_matrix)

In [155]:
indices = pd.Series(data3['title'])
indices[:5]

0                      toy story
1                        jumanji
2               grumpier old men
3              waiting to exhale
4    father of the bride part ii
Name: title, dtype: object

In [162]:
indices[indices=="thor"].index[0]

6079

# Recommendations

In [134]:
# def recommend(title, cosine_sim = cosine_sim):
#     recommended_movies = []
#     idx = indices[indices == title].index[0]
#     score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
#     top_10_indices = list(score_series.iloc[1:11].index)
    
#     for i in top_10_indices:
#         recommended_movies.append(list(data2['title'])[i])
        
#     return recommended_movies

In [171]:
data3.head()

Unnamed: 0,id,title,imdb_link,release_year,full_data
0,862,toy story,https://www.imdb.com/title/tt0114709/,1995,tomhanks timallen donrickles johnlasseter anim...
1,8844,jumanji,https://www.imdb.com/title/tt0113497/,1995,robinwilliams jonathanhyde kirstendunst joejoh...
2,15602,grumpier old men,https://www.imdb.com/title/tt0113228/,1995,waltermatthau jacklemmon ann-margret howarddeu...
3,31357,waiting to exhale,https://www.imdb.com/title/tt0114885/,1995,whitneyhouston angelabassett lorettadevine for...
4,11862,father of the bride part ii,https://www.imdb.com/title/tt0113041/,1995,stevemartin dianekeaton martinshort charlesshy...


In [195]:
def recommend(title):
    title = title.lower()
    recommended_movies_id = []
    idx = indices[indices == title].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_10_indices = list(score_series.iloc[1:11].index)
    
    for i in top_10_indices:
        recommended_movies_id.append(list(data2['id'])[i])
    
    df = data2[data2["id"].isin(recommended_movies_id)]
    df = df[["title", "release_year", "imdb_link"]].reset_index(drop=True)
    df = df.set_index(pd.Index([1,2,3,4,5,6,7,8,9,10]))
    df.columns = ["Title", "Release Year", "IMDB Link"]
    
    return df 

In [196]:
recommend("titanic")

Unnamed: 0,Title,Release Year,IMDB Link
1,Angels and Insects,1995,https://www.imdb.com/title/tt0112365/
2,My Best Friend's Wedding,1997,https://www.imdb.com/title/tt0119738/
3,The Object of My Affection,1998,https://www.imdb.com/title/tt0120772/
4,Onegin,1999,https://www.imdb.com/title/tt0119079/
5,Down to You,2000,https://www.imdb.com/title/tt0186975/
6,Lilies,1996,https://www.imdb.com/title/tt0116882/
7,Persuasion,2007,https://www.imdb.com/title/tt0844330/
8,The Deep Blue Sea,2011,https://www.imdb.com/title/tt1700844/
9,Titanic,1996,https://www.imdb.com/title/tt0115392/
10,Just a Question of Love,2000,https://www.imdb.com/title/tt0231844/


In [198]:
recommend("Thor")

Unnamed: 0,Title,Release Year,IMDB Link
1,Iron Man 2,2010,https://www.imdb.com/title/tt1228705/
2,The Avengers,2012,https://www.imdb.com/title/tt0848228/
3,Thor: The Dark World,2013,https://www.imdb.com/title/tt1981115/
4,Captain America: The Winter Soldier,2014,https://www.imdb.com/title/tt1843866/
5,Avengers: Age of Ultron,2015,https://www.imdb.com/title/tt2395427/
6,Ant-Man,2015,https://www.imdb.com/title/tt0478970/
7,Thor: Ragnarok,2017,https://www.imdb.com/title/tt3501632/
8,Doctor Strange,2016,https://www.imdb.com/title/tt1211837/
9,Marvel One-Shot: Item 47,2012,https://www.imdb.com/title/tt2247732/
10,Team Thor,2016,https://www.imdb.com/title/tt6016776/


# Import vectorizer and cosine similarity

In [201]:
data2.to_csv("final_data.csv")

In [200]:
pickle.dump(cv, open("vectorizer.pkl", "wb"))
pickle.dump(cosine_sim, open("cosine_similarity.pkl", "wb"), protocol=4)
pickle.dump(recommend, open("recommender.pkl", "wb"))