## Import libraries

In [1]:
import numpy as np
import pandas as pd
import json
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import warnings
warnings.filterwarnings("ignore")

## Load data

In [None]:
!pip install -U -q PyDrive

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
 
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
link = "https://drive.google.com/file/d/1a_vGanNqjLy97Ii5leYyHXFB_0Ovzety/view?usp=sharing"
 
id = link.split("/")[-2]
 
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('text_preprocessed_data.csv') 
 
data = pd.read_csv('text_preprocessed_data.csv')

## Read data

In [None]:
data.head(3)

Unnamed: 0.1,Unnamed: 0,title,id,language,imdb_id,adult,genres_data,keywords_data,cast_data,crew_data
0,0,Toy Story,862,en,tt0114709,False,animation comedy family,jealousy toy boy friendship friends rivalry bo...,tomhanks timallen donrickles,johnlasseter josswhedon andrewstanton
1,1,Jumanji,8844,en,tt0113497,False,adventure fantasy family,board game disappearance based on children s b...,robinwilliams jonathanhyde kirstendunst,larryj.franco jonathanhensleigh jameshorner
2,2,Grumpier Old Men,15602,en,tt0113228,False,romance comedy,fishing best friend duringcreditsstinger old men,waltermatthau jacklemmon ann-margret,howarddeutch markstevenjohnson markstevenjohnson


In [None]:
data1 = data.drop(["Unnamed: 0", "language", "adult"], axis=1)
data1.head(1)

Unnamed: 0,title,id,imdb_id,genres_data,keywords_data,cast_data,crew_data
0,Toy Story,862,tt0114709,animation comedy family,jealousy toy boy friendship friends rivalry bo...,tomhanks timallen donrickles,johnlasseter josswhedon andrewstanton


In [None]:
data1["text_data"] = data1["genres_data"] + data1["keywords_data"] + data1["cast_data"] + data1["crew_data"]

In [None]:
data1.isna().sum()

title                0
id                   0
imdb_id              0
genres_data       2518
keywords_data    14884
cast_data         5674
crew_data        12893
text_data        22524
dtype: int64

In [None]:
data1 = data1.drop(["genres_data", "keywords_data", "cast_data", "crew_data"], axis=1)

In [None]:
data1.head(1)

Unnamed: 0,title,id,imdb_id,text_data
0,Toy Story,862,tt0114709,animation comedy familyjealousy toy boy friend...


In [None]:
imdb_link = "https://www.imdb.com/"
data1["imdb_link"] = ""

In [None]:
for index, row in data1.iterrows() :
  data1["imdb_link"].iloc[index] = imdb_link + "title/" + str(data1["imdb_id"][index]) + "/"

In [None]:
data1.head()

Unnamed: 0,title,id,imdb_id,text_data,imdb_link
0,Toy Story,862,tt0114709,animation comedy familyjealousy toy boy friend...,https://www.imdb.com/title/tt0114709/
1,Jumanji,8844,tt0113497,adventure fantasy familyboard game disappearan...,https://www.imdb.com/title/tt0113497/
2,Grumpier Old Men,15602,tt0113228,romance comedyfishing best friend duringcredit...,https://www.imdb.com/title/tt0113228/
3,Waiting to Exhale,31357,tt0114885,comedy drama romancebased on novel interracial...,https://www.imdb.com/title/tt0114885/
4,Father of the Bride Part II,11862,tt0113041,comedybaby mlife crisis confence aging daughte...,https://www.imdb.com/title/tt0113041/


In [None]:
shape_before_remov_duplicates = data1.shape[0]
data1 = data1.drop_duplicates().reset_index(drop=True)
shape_after_remov_duplicates = data1.shape[0]
print(f"Duplicates dropped : {shape_before_remov_duplicates-shape_after_remov_duplicates}")

Duplicates dropped : 1190


In [None]:
final_data = data1.copy()

In [None]:
final_data.title = final_data.title.str.lower()

In [None]:
final_data.to_csv("Final data.csv")

## Vectorization

In [3]:
final_data = pd.read_csv("Final data.csv")

In [4]:
final_data.isna().sum()

Unnamed: 0        0
title             0
id                0
imdb_id           0
text_data     21800
imdb_link         0
dtype: int64

In [5]:
final_data = final_data.fillna("")

In [6]:
cv = CountVectorizer()
cv.fit(final_data["text_data"])
X = cv.transform(final_data["text_data"])

In [7]:
X

<45406x100553 sparse matrix of type '<class 'numpy.int64'>'
	with 317643 stored elements in Compressed Sparse Row format>

In [None]:
X_sparse = csr_matrix(X)
# print(X_sparse)
tsvd = TruncatedSVD(n_components=500)
X_sparse_tsvd = tsvd.fit(X_sparse).transform(X_sparse)

In [None]:
X_sparse_tsvd.shape

(45406, 500)

## Cosine similarity

In [None]:
cosine_sim = cosine_similarity(X_sparse_tsvd)

In [None]:
indices = pd.Series(final_data['title'])
indices[:5]

0                      toy story
1                        jumanji
2               grumpier old men
3              waiting to exhale
4    father of the bride part ii
Name: title, dtype: object

In [None]:
indices[indices=="toy story"].index[0]

0

## Recommendations

In [None]:
def recommend(title, cosine_sim = cosine_sim):
    recommended_movies = []
    idx = indices[indices == title].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_10_indices = list(score_series.iloc[1:11].index)
    
    for i in top_10_indices:
        recommended_movies.append(list(final_data['title'])[i])
        
    return recommended_movies

In [None]:
recommend("thor", cosine_sim)

['thor: the dark world',
 'doctor strange',
 'iron man 2',
 'thor: ragnarok',
 'captain america: the winter soldier',
 'x-men',
 'ant-man',
 'iron man 3',
 'x2',
 'captain america: civil war']

## Import vectorizer and cosine similarity as pickle files

In [None]:
pickle.dump(cv, open("vectorizer.pkl", "wb"))

In [None]:
pickle.dump(cosine_sim, open("cosine_similarity.pkl", "wb"), protocol=4)