## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import json
import re
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pickle
import warnings
warnings.filterwarnings("ignore")

# Load data

In [2]:
!pip install -U -q PyDrive

In [3]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
 
 
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [4]:
link1 = "https://drive.google.com/file/d/1TkuZlwFxmoIgjqH8sL8tTFCrLzs8s2gz/view?usp=sharing"
link2 = "https://drive.google.com/file/d/1-t6D2qFyFgSR4W6MB8VMqjRQ0Kk-P6sZ/view?usp=sharing"
link3 = "https://drive.google.com/file/d/13jCgCxLVimOpUvL7J0g2AytY-8_mFoOk/view?usp=sharing"
 
id1 = link1.split("/")[-2]
id2 = link2.split("/")[-2]
id3 = link3.split("/")[-2]
 
downloaded1 = drive.CreateFile({'id':id1})
downloaded1.GetContentFile('movies_metadata.csv') 

downloaded2 = drive.CreateFile({'id':id2})
downloaded2.GetContentFile('keywords.csv') 

downloaded3 = drive.CreateFile({'id':id3})
downloaded3.GetContentFile('credits.csv') 
 
movies = pd.read_csv('movies_metadata.csv')
keywords = pd.read_csv('keywords.csv')
credits = pd.read_csv('credits.csv')

In [5]:
pd.set_option("display.max_columns", None)

#  Merging data

In [9]:
movies.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [11]:
## drop unnecessary columns
movies1 = movies.drop(["belongs_to_collection", "homepage", "original_title", "overview", "poster_path", "production_companies", 
                       "production_countries", "runtime", "spoken_languages", "status", "tagline", "video"], axis=1)

In [13]:
movies1.head(3)

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,popularity,release_date,revenue,title,vote_average,vote_count
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,tt0114709,en,21.946943,1995-10-30,373554033.0,Toy Story,7.7,5415.0
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,tt0113497,en,17.015539,1995-12-15,262797249.0,Jumanji,6.9,2413.0
2,False,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",15602,tt0113228,en,11.7129,1995-12-22,0.0,Grumpier Old Men,6.5,92.0


In [15]:
movies1.shape

(45466, 12)

In [17]:
# data.id.astype("int") gives error beacause of the following reason
movies1[movies1["id"]=="1997-08-20"] ## id field contains date

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,popularity,release_date,revenue,title,vote_average,vote_count
19730,- Written by Ørnås,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'name': 'Carousel Productions', 'id': 11176}...",1997-08-20,0,104.0,,1,,,,


In [18]:
# dropping rows which contains date in their id field
id_errors = []
for index, row in movies1.iterrows() :
    row["id"] = row["id"].split("-")
    if (len(row["id"])>1) :
        id_errors.append(index)

movies1 = movies1.drop(id_errors)
movies1 = movies1.reset_index(drop=True)

for index, row in movies1.iterrows() :
    row["id"] = int(row["id"][0])

In [21]:
## indices of the movies that contains date in their id field
id_errors

[19730, 29503, 35587]

In [22]:
movies1["id"] = movies1.id.astype("int")

In [23]:
## merging movies dataset with keywords dataset
merge1 = movies1.merge(keywords, on="id")

In [24]:
merge2 = merge1.merge(credits, on="id")

# Data cleaning

In [77]:
data1 = merge2.copy()

In [78]:
data1.shape

(46628, 15)

## Handling missing values

In [79]:
data1.isna().sum()

adult                 0
budget                0
genres                0
id                    0
imdb_id              17
original_language    11
popularity            4
release_date         88
revenue               4
title                 4
vote_average          4
vote_count            4
keywords              0
cast                  0
crew                  0
dtype: int64

In [80]:
## drop rows with na values
data1 = data1.dropna()

In [81]:
data1 = data1.reset_index(drop=True)

## Handing duplicates

In [82]:
before_rem_duplicates_shape = data1.shape[0]
before_rem_duplicates_shape

46515

In [83]:
data1 = data1.drop_duplicates().reset_index(drop=True)

In [84]:
after_rem_duplicates_shape = data1.shape[0]
after_rem_duplicates_shape

45350

In [85]:
## number of duplicates removed
print(f"Duplicates removed : {before_rem_duplicates_shape-after_rem_duplicates_shape}")

Duplicates removed : 1165


## Change imdb id to imdb link

In [86]:
imdb_link = "https://www.imdb.com/"
data1["imdb_link"] = ""

In [87]:
for index, row in data1.iterrows() :
  data1["imdb_link"].iloc[index] = imdb_link + "title/" + str(data1["imdb_id"][index]) + "/"

In [88]:
data1 = data1.drop("imdb_id", axis=1)

# Data exploration

## Adult column

In [89]:
adult_counts = data1.adult.value_counts()
adult_counts = pd.DataFrame(adult_counts).reset_index()
adult_counts.columns = ["Adult", "Counts"]
adult_counts

Unnamed: 0,Adult,Counts
0,False,45342
1,True,8


In [90]:
fig = px.pie(adult_counts, values="Counts", names="Adult", title="Adult movie counts")
fig.show()

In [91]:
## since there is a large percentage difference between adult movies and non-adult movies hence we'll drop adult column
data1 = data1.drop("adult", axis=1)

## Language column

In [92]:
language_counts = pd.DataFrame(data1.original_language.value_counts())
language_counts = language_counts.reset_index()
language_counts.columns = ["language", "counts"]

fig = px.line(language_counts, x="language", y="counts", title="Language count of movies")
fig.show()

In [93]:
## most of the movies are in english hence first we'll drop movies which are not in english and then we'll drop language column
data1 = data1[data1["original_language"] == "en"]
data1 = data1.reset_index(drop=True)

In [96]:
## dropping original_language column
data1 = data1.drop("original_language", axis=1)

## Revenue column

In [216]:
data2 = data1.copy()

In [217]:
data2.shape

(32193, 13)

In [218]:
top_15_max_revenue_movies = data2["revenue"].nlargest(15)
top_15_max_revenue_movies_indices = top_15_max_revenue_movies.index

In [219]:
top_15_max_revenue_movies_indices

Int64Index([11771, 20385,  1451, 13974, 19257, 21979, 20388, 13716, 17097,
            30260, 30914, 16114, 23203, 20397, 13614],
           dtype='int64')

In [220]:
max_revenue_movies = data2.iloc[top_15_max_revenue_movies_indices]

In [221]:
plt.figure(figsize=(20,15))
fig = px.histogram(max_revenue_movies, x="title", y="revenue", color="revenue", title="Maximum revenue generated by movies", labels=dict(title="Movie", revenue="Revenue"))
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.update_layout(
    autosize=False,
    width=1400,
    height=700,)
fig.show()

<Figure size 1440x1080 with 0 Axes>

In [222]:
## dropping revenue column
data2 = data2.drop("revenue", axis=1)

## Budget column

In [152]:
data3 = data2.copy()

In [154]:
data3.budget.dtype

dtype('O')

In [155]:
## change datatype to int from object
data3.budget = data3.budget.astype("int")

In [163]:
top_15_max_budget_movies = data3["budget"].nlargest(15)
top_15_max_budget_movies_indices = top_15_max_budget_movies.index
max_budget_movies = data3.iloc[top_15_max_budget_movies_indices]

plt.figure(figsize=(20,15))
fig = px.histogram(max_budget_movies, x="title", y="budget", color="budget", title="Maximum budget movies", labels=dict(title="Movie", budget="Budget"))
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.show()

<Figure size 1440x1080 with 0 Axes>

In [165]:
## dropping budget column
data3 = data3.drop("budget", axis=1)

In [166]:
data3.head(1)

Unnamed: 0,genres,id,popularity,release_date,title,vote_average,vote_count,keywords,cast,crew,imdb_link
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,21.946943,1995-10-30,Toy Story,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",https://www.imdb.com/title/tt0114709/


## Popularity column

In [203]:
data4 = data3.copy()

In [204]:
data4.popularity.dtype

dtype('O')

In [205]:
data4.popularity = data4.popularity.astype("float")

In [208]:
top_15_max_popularity_movies = data4["popularity"].nlargest(20)
top_15_max_popularity_movies_indices = top_15_max_popularity_movies.index
max_popularity_movies = data3.iloc[top_15_max_popularity_movies_indices]

plt.figure(figsize=(20,15))
fig = px.histogram(max_popularity_movies, x="title", y="popularity", color="popularity", title="Most popular movies", labels=dict(title="Movie", popularity="Popularity"))
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.update_layout(
    autosize=False,
    width=1400,
    height=700,)
fig.show()

<Figure size 1440x1080 with 0 Axes>

In [209]:
data4[data4["popularity"]<1].shape

(14260, 11)

In [210]:
## 14260 movies have popularity less than 1 hence we'll remove those movies
data4 = data4[data4["popularity"]>1]
data4 = data4.reset_index(drop=True)

In [211]:
## dropping popularity column
data4 = data4.drop("popularity", axis=1)

In [212]:
data4.shape

(17933, 10)

## Votes column

In [214]:
data5 = data4.copy()

In [215]:
top_15_max_vote_count_movies = data5["vote_count"].nlargest(20)
top_15_max_vote_count_movies_indices = top_15_max_vote_count_movies.index
max_vote_count_movies = data5.iloc[top_15_max_vote_count_movies_indices]

plt.figure(figsize=(20,15))
fig = px.histogram(max_vote_count_movies, x="title", y="vote_count", color="vote_count", title="Maximum voted movies", labels=dict(title="Movie", vote_count="Vote counts"))
fig.update_layout(barmode='stack', xaxis={'categoryorder':'total descending'})
fig.update_layout(
    autosize=False,
    width=1400,
    height=700,)
fig.show()

<Figure size 1440x1080 with 0 Axes>

In [223]:
data5 = data5.drop("vote_count", axis=1)

In [224]:
data5.head(1)

Unnamed: 0,genres,id,release_date,title,vote_average,keywords,cast,crew,imdb_link
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,1995-10-30,Toy Story,7.7,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",https://www.imdb.com/title/tt0114709/


## Date column

In [225]:
## changing release_date column to release_year column
data6 = data5.copy()

In [226]:
data6["release_year"] = data6["release_date"].apply(lambda x: x.split("-")[0])

In [229]:
## dropping release_date column
data6 = data6.drop("release_date", axis=1)

In [230]:
data6.shape

(17933, 9)

In [232]:
data6["release_year"].dtype

dtype('O')

In [233]:
## change data type of year column
data6["release_year"] = data6["release_year"].astype("int")

In [235]:
data6[data6["release_year"]<1990].shape

(5054, 9)

In [238]:
data6["vote_average"].dtype

dtype('float64')

In [249]:
data6[(data6["release_year"]<1990) & (data6["vote_average"]<8)].shape

(4972, 9)

In [250]:
rows_to_be_dropped = data6[(data6["release_year"]<1990) & (data6["vote_average"]<8)].index

Int64Index([  299,   424,   428,   457,   459,   461,   463,   471,   475,
              504,
            ...
            17794, 17795, 17805, 17830, 17881, 17910, 17911, 17912, 17914,
            17916],
           dtype='int64', length=4972)

In [251]:
## removing columns with release_year<1990 and average_vote<8
data6 = data6.drop(rows_to_be_dropped)

In [253]:
data6 = data6.reset_index(drop=True)

In [254]:
data6.shape

(12961, 9)

In [256]:
## dropping vote_average column
data6 = data6.drop("vote_average", axis=1)

In [257]:
data6.head(1)

Unnamed: 0,genres,id,title,keywords,cast,crew,imdb_link,release_year
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",https://www.imdb.com/title/tt0114709/,1995


## Save clean data for next step

In [258]:
data6.to_csv("clean_data.csv")