Problem Statement:❓
Using different techniques of Machine Learning, we need to build a Recommender System that recommends movies based on “ Cast, Genre, Reviews, TMDB/IMDB ratings”

Using different types of recommendation techniques like:
1. Popularity based recommender system
2. Content based Recommender System
3. Collaborative Recommender System


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing dependencies 

In [2]:
import pandas as pd 
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore a specific warning by category
warnings.filterwarnings("ignore")

In [3]:
credits=pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/credits.csv")
movies=pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/movies_metadata.csv",low_memory=False)
keywords=pd.read_csv(r"/content/drive/MyDrive/Colab Notebooks/keywords.csv")
links=pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/links.csv')
links_small=pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/links_small.csv')
ratings=pd.read_csv(r'/content/drive/MyDrive/Colab Notebooks/ratings_small.csv')

In [4]:
print('Shape of Credits:',credits.shape)
print('Shape of Movies:',movies.shape)


Shape of Credits: (45476, 3)
Shape of Movies: (45466, 24)


In [5]:
credits.head()


Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [6]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [7]:
credits.describe()

Unnamed: 0,id
count,45476.0
mean,108345.997537
std,112443.796536
min,2.0
25%,26443.25
50%,60002.5
75%,157302.0
max,469172.0


In [8]:
movies.head().T

Unnamed: 0,0,1,2,3,4
adult,False,False,False,False,False
belongs_to_collection,"{'id': 10194, 'name': 'Toy Story Collection', ...",,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",,"{'id': 96871, 'name': 'Father of the Bride Col..."
budget,30000000,65000000,0,16000000,0
genres,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...","[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","[{'id': 35, 'name': 'Comedy'}]"
homepage,http://toystory.disney.com/toy-story,,,,
id,862,8844,15602,31357,11862
imdb_id,tt0114709,tt0113497,tt0113228,tt0114885,tt0113041
original_language,en,en,en,en,en
original_title,Toy Story,Jumanji,Grumpier Old Men,Waiting to Exhale,Father of the Bride Part II
overview,"Led by Woody, Andy's toys live happily in his ...",When siblings Judy and Peter discover an encha...,A family wedding reignites the ancient feud be...,"Cheated on, mistreated and stepped on, the wom...",Just when George Banks has recovered from his ...


In [9]:
movies.describe()

Unnamed: 0,revenue,runtime,vote_average,vote_count
count,45460.0,45203.0,45460.0,45460.0
mean,11209350.0,94.128199,5.618207,109.897338
std,64332250.0,38.40781,1.924216,491.310374
min,0.0,0.0,0.0,0.0
25%,0.0,85.0,5.0,3.0
50%,0.0,95.0,6.0,10.0
75%,0.0,107.0,6.8,34.0
max,2787965000.0,1256.0,10.0,14075.0


In [10]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [11]:
#changing datatype of id column
movies = movies[movies['id'].str.isnumeric()]  # Keep only rows with numeric 'id' values
movies['id'] = movies['id'].astype(int)  # Convert 'id' column to integer

In [12]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [13]:
keywords.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


In [14]:
print(movies.columns)
print(keywords.columns)
print(credits.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')
Index(['id', 'keywords'], dtype='object')
Index(['cast', 'crew', 'id'], dtype='object')


In [15]:
# Merge the two dataframes based on the 'id' column
df = pd.merge(credits, movies,  on='id')
df.head()


Unnamed: 0,cast,crew,id,adult,belongs_to_collection,budget,genres,homepage,imdb_id,original_language,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,tt0114709,en,...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,tt0113497,en,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,tt0113228,en,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,tt0114885,en,...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,tt0113041,en,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [16]:
new_df = pd.merge(df, keywords, on='id')

 

In [17]:
new_df.columns

Index(['cast', 'crew', 'id', 'adult', 'belongs_to_collection', 'budget',
       'genres', 'homepage', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'keywords'],
      dtype='object')

In [18]:
new_df.shape

(46628, 27)

In [19]:
#creating new feature year
# Convert the 'release_date' to datetime if it's not already in datetime format
new_df['release_date'] = pd.to_datetime(new_df['release_date'])

# Extract the year from the date
new_df['release_year'] = new_df['release_date'].dt.year

In [20]:
#create a new feature weighted rating
# Calculate the mean and total vote count for all movies
mean_vote = new_df['vote_average'].mean()
total_votes = new_df['vote_count'].sum()

# Define the minimum vote count threshold for considering a movie
min_vote_count = 1000

# Compute the weighted rating for each movie
new_df['weighted_rating'] = (new_df['vote_count'] / (new_df['vote_count'] + min_vote_count)) * new_df['vote_average'] + (min_vote_count / (new_df['vote_count'] + min_vote_count)) * mean_vote

In [21]:
# Create new column rating based on the weighted rating
new_df['raiting'] = np.where(new_df['weighted_rating'] >= 8, 'High', np.where(new_df['weighted_rating'] >= 6, 'Average', 'Low'))

In [22]:
new_df.columns

Index(['cast', 'crew', 'id', 'adult', 'belongs_to_collection', 'budget',
       'genres', 'homepage', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'keywords', 'release_year',
       'weighted_rating', 'raiting'],
      dtype='object')

# Feature selection

In [23]:
#selected features
movie=new_df[['id','genres','title','keywords','cast','overview','crew','adult','budget','original_language','popularity','revenue','runtime','status','video','weighted_rating','raiting','tagline','release_year']]

In [24]:
movie.isnull().sum()

id                       0
genres                   0
title                    4
keywords                 0
cast                     0
overview               995
crew                     0
adult                    0
budget                   0
original_language       11
popularity               4
revenue                  4
runtime                268
status                  86
video                    4
weighted_rating          4
raiting                  0
tagline              25845
release_year            88
dtype: int64

In [25]:
movie.duplicated().sum()

1172

In [26]:
movie = movie.drop_duplicates()

In [27]:
movie.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45456 entries, 0 to 46627
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 45456 non-null  int64  
 1   genres             45456 non-null  object 
 2   title              45453 non-null  object 
 3   keywords           45456 non-null  object 
 4   cast               45456 non-null  object 
 5   overview           44502 non-null  object 
 6   crew               45456 non-null  object 
 7   adult              45456 non-null  object 
 8   budget             45456 non-null  object 
 9   original_language  45445 non-null  object 
 10  popularity         45453 non-null  object 
 11  revenue            45453 non-null  float64
 12  runtime            45196 non-null  float64
 13  status             45372 non-null  object 
 14  video              45453 non-null  object 
 15  weighted_rating    45453 non-null  float64
 16  raiting            454

In [28]:
movie['overview'].mode()[0]

'No overview found.'

In [29]:
#deal with null value in over view column 
most_common_overview = movie['overview'].mode()[0]
movie['overview'].fillna(most_common_overview, inplace=True)


In [30]:
# Replace NaN values in 'original_language' column with 'Unknown'
movie['original_language'].fillna('Unknown', inplace=True)

In [31]:
mean_runtime = movies['runtime'].mean()
# Fill NaN values in 'runtime' column with mean value
movie['runtime'].fillna(mean_runtime, inplace=True)

In [32]:
movie['status'].unique()

array(['Released', nan, 'Rumored', 'Post Production', 'In Production',
       'Planned', 'Canceled'], dtype=object)

In [33]:
# Replace NaN values in 'original_language' column with 'Unknown'
movie['status'].fillna('Unknown', inplace=True)

In [34]:
movie.dropna(subset=['title','release_year'], inplace=True)

In [35]:
movie.dropna(subset=['release_year'], inplace=True)

In [36]:
movie.shape

(45369, 19)

In [37]:
movie.isnull().sum()

id                       0
genres                   0
title                    0
keywords                 0
cast                     0
overview                 0
crew                     0
adult                    0
budget                   0
original_language        0
popularity               0
revenue                  0
runtime                  0
status                   0
video                    0
weighted_rating          0
raiting                  0
tagline              24973
release_year             0
dtype: int64

In [38]:
movie['popularity'].unique()

array(['21.946943', '17.015539', '11.7129', ..., '0.903007', '0.003503',
       '0.163015'], dtype=object)

In [39]:
movie['popularity'] = pd.to_numeric(movie['popularity'], errors='coerce')

# Calculate the mean of 'popularity' column


In [40]:
movie['popularity'] = movie['popularity'].round(2)

In [41]:
import ast

In [42]:
def convert(text):
  l=[]
  for i in ast.literal_eval(text):
    l.append(i['name'])
  return l

In [43]:
movie["genres"]=movie["genres"].apply(convert)

In [44]:
movie["keywords"]=movie["keywords"].apply(convert)

In [45]:
movie['cast'][0]

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

In [46]:
def convert_cast(obj):
  c=[]
  counter=0
  for i in ast.literal_eval(obj):
    if counter < 3:
      c.append(i['name'])
    counter+=1
  return c
    
  


In [47]:
movie["cast"]=movie["cast"].apply(convert_cast)

In [48]:
movie

Unnamed: 0,id,genres,title,keywords,cast,overview,crew,adult,budget,original_language,popularity,revenue,runtime,status,video,weighted_rating,raiting,tagline,release_year
0,862,"[Animation, Comedy, Family]",Toy Story,"[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles]","Led by Woody, Andy's toys live happily in his ...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",False,30000000,en,21.95,373554033.0,81.0,Released,False,7.374470,Average,,1995.0
1,8844,"[Adventure, Fantasy, Family]",Jumanji,"[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst]",When siblings Judy and Peter discover an encha...,"[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",False,65000000,en,17.02,262797249.0,104.0,Released,False,6.522540,Average,Roll the dice and unleash the excitement!,1995.0
2,15602,"[Romance, Comedy]",Grumpier Old Men,"[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret]",A family wedding reignites the ancient feud be...,"[{'credit_id': '52fe466a9251416c75077a89', 'de...",False,0,en,11.71,0.0,101.0,Released,False,5.686564,Low,Still Yelling. Still Fighting. Still Ready for...,1995.0
3,31357,"[Comedy, Drama, Romance]",Waiting to Exhale,"[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devine]","Cheated on, mistreated and stepped on, the wom...","[{'credit_id': '52fe44779251416c91011acb', 'de...",False,16000000,en,3.86,81452156.0,127.0,Released,False,5.627783,Low,Friends are the people who let you be yourself...,1995.0
4,11862,[Comedy],Father of the Bride Part II,"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short]",Just when George Banks has recovered from his ...,"[{'credit_id': '52fe44959251416c75039ed7', 'de...",False,0,en,8.39,76578911.0,106.0,Released,False,5.624747,Low,Just When His World Is Back To Normal... He's ...,1995.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46622,30840,"[Drama, Action, Romance]",Robin Hood,[],"[Patrick Bergin, Uma Thurman, David Morrissey]","Yet another version of the classic epic, with ...","[{'credit_id': '52fe44439251416c9100a899', 'de...",False,0,en,5.68,0.0,104.0,Released,False,5.613965,Low,,1991.0
46624,111109,[Drama],Century of Birthing,"[artist, play, pinoy]","[Angel Aquino, Perry Dizon, Hazel Orencio]",An artist struggles to finish his work while a...,"[{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...",False,0,tl,0.18,0.0,360.0,Released,False,5.621862,Low,,2011.0
46625,67758,"[Action, Drama, Thriller]",Betrayal,[],"[Erika Eleniak, Adam Baldwin, Julie du Page]","When one of her hits goes wrong, a professiona...","[{'credit_id': '52fe4776c3a368484e0c8387', 'de...",False,0,en,0.90,0.0,90.0,Released,False,5.600922,Low,A deadly game of wits.,2003.0
46626,227506,[],Satan Triumphant,[],"[Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...","In a small town live two brothers, one a minis...","[{'credit_id': '533bccebc3a36844cf0011a7', 'de...",False,0,en,0.00,0.0,87.0,Released,False,5.611728,Low,,1917.0


In [49]:
movie["crew"][0]

'[{\'credit_id\': \'52fe4284c3a36847f8024f49\', \'department\': \'Directing\', \'gender\': 2, \'id\': 7879, \'job\': \'Director\', \'name\': \'John Lasseter\', \'profile_path\': \'/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f4f\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12891, \'job\': \'Screenplay\', \'name\': \'Joss Whedon\', \'profile_path\': \'/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f55\', \'department\': \'Writing\', \'gender\': 2, \'id\': 7, \'job\': \'Screenplay\', \'name\': \'Andrew Stanton\', \'profile_path\': \'/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f5b\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12892, \'job\': \'Screenplay\', \'name\': \'Joel Cohen\', \'profile_path\': \'/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f61\', \'department\': \'Writing\', \'gender\': 0, \'id\': 12893, \'job\': \'Screenplay\', \'name\': \'A

In [50]:
def fetch_director(text):
  d=[]
  for i in ast.literal_eval(text):
    if i["job"]=='Director':
      d.append(i["name"])
    return d


In [51]:
movie["crew"]=movie["crew"].apply(fetch_director)

In [52]:
movie

Unnamed: 0,id,genres,title,keywords,cast,overview,crew,adult,budget,original_language,popularity,revenue,runtime,status,video,weighted_rating,raiting,tagline,release_year
0,862,"[Animation, Comedy, Family]",Toy Story,"[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles]","Led by Woody, Andy's toys live happily in his ...",[John Lasseter],False,30000000,en,21.95,373554033.0,81.0,Released,False,7.374470,Average,,1995.0
1,8844,"[Adventure, Fantasy, Family]",Jumanji,"[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst]",When siblings Judy and Peter discover an encha...,[],False,65000000,en,17.02,262797249.0,104.0,Released,False,6.522540,Average,Roll the dice and unleash the excitement!,1995.0
2,15602,"[Romance, Comedy]",Grumpier Old Men,"[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret]",A family wedding reignites the ancient feud be...,[Howard Deutch],False,0,en,11.71,0.0,101.0,Released,False,5.686564,Low,Still Yelling. Still Fighting. Still Ready for...,1995.0
3,31357,"[Comedy, Drama, Romance]",Waiting to Exhale,"[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devine]","Cheated on, mistreated and stepped on, the wom...",[Forest Whitaker],False,16000000,en,3.86,81452156.0,127.0,Released,False,5.627783,Low,Friends are the people who let you be yourself...,1995.0
4,11862,[Comedy],Father of the Bride Part II,"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short]",Just when George Banks has recovered from his ...,[],False,0,en,8.39,76578911.0,106.0,Released,False,5.624747,Low,Just When His World Is Back To Normal... He's ...,1995.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46622,30840,"[Drama, Action, Romance]",Robin Hood,[],"[Patrick Bergin, Uma Thurman, David Morrissey]","Yet another version of the classic epic, with ...",[John Irvin],False,0,en,5.68,0.0,104.0,Released,False,5.613965,Low,,1991.0
46624,111109,[Drama],Century of Birthing,"[artist, play, pinoy]","[Angel Aquino, Perry Dizon, Hazel Orencio]",An artist struggles to finish his work while a...,[Lav Diaz],False,0,tl,0.18,0.0,360.0,Released,False,5.621862,Low,,2011.0
46625,67758,"[Action, Drama, Thriller]",Betrayal,[],"[Erika Eleniak, Adam Baldwin, Julie du Page]","When one of her hits goes wrong, a professiona...",[Mark L. Lester],False,0,en,0.90,0.0,90.0,Released,False,5.600922,Low,A deadly game of wits.,2003.0
46626,227506,[],Satan Triumphant,[],"[Iwan Mosschuchin, Nathalie Lissenko, Pavel Pa...","In a small town live two brothers, one a minis...",[Yakov Protazanov],False,0,en,0.00,0.0,87.0,Released,False,5.611728,Low,,1917.0


In [53]:
def replace_space(l):
  if l is None:
    return []
  l1=[]
  for i in l:
    l1.append(i.replace(" ",""))
  return l1

In [54]:
movie["genres"]=movie["genres"].apply(replace_space)
movie["keywords"]=movie["keywords"].apply(replace_space)
movie["crew"]=movie["crew"].apply(replace_space)
movie["cast"]=movie["cast"].apply(replace_space)

In [55]:
movie["overview"]=movie["overview"].apply(lambda x:str(x).split())

# EDA

# univariate Data Analysis

In [56]:
# plt.hist(movie['id'], bins=100)
# plt.title('Histogram of id column')
# plt.xlabel('id')
# plt.ylabel('Number of movies')
# plt.show()

In [57]:
# plt.hist(movie['revenue'], bins=100)
# plt.title('Histogram of movie revenues')
# plt.xlabel('Revenue')
# plt.ylabel('Number of movies')
# plt.show()

In [58]:
# plt.hist(movie['runtime'], bins=100)
# plt.title('Histogram of movie runtimes')
# plt.xlabel('Runtime')
# plt.ylabel('Number of movies')
# plt.show()

In [59]:
import plotly as px

In [60]:
movie.columns

Index(['id', 'genres', 'title', 'keywords', 'cast', 'overview', 'crew',
       'adult', 'budget', 'original_language', 'popularity', 'revenue',
       'runtime', 'status', 'video', 'weighted_rating', 'raiting', 'tagline',
       'release_year'],
      dtype='object')

In [61]:
# plt.figure(figsize=(8, 6))
# sns.lineplot(data=movie, x='release_year', y='popularity')
# plt.title('Popularity Over the Years')
# plt.xlabel('Release Year')
# plt.ylabel('Popularity')
# plt.show()

In [62]:
# plt.figure(figsize=(8, 6))
# sns.catplot(data=movie, x='budget', y='revenue')
# plt.title('Budget vs. Revenue')
# plt.xlabel('Budget')
# plt.ylabel('Revenue')
# plt.show()


In [63]:
# plt.figure(figsize=(8, 6))
# sns.histplot(data=movie, x='runtime', bins=10)
# plt.title('Distribution of Movie Runtime')
# plt.xlabel('Runtime')
# plt.ylabel('Count')
# plt.show()


In [64]:
# plt.figure(figsize=(10, 6))
# sns.countplot(data=movie, y='original_language')
# plt.title('Distribution of Original Languages')
# plt.xlabel('Original Language')
# plt.ylabel('Count')

# plt.xticks(rotation=45)
# plt.show()


In [65]:
# plt.figure(figsize=(12, 6))
# sns.lineplot(data=movie, x='release_year', y='weighted_rating')
# plt.title('Movie Rating Trends over the Years')
# plt.xlabel('Release Year')
# plt.ylabel('Weighted Rating')
# plt.show()


In [66]:
# import plotly.graph_objects as go

# status_counts = movie['status'].value_counts()

# labels = status_counts.index
# values = status_counts.values

# fig = go.Figure(data=[go.Pie(labels=labels, values=values)])

# fig.update_layout(
#     title='Movie Status Distribution',
    
# )

# fig.show()


# Bivariate Data Analysis

# Multivariate Data Analysis

In [67]:
#prudvi

In [68]:
movie['movie_tags']=movie['overview'] + movie['genres']+ movie['keywords']+movie['cast'] + movie['crew']

In [69]:
movie['movie_tags']=movie['movie_tags'].apply( lambda x: " ".join(x))

In [70]:
df_movie=movie[['id','title','movie_tags']]

In [71]:
df_movie['movie_tags'][0]

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences. Animation Comedy Family jealousy toy boy friendship friends rivalry boynextdoor newtoy toycomestolife TomHanks TimAllen DonRickles JohnLasseter"

In [72]:
df_movie.head()

Unnamed: 0,id,title,movie_tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...


In [73]:
import nltk
nltk.download('stopwords')
import nltk
nltk.download('wordnet')
import nltk
nltk.download('omw-1.4')
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [74]:
lemmatizer = WordNetLemmatizer()

In [75]:
def preprocess(raw_text, flag):
    # Removing special characters and digits
    sentence = re.sub("[^a-zA-Z]", " ", raw_text)
    
    # change sentence to lower case
    sentence = sentence.lower()

    # tokenize into words
    tokens = sentence.split()
    
    # remove stop words                
    clean_tokens = [t for t in tokens if not t in stopwords.words("english")]
    
    # Stemming/Lemmatization
    if(flag == 'stem'):
        clean_tokens = [stemmer.stem(word) for word in clean_tokens]
    else:
        clean_tokens = [lemmatizer.lemmatize(word) for word in clean_tokens]
    
    return pd.Series([" ".join(clean_tokens), len(clean_tokens)])

In [76]:
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()

In [79]:
temp_df1 = df_movie['movie_tags'].progress_apply(lambda x: preprocess(x, 'lemma'))

temp_df1

100%|██████████| 45369/45369 [06:20<00:00, 119.11it/s]


Unnamed: 0,0,1
0,led woody andy toy live happily room andy birt...,49
1,sibling judy peter discover enchanted board ga...,50
2,family wedding reignites ancient feud next doo...,46
3,cheated mistreated stepped woman holding breat...,39
4,george bank recovered daughter wedding receive...,35
...,...,...
46622,yet another version classic epic enough variat...,35
46624,artist struggle finish work storyline cult pla...,16
46625,one hit go wrong professional assassin end sui...,22
46626,small town live two brother one minister one h...,66


In [80]:
temp_df1.columns = ['movie_tag', 'text_length_lem']

temp_df1.head()

Unnamed: 0,movie_tag,text_length_lem
0,led woody andy toy live happily room andy birt...,49
1,sibling judy peter discover enchanted board ga...,50
2,family wedding reignites ancient feud next doo...,46
3,cheated mistreated stepped woman holding breat...,39
4,george bank recovered daughter wedding receive...,35


In [81]:
df_movie = pd.concat([df_movie, temp_df1], axis=1)

df_movie.head()

Unnamed: 0,id,title,movie_tags,movie_tag,text_length_lem
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",led woody andy toy live happily room andy birt...,49
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,sibling judy peter discover enchanted board ga...,50
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,family wedding reignites ancient feud next doo...,46
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",cheated mistreated stepped woman holding breat...,39
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,george bank recovered daughter wedding receive...,35


In [82]:
df_movie['movie_tag'][0]

'led woody andy toy live happily room andy birthday brings buzz lightyear onto scene afraid losing place andy heart woody plot buzz circumstance separate buzz woody owner duo eventually learns put aside difference animation comedy family jealousy toy boy friendship friend rivalry boynextdoor newtoy toycomestolife tomhanks timallen donrickles johnlasseter'

In [83]:
df_movie.shape

(45369, 5)

In [84]:
df_movie['title']=df_movie['title'].str.lower()

In [85]:
df_movie.head()

Unnamed: 0,id,title,movie_tags,movie_tag,text_length_lem
0,862,toy story,"Led by Woody, Andy's toys live happily in his ...",led woody andy toy live happily room andy birt...,49
1,8844,jumanji,When siblings Judy and Peter discover an encha...,sibling judy peter discover enchanted board ga...,50
2,15602,grumpier old men,A family wedding reignites the ancient feud be...,family wedding reignites ancient feud next doo...,46
3,31357,waiting to exhale,"Cheated on, mistreated and stepped on, the wom...",cheated mistreated stepped woman holding breat...,39
4,11862,father of the bride part ii,Just when George Banks has recovered from his ...,george bank recovered daughter wedding receive...,35


In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [87]:
vectorizer = TfidfVectorizer()
tfidf_vector = vectorizer.fit_transform(df_movie['movie_tag'])


In [89]:
tfidf_vector.shape

(45369, 137619)

In [90]:
tfidf_array = tfidf_vector.toarray()

In [96]:
from sklearn.neighbors import NearestNeighbors

In [97]:
k = 5  # Number of neighbors to consider
nn_model = NearestNeighbors(n_neighbors=k, metric='cosine')
nn_model.fit(tfidf_vector)

In [101]:
def recommend_movies(movie_title, top_n=5):
    # Find the index of the given movie title in the dataset
    movie_index = df_movie[df_movie['title'] == movie_title].index[0]

    # Get the feature vector for the given movie
    movie_vector = tfidf_vector[movie_index]

    # Find the k nearest neighbors
    distances, indices = nn_model.kneighbors(movie_vector)

    # Get the top N recommendations (excluding the given movie itself)
    top_recommendations = []
    for index in indices[0]:
        if df_movie.iloc[index]['title'] != movie_title:
            top_recommendations.append(index)
        if len(top_recommendations) == top_n:
            break

    return top_recommendations

In [105]:
recommended_movie_indices = recommend_movies('toy story', top_n=5)
recommended_movies = df_movie.loc[recommended_movie_indices]['title'].tolist()
print(recommended_movies)

['four lions', 'bicycle thieves', 'crimes of the future', "the devil's rejects"]


In [106]:
recommended_movie_indices = recommend_movies('jumanji', top_n=5)
recommended_movies = df_movie.loc[recommended_movie_indices]['title'].tolist()
print(recommended_movies)

['portrait in black', 'seymour: an introduction', 'a family affair', 'huge']
