# Linking drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# imports

In [2]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import os

# Loading the dataset

In [3]:
ratings = pd.read_csv('/content/drive/MyDrive/recommendation system/movie recommendation/ml-latest/ratings.csv')
movies = pd.read_csv('/content/drive/MyDrive/recommendation system/movie recommendation/ml-latest/movies.csv')
tags = pd.read_csv('/content/drive/MyDrive/recommendation system/movie recommendation/ml-latest/tags.csv')
genome_tags = pd.read_csv('/content/drive/MyDrive/recommendation system/movie recommendation/ml-latest/genome-tags.csv')
genome_scores = pd.read_csv('/content/drive/MyDrive/recommendation system/movie recommendation/ml-latest/genome-scores.csv')
links = pd.read_csv('/content/drive/MyDrive/recommendation system/movie recommendation/ml-latest/links.csv')

In [4]:
# ratings_other = pd.read_csv('/content/drive/MyDrive/recommendation system/movie recommendation/archive/ratings.csv')
movies_other = pd.read_csv('/content/drive/MyDrive/recommendation system/movie recommendation/archive/movies_metadata.csv')
keywords_other = pd.read_csv('/content/drive/MyDrive/recommendation system/movie recommendation/archive/keywords.csv')
credits_other = pd.read_csv('/content/drive/MyDrive/recommendation system/movie recommendation/archive/credits.csv')
# ratings_small_other = pd.read_csv('/content/drive/MyDrive/recommendation system/movie recommendation/archive/ratings_small.csv')

  movies_other = pd.read_csv('/content/drive/MyDrive/recommendation system/movie recommendation/archive/movies_metadata.csv')


# EDA

# For 1st dataset

## ratings

In [5]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


### Dropping irrelevant columns

In [7]:
ratings.drop(columns = ["timestamp"], inplace = True)

### Checking for null values in ratings

In [8]:
for i in ratings.columns:
  print(i, ratings[i].isnull().any())

userId False
movieId False
rating False


### checking for duplicate values

In [9]:
ratings[ratings.duplicated(subset = ['userId', "movieId"])].count()

Unnamed: 0,0
userId,0
movieId,0
rating,0


### Printing the unique users and unique movies

In [10]:
print(ratings["userId"].nunique())
print(ratings["movieId"].nunique())

330975
83239


### Removing users who have rated less than 50 movies

In [11]:
df = ratings.groupby("userId")["movieId"].count()

df = df[df >= 50]
ratings = ratings[ratings["userId"].isin(df.index)]
ratings.reset_index(drop = True, inplace = True)

In [12]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,110,4.0
2,1,158,4.0
3,1,260,4.5
4,1,356,5.0


### Droping movies which have been rated by less than 50 users

In [13]:
df = ratings.groupby("movieId")["userId"].count()
df = df[df >= 50]
ratings = ratings[ratings["movieId"].isin(df.index)]
ratings.reset_index(drop = True, inplace = True)

### Printing the unique users and unique movies

In [14]:
print(ratings["userId"].nunique())
print(ratings["movieId"].nunique())

130561
15899


## movies

In [15]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Removing date from movie title

In [16]:
def remove_date(input):
  for i in input:
    if i == "(":
      return input[:input.index(i)].strip()
  return input

movies["title"] = movies["title"].apply(remove_date).reset_index(drop = True)
movies["title"]

Unnamed: 0,title
0,Toy Story
1,Jumanji
2,Grumpier Old Men
3,Waiting to Exhale
4,Father of the Bride Part II
...,...
86532,State of Siege: Temple Attack
86533,Ouija Japan
86534,The Men Who Made the Movies: Howard Hawks
86535,Skinford: Death Sentence


In [17]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86537 entries, 0 to 86536
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  86537 non-null  int64 
 1   title    86537 non-null  object
 2   genres   86537 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


## making the title order from shawshank redemption, the to the shawshank redemption

In [18]:
def change_title(input):
  for i in input:
    if i == ",":
      first = input[:input.index(i)].strip()
      second = input[input.index(i)+2:].strip()
      return second + " " + first
  return input

movies["title"] = movies["title"].apply(change_title).reset_index(drop = True)
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy
...,...,...,...
86532,288967,State of Siege: Temple Attack,Action|Drama
86533,288971,Ouija Japan,Action|Horror
86534,288975,The Men Who Made the Movies: Howard Hawks,Documentary
86535,288977,Skinford: Death Sentence,Crime|Thriller


### splitting genres to make a list

In [19]:
def split_genre(input):
  return input.split("|")

movies['genres'] = movies['genres'].apply(split_genre)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men,"[Comedy, Romance]"
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II,[Comedy]


### Removing null values

In [20]:
movies.dropna(subset = ["movieId", "title"], inplace = True)

In [21]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86537 entries, 0 to 86536
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  86537 non-null  int64 
 1   title    86537 non-null  object
 2   genres   86537 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB


### Removing duplicate values

In [22]:
movies.drop_duplicates(subset=["title"],inplace = True)

In [23]:
print(movies['title'].nunique())
print(movies['movieId'].nunique())

78818
78818


In [24]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 78818 entries, 0 to 86536
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  78818 non-null  int64 
 1   title    78818 non-null  object
 2   genres   78818 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.4+ MB


## Tags

In [25]:
tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,10,260,good vs evil,1430666558
1,10,260,Harrison Ford,1430666505
2,10,260,sci-fi,1430666538
3,14,1221,Al Pacino,1311600756
4,14,1221,mafia,1311600746


In [26]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2328315 entries, 0 to 2328314
Data columns (total 4 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   userId     int64 
 1   movieId    int64 
 2   tag        object
 3   timestamp  int64 
dtypes: int64(3), object(1)
memory usage: 71.1+ MB


## genome_tags and genome_scores

In [27]:
genome_tags.head(5)

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [28]:
genome_tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tagId   1128 non-null   int64 
 1   tag     1128 non-null   object
dtypes: int64(1), object(1)
memory usage: 17.8+ KB


In [29]:
genome_scores.head(5)

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.032
1,1,2,0.02225
2,1,3,0.07
3,1,4,0.059
4,1,5,0.123


In [30]:
genome_scores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18472128 entries, 0 to 18472127
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   movieId    int64  
 1   tagId      int64  
 2   relevance  float64
dtypes: float64(1), int64(2)
memory usage: 422.8 MB


### Merging genome tags and genome scores

In [31]:
genome = pd.merge(genome_scores, genome_tags, on = "tagId")
genome

Unnamed: 0,movieId,tagId,relevance,tag
0,1,1,0.03200,007
1,1,2,0.02225,007 (series)
2,1,3,0.07000,18th century
3,1,4,0.05900,1920s
4,1,5,0.12300,1930s
...,...,...,...,...
18472123,288167,1124,0.09875,writing
18472124,288167,1125,0.02950,wuxia
18472125,288167,1126,0.02275,wwii
18472126,288167,1127,0.11225,zombie


### Keeping only those tags which have a relevance score of > 0.05

In [32]:
genome = genome[genome["relevance"] > 0.05]
genome

Unnamed: 0,movieId,tagId,relevance,tag
2,1,3,0.07000,18th century
3,1,4,0.05900,1920s
4,1,5,0.12300,1930s
5,1,6,0.13100,1950s
6,1,7,0.06175,1960s
...,...,...,...,...
18472117,288167,1118,0.08100,workplace
18472118,288167,1119,0.08075,world politics
18472122,288167,1123,0.11450,writers
18472123,288167,1124,0.09875,writing


### Dropping relevance and making the tags and thier names as a list

In [33]:
tag_list = genome.groupby("movieId")["tag"].apply(list).sort_values().reset_index()
tag_list

Unnamed: 0,movieId,tag
0,72178,"[007, 007 (series), 18th century, 1920s, 1930s..."
1,219003,"[007, 007 (series), 18th century, 1920s, 1930s..."
2,207670,"[007, 007 (series), 18th century, 1920s, 1930s..."
3,125545,"[007, 007 (series), 18th century, 1920s, 1930s..."
4,38046,"[007, 007 (series), 18th century, 1920s, 1930s..."
...,...,...
16371,106624,"[absurd, action, adaptation, adapted from:book..."
16372,33021,"[absurd, action packed, adaptation, adapted fr..."
16373,141950,"[absurd, adaptation, adapted from:book, adapte..."
16374,45100,"[absurd, adaptation, adapted from:book, adapte..."


# For second dataset

## Keywords

In [34]:
keywords_other.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [35]:
keywords_other.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46419 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        46419 non-null  int64 
 1   keywords  46419 non-null  object
dtypes: int64(1), object(1)
memory usage: 725.4+ KB


### Rename id to movieID and convert to integer

In [36]:
keywords_other.rename(columns = {"id": "movieId"}, inplace = True)

In [37]:
keywords_other["movieId"] = keywords_other["movieId"].astype("Int64")

### Dropping null and duplicated values

In [38]:
keywords_other.dropna(subset = ["movieId"], inplace = True)

In [39]:
keywords_other.drop_duplicates(subset = ["movieId"], inplace = True)

In [40]:
keywords_other.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45432 entries, 0 to 46418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movieId   45432 non-null  Int64 
 1   keywords  45432 non-null  object
dtypes: Int64(1), object(1)
memory usage: 1.1+ MB


### Converting keywords into a list

In [41]:
def convert_keywords(input):
  input = eval(input)
  keywords = []
  for i in input:
    keywords.append(i['name'])
  return keywords

keywords_other["keywords"]= keywords_other['keywords'].apply(convert_keywords)
keywords_other.head()

Unnamed: 0,movieId,keywords
0,862,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844,"[board game, disappearance, based on children'..."
2,15602,"[fishing, best friend, duringcreditsstinger, o..."
3,31357,"[based on novel, interracial relationship, sin..."
4,11862,"[baby, midlife crisis, confidence, aging, daug..."


## Credits

In [42]:
credits_other.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [43]:
credits_other.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   cast    45476 non-null  object
 1   crew    45476 non-null  object
 2   id      45476 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


### Rename id to movieId and convert it to int64

In [44]:
credits_other.rename(columns = {"id" : "movieId"}, inplace = True)

In [45]:
credits_other["movieId"] = credits_other["movieId"].astype("int64")

### extracting top 5 actors in the cast

In [46]:
def get_cast(input):
  input = eval(input)
  cast = []
  for i in input:
    cast.append(i['name'])
  return cast[:5]

credits_other["cast"] =  credits_other['cast'].apply(get_cast)
credits_other.head()

Unnamed: 0,cast,crew,movieId
0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[Whitney Houston, Angela Bassett, Loretta Devi...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


### Extracting director from the crew

In [47]:
def get_director(input):
  input = eval(input)
  for i in input:
    if(i['job'] == 'Director'):
      return i['name']
  return np.nan

credits_other["director"] =  credits_other['crew'].apply(get_director)
credits_other.head()

Unnamed: 0,cast,crew,movieId,director
0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,John Lasseter
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,Joe Johnston
2,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,Howard Deutch
3,"[Whitney Houston, Angela Bassett, Loretta Devi...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,Forest Whitaker
4,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,Charles Shyer


In [48]:
credits_other.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   cast      45476 non-null  object
 1   crew      45476 non-null  object
 2   movieId   45476 non-null  int64 
 3   director  44589 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.4+ MB


### Dropping irrelevant columns

In [49]:
credits_other.drop(columns = ["crew"], inplace = True)
credits_other.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45476 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   cast      45476 non-null  object
 1   movieId   45476 non-null  int64 
 2   director  44589 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


### Dropping null and duplicated values

In [50]:
credits_other.drop_duplicates(subset = ["movieId"], inplace = True)

In [51]:
credits_other.dropna(subset = ["movieId"], inplace = True)

In [52]:
credits_other.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45432 entries, 0 to 45475
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   cast      45432 non-null  object
 1   movieId   45432 non-null  int64 
 2   director  44545 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


## Movies

In [53]:
movies_other.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [54]:
movies_other.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

### Drop irrelevant columns

In [55]:
movies_other.drop(columns = ["adult", "popularity", "revenue","vote_average", "vote_count","belongs_to_collection", "budget", "homepage", "imdb_id", "original_language", "original_title", "production_companies", "production_countries", "release_date",  "runtime", "spoken_languages", "status", "video"], inplace = True)

In [56]:
movies_other.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   genres       45466 non-null  object
 1   id           45466 non-null  object
 2   overview     44512 non-null  object
 3   poster_path  45080 non-null  object
 4   tagline      20412 non-null  object
 5   title        45460 non-null  object
dtypes: object(6)
memory usage: 2.1+ MB


### Reformating the columns to be of correct types

In [57]:
movies_other['id'] = pd.to_numeric(movies_other['id'], errors='coerce').astype('Int64')
movies_other["poster_path"] = movies_other["poster_path"].astype("string")
movies_other["tagline"] = movies_other["tagline"].astype("string")
movies_other["title"] = movies_other["title"].astype("string")

In [58]:
movies_other.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   genres       45466 non-null  object
 1   id           45463 non-null  Int64 
 2   overview     44512 non-null  object
 3   poster_path  45080 non-null  string
 4   tagline      20412 non-null  string
 5   title        45460 non-null  string
dtypes: Int64(1), object(2), string(3)
memory usage: 2.1+ MB


### Combining overview and tagline as description and dropping overview and tagline

In [59]:
movies_other["overview"] = movies_other["overview"].fillna("")
movies_other["tagline"] = movies_other["tagline"].fillna("")

In [60]:
movies_other["description"] = movies_other["overview"] + " " + movies_other["tagline"]

In [61]:
movies_other.drop(columns = ["overview", "tagline"], inplace = True)

### splitting genres to make a list

In [62]:
def split_genre(input):
  input = eval(input)
  genre = []
  for i in input:
    genre.append(i['name'])
  return genre

movies_other['genres'] = movies_other['genres'].apply(split_genre)
movies_other.head()

Unnamed: 0,genres,id,poster_path,title,description
0,"[Animation, Comedy, Family]",862,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,"[Adventure, Fantasy, Family]",8844,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,Jumanji,When siblings Judy and Peter discover an encha...
2,"[Romance, Comedy]",15602,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,"[Comedy, Drama, Romance]",31357,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,[Comedy],11862,/e64sOI48hQXyru7naBFyssKFxVd.jpg,Father of the Bride Part II,Just when George Banks has recovered from his ...


### Renaming id to movieId

In [63]:
movies_other.rename(columns = {"id": "movieId"}, inplace = True)

## Combining movies, credits, keywords

In [64]:
movies_credits_keywords_other = pd.merge(movies_other, keywords_other, on = "movieId")
movies_credits_keywords_other = pd.merge(movies_credits_keywords_other, credits_other, on = "movieId")
movies_credits_keywords_other.head()

Unnamed: 0,genres,movieId,poster_path,title,description,keywords,cast,director
0,"[Animation, Comedy, Family]",862,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",John Lasseter
1,"[Adventure, Fantasy, Family]",8844,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,Jumanji,When siblings Judy and Peter discover an encha...,"[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",Joe Johnston
2,"[Romance, Comedy]",15602,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",Howard Deutch
3,"[Comedy, Drama, Romance]",31357,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devi...",Forest Whitaker
4,[Comedy],11862,/e64sOI48hQXyru7naBFyssKFxVd.jpg,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short, Kim...",Charles Shyer


In [65]:
movies_credits_keywords_other.drop(columns = ["movieId"], inplace = True)

# Combining the first and second dataset

In [66]:
combined_dataset = pd.merge(movies, movies_credits_keywords_other, on = "title")
combined_dataset

Unnamed: 0,movieId,title,genres_x,genres_y,poster_path,description,keywords,cast,director
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]","[Animation, Comedy, Family]",/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"Led by Woody, Andy's toys live happily in his ...","[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",John Lasseter
1,2,Jumanji,"[Adventure, Children, Fantasy]","[Adventure, Fantasy, Family]",/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,When siblings Judy and Peter discover an encha...,"[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",Joe Johnston
2,3,Grumpier Old Men,"[Comedy, Romance]","[Romance, Comedy]",/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,A family wedding reignites the ancient feud be...,"[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",Howard Deutch
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]","[Comedy, Drama, Romance]",/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,"Cheated on, mistreated and stepped on, the wom...","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devi...",Forest Whitaker
4,5,Father of the Bride Part II,[Comedy],[Comedy],/e64sOI48hQXyru7naBFyssKFxVd.jpg,Just when George Banks has recovered from his ...,"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short, Kim...",Charles Shyer
...,...,...,...,...,...,...,...,...,...
39119,283405,A Bunch of Amateurs,[Documentary],[Comedy],/uj3Mz9XGvC17mJKuUatsh4NWUML.jpg,"Burt Reynolds stars as Jefferson Steel, a wash...",[],"[Burt Reynolds, Derek Jacobi, Samantha Bond, I...",Andy Cadiff
39120,284081,Leave,"[Horror, Thriller]","[Thriller, Mystery]",/3EPAv34wESOSGdMBMc0xHW37Seu.jpg,Henry Harper is a successful novelist who has ...,[],"[Ron Livingston, Vinessa Shaw, Rick Gomez, Fra...",Robert Celestino
39121,285921,Timescape,"[Adventure, Children, Sci-Fi]","[Mystery, Science Fiction]",/qtx6jCryMocj7wWCnjOZHOTFG4r.jpg,Before they can complete renovations on their ...,[],"[Jeff Daniels, Ariana Richards, Emilia Crow, J...",David Twohy
39122,285939,¡Que Viva México!,[Comedy],"[Documentary, History]",/yzx5OL3AvzxmwCuiXtHZiQCkvLs.jpg,¡Qué viva México! was a film project undertake...,[film history],"[Sergey Bondarchuk, Grigori Aleksandrov]",Sergei M. Eisenstein


In [67]:
combined_dataset = combined_dataset[combined_dataset["movieId"].isin(ratings["movieId"].unique())]
combined_dataset

Unnamed: 0,movieId,title,genres_x,genres_y,poster_path,description,keywords,cast,director
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]","[Animation, Comedy, Family]",/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"Led by Woody, Andy's toys live happily in his ...","[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",John Lasseter
1,2,Jumanji,"[Adventure, Children, Fantasy]","[Adventure, Fantasy, Family]",/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,When siblings Judy and Peter discover an encha...,"[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",Joe Johnston
2,3,Grumpier Old Men,"[Comedy, Romance]","[Romance, Comedy]",/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,A family wedding reignites the ancient feud be...,"[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",Howard Deutch
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]","[Comedy, Drama, Romance]",/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,"Cheated on, mistreated and stepped on, the wom...","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devi...",Forest Whitaker
4,5,Father of the Bride Part II,[Comedy],[Comedy],/e64sOI48hQXyru7naBFyssKFxVd.jpg,Just when George Banks has recovered from his ...,"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short, Kim...",Charles Shyer
...,...,...,...,...,...,...,...,...,...
39059,232131,The Marksman,"[Action, Thriller]","[Action, Adventure, Thriller]",/vNWU0AFgXn8riAdDB569VPQdMJJ.jpg,Chechen rebels take over a Russian nuclear pla...,[helicopter],"[Wesley Snipes, Emma Samms, William Hope, Anth...",Marcus Adams
39063,241346,Doctor Who: The Time of the Doctor,"[Drama, Sci-Fi]","[Drama, Science Fiction]",/pEFGnBK6Kd4sqbkz7nMo3s4oyi2.jpg,"Orbiting a quiet backwater planet, the massed ...",[],"[Matt Smith, Jenna Coleman, Peter Capaldi, Kar...",James Payne
39085,257281,The Tragedy of Macbeth,"[Drama, Fantasy, Thriller, War]","[War, Drama, History]",/zghViMv9tpKJEfLIxDevI2x7bKd.jpg,A ruthlessly ambitious Scottish lord siezes th...,"[scotland, shakespeare, ambush, showdown, quee...","[Jon Finch, Francesca Annis, Martin Shaw, Tere...",Roman Polanski
39098,270946,Deep Water,"[Drama, Romance, Thriller]","[Documentary, Adventure, Drama, Action]",/oSOMJMDOef9AuviQCzc3S1iTAji.jpg,DEEP WATER is the stunning true story of the f...,"[boat, woman director]","[Tilda Swinton, Simon Russell Beale, Jean Badi...",Louise Osmond


In [68]:
combined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14016 entries, 0 to 39099
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   movieId      14016 non-null  int64 
 1   title        14016 non-null  object
 2   genres_x     14016 non-null  object
 3   genres_y     14016 non-null  object
 4   poster_path  14007 non-null  string
 5   description  14016 non-null  string
 6   keywords     14016 non-null  object
 7   cast         14016 non-null  object
 8   director     13975 non-null  object
dtypes: int64(1), object(6), string(2)
memory usage: 1.1+ MB


In [69]:
combined_dataset["movieId"].nunique()

12157

In [70]:
combined_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14016 entries, 0 to 39099
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   movieId      14016 non-null  int64 
 1   title        14016 non-null  object
 2   genres_x     14016 non-null  object
 3   genres_y     14016 non-null  object
 4   poster_path  14007 non-null  string
 5   description  14016 non-null  string
 6   keywords     14016 non-null  object
 7   cast         14016 non-null  object
 8   director     13975 non-null  object
dtypes: int64(1), object(6), string(2)
memory usage: 1.1+ MB


## total unique movies

In [71]:
combined_dataset["movieId"].nunique()

12157

## combining the genres from both the datasets

In [72]:
def combine_genres(genrea, genreb):
  genrea.extend(genreb)
  return list(set(genrea))

combined_dataset["genres"] = combined_dataset.apply(lambda x: combine_genres(x["genres_x"], x["genres_y"]), axis = 1)
combined_dataset.drop(columns = ["genres_x", "genres_y"], inplace = True)
combined_dataset

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_dataset["genres"] = combined_dataset.apply(lambda x: combine_genres(x["genres_x"], x["genres_y"]), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_dataset.drop(columns = ["genres_x", "genres_y"], inplace = True)


Unnamed: 0,movieId,title,poster_path,description,keywords,cast,director,genres
0,1,Toy Story,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"Led by Woody, Andy's toys live happily in his ...","[jealousy, toy, boy, friendship, friends, riva...","[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",John Lasseter,"[Comedy, Fantasy, Animation, Adventure, Childr..."
1,2,Jumanji,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,When siblings Judy and Peter discover an encha...,"[board game, disappearance, based on children'...","[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",Joe Johnston,"[Children, Family, Fantasy, Adventure]"
2,3,Grumpier Old Men,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,A family wedding reignites the ancient feud be...,"[fishing, best friend, duringcreditsstinger, o...","[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",Howard Deutch,"[Romance, Comedy]"
3,4,Waiting to Exhale,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,"Cheated on, mistreated and stepped on, the wom...","[based on novel, interracial relationship, sin...","[Whitney Houston, Angela Bassett, Loretta Devi...",Forest Whitaker,"[Romance, Drama, Comedy]"
4,5,Father of the Bride Part II,/e64sOI48hQXyru7naBFyssKFxVd.jpg,Just when George Banks has recovered from his ...,"[baby, midlife crisis, confidence, aging, daug...","[Steve Martin, Diane Keaton, Martin Short, Kim...",Charles Shyer,[Comedy]
...,...,...,...,...,...,...,...,...
39059,232131,The Marksman,/vNWU0AFgXn8riAdDB569VPQdMJJ.jpg,Chechen rebels take over a Russian nuclear pla...,[helicopter],"[Wesley Snipes, Emma Samms, William Hope, Anth...",Marcus Adams,"[Adventure, Action, Thriller]"
39063,241346,Doctor Who: The Time of the Doctor,/pEFGnBK6Kd4sqbkz7nMo3s4oyi2.jpg,"Orbiting a quiet backwater planet, the massed ...",[],"[Matt Smith, Jenna Coleman, Peter Capaldi, Kar...",James Payne,"[Sci-Fi, Drama, Science Fiction]"
39085,257281,The Tragedy of Macbeth,/zghViMv9tpKJEfLIxDevI2x7bKd.jpg,A ruthlessly ambitious Scottish lord siezes th...,"[scotland, shakespeare, ambush, showdown, quee...","[Jon Finch, Francesca Annis, Martin Shaw, Tere...",Roman Polanski,"[Fantasy, War, Drama, History, Thriller]"
39098,270946,Deep Water,/oSOMJMDOef9AuviQCzc3S1iTAji.jpg,DEEP WATER is the stunning true story of the f...,"[boat, woman director]","[Tilda Swinton, Simon Russell Beale, Jean Badi...",Louise Osmond,"[Documentary, Romance, Adventure, Drama, Actio..."


# general EDA

### Merging movies with tag and ratings information

In [73]:
merged_df = pd.merge(movies, ratings, on = "movieId")
merged_df = merged_df[merged_df["movieId"].isin(combined_dataset["movieId"].unique())]
merged_df

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fanta...",1,4.0
1,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fanta...",2,5.0
2,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fanta...",7,4.0
3,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fanta...",10,3.0
4,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fanta...",12,5.0
...,...,...,...,...,...
28513853,271865,X,"[Horror, Mystery, Thriller, Action, Thriller, ...",328744,4.0
28513854,271865,X,"[Horror, Mystery, Thriller, Action, Thriller, ...",329018,3.5
28513855,271865,X,"[Horror, Mystery, Thriller, Action, Thriller, ...",329954,3.0
28513856,271865,X,"[Horror, Mystery, Thriller, Action, Thriller, ...",330235,2.5


### calculating unique movies and users

In [74]:
n_users = merged_df["userId"].nunique()
n_movies = merged_df["movieId"].nunique()
print(f"Number of unique users: {n_users}")
print(f"Number of unique movies: {n_movies}")

Number of unique users: 130560
Number of unique movies: 12157


In [75]:
users_mean_ratings = merged_df.groupby("userId")["rating"].mean()
movies_mean_ratings = merged_df.groupby("movieId")["rating"].mean()
print(f"Mean rating given by users: {users_mean_ratings.mean()}")
print(f"Mean rating of movies: {movies_mean_ratings.mean()}")

Mean rating given by users: 3.6806714390952955
Mean rating of movies: 3.2581841385414725


### Calculating baysian averaage


we calculate the average rating using **Bayesian Average**, given by:

$$
\text{avg_rating} = w_i \cdot A_i + (1 - w_i) \cdot S
$$

Where:
$$
 w_i = \frac{n_i}{n_i + n_{\text{avg}}}
$$
- \( n_i \): Total number of ratings given to movie \( i \)
- \( n_avg \): Average number of ratings given to a movie
- \( A_i \): Average rating of movie \( i \)
- \( S \): Mean average rating


In [76]:
n_i = merged_df.groupby("movieId")["rating"].count()
print("\nn_i: \n" , n_i)
n_avg = merged_df.groupby("movieId")["rating"].count().mean()
print("\nnavg: \n",n_avg)
w = n_i / (n_i + n_avg)
print("\nw: \n", w)
A = merged_df.groupby("movieId")["rating"].mean()
print("\n A: \n",A)
S = merged_df["rating"].mean()
print("\nS: \n", S)
baysian_avg = w * A + (1 - w) * S
print("\n baysian_avg: \n", baysian_avg)


n_i: 
 movieId
1         57944
2         27299
3          9711
4          2643
5          9837
          ...  
232131      109
241346       54
257281      268
270946      105
271865      482
Name: rating, Length: 12157, dtype: int64

navg: 
 2145.3176770584846

w: 
 movieId
1         0.964298
2         0.927140
3         0.819057
4         0.551968
5         0.820960
            ...   
232131    0.048352
241346    0.024553
257281    0.111050
270946    0.046660
271865    0.183457
Name: rating, Length: 12157, dtype: float64

 A: 
 movieId
1         3.885415
2         3.252170
3         3.084543
4         2.823874
5         3.010471
            ...   
232131    2.908257
241346    3.555556
257281    3.505597
270946    2.814286
271865    3.272822
Name: rating, Length: 12157, dtype: float64

S: 
 3.5324386948212556

 baysian_avg: 
 movieId
1         3.872813
2         3.272591
3         3.165587
4         3.141334
5         3.103924
            ...   
232131    3.502258
241346    3.533006
2

In [77]:
baysian_avg_df = baysian_avg.reset_index()
baysian_avg_df.columns = ['movieId', 'baysian_avg']
merged_df = merged_df.merge(baysian_avg_df, on='movieId')

In [78]:
merged_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,baysian_avg
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fanta...",1,4.0,3.872813
1,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fanta...",2,5.0,3.872813
2,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fanta...",7,4.0,3.872813
3,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fanta...",10,3.0,3.872813
4,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fanta...",12,5.0,3.872813


In [79]:
merged_df.groupby("title")["baysian_avg"].mean().reset_index().sort_values("baysian_avg", ascending=False).head(10)

Unnamed: 0,title,baysian_avg
10605,The Shawshank Redemption,4.371141
9468,The Godfather,4.281378
10900,The Usual Suspects,4.2353
9469,The Godfather: Part II,4.216578
7530,Schindler's List,4.203771
3220,Fight Club,4.196518
25,12 Angry Men,4.193469
6954,Pulp Fiction,4.190418
7092,Rear Window,4.166583
6449,One Flew Over the Cuckoo's Nest,4.164495


In [80]:
merged_df.groupby("title")["baysian_avg"].mean().reset_index().sort_values("baysian_avg", ascending=True).head(10)

Unnamed: 0,title,baysian_avg
1097,Battlefield Earth,2.162683
8047,Speed 2: Cruise Control,2.341153
1077,Batman & Robin,2.360734
710,Anaconda,2.434073
8317,Super Mario Bros.,2.455074
4146,Home Alone 3,2.485684
11925,Wild Wild West,2.48659
1857,Catwoman,2.526508
7765,Showgirls,2.527825
3782,Grease 2,2.54167


#making pivot matrix

## creating the pivot table

In [None]:
pivot = merged_df.pivot_table(index='title', columns='userId', values='rating', )
pivot.fillna(0, inplace=True)
pivot

# making the scipy matrix
this is used for efficient storage of sparse data

In [None]:
from scipy.sparse import csr_matrix
matrix = csr_matrix(pivot)
matrix

# movie based collaborative filltering model:
## predict similar movies based on similar user ratings
this content based recommender calculates the cosine similarity between the pivot vector for given movie name and rest of the movies in the pivot table. it then returns the most similar vectors. this means if a movies A and B have both been rated highly by say users x, y, z, then upon prediction for movie A, movie B will be returned and vice versa.
the intuition behind this model is that, we try to find movies rated similarly by users. if movie A was liked by user 1 through 10, and there aslso exists a movie B, liked by 9 of these users, we should recommend movie B to the 10th user. he will most likely like it aas well.

## defining the knn model

In [None]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(n_neighbors = 11, algorithm='auto', metric="cosine")
model.fit(matrix)

## making recommendations:

In [None]:
def movie_collaborative_model(pivot, model, movie_title, num_pred):
  if movie_title not in pivot.index:
    return "Movie not found"

  movie_idx = pivot.index.get_loc(movie_title)
  movie_value_vector = pivot.iloc[movie_idx]
  dist, ind = model.kneighbours(movie_value_vector, n_neighbours = num_pred+1)
  recommendations = [pivot.index[i] for i in ind if i != movie_idx]
  return recommendations

In [None]:
title = movies['title'].loc(0)
recommendations = movie_collaborative_model(pivot, model, title, 10)
print(recommendations)

# User based collaborative filltering model
## predict similar users based on ratings
in this model we are making the predicitons based on finding similar users. the intuition is once you find 2 users who have similar tastes in movies, any movie user A likes will also be liked by user B

In [None]:
pivot_t = pivot.T
matrix_t = csr_matrix(pivot_t)

## defining the knn model

In [None]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(n_neighbors = 11, algorithm='auto', metric="cosine")
model.fit(matrix_t)

## making recommendations:

In [None]:
def user_collaborative_model(pivot, df, model, userId, num_pred):
  if userId not in pivot.index:
    return "User not found"

  user_idx = pivot.index.get_loc(userId)
  user_value_vector = pivot.iloc[user_idx]
  dist, ind = model.kneighbours(user_value_vector, n_neighbours = 5)
  recommendations = []
  for i in ind[0]:
    u_idx = pivot.index[i]
    user_movies = df[df["user_id"] == u_idx]
    top_movies = user_movies.sort_values(by="rating", ascending=False).head(num_pred/5)
    recommendations.extend(top_movies["title"].tolist())
  return recommendations

In [None]:
user = 23
recommendations = user_collaborative_model(pivot_t, movie_ratings_df, model, title, 10)
print(recommendations)

# Content based recommendation system
use nlp to get movies with similar summary

## Load the spacy model

In [None]:
!pip install spacy
!python -m spacy download en_core_web_md

In [None]:
import spacy
nlp = spacy.load("en_core_web_md")

In [None]:
def preprocess_text(text):
      if isinstance(text, str):
        doc = nlp(text)
        tokens = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
        processed_text = ' '.join(tokens)
        return processed_text
      return np.nan

## Summary based system
using overview and tagline of the movie to get similar predictions

### Apply preprocessing on description

In [None]:
combined_dataset["description"]

In [None]:
combined_dataset["description"] = combined_dataset["description"].apply(preprocess_text)
combined_dataset["description"]

### making a tfidf model

In [None]:
combined_dataset["description"] = combined_dataset["description"].fillna('')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer()
X = tv.fit_transform(combined_dataset["description"]).toarray()

### Using cosine similarity to predict similar movies

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
def get_similar_article(text, vectorizer, X, df, top_n=5):
  if isinstance(text, str):
    preprocessed_text = preprocess_text(text)
    X_text = vectorizer.transform([preprocessed_text])
    similarity_scores = cosine_similarity(X, X_text).flatten()
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    return df.iloc[top_indices], similarity_scores[top_indices], similarity_scores

In [None]:
similar_movies, scores, similarity_scores = get_similar_article(combined_dataset["description"][1], tv, X, combined_dataset,10)
print(scores)
similar_movies

## cast based content recommendation system
using cast, director, genres and keywords to predict similar movies

### Making a seperate column consisting of cast, directors, genre and keywords.
including director 5 times to give it more weightage

In [None]:
def get_string_from_list(list):
  return ' '.join(list)

combined_dataset["cast"] = combined_dataset["cast"].apply(get_string_from_list)
combined_dataset["genres"] = combined_dataset["genres"].apply(get_string_from_list)
combined_dataset["keywords"] = combined_dataset["keywords"].apply(get_string_from_list)

combined_dataset["cast_director_genre_keywords"] = combined_dataset["cast"] + combined_dataset["director"] * 5 + combined_dataset["genres"] + combined_dataset["keywords"]


In [None]:
combined_dataset["cast_director_genre_keywords"][0]

In [None]:
combined_dataset['cast_director_genre_keywords'].apply(preprocess_text)
combined_dataset["cast_director_genre_keywords"][0]

### Creating a count vectorizer model


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(combined_dataset["cast_director_genre_keywords"]).toarray()

In [None]:
similar_movies, scores, similarity_scores = get_similar_article(combined_dataset["cast_director_genre_keywords"][0],cv, X, combined_dataset)
print(scores)
similar_movies

## Hybrid content recommendation system

In [None]:
combined_dataset["overall_detail"] = combined_dataset["description"] + combined_dataset["cast_director_genre_keywords"]*2

### Creating a tfidf vectorizer model


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv_full = TfidfVectorizer()
X = tv_full.fit_transform(combined_dataset["cast_director_genre_keywords"]).toarray()

In [None]:
similar_movies, scores, similarity_scores = get_similar_article(combined_dataset["overall_details"][1], tv_full, X, combined_dataset)
print(scores)
similar_movies

# Use suprise library (collaborative filtering models) to save space (do not run now)

In [None]:
!pip install surprise

In [None]:
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise import KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD
from surprise import accuracy
import random

## Building the test and train dataset

In [None]:
reader = Reader(rating_scale=(0, 5))
data   = Dataset.load_from_df(df[['userId','title','rating']], reader)
raw_ratings = data.raw_ratings

random.shuffle(raw_ratings)

threshold   = int(len(raw_ratings)*0.8)

train_raw_ratings = raw_ratings[:threshold]
test_raw_ratings  = raw_ratings[threshold:]

data.raw_ratings = train_raw_ratings        # data is now the trainset
trainset = data.build_full_trainset()
testset = data.construct_testset(test_raw_ratings)

## Training 4 models

In [None]:
models=[KNNBasic(),KNNWithMeans(),KNNWithZScore(),SVD()]
results = {}

for model in models:
    CV_scores = cross_validate(model, data, measures=["MAE","RMSE"], cv=5, n_jobs=-1)

    result = pd.DataFrame.from_dict(CV_scores).mean(axis=0)
    results[str(model).split("algorithms.")[1].split("object ")[0]] = result

In [None]:
performance_df = pd.DataFrame.from_dict(results)
print("Model Performance: \n")
performance_df.T.sort_values(by='test_rmse')

### grid search for hyperparameter tuning for knn with mmeans

In [None]:
param_grid = { 'sim_options' : {'min_support': [3,5],
                                'user_based': [False, True]}
             }

gridsearchKNNWithMeans = GridSearchCV(KNNWithMeans,
                                      param_grid = param_grid,
                                      measures=['mae', 'rmse'],
                                      cv=5,
                                      n_jobs=-1)

gridsearchKNNWithMeans.fit(data)

print(f'MAE Best Parameters:  {gridsearchKNNWithMeans.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchKNNWithMeans.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchKNNWithMeans.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchKNNWithMeans.best_score["rmse"]}\n')
best_params_knn = gridsearchKNNWithMeans.best_params["rmse"]

### hyperparameter tuning for svd model using gridsearch

In [None]:
param_grid = {"n_factors": range(10,100,20),
              "n_epochs" : [5, 10, 20],
              "lr_all"   : [0.002, 0.005],
              "reg_all"  : [0.2, 0.5]}

gridsearchSVD = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=5, n_jobs=-1)

gridsearchSVD.fit(data)

print(f'MAE Best Parameters:  {gridsearchSVD.best_params["mae"]}')
print(f'MAE Best Score:       {gridsearchSVD.best_score["mae"]}\n')

print(f'RMSE Best Parameters: {gridsearchSVD.best_params["rmse"]}')
print(f'RMSE Best Score:      {gridsearchSVD.best_score["rmse"]}\n')
best_params_svd = gridsearchSVD.best_params["rmse"]

## Getting predictions from svd algorithm

In [None]:
def generate_recommendationsSVD(userID, get_recommend =10):
    model = SVD(**best_params_svd)
    model.fit(trainset)
    testset = trainset.build_anti_testset()
    predictions = model.test(testset)
    predictions_df = pd.DataFrame(predictions)
    predictions_userID = predictions_df[predictions_df['uid'] == userID].sort_values(by="est", ascending = False).head(get_recommend)

    recommendations = []
    recommendations.append(list(predictions_userID['iid']))
    recommendations = recommendations[0]

    return(recommendations)

In [None]:
recom = generate_recommendationsSVD(11676)
recom