In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from sklearn.neighbors import NearestNeighbors

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv', na_values="NA")
credits = pd.read_csv('tmdb_5000_credits.csv', na_values="NA")

In [3]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [4]:
movies.shape

(4803, 20)

In [5]:
movies.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

In [6]:
movies.duplicated().sum()

0

In [7]:
movies.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [8]:
credits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [9]:
credits.shape

(4803, 4)

In [10]:
credits.isnull().sum()

movie_id    0
title       0
cast        0
crew        0
dtype: int64

In [11]:
credits.duplicated().sum()

0

In [12]:
credits.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [13]:
credits.columns = ['id','tittle','cast','crew']
movies=movies.merge(credits,on='id')

In [14]:
credits.head()

Unnamed: 0,id,tittle,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [15]:
#Mean vote across the whole report
C= movies['vote_average'].mean()
C

6.092171559442011

In [16]:
#m is the minimum votes required. Considering 90% percentile
m= movies['vote_count'].quantile(0.9)
m

1838.4000000000015

In [17]:
### Filter out movies that qualify for the chart

final_movies = movies.copy().loc[movies['vote_count'] >= m]
final_movies.shape

(481, 23)

# 1. Simple Recommender - IMDB Weighted Rating

IMDb originally used the following formula to calculate their weighted rating:

W =   (v/(v+m) * R) + (m/(m+v) * C)


    W is the weighted rating;
    R is the mean rating for the movie, from 1 to 10;
    v is the number of votes for the movie;
    m is the minimum votes required to be listed in the Top 250; 
    C is the mean vote across the whole report.

In [18]:
### Weighted Rating as per IMDB formula
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [19]:
final_movies['score'] = final_movies.apply(weighted_rating, axis=1)

In [20]:
### Sorting movies based on score

final_movies = final_movies.sort_values('score', ascending=False)

In [21]:
final_movies.head(20)[['title', 'vote_count', 'vote_average', 'score']]

Unnamed: 0,title,vote_count,vote_average,score
1881,The Shawshank Redemption,8205,8.5,8.059258
662,Fight Club,9413,8.3,7.939256
65,The Dark Knight,12002,8.2,7.92002
3232,Pulp Fiction,8428,8.3,7.904645
96,Inception,13752,8.1,7.863239
3337,The Godfather,5893,8.4,7.851236
95,Interstellar,10867,8.1,7.809479
809,Forrest Gump,7927,8.2,7.803188
329,The Lord of the Rings: The Return of the King,8064,8.1,7.727243
1990,The Empire Strikes Back,5879,8.2,7.697884


# 2. Simple Recommender - Trending Movies 

In [22]:
### Finding top 20 popular movies with popularity column from the movies dataset

popular_movies= movies.sort_values('popularity', ascending=False)
popular_movies.head(10)[['title','popularity']]

Unnamed: 0,title,popularity
546,Minions,875.581305
95,Interstellar,724.247784
788,Deadpool,514.569956
94,Guardians of the Galaxy,481.098624
127,Mad Max: Fury Road,434.278564
28,Jurassic World,418.708552
199,Pirates of the Caribbean: The Curse of the Bla...,271.972889
82,Dawn of the Planet of the Apes,243.791743
200,The Hunger Games: Mockingjay - Part 1,206.227151
88,Big Hero 6,203.73459


# 3. Content Based Filtering - Movie Plot / Overview Based Recommender

In [23]:
movies['overview'].head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [24]:

tfidf = TfidfVectorizer(stop_words='english')
movies['overview'] = movies['overview'].fillna('')
tfidf_matrix = tfidf.fit_transform(movies['overview'])
tfidf_matrix.shape

(4803, 20978)

In [25]:
###Finding cosine similarity matrix

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [26]:
### Construct a reverse map of indices and movie titles

indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

In [27]:
len(indices)

4803

In [28]:
indices

title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4798
Newlyweds                                   4799
Signed, Sealed, Delivered                   4800
Shanghai Calling                            4801
My Date with Drew                           4802
Length: 4803, dtype: int64

We are going to define a function that takes in a movie title as an input and outputs a list of the 10 most similar movies.

In [29]:
def get_recommendations(title, cosine_sim=cosine_sim):
    index = indices[title]
    sim_scores = list(enumerate(cosine_sim[index]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies['title'].iloc[movie_indices]

In [30]:
get_recommendations('The Dark Knight Rises')

65                              The Dark Knight
299                              Batman Forever
428                              Batman Returns
1359                                     Batman
3854    Batman: The Dark Knight Returns, Part 2
119                               Batman Begins
2507                                  Slow Burn
9            Batman v Superman: Dawn of Justice
1181                                        JFK
210                              Batman & Robin
Name: title, dtype: object

In [31]:
get_recommendations('The Avengers')

7               Avengers: Age of Ultron
3144                            Plastic
1715                            Timecop
4124                 This Thing of Ours
3311              Thank You for Smoking
3033                      The Corruptor
588     Wall Street: Money Never Sleeps
2136         Team America: World Police
1468                       The Fountain
1286                        Snowpiercer
Name: title, dtype: object

In [32]:
get_recommendations('The Shawshank Redemption')

4531               Civil Brand
3785                    Prison
609                Escape Plan
2868                  Fortress
4727              Penitentiary
1779    The 40 Year Old Virgin
2667          Fatal Attraction
3871         A Christmas Story
434           The Longest Yard
42                 Toy Story 3
Name: title, dtype: object

# 4. Content Based Filtering - Credits, Genres and Keywords Based Recommender

In [33]:
columns = ['cast', 'crew', 'keywords', 'genres']
for i in columns:
    movies[i] = movies[i].apply(literal_eval)

In [34]:
def get_director(x):
    for crew_member in x:
        if crew_member['job'] == 'Director':
            return crew_member['name']
    return ''

In [35]:
def get_top3(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names
    return []

In [36]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [37]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + str(x['director']) + ' ' + ' '.join(x['genres'])

In [38]:
movies['director'] = movies['crew'].apply(get_director)

columns = ['cast', 'keywords', 'genres']
for col in columns:
    movies[col] = movies[col].apply(get_top3)

In [39]:
movies[['title', 'cast', 'director', 'keywords', 'genres']].head(5)

Unnamed: 0,title,cast,director,keywords,genres
0,Avatar,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",James Cameron,"[culture clash, future, space war]","[Action, Adventure, Fantasy]"
1,Pirates of the Caribbean: At World's End,"[Johnny Depp, Orlando Bloom, Keira Knightley]",Gore Verbinski,"[ocean, drug abuse, exotic island]","[Adventure, Fantasy, Action]"
2,Spectre,"[Daniel Craig, Christoph Waltz, Léa Seydoux]",Sam Mendes,"[spy, based on novel, secret agent]","[Action, Adventure, Crime]"
3,The Dark Knight Rises,"[Christian Bale, Michael Caine, Gary Oldman]",Christopher Nolan,"[dc comics, crime fighter, terrorist]","[Action, Crime, Drama]"
4,John Carter,"[Taylor Kitsch, Lynn Collins, Samantha Morton]",Andrew Stanton,"[based on novel, mars, medallion]","[Action, Adventure, Science Fiction]"


In [40]:
features = ['cast', 'keywords', 'director', 'genres']

for col in columns:
    movies[col] = movies[col].apply(clean_data)

In [41]:
movies['soup'] = movies.apply(create_soup, axis=1)

In [42]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movies['soup'])

In [43]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [44]:
movies = movies.reset_index()
indices = pd.Series(movies.index, index=movies['title'])

In [45]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

65               The Dark Knight
119                Batman Begins
4638    Amidst the Devil's Wings
1196                The Prestige
3073           Romeo Is Bleeding
3180          The Way of the Gun
2972              September Dawn
3326              Black November
303                     Catwoman
1503                      Takers
Name: title, dtype: object

In [46]:
get_recommendations('The Godfather', cosine_sim2)

867      The Godfather: Part III
2731      The Godfather: Part II
1525              Apocalypse Now
1018             The Cotton Club
1209               The Rainmaker
3012               The Outsiders
4209            The Conversation
2333       Peggy Sue Got Married
2600            New York Stories
4638    Amidst the Devil's Wings
Name: title, dtype: object

In [47]:
get_recommendations('The Avengers', cosine_sim2)

7                   Avengers: Age of Ultron
26               Captain America: Civil War
79                               Iron Man 2
169      Captain America: The First Avenger
174                     The Incredible Hulk
1294                               Serenity
85      Captain America: The Winter Soldier
31                               Iron Man 3
33                    X-Men: The Last Stand
68                                 Iron Man
Name: title, dtype: object

# 5. Collaborative Filtering - Singular Value Decomposition :

In [48]:
ratings = pd.read_csv('ratings_small.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [49]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [50]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [51]:
ratings.duplicated().sum()

0

In [52]:
ratings.userId.value_counts()

547    2391
564    1868
624    1735
15     1700
73     1610
       ... 
296      20
289      20
249      20
221      20
1        20
Name: userId, Length: 671, dtype: int64

In [53]:
ratings.movieId.value_counts()

356       341
296       324
318       311
593       304
260       291
         ... 
98604       1
103659      1
104419      1
115927      1
6425        1
Name: movieId, Length: 9066, dtype: int64

In [54]:
ratings.rating.value_counts()

4.0    28750
3.0    20064
5.0    15095
3.5    10538
4.5     7723
2.0     7271
2.5     4449
1.0     3326
1.5     1687
0.5     1101
Name: rating, dtype: int64

In [55]:
print(f"Total ratings: {len(ratings)}")
print(f"Total users: {ratings['userId'].nunique()}")
print(f"Total movies: {ratings['movieId'].nunique()}")

Total ratings: 100004
Total users: 671
Total movies: 9066


In [56]:
movie_ratings = ratings.groupby('movieId')['rating'].agg(['mean', 'count'])

# Filter movies with at least 10 ratings for reliability
filtered_movies = movie_ratings[movie_ratings['count'] >= 10]

# Top 5 highest rated movies
top_5_movies = filtered_movies.sort_values(by='mean', ascending=False).head(5)
print("Top 5 Highest Rated Movies:")
print(top_5_movies)

# Top 1 worst rated movie
worst_movie = filtered_movies.sort_values(by='mean', ascending=True).head(1)
print("Top Worst Rated Movie:")
print(worst_movie)

Top 5 Highest Rated Movies:
             mean  count
movieId                 
1939     4.636364     11
3469     4.541667     12
858      4.487500    200
318      4.487138    311
1948     4.458333     12
Top Worst Rated Movie:
             mean  count
movieId                 
3593     1.210526     19


In [58]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd=SVD()
cross_validate(svd, data, measures=['RMSE','MAE'], cv=3)

{'test_rmse': array([0.90632909, 0.89615407, 0.90573961]),
 'test_mae': array([0.69805593, 0.69138438, 0.69789635]),
 'fit_time': (5.292846441268921, 5.433105230331421, 5.171314477920532),
 'test_time': (0.2682924270629883, 0.2228074073791504, 0.283660888671875)}

In [59]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2087b1da160>

In [60]:
ratings[ratings['userId'] == 1]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
5,1,1263,2.0,1260759151
6,1,1287,2.0,1260759187
7,1,1293,2.0,1260759148
8,1,1339,3.5,1260759125
9,1,1343,2.0,1260759131


In [61]:
### Predicted rating for user with user_id 1 and movie No : 302

svd.predict(1, 302, None,verbose=True)

user: 1          item: 302        r_ui = None   est = 2.78   {'was_impossible': False}


Prediction(uid=1, iid=302, r_ui=None, est=2.7843038469453942, details={'was_impossible': False})

In [62]:
all_movie_ids = ratings['movieId'].unique()

# Predict ratings for all movies for user 1
predictions = [svd.predict(1, movie_id) for movie_id in all_movie_ids]

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(predictions, columns=['uid', 'iid', 'r_ui', 'est', 'details'])

user_1_ratings = ratings[ratings['userId'] == 1]

# Sort the ratings in descending order and get the top 10
top_10_user_1_ratings = user_1_ratings.sort_values(by='rating', ascending=False).head(10)
print("Top 10 Movies Rated by User 1:")
print(top_10_user_1_ratings)

# Get top 10 recommended movies for user 1
top_10_recommendations = predictions_df.sort_values(by='est', ascending=False).head(10)
print("Top 10 Recommended Movies for User 1:")
print(top_10_recommendations)

Top 10 Movies Rated by User 1:
    userId  movieId  rating   timestamp
4        1     1172     4.0  1260759205
13       1     2105     4.0  1260759139
12       1     1953     4.0  1260759191
8        1     1339     3.5  1260759125
19       1     3671     3.0  1260759117
1        1     1029     3.0  1260759179
2        1     1061     3.0  1260759182
14       1     2150     3.0  1260759194
17       1     2455     2.5  1260759113
0        1       31     2.5  1260759144
Top 10 Recommended Movies for User 1:
      uid    iid  r_ui       est                    details
99      1    318  None  3.638173  {'was_impossible': False}
177     1   1136  None  3.534353  {'was_impossible': False}
505     1    608  None  3.510174  {'was_impossible': False}
731     1    899  None  3.501469  {'was_impossible': False}
24      1     50  None  3.494400  {'was_impossible': False}
747     1   1060  None  3.483415  {'was_impossible': False}
4       1   1172  None  3.479910  {'was_impossible': False}
782     1  