# Movie Recommendation System

In [37]:
import numpy as np
import pandas as pd

In [38]:
!pip install surprise



Please make sure to add proper dataset path while running the cells.

# Recommendation System using K-Nearest Neighbors algorithm

Load Datasets and libraries

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [41]:
data = pd.merge(movies, ratings)
data = data.dropna()

Normalize the Data using Min Max Scalar and split train test

In [42]:
scaler = MinMaxScaler()
data['rating'] = scaler.fit_transform(data[['rating']])

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) 
print(train_data.head(5))

       movieId                                            title  \
80568    35836                   40-Year-Old Virgin, The (2005)   
50582     3000         Princess Mononoke (Mononoke-hime) (1997)   
8344       308  Three Colors: White (Trzy kolory: Bialy) (1994)   
99603   148881                         World of Tomorrow (2015)   
71701     6593                             Freaky Friday (2003)   

                                         genres  userId    rating   timestamp  
80568                            Comedy|Romance     306  0.444444  1518381028  
50582  Action|Adventure|Animation|Drama|Fantasy     182  1.000000  1054781963  
8344                               Comedy|Drama      40  1.000000   832059658  
99603                          Animation|Comedy     567  1.000000  1525282206  
71701                   Children|Comedy|Fantasy     426  0.777778  1451080526  


Vectorize the data using tfidf and find the tfidf matrix, 

In [43]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['title'] + " " + train_data['genres'])
movie_id_to_index = {movie_id: index for index, movie_id in enumerate(train_data['movieId'])}

KNN algorithm to find k nearest neighbors and cosine similarity

In [44]:
def knn(query_index, tfidf_matrix, k=25):
    cosine_similarities = linear_kernel(tfidf_matrix[query_index], tfidf_matrix).flatten()
    neighbor_indices = cosine_similarities.argsort()[:-k-1:-1]
    return neighbor_indices

Predict rating for test data and find rmse and mae

In [45]:
def predict_rating(user_id, movie_id, train_data, tfidf_matrix, k=25):
    user_ratings = train_data[train_data['userId'] == user_id]
    if len(user_ratings) == 0:
        return 0
    query_index = movie_id_to_index.get(movie_id)
    if query_index is None:
        return 0

    neighbors = knn(query_index, tfidf_matrix, k)
    sum_similarities = 0
    weighted_sum = 0

    for neighbor in neighbors:
        similarity = np.dot(tfidf_matrix[query_index], tfidf_matrix[neighbor].T).toarray()[0][0]
        neighbor_rating = user_ratings.loc[user_ratings['movieId'] == neighbor, 'rating'].values
        if len(neighbor_rating) > 0:
            weighted_sum += similarity * neighbor_rating[0]
            sum_similarities += similarity

    if sum_similarities == 0:
        return 0
    else:
        return weighted_sum / sum_similarities
    
    
    
test_predictions = []
test_actual = []

for index, row in test_data.iterrows():
    prediction = predict_rating(row['userId'], row['movieId'], train_data, tfidf_matrix)
    test_predictions.append(prediction)
    test_actual.append(row['rating'])

test_rmse = np.sqrt(mean_squared_error(test_actual, test_predictions))
test_mae = mean_absolute_error(test_actual, test_predictions)
print(f"\nTest RMSE: {test_rmse}")
print(f"\nTest MAE: {test_mae}")


Test RMSE: 0.6794391046834429

Test MAE: 0.6242411944902655


# Recommendation System with Collaborative Filtering using User Ratings

In [47]:
#reading data
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

## Preprocessing

In [48]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [49]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing paranthesis
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')
#Drop genres
movies_df = movies_df.drop('genres', axis = 1)
movies_df.head()

  movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')


Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [50]:
# Dropping timestamp from ratings dataframe
ratings_df = ratings_df.drop('timestamp', axis = 1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


### Taking User Input 

In [51]:
userInput = [
            {'movieId':1968, 'rating':5},
            {'movieId':1, 'rating':3.5},
            {'movieId':2, 'rating':2},
            {'movieId':296, 'rating':5},
            {'movieId':1274, 'rating':4.5}
         ] 
user_df = pd.DataFrame(userInput)

In [52]:
# Custom Input

# def takeUserInput():
#   userIp = []
#   while True:
#     movieId = input("Enter the watched movieID or done to exit:\t")
#     if movieId == 'done':
#       break
#     rating = float(input(f"Enter the rating for {movieId} out of 5:\t"))
#     userIp.append({'movieId':int(movieId), 'rating':rating})
#   return pd.DataFrame(userIp)

# user_df = takeUserInput()
# user_df

In [53]:
# Preprocessing the User input
user_df = pd.merge(user_df, movies_df, on="movieId", how="left")
user_df.drop('year', axis = 1, inplace = True)
user_df.sort_values(by=['movieId'])

Unnamed: 0,movieId,rating,title
1,1,3.5,Toy Story
2,2,2.0,Jumanji
3,296,5.0,Pulp Fiction
4,1274,4.5,Akira
0,1968,5.0,"Breakfast Club, The"


### The users who has seen the same movies

In [54]:
#Filtering out users that have watched movies that the input has watched and storing it
selectedUser = ratings_df[ratings_df['movieId'].isin(user_df['movieId'].tolist())]
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
selectedUserGroup = selectedUser.groupby(['userId'])
#Sorting it so users with movie most in common with the input will have priority
selectedUserGroup = sorted(selectedUserGroup,  key=lambda x: len(x[1]), reverse=True)
selectedUserGroup = selectedUserGroup[0:100]

In [55]:
pcd = {}
for name, group in selectedUserGroup:
    group = group.sort_values(by='movieId')
    user_df = user_df.sort_values(by='movieId')
    nRatings = len(group)
    temp_df = user_df[user_df['movieId'].isin(group['movieId'].tolist())]
    
    tempRatingList = temp_df['rating'].tolist()
    tempGroupList = group['rating'].tolist()
    
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    if Sxx != 0 and Syy != 0:
        pcd[name] = Sxy/np.sqrt(Sxx*Syy)
    else:
        pcd[name] = 0

In [56]:
r_df = pd.DataFrame.from_dict(pcd, orient='index')
r_df.columns = ['similarityIndex']
r_df['userId'] = r_df.index
r_df.index = range(len(r_df))
topUsers=r_df.sort_values(by='similarityIndex', ascending=False)[0:50]

### Weighted Average
#### Rating of selected users to all movies



In [57]:
selectedUsersRating = topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
#Multiplies the similarity by the user's ratings
selectedUsersRating['weightedRating'] = selectedUsersRating['similarityIndex']*selectedUsersRating['rating']

In [58]:
#Applies a sum to the topUsers after grouping it up by userId
temp_Rating = selectedUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
temp_Rating.columns = ['sum_similarityIndex','sum_weightedRating']

In [59]:
#Creates an empty dataframe
recommend_df = pd.DataFrame()
#Now we take the weighted average
recommend_df['weighted average recommendation score'] = temp_Rating['sum_weightedRating']/temp_Rating['sum_similarityIndex']
recommend_df['movieId'] = temp_Rating.index
recommend_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.66308,1
2,3.060905,2
3,3.003402,3
4,2.0,4
5,2.75985,5


### Recommended movies
#### Top 10 recommendated movies using Collaborative Filtering

In [60]:
recommendation_df = recommend_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
536,633,Denise Calls Up,1995
687,905,It Happened One Night,1934
912,1211,"Wings of Desire (Himmel über Berlin, Der)",1987
2484,3310,"Kid, The",1921
3189,4298,Rififi (Du rififi chez les hommes),1955
3905,5485,Tadpole,2002
3936,5537,Satin Rouge,2002
4969,7579,Pride and Prejudice,1940
9022,140627,Battle For Sevastopol,2015
9234,152711,Who Killed Chea Vichea?,2010


# Recommendation System using Singular Value Decomposition

In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from ast import literal_eval
from surprise import SVD, BaselineOnly, CoClustering, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import accuracy
from surprise import NormalPredictor
from surprise.prediction_algorithms import KNNBaseline
from surprise.model_selection import GridSearchCV
import math
from collections import defaultdict
import csv
from sklearn.metrics import ndcg_score
import time

In [62]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [63]:
#preprocessing dataframe
dataf = pd.merge(ratings, movies, on='movieId' , how = 'left')
dataf = dataf.drop('title', axis = 1)
dataf['genres'] = dataf['genres'].str.split('|')
#preprocessing tags dataframe
tags['tag'] = tags['tag'].str.split('|')
tags.drop('timestamp', axis=1, inplace=True)
tags = tags.groupby(['userId','movieId'])['tag'].apply(lambda x: ','.join(x.astype(str))).reset_index()
dataf = pd.merge(dataf, tags, on=['userId','movieId'], how='left')
dataf['tag'] = dataf['tag'].apply(lambda d: d if isinstance(d, list) else [])
dataf['genres'] = dataf['genres'].apply(lambda d: d if isinstance(d, list) else [])

In [64]:
#train test split
training_data, testing_data = train_test_split(dataf, test_size=0.2, stratify = dataf.userId)
training_data = training_data.sort_values(['userId', 'movieId'])
testing_data = testing_data.sort_values(['userId','movieId'])
training_data.to_csv('training_data.csv', index = False)
testing_data.to_csv('testing_data.csv', index = False)
dataset = Dataset.load_builtin('ml-1m')

In [65]:
from surprise.model_selection import train_test_split
train_set, test_set = train_test_split(dataset, test_size=.20)

In [66]:
def measure_accuracy(algorthm, train_set, test_set):
  algorthm.fit(train_set)
  pred = algorthm.test(test_set)
  accuracy.rmse(pred)
  accuracy.mae(pred)
  return
  
def data_processing(train, test):
  reader = Reader(rating_scale=(0, 5))
  train_set = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)
  test_set = Dataset.load_from_df(test[['userId', 'movieId', 'rating']], reader)
  train_set = train_set.construct_trainset(train_set.raw_ratings)
  test_set = test_set.construct_testset(test_set.raw_ratings)
  return train_set, test_set

def measure_accuracy(algorthm, train_set, test_set):
  algorthm.fit(train_set)
  pred = algorthm.test(test_set)
  rmse = accuracy.rmse(pred)
  mae = accuracy.mae(pred)
  return rmse, mae, pred

In [67]:
training_data_csv = 'training_data.csv'
testing_data_csv = 'testing_data.csv'
dataf1 = pd.read_csv(training_data_csv)
dataf2 = pd.read_csv(testing_data_csv)
trainset, testset = data_processing(dataf1, dataf2)
print("BaselineOnly")
algorthm = BaselineOnly()
test_base_rmse, test_base_mae, test_base_pred  = measure_accuracy(algorthm, trainset, testset)
print("SVD")
algorthm = SVD()
test_svd_rmse, test_svd_mae, test_svd_pred  = measure_accuracy(algorthm, trainset, testset)

BaselineOnly
Estimating biases using als...
RMSE: 0.8734
MAE:  0.6725
SVD
RMSE: 0.8789
MAE:  0.6746


In [68]:
testing_pred = pd.DataFrame(columns= ['U_id', 'I_id', 'Og_Rating', 'SVD_Rating', 'Baseline_Rating'])
for i in range(len(test_base_pred)): 
  svd = test_svd_pred[i]
  baseline = test_base_pred[i]
  dataf = pd.DataFrame([[svd.uid, svd.iid, svd.r_ui, svd.est, baseline.est]], columns=['U_id', 'I_id', 'Og_Rating', 'SVD_Rating', 'Baseline_Rating'])
  testing_pred = pd.concat([dataf, testing_pred], ignore_index=True)
testing_pred.to_csv('testing_pred.csv')
pred_t = pd.read_csv('testing_pred.csv')
T = pred_t.shape[0]
svd_weight = 0.05
baseline_weight = 0

In [69]:
def process_data(train, test):
    reader = Reader(rating_scale=(0, 5))
    train_set = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)
    test_set = Dataset.load_from_df(test[['userId', 'movieId', 'rating']], reader)
    train_set = train_set.construct_trainset(train_set.raw_ratings)
    test_set = test_set.construct_testset(test_set.raw_ratings)
    return train_set, test_set

training_data_csv = 'training_data.csv'
testing_data_csv = 'testing_data.csv'
dataf1 = pd.read_csv(training_data_csv)
dataf2 = pd.read_csv(testing_data_csv)
trainset, testset = process_data(dataf1, dataf2)

def func_topn_items(pred, n):
    topn_items = defaultdict(list)
    orgnl_ratngs = defaultdict(list)
    for uid, iid, true_r, est, _ in pred:
        topn_items[uid].append((iid, est))
        orgnl_ratngs[uid].append((iid, true_r))
    for uid, user_ratings in topn_items.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        topn_items[uid] = user_ratings[:n]
    return topn_items, orgnl_ratngs

def discounted_cg(s):
    return s[0] + sum(sc/math.log(ind, 2) for sc, ind in zip(s[1:], range(2, len(s) + 1)))

def normalized_discounted_cg(s):
    idcg = discounted_cg(sorted(s, reverse=True))
    return (discounted_cg(s)/idcg) if idcg > 0.0 else 0.0

def precision_and_recall(pred, k=5, threshold=3.5):
    u_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in pred:
        u_est_true[uid].append((est, true_r))
    precisions_vals = dict()
    recalls_vals = dict()
    for uid, user_ratings in u_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])
        precisions_vals[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        recalls_vals[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
    precision = (sum(prec for prec in precisions_vals.values()) / len(precisions_vals))
    recall = (sum(rec for rec in recalls_vals.values()) / len(recalls_vals))
    return precision, recall

def recmndation(algorthm, trainset, testset):
  algorthm.fit(trainset)
  testing_preds = algorthm.test(testset)
  Rmse = accuracy.rmse(testing_preds)
  Mae = accuracy.mae(testing_preds)
  topn_items, orgnl_ratngs = func_topn_items(testing_preds, 5)
  precision, recall = precision_and_recall(testing_preds)
  f_measure = (2*precision*recall)/(precision+recall)
  ndcg_scores = dict()
  for uid, user_ratings in topn_items.items():
    s = []
    for iid, est_r in user_ratings:
        iid_found = False
        org_user_ratings = orgnl_ratngs[uid]
        for i, r in org_user_ratings:
            if iid == i:
                s.append(r)
                iid_found = True
                break
        if not iid_found:
            s.append(0)
    ndcg_scores[uid] = normalized_discounted_cg(s)
  ndcg_score = sum(ndcg for ndcg in ndcg_scores.values())/len(ndcg_scores)
  return (Rmse, Mae, precision, recall, f_measure, ndcg_score, testing_preds)

surprise_df = pd.DataFrame(columns= ['Algorithm', 'Rmse', 'Mae', 'Precision', 'Recall', 'F-measure', 'NDCG'])

In [70]:
for algorithm in [SVD(), BaselineOnly()]:
    results = recmndation(algorithm,trainset,testset) 
    
    name =str(algorithm).split(' ')[0].split('.')[-1]
    print("Algorithm:", name)
    dataf = pd.DataFrame([[name, results[0], results[1], results[2], results[3], results[4], results[5]]], columns= ['Algorithm', 'Rmse', 'Mae', 'Precision', 'Recall', 'F-measure', 'NDCG'])
    surprise_df = pd.concat([dataf, surprise_df], ignore_index=True)
surprise_df.sort_values(by='Rmse', ascending=False) 

RMSE: 0.8770
MAE:  0.6720
Algorithm: SVD
Estimating biases using als...
RMSE: 0.8734
MAE:  0.6725
Algorithm: BaselineOnly


Unnamed: 0,Algorithm,Rmse,Mae,Precision,Recall,F-measure,NDCG
1,SVD,0.876977,0.672045,0.823852,0.405421,0.543422,0.960318
0,BaselineOnly,0.873406,0.672481,0.825519,0.407513,0.545662,0.960388
