# Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from surprise import Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise.prediction_algorithms import SVD, KNNBasic, NMF, SlopeOne, CoClustering
from surprise.accuracy import rmse, mae, mse

import pickle

# Data Read

In [3]:
df = pd.read_csv("ratings_small.csv")

df.drop(columns=["timestamp"], inplace=True)

df.head()

Unnamed: 0,userId,movieId,rating
0,1,31,2.5
1,1,1029,3.0
2,1,1061,3.0
3,1,1129,2.0
4,1,1172,4.0


In [4]:
df_copy = df.copy()

min_rating = df_copy['rating'].min()
max_rating = df_copy['rating'].max()

reader = Reader(rating_scale=(min_rating, max_rating))
data = Dataset.load_from_df(df_copy[['userId','movieId','rating']], reader)

trainset, testset = train_test_split(data, test_size=0.20)

# Matrix-Based 

## Optimal SVD

In [5]:
best_svd = SVD(n_factors=150, lr_all=0.01, reg_all=0.1)

best_svd.fit(trainset)

best_svd_predictions = best_svd.test(testset)

mse(best_svd_predictions)
rmse(best_svd_predictions)
mae(best_svd_predictions)

MSE: 0.7895
RMSE: 0.8885
MAE:  0.6836


0.6835787917565951

# Item-Based

## Optimal KNN

In [6]:
sim_options = {
    'name': 'msd',  
    'user_based': False
}

best_knn = KNNBasic(sim_options=sim_options, k=60, min_k=5, shrinkage=10)
best_knn.fit(trainset)

best_knn_predictions = best_knn.test(testset)

mse(best_knn_predictions)
rmse(best_knn_predictions)
mae(best_knn_predictions)

Computing the msd similarity matrix...
Done computing similarity matrix.
MSE: 0.8691
RMSE: 0.9322
MAE:  0.7186


0.7185882449284935

# Content Based

## TF-IDF + Numerical, K-Neighbours Cosine Similarity

In [16]:
model_filename = 'nearest_neighbors_model_v2.pkl'

with open(model_filename, 'rb') as model_file:
    content_based = pickle.load(model_file)

array_filename = 'combined_features_array_v2.pkl'

with open(array_filename, 'rb') as array_file:
    combined_features = pickle.load(array_file)

In [9]:
def get_recommendations(movie_index):
    distances, indices = content_based.kneighbors([combined_features[movie_index]])
    return indices[0][1:].tolist()

# Hybrid Recommender

## KNN + SVD

In [10]:
knn_preds = best_knn_predictions
svd_preds = best_svd_predictions

knn_weight = 0.7
svd_weight = 0.3

hybrid_predictions = []

for knn_pred, svd_pred in zip(knn_preds, svd_preds):
    combined_rating = knn_weight * knn_pred.est + svd_weight * svd_pred.est  
    hybrid_predictions.append((knn_pred.uid, knn_pred.iid, knn_pred.r_ui, combined_rating, knn_pred.details))


mse(hybrid_predictions)
rmse(hybrid_predictions)
mae(hybrid_predictions)

MSE: 0.8237
RMSE: 0.9076
MAE:  0.6990


0.6989897435284329

## KNN + SVD + Popularity

In [11]:
knn_preds = best_knn_predictions
svd_preds = best_svd_predictions
popularity_df = df.groupby('movieId')['rating'].mean().reset_index()

knn_weight = 0.3
svd_weight = 0.5
popularity_weight = 0.2

hybrid_predictions = []

for knn_pred, svd_pred in zip(knn_preds, svd_preds):
    item_id = knn_pred.iid
    # Get popularity score for the item
    popularity_score = popularity_df.loc[popularity_df['movieId'] == item_id, 'rating'].values[0] if item_id in popularity_df['movieId'].values else 0
    # Combine ratings with weights
    combined_rating = knn_weight * knn_pred.est + svd_weight * svd_pred.est + popularity_weight * popularity_score
    hybrid_predictions.append((knn_pred.uid, knn_pred.iid, knn_pred.r_ui, combined_rating, knn_pred.details))

# Evaluate the hybrid model
mse(hybrid_predictions)
rmse(hybrid_predictions)
mae(hybrid_predictions)

MSE: 0.7602
RMSE: 0.8719
MAE:  0.6743


0.6743161610816854

## SVD + Popularity

In [12]:
svd_preds = best_svd_predictions
popularity_df = df.groupby('movieId')['rating'].mean().reset_index()

svd_weight = 0.8
popularity_weight = 0.2

hybrid_predictions = []

for svd_pred in svd_preds:
    item_id = svd_pred.iid
    # Get popularity score for the item
    popularity_score = popularity_df.loc[popularity_df['movieId'] == item_id, 'rating'].values[0] if item_id in popularity_df['movieId'].values else 0
    # Combine ratings with weights
    combined_rating =  svd_weight * svd_pred.est + popularity_weight * popularity_score
    hybrid_predictions.append((svd_pred.uid, svd_pred.iid, svd_pred.r_ui, combined_rating, svd_pred.details))

# Evaluate the hybrid model
mse(hybrid_predictions)
rmse(hybrid_predictions)
mae(hybrid_predictions)

MSE: 0.7612
RMSE: 0.8724
MAE:  0.6741


0.6741474669336665

# Content + KNN

In [17]:
def content_knn_hybrid(predictions, user_id, base_item, top_n=5):
    
    desired_item_ids = get_recommendations(base_item) 
    
    user_predictions = [pred for pred in predictions if pred.uid == user_id]

    user_item_predictions = [pred for pred in user_predictions if pred.iid in desired_item_ids]

    sorted_predictions = sorted(user_item_predictions, key=lambda x: x.est, reverse=True)
    
    top_iids = [pred.iid for pred in sorted_predictions[:top_n]]
        
    return top_iids



content_knn_hybrid(predictions=best_knn_predictions,
                   user_id=547,
                   base_item=20,
                   top_n=5)

[32, 161, 1, 36, 57]