In [12]:
!pip install "numpy<2" && pip install git+https://github.com/NicolasHug/Surprise.git

Collecting git+https://github.com/NicolasHug/Surprise.git
  Cloning https://github.com/NicolasHug/Surprise.git to /tmp/pip-req-build-h2ovgbf1
  Running command git clone --filter=blob:none --quiet https://github.com/NicolasHug/Surprise.git /tmp/pip-req-build-h2ovgbf1
  Resolved https://github.com/NicolasHug/Surprise.git to commit 2381fb11d0c4bf917cc4b9126f205d0013649966
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [13]:
import pandas as pd
from surprise import Dataset, Reader, SVD, KNNWithMeans
from surprise.model_selection import train_test_split
from surprise import accuracy
import pickle
from collections import defaultdict

In [14]:
df = pd.read_csv("/content/u.data", sep="\t", names=["user_id","item_id","rating","timestamp"])
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(df[["user_id","item_id","rating"]], reader)

In [15]:
movie_titles = {}
try:
    item_df = pd.read_csv("/content/u.item", sep="|", header=None, encoding="latin-1")
    item_df = item_df[[0,1]]
    item_df.columns = ["item_id","title"]
    movie_titles = dict(zip(item_df.item_id, item_df.title))
except Exception as e:
    print("u.item not found or failed to load, movie titles will be empty.", e)

In [16]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)
print('Trainset number of ratings:', trainset.n_ratings)

Trainset number of ratings: 80000


In [17]:
svd = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02, random_state=42)
svd.fit(trainset)
preds_svd = svd.test(testset)
rmse_svd = accuracy.rmse(preds_svd, verbose=False)
mae_svd = accuracy.mae(preds_svd, verbose=False)
print(f"SVD - RMSE: {rmse_svd:.4f}, MAE: {mae_svd:.4f}")

SVD - RMSE: 0.9352, MAE: 0.7375


In [18]:
sim_options = {'name': 'cosine', 'user_based': True}
knn = KNNWithMeans(k=40, sim_options=sim_options)
knn.fit(trainset)
preds_knn = knn.test(testset)
rmse_knn = accuracy.rmse(preds_knn, verbose=False)
mae_knn = accuracy.mae(preds_knn, verbose=False)
print(f"KNNWithMeans - RMSE: {rmse_knn:.4f}, MAE: {mae_knn:.4f}")

Computing the cosine similarity matrix...
Done computing similarity matrix.
KNNWithMeans - RMSE: 0.9538, MAE: 0.7526


In [19]:
def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

In [22]:
all_items = [trainset.to_raw_iid(i) for i in trainset.all_items()]

sample_user = str(testset[0][0])
print("Sample user:", sample_user)

try:
    inner_uid = trainset.to_inner_uid(sample_user)
except:
    print("User not in training set")
    inner_uid = None

user_rated_items = set([trainset.to_raw_iid(i)
                        for i in trainset.ur[inner_uid]])

anti_testset = [
    (sample_user, item, 0.0)
    for item in all_items
    if item not in user_rated_items
]

user_preds = svd.test(anti_testset)
top_n = get_top_n(user_preds, 10)
top_n[sample_user][:10]

Sample user: 907
User not in training set


[(408, 4.583729068698259),
 (318, 4.572227960716579),
 (64, 4.547786306625786),
 (483, 4.538799175409448),
 (513, 4.525553263950404),
 (169, 4.4972624472977465),
 (12, 4.4333082232654775),
 (603, 4.420326071192217),
 (178, 4.415626241479128),
 (50, 4.395147323607444)]

In [23]:
best_model = svd if rmse_svd <= rmse_knn else knn
model_name = 'svd_model.pkl' if best_model is svd else 'knn_model.pkl'

with open(model_name, 'wb') as f:
    pickle.dump(best_model, f)
with open('movie_titles.pkl', 'wb') as f:
    pickle.dump(movie_titles, f)