In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("prajitdatta/movielens-100k-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/prajitdatta/movielens-100k-dataset?dataset_version_number=1...


100%|██████████| 4.77M/4.77M [00:00<00:00, 6.65MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/prajitdatta/movielens-100k-dataset/versions/1


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds

DATA_DIR = Path("/root/.cache/kagglehub/datasets/prajitdatta/movielens-100k-dataset/versions/1/ml-100k")          # unzip the 100K archive here
u_cols = ['user_id','age','gender','occupation','zip']
users  = pd.read_csv(DATA_DIR / "u.user", sep='|', names=u_cols)

r_cols = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_csv(DATA_DIR / "u.data", sep='\t', names=r_cols)

m_cols = ['movie_id','title','release_date','video_release_date','imdb_url']
movies = pd.read_csv(DATA_DIR / "u.item", sep='|', encoding='latin-1', usecols=range(5), names=m_cols)

In [3]:
user_item = ratings.pivot(index='user_id', columns='movie_id', values='rating')
user_item = user_item.fillna(0)      # unseen = 0

In [4]:
def user_based_recommend(user_id, k_users=50, top_n=10):
    """Return top-N unseen movies for user_id using user-user CF."""
    user_sim = cosine_similarity(user_item)
    sim_df   = pd.DataFrame(user_sim,
                            index=user_item.index,
                            columns=user_item.index)

    # neighbours: most similar users (exclude self)
    neighbours = sim_df[user_id].drop(user_id).sort_values(ascending=False)[:k_users]

    # weighted sum of neighbour ratings
    neighbour_ratings = user_item.loc[neighbours.index]
    weighted = neighbour_ratings.T.dot(neighbours) / neighbours.sum()

    # mask already-seen movies
    already_seen = user_item.loc[user_id] > 0
    recs = weighted[~already_seen].sort_values(ascending=False).head(top_n)
    return recs.index.to_list()

In [5]:
def precision_at_k(user_id, k_rec=10, k_test=10):
    """Compute precision@k_rec for one user."""
    # hidden test set = user's last k_test ratings chronologically
    user_ratings = ratings[ratings.user_id == user_id].sort_values('timestamp')
    if len(user_ratings) < k_test:
        return None
    test_movies = user_ratings.movie_id.iloc[-k_test:].values

    # recommend
    rec_movies = user_based_recommend(user_id, top_n=k_rec)

    # precision
    hits = len(set(rec_movies) & set(test_movies))
    return hits / k_rec

# Average precision across 200 random users
import random
sample_users = random.sample(list(ratings.user_id.unique()), 200)
precisions = [precision_at_k(u) for u in sample_users]
avg_precision = np.nanmean(precisions)
print("Average precision@10 =", round(avg_precision, 3))

Average precision@10 = 0.0


In [6]:
def item_based_recommend(user_id, k_items=30, top_n=10):
    item_sim = cosine_similarity(user_item.T)
    sim_item_df = pd.DataFrame(item_sim,
                               index=movies.movie_id,
                               columns=movies.movie_id)

    user_ratings = user_item.loc[user_id]
    rated = user_ratings[user_ratings > 0]

    # aggregate similarity scores
    scores = pd.Series(dtype='float64')
    for m, r in rated.items():
        sims = sim_item_df[m].drop(m)
        scores = scores.add(sims * r, fill_value=0)

    already_seen = user_ratings > 0
    recs = scores[~already_seen].sort_values(ascending=False).head(top_n)
    return recs.index.to_list()

In [7]:
def svd_recommend(user_id, k_latent=50, top_n=10):
    # center matrix
    R = user_item.values
    user_mean = np.mean(R, axis=1)
    R_demean = R - user_mean.reshape(-1, 1)

    # perform SVD
    U, sigma, Vt = svds(R_demean, k=k_latent)
    sigma = np.diag(sigma)
    pred = np.dot(np.dot(U, sigma), Vt) + user_mean.reshape(-1, 1)
    preds_df = pd.DataFrame(pred, index=user_item.index, columns=user_item.columns)

    # unseen mask
    unseen = user_item.loc[user_id] == 0
    recs = preds_df.loc[user_id][unseen].sort_values(ascending=False).head(top_n)
    return recs.index.to_list()

In [8]:
def show_titles(movie_ids):
    return movies.set_index('movie_id').loc[movie_ids]['title'].tolist()

user = 196
print("User-based :", show_titles(user_based_recommend(user)))
print("Item-based :", show_titles(item_based_recommend(user)))
print("SVD        :", show_titles(svd_recommend(user)))

User-based : ['Fargo (1996)', 'Star Wars (1977)', 'When Harry Met Sally... (1989)', 'Back to the Future (1985)', 'Pulp Fiction (1994)', 'Silence of the Lambs, The (1991)', 'Amadeus (1984)', 'Sense and Sensibility (1995)', 'Jerry Maguire (1996)', 'Indiana Jones and the Last Crusade (1989)']
Item-based : ['Back to the Future (1985)', 'When Harry Met Sally... (1989)', 'Star Wars (1977)', 'Raiders of the Lost Ark (1981)', 'Monty Python and the Holy Grail (1974)', 'Fargo (1996)', 'Amadeus (1984)', 'Forrest Gump (1994)', 'Silence of the Lambs, The (1991)', 'Pulp Fiction (1994)']
SVD        : ['When Harry Met Sally... (1989)', 'Back to the Future (1985)', 'Grease (1978)', 'Dave (1993)', 'Sleepless in Seattle (1993)', 'Postino, Il (1994)', 'Sabrina (1995)', 'M*A*S*H (1970)', 'Clueless (1995)', "Ulee's Gold (1997)"]
