## Loading the data

In [2]:
import os
import requests
import zipfile
import io
import pandas as pd

# 1. Download the ZIP
url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))

# 2. Extract (in memory or to disk)
data_folder = "ml-100k"
if not os.path.exists(data_folder):
    z.extractall(data_folder)

# 3. Load ratings file
ratings_path = os.path.join(data_folder, "u.data")
ratings = pd.read_csv(ratings_path, sep="\t",
                      names=["user_id","item_id","rating","timestamp"],
                      engine="python")

# 4. Load movie metadata (optional)
movies_path = os.path.join(data_folder, "u.item")
# u.item uses '|' as separator — adapt encoding if needed
movies = pd.read_csv(movies_path, sep="|", 
                     names=["item_id","title","release_date",
                            "video_release_date","IMDb_URL"] + [f"genre_{i}" for i in range(19)],
                     encoding="latin-1", engine="python")

print("Ratings sample:")
ratings.head()

Ratings sample:


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
print("\nMovies sample:")
movies.head()


Movies sample:


Unnamed: 0,item_id,title,release_date,video_release_date,IMDb_URL,genre_0,genre_1,genre_2,genre_3,genre_4,...,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14,genre_15,genre_16,genre_17,genre_18
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
print("Data Info:")
print(f"Number of ratings: {len(ratings)}")
print(f"Number of movies: {len(movies)}")
print(f"Number of unique users: {ratings['user_id'].nunique()}")

Data Info:
Number of ratings: 100000
Number of movies: 1682
Number of unique users: 943


## Matrix Factorization 

In [20]:
import numpy as np
from sklearn.model_selection import train_test_split


# 1. Split the data
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# 2. Build MF training data
num_users = ratings['user_id'].nunique()
num_items = ratings['item_id'].nunique()

# Zero-index user and item IDs for training set
train_user_ids = train_data['user_id'].values - 1
train_item_ids = train_data['item_id'].values - 1
train_ratings_vals = train_data['rating'].values

train_ratings = list(zip(train_user_ids, train_item_ids, train_ratings_vals))
n_train = len(train_ratings)

# 3. MF SGD Implementation

def train_mf_sgd(num_users, num_items, data, k=20, epochs=25, lr=0.01, reg=0.02, seed=42):
    rng = np.random.default_rng(seed)
    P = rng.normal(scale=0.1, size=(num_users, k))
    Q = rng.normal(scale=0.1, size=(num_items, k))

    for epoch in range(epochs):
        rng.shuffle(data)

        for u, i, r in data:
            pred = np.dot(P[u], Q[i])
            err = r - pred

            # Gradient update
            P[u] += lr * (err * Q[i] - reg * P[u])
            Q[i] += lr * (err * P[u] - reg * Q[i])

        # Compute MSE on training set only
        se = 0
        for u, i, r in data:
            pred = np.dot(P[u], Q[i])
            se += (r - pred) ** 2
        mse = se / len(data)

        print(f"Epoch {epoch+1:02d}/{epochs} — MSE: {mse:.4f}")

    return P, Q


P, Q = train_mf_sgd(
    num_users=num_users,
    num_items=num_items,
    data=train_ratings,
    k=20,
    epochs=25,
    lr=0.01,
    reg=0.02
)

# 4. Evaluate on test set

def predict(P, Q, u, i):
    return np.dot(P[u], Q[i])

se = 0
for row in test_data.itertuples():
    u = row.user_id - 1
    i = row.item_id - 1
    r = row.rating
    pred = predict(P, Q, u, i)
    se += (r - pred)**2

test_mse = se / len(test_data)
print("\nTest MSE:", test_mse)


Epoch 01/25 — MSE: 5.8003
Epoch 02/25 — MSE: 1.4519
Epoch 03/25 — MSE: 1.0439
Epoch 04/25 — MSE: 0.9269
Epoch 05/25 — MSE: 0.8664
Epoch 06/25 — MSE: 0.8228
Epoch 07/25 — MSE: 0.7862
Epoch 08/25 — MSE: 0.7513
Epoch 09/25 — MSE: 0.7192
Epoch 10/25 — MSE: 0.6879
Epoch 11/25 — MSE: 0.6598
Epoch 12/25 — MSE: 0.6321
Epoch 13/25 — MSE: 0.6077
Epoch 14/25 — MSE: 0.5831
Epoch 15/25 — MSE: 0.5610
Epoch 16/25 — MSE: 0.5420
Epoch 17/25 — MSE: 0.5236
Epoch 18/25 — MSE: 0.5070
Epoch 19/25 — MSE: 0.4928
Epoch 20/25 — MSE: 0.4799
Epoch 21/25 — MSE: 0.4680
Epoch 22/25 — MSE: 0.4573
Epoch 23/25 — MSE: 0.4458
Epoch 24/25 — MSE: 0.4373
Epoch 25/25 — MSE: 0.4296

Test MSE: 0.9438673871144647


In [26]:
# example prediction
user_id = 8  # Example user ID
item_id = 11  # Example item ID
predicted_rating = predict(P, Q, user_id - 1, item_id - 1)
print(f"Predicted rating for user {user_id} on item {item_id}: {predicted_rating:.2f}")
print("Item title:", movies[movies['item_id'] == item_id]['title'].values[0])

Predicted rating for user 8 on item 11: 3.84
Item title: Seven (Se7en) (1995)


## Neural Collaborative Filtering (NCF)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split


# 1. DATASET CLASS

class RatingDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df['user_id'].values - 1, dtype=torch.long)
        self.items = torch.tensor(df['item_id'].values - 1, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

# 2. DATA LOADERS

train_loader = DataLoader(RatingDataset(train_data), batch_size=256, shuffle=True)
test_loader  = DataLoader(RatingDataset(test_data),  batch_size=256, shuffle=False)

# 3. NCF MODEL

class NCF(nn.Module):
    def __init__(self, num_users, num_items, emb_dim=64):  # increased emb size
        super().__init__()
        self.user_emb = nn.Embedding(num_users, emb_dim)
        self.item_emb = nn.Embedding(num_items, emb_dim)

        # Optional user/item bias terms help capture global tendencies
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_bias = nn.Embedding(num_items, 1)

        self.mlp = nn.Sequential(
            nn.Linear(emb_dim * 2, 128),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(64, 1)
        )

        # initialize embeddings/biases
        nn.init.normal_(self.user_emb.weight, std=0.05)
        nn.init.normal_(self.item_emb.weight, std=0.05)
        nn.init.zeros_(self.user_bias.weight)
        nn.init.zeros_(self.item_bias.weight)

    def forward(self, users, items):
        u = self.user_emb(users)
        i = self.item_emb(items)
        x = torch.cat([u, i], dim=1)
        base = self.mlp(x)
        # add user/item bias
        return base + self.user_bias(users) + self.item_bias(items)

# 4. TRAINING SETUP

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = NCF(num_users, num_items, emb_dim=64).to(device)
criterion = nn.MSELoss()

# Add weight decay for embedding regularization and lower LR with scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=0.0008, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

# 5. TRAINING LOOP

epochs = 20  # train longer

# Scale ratings to 0-1 range to stabilize training
#  ML-100k ratings are 1-5; normalize before training and denormalize for reporting
def scale_ratings(r): return (r - 1.0) / 4.0
def unscale_ratings(r): return r * 4.0 + 1.0

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for users, items, ratings in train_loader:
        users = users.to(device)
        items = items.to(device)
        ratings = scale_ratings(ratings).to(device).unsqueeze(1)

        preds = torch.sigmoid(model(users, items))  # constrain to 0-1
        loss = criterion(preds, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs} — Train Loss: {avg_train_loss:.4f}")
    scheduler.step(avg_train_loss)

# 6. EVALUATION

model.eval()
test_loss = 0
rmse_sum = 0
n_batches = 0

with torch.no_grad():
    for users, items, ratings in test_loader:
        users = users.to(device)
        items = items.to(device)
        ratings = ratings.to(device).unsqueeze(1)

        preds01 = torch.sigmoid(model(users, items))
        preds = unscale_ratings(preds01)  # back to 1-5
        loss = criterion(preds, ratings)
        test_loss += loss.item()

        # RMSE for more interpretable metric
        rmse_sum += torch.mean((preds - ratings) ** 2).item()
        n_batches += 1

print("Test MSE:", test_loss / n_batches)
print("Test RMSE:", (rmse_sum / n_batches) ** 0.5)

Epoch 1/20 — Train Loss: 0.0645
Epoch 2/20 — Train Loss: 0.0547
Epoch 3/20 — Train Loss: 0.0532
Epoch 4/20 — Train Loss: 0.0524
Epoch 5/20 — Train Loss: 0.0517
Epoch 6/20 — Train Loss: 0.0511
Epoch 7/20 — Train Loss: 0.0504
Epoch 8/20 — Train Loss: 0.0495
Epoch 9/20 — Train Loss: 0.0487
Epoch 10/20 — Train Loss: 0.0479
Epoch 11/20 — Train Loss: 0.0471
Epoch 12/20 — Train Loss: 0.0464
Epoch 13/20 — Train Loss: 0.0457
Epoch 14/20 — Train Loss: 0.0449
Epoch 15/20 — Train Loss: 0.0442
Epoch 16/20 — Train Loss: 0.0434
Epoch 17/20 — Train Loss: 0.0427
Epoch 18/20 — Train Loss: 0.0419
Epoch 19/20 — Train Loss: 0.0412
Epoch 20/20 — Train Loss: 0.0405
Test MSE: 0.8904026573217367
Test RMSE: 0.9436114970271063


In [33]:
# make example prediction
user_id = 8  # Example user ID
item_id = 11  # Example item ID
user_tensor = torch.tensor([user_id - 1], dtype=torch.long).to(device)
item_tensor = torch.tensor([item_id - 1], dtype=torch.long).to(device)
pred01 = torch.sigmoid(model(user_tensor, item_tensor))
pred_rating = unscale_ratings(pred01).item()
print(f"Predicted rating for user {user_id} on item {item_id}: {pred_rating:.2f}")
print("Item title:", movies[movies['item_id'] == item_id]['title'].values[0])

Predicted rating for user 8 on item 11: 4.07
Item title: Seven (Se7en) (1995)


## Content-Based Filtering

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Build a lightweight content string: title + year + active genre names
genre_names = [
    'unknown','Action','Adventure','Animation','Children','Comedy','Crime',
    'Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery',
    'Romance','Sci-Fi','Thriller','War','Western'
]
genre_cols = [f'genre_{i}' for i in range(19)]

def _row_to_content(row):
    title = str(row['title']) if row['title'] is not None else ''
    rd = row['release_date']
    year = f"year_{str(rd)[-4:]}" if isinstance(rd, str) and len(rd) >= 4 else ''
    genres = [genre_names[i] for i in range(19) if row[genre_cols[i]] == 1]
    return " ".join([title, year] + genres)

if 'content' not in movies.columns:
    movies['content'] = movies.apply(_row_to_content, axis=1)

# TF-IDF and cosine similarity matrix
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['content'])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Helper to fetch top-N similar items by item_id
def get_similar_items(item_id, top_n=10):
    idx_arr = movies.index[movies['item_id'] == item_id].to_numpy()
    if len(idx_arr) == 0:
        raise ValueError(f"item_id {item_id} not found")
    idx = int(idx_arr[0])
    sims = cosine_sim[idx]
    top_idx = sims.argsort()[::-1]  # descending
    top_idx = [i for i in top_idx if i != idx][:top_n]
    result = movies.loc[top_idx, ['item_id', 'title']].copy()
    result['similarity'] = [float(sims[i]) for i in top_idx]
    return result.reset_index(drop=True)

In [43]:
# Example: show similar movies to the existing item_id variable
item_id = 1  # Example item ID
print("Movie name:", movies[movies['item_id'] == item_id]['title'].values[0])
print(get_similar_items(item_id, top_n=10))

Movie name: Toy Story (1995)
   item_id                              title  similarity
0     1066                       Balto (1995)    0.391444
1     1072  Pyromaniac's Love Story, A (1995)    0.390892
2     1219              Goofy Movie, A (1995)    0.353428
3      542                  Pocahontas (1995)    0.350211
4     1470            Gumby: The Movie (1995)    0.337008
5     1053                Now and Then (1995)    0.299264
6      548  NeverEnding Story III, The (1994)    0.296255
7      308     FairyTale: A True Story (1997)    0.289664
8        8                        Babe (1995)    0.280017
9     1344       Story of Xinghua, The (1993)    0.276277


## Build the Hybrid Recommender

In [44]:
def hybrid_score(user_id, item_id, alpha=0.5):
    """
    Compute a hybrid score for a given user and item.
    alpha: weight for collaborative (MF) score, (1-alpha) for content-based score.
    """
    # Matrix Factorization prediction (collaborative)
    mf_pred = np.dot(P[user_id - 1], Q[item_id - 1])

    # Content-based similarity: average similarity to user's rated items
    user_rated_items = train_data[train_data['user_id'] == user_id]['item_id'].values
    if len(user_rated_items) == 0:
        content_score = 0
    else:
        idx_target = movies.index[movies['item_id'] == item_id][0]
        idx_rated = [movies.index[movies['item_id'] == iid][0] for iid in user_rated_items if iid in movies['item_id'].values]
        sim_scores = cosine_sim[idx_target, idx_rated] if idx_rated else np.array([0])
        content_score = np.mean(sim_scores)

    # Combine scores (normalize content_score to 1-5 scale)
    content_score_scaled = content_score * 4 + 1
    hybrid = alpha * mf_pred + (1 - alpha) * content_score_scaled
    return hybrid

# Example usage:
user_id = 8
item_id = 11
score = hybrid_score(user_id, item_id, alpha=0.7)
print(f"Hybrid score for user {user_id} and item {item_id}: {score:.2f}")
print("Item title:", movies[movies['item_id'] == item_id]['title'].values[0])

Hybrid score for user 8 and item 11: 3.06
Item title: Seven (Se7en) (1995)
