In [None]:
!pip install torch_geometric

In [None]:
from datetime import datetime
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn import preprocessing as pp
from sklearn.metrics import make_scorer
from torch_geometric.data import HeteroData, download_url, extract_zip
from keras.layers import Dropout, Flatten, Activation, Input, Embedding, BatchNormalization, Dense, dot
from keras.optimizers import Adam
from pylab import rcParams
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score, accuracy_score
from scipy.stats import randint, uniform
import os
import pandas as pd
import numpy as np
import keras
import torch
import torch.nn as nn
import scipy.sparse as sp
import matplotlib.pyplot as plt
import time
import random
import xgboost as xgb

In [None]:
def download(url, root=os.getcwd()) -> None:
    # ref: https://pytorch-geometric.readthedocs.io/en/stable/_modules/torch_geometric/datasets/movie_lens_100k.html#MovieLens100K
    path = download_url(url, root)
    extract_zip(path, root)
    os.remove(path)

    folder_name = url.split("/")[-1].split(".")[0]
    # folder = os.path.join(root, folder_name)
    # fs.rm(raw_dir)
    # os.rename(folder, raw_dir)
    return os.path.join(root, folder_name)

url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"

raw_file_names = [
    "u.item",
    "u.user",
    "u.data",
]
# ['u.item', 'u.user', 'u1.base', 'u1.test']

In [None]:
USER_HEADERS = ["user_id", "age", "gender", "occupation", "zip_code"]
MOVIE_HEADERS = [
    "item_id",
    "title",
    "release_date",
    "video_release_date",
    "IMDb URL",
    "unknown",
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]
RATING_HEADERS = ["user_id", "item_id", "rating", "timestamp"]


folder_path = download(url)
raw_paths = [os.path.join(folder_path, i) for i in raw_file_names]

In [None]:
user_df = pd.read_csv(
    raw_paths[1],
    sep="|",
    header=None,
    names=USER_HEADERS,
    # index_col='user_id',
    encoding="ISO-8859-1",
)

item_df = pd.read_csv(
    raw_paths[0],
    sep="|",
    header=None,
    names=MOVIE_HEADERS,
    # index_col='item_id',
    encoding="ISO-8859-1",
)

rating_df = pd.read_csv(
    raw_paths[2],
    sep="\t",
    header=None,
    names=RATING_HEADERS,
)

In [None]:
rating_df

In [None]:
user_df

In [None]:
item_df

## Data Preparation

In [None]:
# Consider ratings of 4 or higher as a positive interaction.
positive_ratings = rating_df[rating_df['rating'] >= 4]
positive_ratings

In [None]:
# Get the genre columns from the item_df
genre_cols = item_df.columns[item_df.columns.str.startswith(('Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'))]
genre_cols

In [None]:
# Merge positive ratings with movie genres
merged_df = pd.merge(positive_ratings, item_df, left_on='item_id', right_on='item_id')
merged_df

In [None]:
# Create user profiles by averaging the genres of movies they liked
user_profiles = merged_df.groupby('user_id')[genre_cols].mean()
user_profiles

In [None]:
neg_sample_ratio = 5
all_item_ids = item_df['item_id'].unique()

print(f"Negative sample ratio set to: {neg_sample_ratio}")
print(f"Total unique movie IDs available: {len(all_item_ids)}")

In [None]:
rated_movies_by_user = rating_df.groupby('user_id')['item_id'].apply(set)

print("Sample for User 1:", rated_movies_by_user.get(1))
print("Sample for User 2:", rated_movies_by_user.get(2))

In [None]:
training_data = []

for _, row in tqdm(positive_ratings.iterrows(), total=positive_ratings.shape[0]):
    user_id = int(row['user_id'])
    item_id = int(row['item_id'])
    timestamp = int(row['timestamp']) # Get the original timestamp

    # Add positive sample
    positive_entry = {'user_id': user_id, 'item_id': item_id, 'target': 1, 'timestamp': timestamp}
    training_data.append(positive_entry)

    # Add negative samples
    current_rated_movies = rated_movies_by_user.get(user_id, set())
    for neg_idx in range(neg_sample_ratio):
        while True:
            # Randomly pick a movie ID
            random_item_id = random.choice(all_item_ids)
            # Check if it's a true negative (user hasn't rated it)
            if random_item_id not in current_rated_movies:
                negative_entry = {'user_id': user_id, 'item_id': random_item_id, 'target': 0, 'timestamp': timestamp}
                training_data.append(negative_entry)
                break
            else:
                pass # Keep picking until a true negative is found

In [None]:
training_df = pd.DataFrame(training_data)

print(f"Training set created with {len(training_df)} samples.")
print("Sample of the training set:")
print(training_df.head(10)) # Show more rows to see both positive and negative
print("\nValue counts for 'target' column:")
print(training_df['target'].value_counts())

In [None]:
# Merge all features into the training DataFrame

# Merge user profiles (user's taste)
training_df = pd.merge(training_df, user_profiles, on='user_id', how='left')
# Rename user profile genres to distinguish them from movie genres
training_df.rename(columns={g: f'user_{g}' for g in genre_cols}, inplace=True)

# Merge item (movie) features using the index of item_df
training_df = pd.merge(training_df, item_df[genre_cols], left_on='item_id', right_index=True, how='left')

# Fill any potential NaNs (for users who might not have a profile yet)
training_df.fillna(0, inplace=True)

training_df

### Modelling

In [None]:
features = [col for col in training_df.columns if col not in ['user_id', 'item_id', 'target', 'timestamp']]
# Sort by user and then by timestamp
training_df.sort_values(by=['user_id', 'timestamp'], inplace=True)

# Defining test ratio 
test_ratio_per_user = 0.2

# Initialize lists to store indices for train and test sets
train_indices = []
test_indices = []

# Group by user and split
for user_id, group in tqdm(training_df.groupby('user_id'), desc="Splitting per user"):
    num_samples = len(group)
    split_point = int(num_samples * (1 - test_ratio_per_user))

    # Get indices for train and test based on the sorted group
    train_indices.extend(group.iloc[:split_point].index.tolist())
    test_indices.extend(group.iloc[split_point:].index.tolist())

# Create X_train, X_test, y_train, y_test using the collected indices
X_train = training_df.loc[train_indices, features]
y_train = training_df.loc[train_indices, 'target']

X_test = training_df.loc[test_indices, features]
y_test = training_df.loc[test_indices, 'target']

In [None]:
# Initialize and train the XGBoost classifier
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=200,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False # Suppress a warning
)


xgb_model.fit(X_train, y_train)
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"ROC AUC Score: {roc_auc:.4f}")

In [None]:
# Ranking-based evaluation metrics
def precision_at_k(y_true, y_scores, k):
    """Calculate precision@k"""
    # Get top k predictions
    top_k_idx = np.argsort(y_scores)[::-1][:k]
    top_k_true = y_true.iloc[top_k_idx] if hasattr(y_true, 'iloc') else y_true[top_k_idx]
    return np.sum(top_k_true) / k

def recall_at_k(y_true, y_scores, k):
    """Calculate recall@k"""
    # Get top k predictions
    top_k_idx = np.argsort(y_scores)[::-1][:k]
    top_k_true = y_true.iloc[top_k_idx] if hasattr(y_true, 'iloc') else y_true[top_k_idx]
    total_relevant = np.sum(y_true)
    if total_relevant == 0:
        return 0.0
    return np.sum(top_k_true) / total_relevant

def average_precision_at_k(y_true, y_scores, k):
    """Calculate average precision@k (AP@k)"""
    # Get top k predictions
    top_k_idx = np.argsort(y_scores)[::-1][:k]
    top_k_true = y_true.iloc[top_k_idx] if hasattr(y_true, 'iloc') else y_true[top_k_idx]
    
    if np.sum(top_k_true) == 0:
        return 0.0
    
    # Calculate precision at each position where there's a relevant item
    precisions = []
    for i in range(k):
        if top_k_true[i] == 1:  # If item at position i is relevant
            precision_at_i = np.sum(top_k_true[:i+1]) / (i + 1)
            precisions.append(precision_at_i)
    
    if len(precisions) == 0:
        return 0.0
    
    return np.mean(precisions)

def mean_average_precision_at_k(y_true_per_user, y_scores_per_user, k):
    """Calculate mean average precision@k (mAP@k) across all users"""
    ap_scores = []
    for y_true, y_scores in zip(y_true_per_user, y_scores_per_user):
        if len(y_true) > 0:  # Only calculate if user has test data
            ap = average_precision_at_k(y_true, y_scores, min(k, len(y_true)))
            ap_scores.append(ap)
    
    return np.mean(ap_scores) if ap_scores else 0.0

def evaluate_ranking_metrics(model, X_test, y_test, test_indices, training_df, k_values=[5, 10, 20]):
    """Evaluate precision@k, recall@k, and mAP@k for each user"""
    
    # Get predictions
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Create a test dataframe with predictions
    test_df = training_df.loc[test_indices].copy()
    test_df['prediction_score'] = y_pred_proba
    
    results = {}
    
    for k in k_values:
        precision_scores = []
        recall_scores = []
        ap_scores = []
        
        # Calculate metrics per user
        for user_id, user_group in test_df.groupby('user_id'):
            y_true_user = user_group['target'].values
            y_scores_user = user_group['prediction_score'].values
            
            # Only evaluate if user has enough test samples
            if len(y_true_user) >= k:
                prec_k = precision_at_k(y_true_user, y_scores_user, k)
                rec_k = recall_at_k(y_true_user, y_scores_user, k)
                ap_k = average_precision_at_k(y_true_user, y_scores_user, k)
                
                precision_scores.append(prec_k)
                recall_scores.append(rec_k)
                ap_scores.append(ap_k)
        
        # Calculate mean metrics
        mean_precision = np.mean(precision_scores) if precision_scores else 0.0
        mean_recall = np.mean(recall_scores) if recall_scores else 0.0
        mean_ap = np.mean(ap_scores) if ap_scores else 0.0
        
        results[k] = {
            'precision': mean_precision,
            'recall': mean_recall,
            'mAP': mean_ap,
            'num_users_evaluated': len(precision_scores)
        }
    
    return results

k_values = [5, 10, 20]
ranking_results = evaluate_ranking_metrics(xgb_model, X_test, y_test, test_indices, training_df, k_values)

for k in k_values:
    results = ranking_results[k]
    print(f"Results for K={k}:")
    print(f"   Precision@{k}: {results['precision']:.4f}")
    print(f"   Recall@{k}:    {results['recall']:.4f}")
    print(f"   mAP@{k}:       {results['mAP']:.4f}")
    print(f"   Users evaluated: {results['num_users_evaluated']}")

In [None]:
def get_recommendations(user_id, model, user_profiles, item_df, rating_df, top_n=10):
    # Get movies the user has already rated
    rated_item_ids = rating_df[rating_df['user_id'] == user_id]['item_id'].unique()

    # Create a DataFrame of candidate movies (all movies not yet rated)
    candidate_movies = item_df[~item_df['item_id'].isin(rated_item_ids)].copy()
    candidate_movies['user_id'] = user_id

    # Create a copy of user profiles to avoid modifying the original DataFrame.
    user_profile_to_merge = user_profiles.copy()

    # Rename the columns to match the feature names used in training (e.g., 'Action' -> 'user_Action').
    user_profile_to_merge.columns = [f'user_{col}' for col in user_profile_to_merge.columns]

    # Merge the prepared user profile data. Since user_profiles is indexed by user_id,
    #    we merge on the index. This avoids column name collisions.
    candidate_movies = pd.merge(candidate_movies, user_profile_to_merge, left_on='user_id', right_index=True, how='left')

    # Fill any NaNs that might result from the merge (e.g., a user with no positive ratings).
    candidate_movies.fillna(0, inplace=True)
    candidate_features = candidate_movies[features]

    # Predict the probability of liking each candidate movie
    candidate_movies['recommendation_score'] = model.predict_proba(candidate_features)[:, 1]

    # Sort by score and return the top N
    recommendations = candidate_movies.sort_values('recommendation_score', ascending=False).head(top_n)

    return recommendations[['item_id', 'title', 'recommendation_score']]

sample_user_id = 50
print(f"\Top 10 Recommendations for User ID {sample_user_id}:")
recommendations = get_recommendations(sample_user_id, xgb_model, user_profiles, item_df, rating_df)
print(recommendations)