In [None]:
!pip install torch_geometric

In [None]:
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn import preprocessing as pp
from torch_geometric.data import HeteroData, download_url, extract_zip
from keras.layers import Dropout, Flatten, Activation, Input, Embedding, BatchNormalization, Dense, dot
from keras.optimizers import Adam
from pylab import rcParams
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score, accuracy_score
import os
import pandas as pd
import numpy as np
import keras
import torch
import torch.nn as nn
import scipy.sparse as sp
import matplotlib.pyplot as plt
import time
import random
import xgboost as xgb

In [None]:
def download(url, root=os.getcwd()) -> None:
    # ref: https://pytorch-geometric.readthedocs.io/en/stable/_modules/torch_geometric/datasets/movie_lens_100k.html#MovieLens100K
    path = download_url(url, root)
    extract_zip(path, root)
    os.remove(path)

    folder_name = url.split("/")[-1].split(".")[0]
    # folder = os.path.join(root, folder_name)
    # fs.rm(raw_dir)
    # os.rename(folder, raw_dir)
    return os.path.join(root, folder_name)

url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"

raw_file_names = [
    "u.item",
    "u.user",
    "u.data",
]
# ['u.item', 'u.user', 'u1.base', 'u1.test']

In [None]:
USER_HEADERS = ["user_id", "age", "gender", "occupation", "zip_code"]
MOVIE_HEADERS = [
    "item_id",
    "title",
    "release_date",
    "video_release_date",
    "IMDb URL",
    "unknown",
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]
RATING_HEADERS = ["user_id", "item_id", "rating", "timestamp"]


folder_path = download(url)
raw_paths = [os.path.join(folder_path, i) for i in raw_file_names]

In [None]:
user_df = pd.read_csv(
    raw_paths[1],
    sep="|",
    header=None,
    names=USER_HEADERS,
    # index_col='user_id',
    encoding="ISO-8859-1",
)

item_df = pd.read_csv(
    raw_paths[0],
    sep="|",
    header=None,
    names=MOVIE_HEADERS,
    # index_col='item_id',
    encoding="ISO-8859-1",
)

rating_df = pd.read_csv(
    raw_paths[2],
    sep="\t",
    header=None,
    names=RATING_HEADERS,
)

In [None]:
rating_df

In [None]:
user_df

In [None]:
item_df

# XGBoost

## Data Preparation

In [None]:
# --- User Profile Creation ---

# Consider ratings of 4 or higher as a positive interaction.
positive_ratings = rating_df[rating_df['rating'] >= 4]

# Get the genre columns from the item_df
genre_cols = item_df.columns[item_df.columns.str.startswith(('Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'))]

# Merge positive ratings with movie genres
merged_df = pd.merge(positive_ratings, item_df, left_on='item_id', right_on='item_id')

# Create user profiles by averaging the genres of movies they liked
user_profiles = merged_df.groupby('user_id')[genre_cols].mean()

print("✅ User profiles created. Sample:")
print(user_profiles.head())

In [None]:
# --- Negative Sampling and Dataset Creation ---

def create_training_set(ratings, all_item_ids, neg_sample_ratio=3):
    """
    Creates a training dataset with positive and negative samples.
    """
    # Get the set of movies each user has already rated for quick lookups
    rated_movies_by_user = ratings.groupby('user_id')['item_id'].apply(set)

    positive_samples = ratings[ratings['rating'] >= 4]

    training_data = []

    print("Generating training data with negative sampling...")
    # Create positive and negative samples
    for _, row in tqdm(positive_samples.iterrows(), total=positive_samples.shape[0]):
        user_id = int(row['user_id'])
        item_id = int(row['item_id'])

        # 1. Add the positive sample
        training_data.append({'user_id': user_id, 'item_id': item_id, 'target': 1})

        # 2. Add negative samples
        for _ in range(neg_sample_ratio):
            while True:
                # Randomly pick a movie ID
                random_item_id = random.choice(all_item_ids)
                # Check if it's a true negative (user hasn't rated it)
                if random_item_id not in rated_movies_by_user.get(user_id, set()):
                    training_data.append({'user_id': user_id, 'item_id': random_item_id, 'target': 0})
                    break

    return pd.DataFrame(training_data)

all_item_ids = item_df['item_id'].unique()
training_df = create_training_set(rating_df, all_item_ids)

print(f"\n✅ Training set created with {len(training_df)} samples.")
print("Sample of the training set:")
print(training_df.head())

In [None]:
# --- Corrected Step 3: Combine Features into a Single DataFrame ---

# The user profile creation is correct, but let's ensure the column name is 'item_id'
user_profiles = merged_df.groupby('user_id')[genre_cols].mean()


# The training_df creation is correct, but ensure the column names are consistent
all_item_ids = item_df.index.unique() # Get IDs from the index now
training_df = create_training_set(rating_df, all_item_ids) # This function is still correct
training_df.rename(columns={'item_id': 'item_id'}, inplace=True) # Rename for consistency

# --- Merge all features into the training DataFrame ---

# Merge user profiles (user's taste)
training_df = pd.merge(training_df, user_profiles, on='user_id', how='left')
# Rename user profile genres to distinguish them from movie genres
training_df.rename(columns={g: f'user_{g}' for g in genre_cols}, inplace=True)


# Merge item (movie) features using the index of item_df
# This is the corrected line:
training_df = pd.merge(training_df, item_df[genre_cols], left_on='item_id', right_index=True, how='left')

# Fill any potential NaNs (for users who might not have a profile yet)
training_df.fillna(0, inplace=True)

print("\n✅ Corrected - Final training DataFrame with all features:")
print(training_df.head())

### Modelling

In [None]:
# --- XGBoost Model Training ---

# Define features (X) and target (y)
features = [col for col in training_df.columns if col not in ['user_id', 'item_id', 'target']]
X = training_df[features]
y = training_df['target']

# Split data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# Initialize and train the XGBoost classifier
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=200,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False # Suppress a warning
)

print("\nTraining XGBoost model...")
xgb_model.fit(X_train, y_train)

# --- Evaluation ---
print("\nEvaluating model...")
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"✅ Model training complete. ROC AUC Score: {roc_auc:.4f}")

In [None]:
def get_recommendations(user_id, model, user_profiles, item_df, rating_df, top_n=10):
    """
    Generates top N movie recommendations for a given user.
    """
    # Get movies the user has already rated
    rated_item_ids = rating_df[rating_df['user_id'] == user_id]['item_id'].unique()

    # Create a DataFrame of candidate movies (all movies not yet rated)
    candidate_movies = item_df[~item_df['item_id'].isin(rated_item_ids)].copy()
    candidate_movies['user_id'] = user_id

    # --- FIX STARTS HERE ---

    # 1. Create a copy of user profiles to avoid modifying the original DataFrame.
    user_profile_to_merge = user_profiles.copy()

    # 2. Rename the columns to match the feature names used in training (e.g., 'Action' -> 'user_Action').
    user_profile_to_merge.columns = [f'user_{col}' for col in user_profile_to_merge.columns]

    # 3. Merge the prepared user profile data. Since user_profiles is indexed by user_id,
    #    we merge on the index. This avoids column name collisions.
    candidate_movies = pd.merge(candidate_movies, user_profile_to_merge, left_on='user_id', right_index=True, how='left')

    # 4. Fill any NaNs that might result from the merge (e.g., a user with no positive ratings).
    candidate_movies.fillna(0, inplace=True)

    # `features` is the list of column names the model was trained on.
    # It is captured from the global scope when this function is called.
    # Now candidate_movies has all the necessary columns with the correct names.
    candidate_features = candidate_movies[features]

    # --- FIX ENDS HERE ---

    # Predict the probability of liking each candidate movie
    candidate_movies['recommendation_score'] = model.predict_proba(candidate_features)[:, 1]

    # Sort by score and return the top N
    recommendations = candidate_movies.sort_values('recommendation_score', ascending=False).head(top_n)

    return recommendations[['item_id', 'title', 'recommendation_score']]

# --- Get recommendations for a sample user ---
# This part of your code remains the same.
sample_user_id = 50
print(f"\n🚀 Top 10 Recommendations for User ID {sample_user_id}:")
recommendations = get_recommendations(sample_user_id, xgb_model, user_profiles, item_df, rating_df)
print(recommendations)