In [25]:
!pip install torch_geometric



In [44]:
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn import preprocessing as pp
from torch_geometric.data import HeteroData, download_url, extract_zip
from keras.layers import Dropout, Flatten, Activation, Input, Embedding, BatchNormalization, Dense, dot
from keras.optimizers import Adam
from pylab import rcParams
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score, accuracy_score
import os
import pandas as pd
import numpy as np
import keras
import torch
import torch.nn as nn
import scipy.sparse as sp
import matplotlib.pyplot as plt
import time
import random
import xgboost as xgb

In [27]:
def download(url, root=os.getcwd()) -> None:
    # ref: https://pytorch-geometric.readthedocs.io/en/stable/_modules/torch_geometric/datasets/movie_lens_100k.html#MovieLens100K
    path = download_url(url, root)
    extract_zip(path, root)
    os.remove(path)

    folder_name = url.split("/")[-1].split(".")[0]
    # folder = os.path.join(root, folder_name)
    # fs.rm(raw_dir)
    # os.rename(folder, raw_dir)
    return os.path.join(root, folder_name)

url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"

raw_file_names = [
    "u.item",
    "u.user",
    "u.data",
]
# ['u.item', 'u.user', 'u1.base', 'u1.test']

In [28]:
USER_HEADERS = ["user_id", "age", "gender", "occupation", "zip_code"]
MOVIE_HEADERS = [
    "item_id",
    "title",
    "release_date",
    "video_release_date",
    "IMDb URL",
    "unknown",
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]
RATING_HEADERS = ["user_id", "item_id", "rating", "timestamp"]


folder_path = download(url)
raw_paths = [os.path.join(folder_path, i) for i in raw_file_names]

Downloading https://files.grouplens.org/datasets/movielens/ml-100k.zip
Extracting /content/ml-100k.zip


In [33]:
user_df = pd.read_csv(
    raw_paths[1],
    sep="|",
    header=None,
    names=USER_HEADERS,
    # index_col='user_id',
    encoding="ISO-8859-1",
)

item_df = pd.read_csv(
    raw_paths[0],
    sep="|",
    header=None,
    names=MOVIE_HEADERS,
    # index_col='item_id',
    encoding="ISO-8859-1",
)

rating_df = pd.read_csv(
    raw_paths[2],
    sep="\t",
    header=None,
    names=RATING_HEADERS,
)

In [35]:
rating_df

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [34]:
user_df

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [36]:
item_df

Unnamed: 0,item_id,title,release_date,video_release_date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,Mat' i syn (1997),06-Feb-1998,,http://us.imdb.com/M/title-exact?Mat%27+i+syn+...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1678,1679,B. Monkey (1998),06-Feb-1998,,http://us.imdb.com/M/title-exact?B%2E+Monkey+(...,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1679,1680,Sliding Doors (1998),01-Jan-1998,,http://us.imdb.com/Title?Sliding+Doors+(1998),0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1680,1681,You So Crazy (1994),01-Jan-1994,,http://us.imdb.com/M/title-exact?You%20So%20Cr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# XGBoost

## Data Preparation

In [37]:
# --- User Profile Creation ---

# Consider ratings of 4 or higher as a positive interaction.
positive_ratings = rating_df[rating_df['rating'] >= 4]

# Get the genre columns from the item_df
genre_cols = item_df.columns[item_df.columns.str.startswith(('Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'))]

# Merge positive ratings with movie genres
merged_df = pd.merge(positive_ratings, item_df, left_on='item_id', right_on='item_id')

# Create user profiles by averaging the genres of movies they liked
user_profiles = merged_df.groupby('user_id')[genre_cols].mean()

print("✅ User profiles created. Sample:")
print(user_profiles.head())

✅ User profiles created. Sample:
           Action  Adventure  Animation  Children's    Comedy     Crime  \
user_id                                                                   
1        0.239264   0.104294   0.030675    0.030675  0.300613  0.092025   
2        0.175000   0.075000   0.025000    0.050000  0.300000  0.150000   
3        0.200000   0.133333   0.000000    0.000000  0.200000  0.266667   
4        0.263158   0.105263   0.000000    0.000000  0.210526  0.210526   
5        0.396552   0.241379   0.137931    0.086207  0.568966  0.120690   

         Documentary     Drama   Fantasy  Film-Noir    Horror   Musical  \
user_id                                                                   
1           0.030675  0.478528  0.006135   0.006135  0.042945  0.036810   
2           0.000000  0.575000  0.000000   0.050000  0.000000  0.000000   
3           0.066667  0.533333  0.000000   0.000000  0.000000  0.000000   
4           0.052632  0.263158  0.000000   0.000000  0.052632  0.0

In [54]:
# --- Negative Sampling and Dataset Creation ---

def create_training_set(ratings, all_item_ids, neg_sample_ratio=3):
    """
    Creates a training dataset with positive and negative samples.
    """
    # Get the set of movies each user has already rated for quick lookups
    rated_movies_by_user = ratings.groupby('user_id')['item_id'].apply(set)

    positive_samples = ratings[ratings['rating'] >= 4]

    training_data = []

    print("Generating training data with negative sampling...")
    # Create positive and negative samples
    for _, row in tqdm(positive_samples.iterrows(), total=positive_samples.shape[0]):
        user_id = int(row['user_id'])
        item_id = int(row['item_id'])

        # 1. Add the positive sample
        training_data.append({'user_id': user_id, 'item_id': item_id, 'target': 1})

        # 2. Add negative samples
        for _ in range(neg_sample_ratio):
            while True:
                # Randomly pick a movie ID
                random_item_id = random.choice(all_item_ids)
                # Check if it's a true negative (user hasn't rated it)
                if random_item_id not in rated_movies_by_user.get(user_id, set()):
                    training_data.append({'user_id': user_id, 'item_id': random_item_id, 'target': 0})
                    break

    return pd.DataFrame(training_data)

all_item_ids = item_df['item_id'].unique()
training_df = create_training_set(rating_df, all_item_ids)

print(f"\n✅ Training set created with {len(training_df)} samples.")
print("Sample of the training set:")
print(training_df.head())

Generating training data with negative sampling...


  0%|          | 0/55375 [00:00<?, ?it/s]


✅ Training set created with 221500 samples.
Sample of the training set:
   user_id  item_id  target
0      298      474       1
1      298      375       0
2      298     1329       0
3      298     1617       0
4      253      465       1


In [55]:
# --- Corrected Step 3: Combine Features into a Single DataFrame ---

# The user profile creation is correct, but let's ensure the column name is 'item_id'
user_profiles = merged_df.groupby('user_id')[genre_cols].mean()


# The training_df creation is correct, but ensure the column names are consistent
all_item_ids = item_df.index.unique() # Get IDs from the index now
training_df = create_training_set(rating_df, all_item_ids) # This function is still correct
training_df.rename(columns={'item_id': 'item_id'}, inplace=True) # Rename for consistency

# --- Merge all features into the training DataFrame ---

# Merge user profiles (user's taste)
training_df = pd.merge(training_df, user_profiles, on='user_id', how='left')
# Rename user profile genres to distinguish them from movie genres
training_df.rename(columns={g: f'user_{g}' for g in genre_cols}, inplace=True)


# Merge item (movie) features using the index of item_df
# This is the corrected line:
training_df = pd.merge(training_df, item_df[genre_cols], left_on='item_id', right_index=True, how='left')

# Fill any potential NaNs (for users who might not have a profile yet)
training_df.fillna(0, inplace=True)

print("\n✅ Corrected - Final training DataFrame with all features:")
print(training_df.head())

Generating training data with negative sampling...


  0%|          | 0/55375 [00:00<?, ?it/s]


✅ Corrected - Final training DataFrame with all features:
   user_id  item_id  target  user_Action  user_Adventure  user_Animation  \
0      298      474       1     0.241758        0.197802        0.087912   
1      298       37       0     0.241758        0.197802        0.087912   
2      298      691       0     0.241758        0.197802        0.087912   
3      298      676       0     0.241758        0.197802        0.087912   
4      253      465       1     0.318841        0.144928        0.043478   

   user_Children's  user_Comedy  user_Crime  user_Documentary  ...  Fantasy  \
0         0.186813     0.285714    0.043956          0.010989  ...        0   
1         0.186813     0.285714    0.043956          0.010989  ...        0   
2         0.186813     0.285714    0.043956          0.010989  ...        0   
3         0.186813     0.285714    0.043956          0.010989  ...        0   
4         0.086957     0.275362    0.072464          0.000000  ...        0   

   Film-N

### Modelling

In [59]:
# --- XGBoost Model Training ---

# Define features (X) and target (y)
features = [col for col in training_df.columns if col not in ['user_id', 'item_id', 'target']]
X = training_df[features]
y = training_df['target']

# Split data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

# Initialize and train the XGBoost classifier
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    n_estimators=200,
    learning_rate=0.1,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    use_label_encoder=False # Suppress a warning
)

print("\nTraining XGBoost model...")
xgb_model.fit(X_train, y_train)

# --- Evaluation ---
print("\nEvaluating model...")
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"✅ Model training complete. ROC AUC Score: {roc_auc:.4f}")


Training data shape: (177200, 36)
Test data shape: (44300, 36)

Training XGBoost model...


Parameters: { "use_label_encoder" } are not used.




Evaluating model...
✅ Model training complete. ROC AUC Score: 0.6707


In [58]:
def get_recommendations(user_id, model, user_profiles, item_df, rating_df, top_n=10):
    """
    Generates top N movie recommendations for a given user.
    """
    # Get movies the user has already rated
    rated_item_ids = rating_df[rating_df['user_id'] == user_id]['item_id'].unique()

    # Create a DataFrame of candidate movies (all movies not yet rated)
    candidate_movies = item_df[~item_df['item_id'].isin(rated_item_ids)].copy()
    candidate_movies['user_id'] = user_id

    # --- FIX STARTS HERE ---

    # 1. Create a copy of user profiles to avoid modifying the original DataFrame.
    user_profile_to_merge = user_profiles.copy()

    # 2. Rename the columns to match the feature names used in training (e.g., 'Action' -> 'user_Action').
    user_profile_to_merge.columns = [f'user_{col}' for col in user_profile_to_merge.columns]

    # 3. Merge the prepared user profile data. Since user_profiles is indexed by user_id,
    #    we merge on the index. This avoids column name collisions.
    candidate_movies = pd.merge(candidate_movies, user_profile_to_merge, left_on='user_id', right_index=True, how='left')

    # 4. Fill any NaNs that might result from the merge (e.g., a user with no positive ratings).
    candidate_movies.fillna(0, inplace=True)

    # `features` is the list of column names the model was trained on.
    # It is captured from the global scope when this function is called.
    # Now candidate_movies has all the necessary columns with the correct names.
    candidate_features = candidate_movies[features]

    # --- FIX ENDS HERE ---

    # Predict the probability of liking each candidate movie
    candidate_movies['recommendation_score'] = model.predict_proba(candidate_features)[:, 1]

    # Sort by score and return the top N
    recommendations = candidate_movies.sort_values('recommendation_score', ascending=False).head(top_n)

    return recommendations[['item_id', 'title', 'recommendation_score']]

# --- Get recommendations for a sample user ---
# This part of your code remains the same.
sample_user_id = 50
print(f"\n🚀 Top 10 Recommendations for User ID {sample_user_id}:")
recommendations = get_recommendations(sample_user_id, xgb_model, user_profiles, item_df, rating_df)
print(recommendations)


🚀 Top 10 Recommendations for User ID 50:
      item_id                                 title  recommendation_score
100       101                    Heavy Metal (1981)              0.856144
134       135          2001: A Space Odyssey (1968)              0.712451
50         51            Legends of the Fall (1994)              0.687882
238       239                       Sneakers (1992)              0.653742
183       184               Army of Darkness (1993)              0.615234
269       270                        Gattaca (1997)              0.611994
213       214          Pink Floyd - The Wall (1982)              0.593492
216       217          Bram Stoker's Dracula (1992)              0.585999
559       560  Kid in King Arthur's Court, A (1995)              0.574304
1484     1485            Colonel Chabert, Le (1994)              0.512189
