# Content-Based Filtering Model

In [57]:
pip install xgboost lightgbm catboost

Note: you may need to restart the kernel to use updated packages.


In [58]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.model_selection import GroupKFold
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor, Pool

In [59]:
content_df_path = '../data/prepared-data/content-based-features.csv'
content_df = pd.read_csv(content_df_path)

print(content_df.head())

   UserID  MovieID  Rating  Avg_Rating  Rating_Count    Decade  Genre_Mystery  \
0       1     1193       5    0.847681      0.503064  0.666667              0   
1       1      661       3    0.616190      0.152903  0.888889              0   
2       1      914       3    0.788522      0.185293  0.555556              0   
3       1     3408       4    0.715970      0.383426  1.000000              0   
4       1     2355       5    0.713594      0.496644  0.888889              0   

   Genre_Thriller  Genre_Action  Genre_Western  ...  Genre_Animation  \
0               0             0              0  ...                0   
1               0             0              0  ...                1   
2               0             0              0  ...                0   
3               0             0              0  ...                0   
4               0             0              0  ...                1   

   Genre_Horror  Genre_Fantasy  Genre_Romance  Genre_Documentary  \
0           

## XGBoost, LightGBM and CatBoost (Train - Validation - Test Split)

All of them gave very similar and very good results. Around 0.975 all of them.

In [60]:
print(content_df.columns)

Index(['UserID', 'MovieID', 'Rating', 'Avg_Rating', 'Rating_Count', 'Decade',
       'Genre_Mystery', 'Genre_Thriller', 'Genre_Action', 'Genre_Western',
       'Genre_War', 'Genre_Musical', 'Genre_Children's', 'Genre_Drama',
       'Genre_Animation', 'Genre_Horror', 'Genre_Fantasy', 'Genre_Romance',
       'Genre_Documentary', 'Genre_Film-Noir', 'Genre_Adventure',
       'Genre_Comedy', 'Genre_Crime', 'Genre_Sci-Fi'],
      dtype='object')


In [61]:
# Datasets
train_list = []
val_list = []
test_list = []

for user_id, group in content_df.groupby('UserID'):
    if len(group) >= 5:  # only split users with enough ratings
        train, temp = train_test_split(group, test_size=0.4, random_state=42)
        val, test = train_test_split(temp, test_size=0.5, random_state=42)
    else:
        # If too few ratings, keep them all in train
        train = group
        val = pd.DataFrame()
        test = pd.DataFrame()
    
    train_list.append(train)
    val_list.append(val)
    test_list.append(test)

train_df = pd.concat(train_list)
val_df = pd.concat(val_list)
test_df = pd.concat(test_list)

# Rating will be the target
# Dropping Rating, UserID and MovieID
X_train = train_df.drop(columns=['Rating', 'UserID', 'MovieID']) 
y_train = train_df['Rating']

X_val = val_df.drop(columns=['Rating', 'UserID', 'MovieID'])
y_val = val_df['Rating']

X_test = test_df.drop(columns=['Rating', 'UserID', 'MovieID'])
y_test = test_df['Rating']


In [62]:
#XGBoost

xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1)
xgb_model.fit(X_train, y_train)

y_pred_val = xgb_model.predict(X_val)

# Calculate MSE 
mse = mean_squared_error(y_val, y_pred_val)
# Calculate RMSE
rmse = mse ** 0.5  # Equivalent to sqrt(mse)
print(f'Validation RMSE: {rmse:.4f}')


Validation RMSE: 0.9758


In [63]:
y_pred_test = xgb_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred_test)
test_rmse = test_mse ** 0.5
print(f'Test RMSE: {test_rmse:.4f}')


Test RMSE: 0.9753


In [64]:
#LightGBM

# Create dataset
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)

# Train
lgb_model = lgb.train(
    {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "early_stopping_rounds": 20
    },
    train_data,
    valid_sets=[val_data],
    num_boost_round=1000
)

# Predict
y_pred = lgb_model.predict(X_test)


In [65]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE: {rmse:.4f}")

Test RMSE: 0.9746


In [66]:
X_train.columns

Index(['Avg_Rating', 'Rating_Count', 'Decade', 'Genre_Mystery',
       'Genre_Thriller', 'Genre_Action', 'Genre_Western', 'Genre_War',
       'Genre_Musical', 'Genre_Children's', 'Genre_Drama', 'Genre_Animation',
       'Genre_Horror', 'Genre_Fantasy', 'Genre_Romance', 'Genre_Documentary',
       'Genre_Film-Noir', 'Genre_Adventure', 'Genre_Comedy', 'Genre_Crime',
       'Genre_Sci-Fi'],
      dtype='object')

In [67]:
# CatBoost

# Categorical features
cat_features = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] 

cat_model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    eval_metric="RMSE",
    early_stopping_rounds=20,
    verbose=100
)

cat_model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    cat_features=cat_features
)

# Predict
y_pred = cat_model.predict(X_test)


0:	learn: 1.0924461	test: 1.0940082	best: 1.0940082 (0)	total: 40.4ms	remaining: 40.3s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9747512833
bestIteration = 59

Shrink model to first 60 iterations.


In [68]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Test RMSE: {rmse:.4f}")

Test RMSE: 0.9743


## XGBoost, LightGBM and CatBoost (K-Fold Cross-Validation)

These ones also gave similarly great results as before but XGBoost K-Fold Cross-Validation gave the best results from the bunch at 0.9789.

In [69]:
X = content_df.drop(columns=['Rating', 'UserID', 'MovieID'])
y = content_df['Rating']

In [70]:
#XGBoost
# Initialize KFold
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

sum = 0
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\nFold {fold + 1}")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create DMatrix (XGBoost's optimized data structure)
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Train model
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "early_stopping_rounds": 20,
        "verbosity": 1
    }
    xgb_model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=[(dval, "validation")],
        verbose_eval=100
    )
    
    # Evaluate
    val_pred = xgb_model.predict(dval)
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    print(f"Validation RMSE: {rmse:.4f}")
    print(f"Sample predictions: {val_pred[:5]} vs actual {y_val.values[:5]}")
    sum = sum + rmse

print(f"\nAverage RMSE across the folds: {(sum / 5):.4f}")


Fold 1


Parameters: { "early_stopping_rounds" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	validation-rmse:1.05001
[100]	validation-rmse:0.98017
[200]	validation-rmse:0.98098
[300]	validation-rmse:0.98140
[400]	validation-rmse:0.98160
[500]	validation-rmse:0.98173
[600]	validation-rmse:0.98179
[700]	validation-rmse:0.98184
[800]	validation-rmse:0.98186
[900]	validation-rmse:0.98188
[999]	validation-rmse:0.98188
Validation RMSE: 0.9819
Sample predictions: [4.386253  4.235983  3.9535725 3.9476895 3.7131262] vs actual [5 5 4 5 4]

Fold 2


Parameters: { "early_stopping_rounds" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	validation-rmse:1.04492
[100]	validation-rmse:0.97473
[200]	validation-rmse:0.97566
[300]	validation-rmse:0.97608
[400]	validation-rmse:0.97632
[500]	validation-rmse:0.97646
[600]	validation-rmse:0.97655
[700]	validation-rmse:0.97660
[800]	validation-rmse:0.97664
[900]	validation-rmse:0.97666
[999]	validation-rmse:0.97666
Validation RMSE: 0.9767
Sample predictions: [4.1125    4.2315583 3.8936195 3.2649279 3.265034 ] vs actual [5 4 5 3 4]

Fold 3


Parameters: { "early_stopping_rounds" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	validation-rmse:1.04586
[100]	validation-rmse:0.97509
[200]	validation-rmse:0.97593
[300]	validation-rmse:0.97635
[400]	validation-rmse:0.97659
[500]	validation-rmse:0.97673
[600]	validation-rmse:0.97681
[700]	validation-rmse:0.97686
[800]	validation-rmse:0.97689
[900]	validation-rmse:0.97691
[999]	validation-rmse:0.97691
Validation RMSE: 0.9769
Sample predictions: [4.170225  3.6548083 3.9719706 4.5190063 3.459523 ] vs actual [3 4 4 3 4]

Fold 4


Parameters: { "early_stopping_rounds" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	validation-rmse:1.04784
[100]	validation-rmse:0.97739
[200]	validation-rmse:0.97829
[300]	validation-rmse:0.97871
[400]	validation-rmse:0.97894
[500]	validation-rmse:0.97907
[600]	validation-rmse:0.97914
[700]	validation-rmse:0.97919
[800]	validation-rmse:0.97922
[900]	validation-rmse:0.97923
[999]	validation-rmse:0.97923
Validation RMSE: 0.9792
Sample predictions: [3.4529464 3.8469877 3.6798549 3.8659863 3.9875278] vs actual [3 5 3 4 5]

Fold 5


Parameters: { "early_stopping_rounds" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	validation-rmse:1.04654
[100]	validation-rmse:0.97781
[200]	validation-rmse:0.97868
[300]	validation-rmse:0.97913
[400]	validation-rmse:0.97934
[500]	validation-rmse:0.97946
[600]	validation-rmse:0.97953
[700]	validation-rmse:0.97959
[800]	validation-rmse:0.97962
[900]	validation-rmse:0.97964
[999]	validation-rmse:0.97964
Validation RMSE: 0.9796
Sample predictions: [3.878255  4.3162627 3.8641188 4.1130733 3.9926395] vs actual [4 3 4 4 4]

Average RMSE across the folds: 0.9789


In [71]:
# Initialize KFold
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

sum = 0
# Iterate through folds
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\nFold {fold + 1}")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create LightGBM datasets
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    
    # Train model (using train(), not fit())
    lgb_model = lgb.train(
        params={
            "objective": "regression",
            "metric": "rmse",
            "verbosity": -1,
            "early_stopping_rounds" : 20,
            "verbose_eval" : 100
        },
        train_set=train_data,
        valid_sets=[val_data],
        num_boost_round=1000
    )
    
    # Evaluate
    val_pred = lgb_model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    print(f"Validation RMSE: {rmse:.4f}")
    print(f"Sample predictions: {val_pred[:5]} vs actual {y_val.values[:5]}")
    sum = sum + rmse

print(f"\nAverage RMSE across the folds: {(sum / 5):.4f}")


Fold 1
Validation RMSE: 0.9783
Sample predictions: [4.3745711  4.22033207 3.95634153 3.93095756 3.7355865 ] vs actual [5 5 4 5 4]

Fold 2
Validation RMSE: 0.9728
Sample predictions: [4.10437926 4.23084931 3.87303459 3.2480329  3.27614592] vs actual [5 4 5 3 4]

Fold 3
Validation RMSE: 0.9733
Sample predictions: [4.14289689 3.62537978 3.96033353 4.49715742 3.48280563] vs actual [3 4 4 3 4]

Fold 4
Validation RMSE: 0.9755
Sample predictions: [3.47216258 3.83796219 3.66812804 3.85326313 3.98113498] vs actual [3 5 3 4 5]

Fold 5
Validation RMSE: 0.9759
Sample predictions: [3.86091149 4.29847422 3.83605812 4.11040277 3.96719181] vs actual [4 3 4 4 4]

Average RMSE across the folds: 0.9752


In [72]:
#CatBoost
#Categorical features
cat_features = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] 

# Initialize KFold
k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

sum = 0
for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"\nFold {fold + 1}")
    
    # Split data
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    # Create Pool (CatBoost's data structure)
    train_pool = Pool(X_train, y_train, cat_features=cat_features)
    val_pool = Pool(X_val, y_val, cat_features=cat_features)
    
    # Train model
    cat_model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.1,
        loss_function='RMSE',
        early_stopping_rounds=20,
        verbose=100
    )
    cat_model.fit(
        train_pool,
        eval_set=val_pool,
        use_best_model=True
    )
    
    # Evaluate
    val_pred = cat_model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    print(f"Validation RMSE: {rmse:.4f}")
    print(f"Sample predictions: {val_pred[:5]} vs actual {y_val.values[:5]}")
    sum = sum + rmse

print(f"\nAverage RMSE across the folds: {(sum / 5):.4f}")


Fold 1
0:	learn: 1.0919708	test: 1.0953487	best: 1.0953487 (0)	total: 65.3ms	remaining: 1m 5s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9780945476
bestIteration = 61

Shrink model to first 62 iterations.
Validation RMSE: 0.9781
Sample predictions: [4.38892676 4.21589141 3.96471983 3.9612391  3.72608709] vs actual [5 5 4 5 4]

Fold 2
0:	learn: 1.0932162	test: 1.0906041	best: 1.0906041 (0)	total: 64ms	remaining: 1m 3s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9725355933
bestIteration = 64

Shrink model to first 65 iterations.
Validation RMSE: 0.9725
Sample predictions: [4.12009994 4.24968673 3.8739992  3.23261747 3.30318346] vs actual [5 4 5 3 4]

Fold 3
0:	learn: 1.0928871	test: 1.0916627	best: 1.0916627 (0)	total: 65.5ms	remaining: 1m 5s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.9730624353
bestIteration = 55

Shrink model to first 56 iterations.
Validation RMSE: 0.9731
Sample predictions: [4.15417248 3.63116