In [1]:
!pip install numpy pandas torch scikit-learn

Defaulting to user installation because normal site-packages is not writeable
[0m

In [4]:
! pip install torch


Defaulting to user installation because normal site-packages is not writeable
[0m

In [None]:

import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings("ignore")



def process_fold(train, test, n_components=8, alpha=20):

    global_mean = train["rating"].mean()

    user_grp = train.groupby("user")["rating"]
    item_grp = train.groupby("item")["rating"]

    user_mean = user_grp.mean()
    user_count = user_grp.count()
    user_std = user_grp.std().fillna(0)

    item_mean = item_grp.mean()
    item_count = item_grp.count()
    item_std = item_grp.std().fillna(0)

    user_bayes = (user_mean * user_count + alpha * global_mean) / (user_count + alpha)
    item_bayes = (item_mean * item_count + alpha * global_mean) / (item_count + alpha)

    user_stats = pd.DataFrame({
        "u_mean": user_bayes,
        "u_count": user_count,
        "u_std": user_std
    })

    item_stats = pd.DataFrame({
        "i_mean": item_bayes,
        "i_count": item_count,
        "i_std": item_std
    })

    train = train.merge(user_stats, on="user", how="left")
    train = train.merge(item_stats, on="item", how="left")

    test = test.merge(user_stats, on="user", how="left")
    test = test.merge(item_stats, on="item", how="left")

    test["u_mean"].fillna(global_mean, inplace=True)
    test["i_mean"].fillna(global_mean, inplace=True)

    for col in ["u_count","u_std","i_count","i_std"]:
        test[col].fillna(0, inplace=True)

    
    pivot = train.pivot_table(
        index="user",
        columns="item",
        values="rating",
        fill_value=0
    )

    svd = TruncatedSVD(n_components=n_components, random_state=42)
    user_factors = svd.fit_transform(pivot)
    item_factors = svd.components_.T

    u_map = dict(zip(pivot.index, user_factors))
    i_map = dict(zip(pivot.columns, item_factors))

    def add_latent(df):

        zero_vec = np.zeros(n_components)

        u_vecs = np.vstack(
            df["user"].map(u_map)
            .apply(lambda x: x if isinstance(x, np.ndarray) else zero_vec)
        )

        i_vecs = np.vstack(
            df["item"].map(i_map)
            .apply(lambda x: x if isinstance(x, np.ndarray) else zero_vec)
        )

        for i in range(n_components):
            df[f"u_svd_{i}"] = u_vecs[:, i]
            df[f"i_svd_{i}"] = i_vecs[:, i]

        mult = u_vecs * i_vecs
        df["svd_dot"] = np.sum(mult, axis=1)

        for i in range(n_components):
            df[f"svd_mult_{i}"] = mult[:, i]

        return df

    train = add_latent(train)
    test = add_latent(test)

    return train, test




xgb_params = {
    "n_estimators": 1200,
    "max_depth": 6,
    "learning_rate": 0.03,
    "subsample": 0.9,
    "colsample_bytree": 0.8,
    "min_child_weight": 3,
    "objective": "multi:softprob",
    "num_class": 5,
    "eval_metric": "mlogloss",
    "device": "cuda",
    "predictor": "gpu_predictor",
    "random_state": 42
}

lgb_params = {
    "n_estimators": 1200,
    "learning_rate": 0.03,
    "max_depth": -1,
    "num_leaves": 63,
    "subsample": 0.9,
    "colsample_bytree": 0.8,
    "objective": "multiclass",
    "num_class": 5,
    "random_state": 42
}

cat_params = {
    "iterations": 1200,
    "depth": 6,
    "learning_rate": 0.03,
    "loss_function": "MultiClass",
    "verbose": False,
    "random_seed": 42
}

scores = []



for fold in range(1, 6):

    print(f"\nðŸš€ Fold {fold}")

    col = ["user","item","rating","timestamp"]
    train_raw = pd.read_csv(f"ml-100k/u{fold}.base", sep="\t", names=col)
    test_raw  = pd.read_csv(f"ml-100k/u{fold}.test", sep="\t", names=col)

    train, test = process_fold(train_raw, test_raw)

    drop_cols = ["user","item","rating","timestamp"]
    features = [c for c in train.columns if c not in drop_cols]

    X_train = train[features].astype(np.float32)
    X_test  = test[features].astype(np.float32)

    y_train = train["rating"] - 1
    y_test  = test["rating"] - 1

   

    model_xgb = xgb.XGBClassifier(**xgb_params)
    model_lgb = lgb.LGBMClassifier(**lgb_params)
    model_cat = CatBoostClassifier(**cat_params)

    model_xgb.fit(X_train, y_train, verbose=False)
    model_lgb.fit(X_train, y_train)
    model_cat.fit(X_train, y_train)

    

    proba_xgb = model_xgb.predict_proba(X_test)
    proba_lgb = model_lgb.predict_proba(X_test)
    proba_cat = model_cat.predict_proba(X_test)

    avg_proba = (proba_xgb + proba_lgb + proba_cat) / 3

    preds = np.argmax(avg_proba, axis=1)

    acc = accuracy_score(y_test, preds)
    print(f"Fold Accuracy: {acc:.4f}")

    scores.append(acc)

print("\n==============================")
print(f"FINAL AVERAGE ACCURACY: {np.mean(scores):.4f}")
print("==============================")



ðŸš€ Fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.047612 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7794
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 31
[LightGBM] [Info] Start training from score -2.830430
[LightGBM] [Info] Start training from score -2.165217
[LightGBM] [Info] Start training from score -1.292667
[LightGBM] [Info] Start training from score -1.071630
[LightGBM] [Info] Start training from score -1.563987
Fold Accuracy: 0.4642

ðŸš€ Fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005780 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7795
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 31
[LightGBM] [Info] Start training from score -2.802430
[LightGBM] [Info] Start training from score -2.164