In [5]:

import pandas as pd
import pickle
import numpy as np

# Training and evaluation
import optuna
import lightgbm as lgb
import xgboost as xgb
from scipy.sparse import load_npz
from sklearn.metrics import mean_squared_error

base_path = '..\..\..\data\main'

In [14]:
test_uwarm_iwarm = pd.read_csv(f'{base_path}\\testset_warm_user_warm_item.csv', usecols=['RatingID', 'UserID', 'WineID', 'Rating'])
test_uwarm_icold = pd.read_csv(f'{base_path}\\testset_warm_user_cold_item.csv', usecols=['RatingID', 'UserID', 'WineID', 'Rating'])
test_ucold_iwarm = pd.read_csv(f'{base_path}\\testset_cold_user_warm_item.csv', usecols=['RatingID', 'UserID', 'WineID', 'Rating'])
test_ucold_icold = pd.read_csv(f'{base_path}\\testset_cold_user_cold_item.csv', usecols=['RatingID', 'UserID', 'WineID', 'Rating'])

In [11]:
# # Load transformed data from npz
X_train_transformed = load_npz(f'{base_path}\\preprocessed\\X_train_transformed.npz')
X_val_transformed = load_npz(f'{base_path}\\preprocessed\\X_val_transformed.npz')
X_test_uwarm_iwarm_transformed = load_npz(f'{base_path}\\preprocessed\\X_test_uwarm_iwarm_transformed.npz')
X_test_uwarm_icold_transformed = load_npz(f'{base_path}\\preprocessed\\X_test_uwarm_icold_transformed.npz')
X_test_ucold_iwarm_transformed = load_npz(f'{base_path}\\preprocessed\\X_test_ucold_iwarm_transformed.npz')
X_test_ucold_icold_transformed = load_npz(f'{base_path}\\preprocessed\\X_test_ucold_icold_transformed.npz')

# # Load target variables
y_train = pd.read_csv(f'{base_path}\\preprocessed\\y_train.csv')
y_val = pd.read_csv(f'{base_path}\\preprocessed\\y_val.csv')
y_test_uwarm_iwarm = pd.read_csv(f'{base_path}\\preprocessed\\y_test_uwarm_iwarm.csv')
y_test_uwarm_icold = pd.read_csv(f'{base_path}\\preprocessed\\y_test_uwarm_icold.csv')
y_test_ucold_iwarm = pd.read_csv(f'{base_path}\\preprocessed\\y_test_ucold_iwarm.csv')
y_test_ucold_icold = pd.read_csv(f'{base_path}\\preprocessed\\y_test_ucold_icold.csv')

# LightGBM

In [None]:
import pickle
# LGBM tuning

def objective(trial):
    params = {
        "objective": "regression",
        "metric": "mse",
        "boosting_type": "gbdt",
        "verbosity": -1,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "num_leaves": trial.suggest_int("num_leaves", 30, 300),
        "max_depth": trial.suggest_int("max_depth", 3, 16),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 10.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 10.0),
    }

    lgb_train = lgb.Dataset(X_train_transformed, y_train)
    lgb_valid = lgb.Dataset(X_val_transformed, y_val, reference=lgb_train)

    model = lgb.train(params, lgb_train,
                      valid_sets=[lgb_valid],
                      callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(period=0)],
                    )
    preds = model.predict(X_val_transformed)
    return mean_squared_error(y_val, preds) 

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)  # 1 hour

# Save the best model to pkl
with open(f'{base_path}\\lightgbm\\lgbm_model.pkl', 'wb') as f:
    pickle.dump(study, f)
    
best_model = lgb.LGBMRegressor(**study.best_params)


In [12]:
# Load the best model from pkl
with open(f'{base_path}\\lightgbm\\lgbm_model.pkl', 'rb') as f:
    best_model = pickle.load(f)

In [15]:
# Train the loaded best model and predict
model = lgb.LGBMRegressor(**best_model.best_params)

model.fit(X_train_transformed, y_train)
y_pred_uwarm_iwarm = model.predict(X_test_uwarm_iwarm_transformed)
y_pred_uwarm_icold = model.predict(X_test_uwarm_icold_transformed)
y_pred_ucold_iwarm = model.predict(X_test_ucold_iwarm_transformed)
y_pred_ucold_icold = model.predict(X_test_ucold_icold_transformed)


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 8.770508 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2963
[LightGBM] [Info] Number of data points in the train set: 13534315, number of used features: 753
[LightGBM] [Info] Start training from score 3.858934






In [16]:
# Save the predictions as RatingID, PredictedRating

# Warm user warm item
result_uwarm_iwarm = pd.DataFrame({
    'RatingID': test_uwarm_iwarm['RatingID'],
    'Rating': y_pred_uwarm_iwarm
})
result_uwarm_iwarm.to_csv(
    f'{base_path}\\lightgbm\\lightgbm_warm_user_warm_item.csv', 
    index=False, 
    header=['RatingID', 'Rating']
)
# Warm user cold item
result_uwarm_icold = pd.DataFrame({
    'RatingID': test_uwarm_icold['RatingID'],
    'Rating': y_pred_uwarm_icold
})
result_uwarm_icold.to_csv(
    f'{base_path}\\lightgbm\\lightgbm_warm_user_cold_item.csv', 
    index=False, 
    header=['RatingID', 'Rating']
)
# Cold user warm item
result_ucold_iwarm = pd.DataFrame({
    'RatingID': test_ucold_iwarm['RatingID'],
    'Rating': y_pred_ucold_iwarm
})
result_ucold_iwarm.to_csv(
    f'{base_path}\\lightgbm\\lightgbm_cold_user_warm_item.csv', 
    index=False, 
    header=['RatingID', 'Rating']
)
# Cold user cold item
result_ucold_icold = pd.DataFrame({
    'RatingID': test_ucold_icold['RatingID'],
    'Rating': y_pred_ucold_icold
})
result_ucold_icold.to_csv(
    f'{base_path}\\lightgbm\\lightgbm_cold_user_cold_item.csv', 
    index=False, 
    header=['RatingID', 'Rating']
)

# XGBoost

In [None]:
# Hyperparameter tuning for XGBoost
def objective_xgb(trial):
    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "seed": 42,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "max_depth": trial.suggest_int("max_depth", 3, 16),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "lambda": trial.suggest_float("lambda", 0.0, 10.0),
        "alpha": trial.suggest_float("alpha", 0.0, 10.0),
    }

    xgb_train = xgb.DMatrix(X_train_transformed, label=y_train)
    xgb_valid = xgb.DMatrix(X_val_transformed, label=y_val)
    model = xgb.train(
        params,
        xgb_train,
        num_boost_round=100,
        evals=[(xgb_valid, "validation")],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    preds = model.predict(xgb_valid)
    return mean_squared_error(y_val, preds)
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=50)
# Save the best model to pkl
with open(f'{base_path}\\xgboost\\xgboost_model.pkl', 'wb') as f:
    pickle.dump(study_xgb, f)
best_model_xgb = xgb.XGBRegressor(**study_xgb.best_params)

In [19]:
# Load the best model from pkl
with open(f'{base_path}\\xgboost\\xgboost_model.pkl', 'rb') as f:
    best_model = pickle.load(f)

In [20]:
# Train the best model
model = xgb.XGBRegressor(**best_model.best_params)
model.fit(X_train_transformed, y_train)
y_pred_uwarm_iwarm = model.predict(X_test_uwarm_iwarm_transformed)
y_pred_uwarm_icold = model.predict(X_test_uwarm_icold_transformed)
y_pred_ucold_iwarm = model.predict(X_test_ucold_iwarm_transformed)
y_pred_ucold_icold = model.predict(X_test_ucold_icold_transformed)

In [21]:
# Save the predictions as RatingID, PredictedRating
# Warm user warm item
result_uwarm_iwarm = pd.DataFrame({
    'RatingID': test_uwarm_iwarm['RatingID'],
    'Rating': y_pred_uwarm_iwarm
})
result_uwarm_iwarm.to_csv(
    f'{base_path}\\xgboost\\xgboost_warm_user_warm_item.csv', 
    index=False, 
    header=['RatingID', 'Rating']
)
# Warm user cold item
result_uwarm_icold = pd.DataFrame({
    'RatingID': test_uwarm_icold['RatingID'],
    'Rating': y_pred_uwarm_icold
})
result_uwarm_icold.to_csv(
    f'{base_path}\\xgboost\\xgboost_warm_user_cold_item.csv', 
    index=False, 
    header=['RatingID', 'Rating']
)
# Cold user warm item
result_ucold_iwarm = pd.DataFrame({
    'RatingID': test_ucold_iwarm['RatingID'],
    'Rating': y_pred_ucold_iwarm
})
result_ucold_iwarm.to_csv(
    f'{base_path}\\xgboost\\xgboost_cold_user_warm_item.csv', 
    index=False, 
    header=['RatingID', 'Rating']
)
# Cold user cold item
result_ucold_icold = pd.DataFrame({
    'RatingID': test_ucold_icold['RatingID'],
    'Rating': y_pred_ucold_icold
})
result_ucold_icold.to_csv(
    f'{base_path}\\xgboost\\xgboost_cold_user_cold_item.csv', 
    index=False, 
    header=['RatingID', 'Rating']
)
