In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from matplotlib import pyplot as plt
from collections import defaultdict
import random
import csv
import os
import xgboost as xgb
from sklearn.metrics import mean_squared_error

In [3]:
train_file = 'train_set.csv'
valid_file = 'valid_set.csv'
seen_test_file = 'seen_test_set.csv'
unseen_test_file = 'unseen_test_set.csv'
test_file = 'test_set.csv'

def get_all_data(file_name):
    df = pd.read_csv(file_name, sep=',')
    features = []
    labels = []

    # iterate through the whole dataset
    for _, row in df.iterrows():
        user_avg_rating = row['user_avg_rating']
        item_avg_rating = row['item_avg_rating']
        rating = row['rating']

        size = int(row['size_idx'])
        fit = int(row['fit_idx'])
        user_attr = int(row['user_attr_idx'])
        model_attr = int(row['model_attr_idx'])
        category = int(row['category_idx'])
        brand = int(row['brand_idx'])
        year = int(row['year_idx'])
        split = int(row['split_idx'])

        features.append([user_avg_rating, item_avg_rating,
                        size, fit,
                        user_attr, model_attr,
                        category, brand,
                        year, split])
        
        labels.append(rating)

    return features, labels

X_train, y_train = get_all_data(train_file)
X_valid, y_valid = get_all_data(valid_file)

In [4]:
print(len(X_train), len(X_train[0]))
print(len(X_valid), len(X_valid[0]))

84811 10
4547 10


In [7]:
# Convert the data into DMatrix format, which is optimized for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

# define the evaluation sets: training and validation sets
evals = [(dtrain, 'train'), (dvalid, 'eval')]

In [10]:
# tunable parameters of XGBoost
params = {
    'objective': 'reg:squarederror',  # For regression tasks
    'eval_metric': 'rmse',  # Use RMSE as evaluation metric
    'eta': 0.1,  # Learning rate
    'max_depth': 6,  # Maximum depth of trees
    'subsample': 0.8,  # Subsampling ratio
    'colsample_bytree': 0.8  # Column sampling ratio
}

num_round = 1000  # Max number of boosting rounds

In [11]:
# Train the model with early stopping
bst = xgb.train(
    params,
    dtrain,
    num_round,
    evals=evals,
    early_stopping_rounds=50,  # Stop after 50 rounds of no improvement
    verbose_eval=True  # Print evaluation results during training
)

[0]	train-rmse:1.01505	eval-rmse:1.06769
[1]	train-rmse:0.95868	eval-rmse:1.05005
[2]	train-rmse:0.91247	eval-rmse:1.03925
[3]	train-rmse:0.87058	eval-rmse:1.02642
[4]	train-rmse:0.83423	eval-rmse:1.01391
[5]	train-rmse:0.80751	eval-rmse:1.01068
[6]	train-rmse:0.79451	eval-rmse:0.99930
[7]	train-rmse:0.76930	eval-rmse:0.99002
[8]	train-rmse:0.74837	eval-rmse:0.98292
[9]	train-rmse:0.74041	eval-rmse:0.97572
[10]	train-rmse:0.73796	eval-rmse:0.97350
[11]	train-rmse:0.73595	eval-rmse:0.97146
[12]	train-rmse:0.72115	eval-rmse:0.96834
[13]	train-rmse:0.70877	eval-rmse:0.96582
[14]	train-rmse:0.69714	eval-rmse:0.96117
[15]	train-rmse:0.68763	eval-rmse:0.95775
[16]	train-rmse:0.68056	eval-rmse:0.95622
[17]	train-rmse:0.67387	eval-rmse:0.95340
[18]	train-rmse:0.66815	eval-rmse:0.95015
[19]	train-rmse:0.66358	eval-rmse:0.94819
[20]	train-rmse:0.66167	eval-rmse:0.94621
[21]	train-rmse:0.65794	eval-rmse:0.94467
[22]	train-rmse:0.65486	eval-rmse:0.94307
[23]	train-rmse:0.65222	eval-rmse:0.94169
[2

In [None]:
# after training, get the best iteration
best_iteration = bst.best_iteration
print(f"best iteration: {best_iteration}")

Best iteration: 85


In [19]:
# evaluate on test set
print("evalute on test set (seen test + unseen test)")
X_test, y_test = get_all_data(test_file)
# convert to DMatrix format
dtest = xgb.DMatrix(X_test, label=y_test)

y_pred = bst.predict(dtest)

test_mse = mean_squared_error(y_test, y_pred)
print(f"test_mse: {test_mse:.4f}")


evalute on test set (seen test + unseen test)
test_mse: 1.0201
