In [2]:
import csv
import random
import pickle
import os
from time import time
from tqdm import tqdm
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from collections import defaultdict
import torch
import torch.nn as nn

import xgboost as xgb
from sklearn.metrics import mean_squared_error


In [3]:
def get_data(csv_file):
    df = pd.read_csv(csv_file, sep=',')
    features = []
    labels = []

    for _, row in df.iterrows():
        rating = row['rating']
        user_avg_rating = row['user_avg_rating']
        recipe_avg_rating = row['recipe_avg_rating']

        n_steps = row['n_steps']
        n_ingredients = row['n_ingredients']
        minutes = row['minutes']
        n_recipe_words = row['n_recipe_words']

        n_review_words = row['n_review_words']
        n_positive = row['n_positive']
        n_negative = row['n_negative']
        n_exclamation = row['n_exclamation']

        features.append([user_avg_rating, recipe_avg_rating,
                       n_steps, n_ingredients, minutes, n_recipe_words,
                       n_review_words, n_positive, n_negative, n_exclamation])
        
        labels.append(rating)
    
    return features, labels

X_train, y_train = get_data('train.csv')
X_valid, y_valid = get_data('valid.csv')

In [4]:
print(len(X_train), len(X_train[0]))
print(len(X_valid), len(X_valid[0]))

20000 10
7023 10


In [5]:
# Convert the data into DMatrix format, which is optimized for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

# define the evaluation sets: training and validation sets
evals = [(dtrain, 'train'), (dvalid, 'eval')]

In [6]:
# tunable parameters of XGBoost
params = {
    'objective': 'reg:squarederror',  # For regression tasks
    'eval_metric': 'rmse',  # Use RMSE as evaluation metric
    'eta': 0.1,  # Learning rate
    'max_depth': 6,  # Maximum depth of trees
    'subsample': 0.8,  # Subsampling ratio
    'colsample_bytree': 0.8  # Column sampling ratio
}

num_round = 1000  # Max number of boosting rounds

In [7]:
# Train the model with early stopping
start_time = time()

bst = xgb.train(
    params,
    dtrain,
    num_round,
    evals=evals,
    early_stopping_rounds=50,  # Stop after 50 rounds of no improvement
    verbose_eval=True  # Print evaluation results during training
)

end_time = time()

[0]	train-rmse:0.87734	eval-rmse:1.34280
[1]	train-rmse:0.80033	eval-rmse:1.33541
[2]	train-rmse:0.73204	eval-rmse:1.33053
[3]	train-rmse:0.67085	eval-rmse:1.32522
[4]	train-rmse:0.61652	eval-rmse:1.32488
[5]	train-rmse:0.58657	eval-rmse:1.32383
[6]	train-rmse:0.54643	eval-rmse:1.31190
[7]	train-rmse:0.50625	eval-rmse:1.31122
[8]	train-rmse:0.47025	eval-rmse:1.31109
[9]	train-rmse:0.44326	eval-rmse:1.30943
[10]	train-rmse:0.42001	eval-rmse:1.30443
[11]	train-rmse:0.40033	eval-rmse:1.30252
[12]	train-rmse:0.37819	eval-rmse:1.30410
[13]	train-rmse:0.35955	eval-rmse:1.30477
[14]	train-rmse:0.34297	eval-rmse:1.30653
[15]	train-rmse:0.32819	eval-rmse:1.30753
[16]	train-rmse:0.31649	eval-rmse:1.30922
[17]	train-rmse:0.30558	eval-rmse:1.31138
[18]	train-rmse:0.29612	eval-rmse:1.31315
[19]	train-rmse:0.28817	eval-rmse:1.31511
[20]	train-rmse:0.28301	eval-rmse:1.31450
[21]	train-rmse:0.27592	eval-rmse:1.31610
[22]	train-rmse:0.26935	eval-rmse:1.31735
[23]	train-rmse:0.26498	eval-rmse:1.31864
[2

In [8]:
# after training, get the best iteration
best_iteration = bst.best_iteration
print(f"best iteration: {best_iteration}")
print(f"training time: {end_time - start_time}")

best iteration: 11
training time: 0.16177916526794434


In [9]:
# evaluate on test set
print("evalute on test set")
X_test, y_test = get_data('test.csv')
# convert to DMatrix format
dtest = xgb.DMatrix(X_test, label=y_test)

start_time = time()

y_pred = bst.predict(dtest)

end_time = time()

test_mse = mean_squared_error(y_test, y_pred)
print(f"test_mse: {test_mse:.4f}")
print(f"evaluation time: {end_time - start_time}")

evalute on test set
test_mse: 1.9453
evaluation time: 0.0


In [None]:
# try some new hyper-parameters
# smaller learning rate and smaller tree depths

# tunable parameters of XGBoost
params = {
    'objective': 'reg:squarederror',  # For regression tasks
    'eval_metric': 'rmse',  # Use RMSE as evaluation metric
    'eta': 0.01,  # Learning rate
    'max_depth': 3,  # Maximum depth of trees
    'subsample': 0.8,  # Subsampling ratio
    'colsample_bytree': 0.8  # Column sampling ratio
}

num_round = 1000  # Max number of boosting rounds

# Train the model with early stopping
start_time = time()

bst = xgb.train(
    params,
    dtrain,
    num_round,
    evals=evals,
    early_stopping_rounds=50,  # Stop after 50 rounds of no improvement
    verbose_eval=True  # Print evaluation results during training
)

end_time = time()

[0]	train-rmse:0.95580	eval-rmse:1.34616
[1]	train-rmse:0.94803	eval-rmse:1.34473
[2]	train-rmse:0.94037	eval-rmse:1.34337
[3]	train-rmse:0.93274	eval-rmse:1.34372
[4]	train-rmse:0.92526	eval-rmse:1.34400
[5]	train-rmse:0.92055	eval-rmse:1.34256
[6]	train-rmse:0.91351	eval-rmse:1.34227
[7]	train-rmse:0.90614	eval-rmse:1.34235
[8]	train-rmse:0.89888	eval-rmse:1.34247
[9]	train-rmse:0.89211	eval-rmse:1.34257
[10]	train-rmse:0.88542	eval-rmse:1.34287
[11]	train-rmse:0.87885	eval-rmse:1.34292
[12]	train-rmse:0.87191	eval-rmse:1.34296
[13]	train-rmse:0.86506	eval-rmse:1.34191
[14]	train-rmse:0.85826	eval-rmse:1.34197
[15]	train-rmse:0.85151	eval-rmse:1.34204
[16]	train-rmse:0.84489	eval-rmse:1.34213
[17]	train-rmse:0.83829	eval-rmse:1.34223
[18]	train-rmse:0.83179	eval-rmse:1.34121
[19]	train-rmse:0.82536	eval-rmse:1.34154
[20]	train-rmse:0.81945	eval-rmse:1.34164
[21]	train-rmse:0.81315	eval-rmse:1.34047
[22]	train-rmse:0.80698	eval-rmse:1.34080
[23]	train-rmse:0.80081	eval-rmse:1.34085
[2

In [12]:
# after training, get the best iteration
best_iteration = bst.best_iteration
print(f"best iteration: {best_iteration}")
print(f"training time: {end_time - start_time}")

best iteration: 172
training time: 0.5092024803161621


In [13]:
# evaluate on test set
print("evalute on test set")
X_test, y_test = get_data('test.csv')
# convert to DMatrix format
dtest = xgb.DMatrix(X_test, label=y_test)

start_time = time()

y_pred = bst.predict(dtest)

end_time = time()

test_mse = mean_squared_error(y_test, y_pred)
print(f"test_mse: {test_mse:.4f}")
print(f"evaluation time: {end_time - start_time}")

evalute on test set
test_mse: 1.9158
evaluation time: 0.0
