In [32]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from xgboost import XGBClassifier
import xgboost as xgb
from lightgbm import log_evaluation, early_stopping

from utils import save_zip_file

# jupyter配置
pd.options.display.max_rows=1000 #Notebook 的一个cell的显示行数
pd.options.display.max_columns=10000#Notebook 的一个cell的显示列数

## chatgpt

In [9]:
train_data = pd.read_csv('./data/train_processed.csv')
train_data['rating'] = train_data['rating'] - 1

In [25]:
# 参数配置
enbale_norm = True

In [30]:
# 定义特征和目标变量
features = ['user_rating_mean', 'user_rating_count', 'user_votes_mean', 'user_votes_max', 'user_helpful_votes_mean', 'user_helpful_votes_max'] \
         + ['product_id', 'product_rating_mean', 'product_rating_count', 'product_votes_mean', 'product_votes_max','product_helpful_votes_mean', 'product_helpful_votes_max']
# features = ['user_rating_mean', 'user_votes_mean', 'product_rating_mean', 'product_votes_mean']

X = train_data[features]
y = train_data['rating']

if enbale_norm:
    print("============训练数据进行归一化============")
    # 创建 MinMaxScaler 对象
    scaler = MinMaxScaler()
    # 拟合并转换数据
    X = scaler.fit_transform(X)

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_val, label=y_val)

# 设置参数
params = {
    'objective': 'multi:softmax',
    'num_class': 5,
    'max_depth': 3,
    'learning_rate': 0.1,
    'eval_metric': 'mlogloss'  # 评估指标
}

# 训练模型
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    early_stopping_rounds=20,
    num_boost_round=10000,
    evals=[(dtrain, 'train'), (dtest, 'test')] # 将训练数据和测试数据都作为验证集，可以实时监督训练情况，是否过拟合
)

# 预测结果
result = bst.predict(
    dtest
)
print(result)
print("RMSE:", np.sqrt(mean_squared_error(y_val, result)))



(596711, 13) (596711,) (149178, 13) (149178,)
[0]	train-mlogloss:1.50801	test-mlogloss:1.50835
[1]	train-mlogloss:1.42696	test-mlogloss:1.42760
[2]	train-mlogloss:1.35917	test-mlogloss:1.35998
[3]	train-mlogloss:1.30146	test-mlogloss:1.30244
[4]	train-mlogloss:1.25148	test-mlogloss:1.25260
[5]	train-mlogloss:1.20851	test-mlogloss:1.20975
[6]	train-mlogloss:1.17023	test-mlogloss:1.17154
[7]	train-mlogloss:1.13679	test-mlogloss:1.13818
[8]	train-mlogloss:1.10685	test-mlogloss:1.10832
[9]	train-mlogloss:1.08020	test-mlogloss:1.08173
[10]	train-mlogloss:1.05687	test-mlogloss:1.05849
[11]	train-mlogloss:1.03559	test-mlogloss:1.03727
[12]	train-mlogloss:1.01644	test-mlogloss:1.01817
[13]	train-mlogloss:0.99950	test-mlogloss:1.00131
[14]	train-mlogloss:0.98393	test-mlogloss:0.98578
[15]	train-mlogloss:0.97015	test-mlogloss:0.97207
[16]	train-mlogloss:0.95739	test-mlogloss:0.95938
[17]	train-mlogloss:0.94601	test-mlogloss:0.94808
[18]	train-mlogloss:0.93551	test-mlogloss:0.93762
[19]	train-mlo

<xgboost.core.DMatrix at 0x16b42a760>

In [22]:
# 预测评分
test_data = pd.read_csv('./data/test_processed.csv')
X_test = test_data[features]

if enbale_norm:
    print("============测试数据进行归一化============")
    # 创建 MinMaxScaler 对象
    scaler = MinMaxScaler()
    # 拟合并转换数据
    X_test = scaler.fit_transform(X_test)
X_test = xgb.DMatrix(X_test)

test_data['rating'] = bst.predict(X_test).astype('int32') + 1
submission = test_data[['ID', 'rating']]

In [23]:
file_name = f"./data/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv"
submission.to_csv(f"./data/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv",index=False, encoding='utf-8')
save_zip_file(file_name, f"./data/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv.zip")
