In [55]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.metrics import roc_auc_score

import xgboost as xgb
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping


# jupyter配置
pd.options.display.max_rows=1000 #Notebook 的一个cell的显示行数
pd.options.display.max_columns=10000#Notebook 的一个cell的显示列数

In [45]:
train_data = pd.read_csv('./data/train.csv')
train_data_y= train_data.loc[:, 'rating'] - 1
train_data_x = train_data.loc[:, ["user_id", "product_id"]] # "product_name", "votes", "helpful_votes"
test_data = pd.read_csv('./data/test.csv')
test_data = test_data.loc[:, ["user_id", "product_id"]]
X_train, X_test, y_train, y_test = train_test_split(train_data_x, train_data_y, test_size=0.3)


In [46]:
print(train_data.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape, test_data.shape)

(745889, 7) (522122, 2) (223767, 2) (522122,) (223767,) (223553, 2)


In [53]:
train_data_x.describe()
train_data_x.dtypes
test_data.dtypes

user_id       int64
product_id    int64
dtype: object

In [59]:
y_train

324619    4
583405    1
469608    1
137583    2
250603    4
         ..
347863    2
17546     4
548894    3
347802    4
219883    2
Name: rating, Length: 522122, dtype: int64

In [62]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
test_x_to_predict = xgb.DMatrix(test_data)

# 设置参数
params = {
    'objective': 'multi:softmax',
    'num_class': 5,
    'eta': 0.1,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    'max_depth': 8,
    "enable_categorical" : True
}

# 训练模型
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    early_stopping_rounds=10,
    num_boost_round=1000,
    evals=[(dtrain, 'train'), (dtest, 'test')] # 将训练数据和测试数据都作为验证集，可以实时监督训练情况，是否过拟合
)

# 预测结果
result = bst.predict(
    dtest
)
print('Accuracy of prediction on dataset:', accuracy_score(y_test, result))

# 提交submit
model = bst
res = model.predict(test_x_to_predict)
# res = np.array([np.argmax(l) for l in res])
submit = pd.DataFrame({'ID':range(len(res)), 'rating':res}).astype('int32')
submit.to_csv(f"./data/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv",index=False, encoding='utf-8')

[0]	train-mlogloss:1.54732	test-mlogloss:1.54743
[1]	train-mlogloss:1.49565	test-mlogloss:1.49593


Parameters: { "enable_categorical" } are not used.



[2]	train-mlogloss:1.45189	test-mlogloss:1.45232
[3]	train-mlogloss:1.41450	test-mlogloss:1.41506
[4]	train-mlogloss:1.38223	test-mlogloss:1.38295
[5]	train-mlogloss:1.35412	test-mlogloss:1.35499
[6]	train-mlogloss:1.32963	test-mlogloss:1.33066
[7]	train-mlogloss:1.30821	test-mlogloss:1.30940
[8]	train-mlogloss:1.28941	test-mlogloss:1.29078
[9]	train-mlogloss:1.27285	test-mlogloss:1.27439
[10]	train-mlogloss:1.25817	test-mlogloss:1.25990
[11]	train-mlogloss:1.24519	test-mlogloss:1.24710
[12]	train-mlogloss:1.23367	test-mlogloss:1.23577
[13]	train-mlogloss:1.22338	test-mlogloss:1.22569
[14]	train-mlogloss:1.21422	test-mlogloss:1.21674
[15]	train-mlogloss:1.20606	test-mlogloss:1.20879
[16]	train-mlogloss:1.19871	test-mlogloss:1.20163
[17]	train-mlogloss:1.19221	test-mlogloss:1.19531
[18]	train-mlogloss:1.18632	test-mlogloss:1.18960
[19]	train-mlogloss:1.18105	test-mlogloss:1.18453
[20]	train-mlogloss:1.17629	test-mlogloss:1.17996
[21]	train-mlogloss:1.17196	test-mlogloss:1.17584
[22]	tra