In [3]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.metrics import roc_auc_score

import xgboost as xgb
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping


# jupyter配置
pd.options.display.max_rows=1000 #Notebook 的一个cell的显示行数
pd.options.display.max_columns=10000#Notebook 的一个cell的显示列数

In [42]:
train_data = pd.read_csv('./data/train.csv')
train_data_y= train_data.loc[:, 'rating'] - 1
train_data_x = train_data.loc[:, ["user_id", "product_id", "votes", "helpful_votes"]] # "product_name", 
test_data = pd.read_csv('./data/test.csv')
test_data = test_data.loc[:, ["user_id", "product_id"]]
X_train, X_test, y_train, y_test = train_test_split(train_data_x, train_data_y, test_size=0.3)


In [None]:
print(train_data.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape, test_data.shape)

(745889, 7) (522122, 4) (223767, 4) (522122,) (223767,) (223553, 2)


In [34]:
train_data_x.describe()
train_data_x.dtypes
test_data.dtypes

user_id       int64
product_id    int64
dtype: object

In [41]:
train_data_y - 1

0         4
1         4
2         4
3         3
4         4
         ..
745884    0
745885    4
745886    3
745887    4
745888    4
Name: rating, Length: 745889, dtype: int64

In [43]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
test_x_to_predict = xgb.DMatrix(test_data)

# 设置参数
params = {
    'objective': 'multi:softmax',
    'num_class': 5,
    'eta': 0.1,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    'max_depth': 8,
    "enable_categorical" : True
}

# 训练模型
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    early_stopping_rounds=10,
    num_boost_round=200,
    evals=[(dtrain, 'train'), (dtest, 'test')] # 将训练数据和测试数据都作为验证集，可以实时监督训练情况，是否过拟合
)

# 预测结果
result = bst.predict(
    dtest
)
print('Accuracy of prediction on dataset:', accuracy_score(y_test, result))

# 提交submit
model = bst
res = model.predict(test_x_to_predict)
# res = np.array([np.argmax(l) for l in res])
submit = pd.DataFrame({'id':range(len(res)), 'label':res}).astype('int32')
submit.to_csv(f"./data/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv",index=False, encoding='utf-8')

[0]	train-mlogloss:1.54141	test-mlogloss:1.54187
[1]	train-mlogloss:1.48484	test-mlogloss:1.48572


Parameters: { "enable_categorical" } are not used.



[2]	train-mlogloss:1.43680	test-mlogloss:1.43808
[3]	train-mlogloss:1.39560	test-mlogloss:1.39724
[4]	train-mlogloss:1.35989	test-mlogloss:1.36191
[5]	train-mlogloss:1.32873	test-mlogloss:1.33110
[6]	train-mlogloss:1.30148	test-mlogloss:1.30419
[7]	train-mlogloss:1.27742	test-mlogloss:1.28048
[8]	train-mlogloss:1.25618	test-mlogloss:1.25956
[9]	train-mlogloss:1.23726	test-mlogloss:1.24096
[10]	train-mlogloss:1.22036	test-mlogloss:1.22442
[11]	train-mlogloss:1.20532	test-mlogloss:1.20972
[12]	train-mlogloss:1.19179	test-mlogloss:1.19651
[13]	train-mlogloss:1.17962	test-mlogloss:1.18466
[14]	train-mlogloss:1.16873	test-mlogloss:1.17409
[15]	train-mlogloss:1.15892	test-mlogloss:1.16459
[16]	train-mlogloss:1.15004	test-mlogloss:1.15603
[17]	train-mlogloss:1.14191	test-mlogloss:1.14820
[18]	train-mlogloss:1.13461	test-mlogloss:1.14122
[19]	train-mlogloss:1.12785	test-mlogloss:1.13478
[20]	train-mlogloss:1.12174	test-mlogloss:1.12898
[21]	train-mlogloss:1.11618	test-mlogloss:1.12372
[22]	tra

ValueError: feature_names mismatch: ['user_id', 'product_id', 'votes', 'helpful_votes'] ['user_id', 'product_id']
expected helpful_votes, votes in input data

In [14]:
train_x

Unnamed: 0,user_id,product_id,product_name,votes,helpful_votes
0,1813,154533,Beautiful Thing,10,8
1,1944,192838,Almost Famous,4,2
2,534,202590,A Clockwork Orange,5,5
3,1811,140456,Great Expectations (Wordsworth Classics),1,0
4,102,154278,Phenomenon,0,0
...,...,...,...,...,...
745884,1853,25651,"Star by Star (Star Wars: The New Jedi Order, B...",1,1
745885,1556,19657,"NIV Study Bible, Personal Size Indexed",13,11
745886,1737,157371,An Ideal Husband,1,1
745887,1916,24941,The Autobiography of Malcolm X (As Told to Ale...,1,1
