In [55]:
import numpy as np
import pandas as pd
import time
import zipfile
import os

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.metrics import roc_auc_score

from xgboost import XGBRegressor
import xgboost as xgb
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping


# jupyter配置
pd.options.display.max_rows=1000 #Notebook 的一个cell的显示行数
pd.options.display.max_columns=10000#Notebook 的一个cell的显示列数

In [45]:
train_data = pd.read_csv('./data/train.csv')
train_data_y= train_data.loc[:, 'rating'] - 1
train_data_x = train_data.loc[:, ["user_id", "product_id"]] # "product_name", "votes", "helpful_votes"
test_data = pd.read_csv('./data/test.csv')
test_data = test_data.loc[:, ["user_id", "product_id"]]
X_train, X_test, y_train, y_test = train_test_split(train_data_x, train_data_y, test_size=0.3)


In [46]:
print(train_data.shape, X_train.shape, X_test.shape, y_train.shape, y_test.shape, test_data.shape)

(745889, 7) (522122, 2) (223767, 2) (522122,) (223767,) (223553, 2)


user_id       int64
product_id    int64
dtype: object

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
test_x_to_predict = xgb.DMatrix(test_data)

# 设置参数
params = {
    'objective': 'multi:softmax',
    'num_class': 5,
    'eta': 0.1,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    'max_depth': 8,
    "enable_categorical" : True
}

# 训练模型
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    early_stopping_rounds=10,
    num_boost_round=1000,
    evals=[(dtrain, 'train'), (dtest, 'test')] # 将训练数据和测试数据都作为验证集，可以实时监督训练情况，是否过拟合
)

# 预测结果
result = bst.predict(
    dtest
)
print('Accuracy of prediction on dataset:', accuracy_score(y_test, result))

# 提交submit
model = bst
res = model.predict(test_x_to_predict)
# res = np.array([np.argmax(l) for l in res])
submit = pd.DataFrame({'ID':range(len(res)), 'rating':res}).astype('int32')
submit.to_csv(f"./data/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv",index=False, encoding='utf-8')

## chatgpt

In [3]:
## 数据处理
train_data = pd.read_csv('./data/train.csv')

# 计算用户特征
user_features = train_data.groupby('user_id').agg({
    'rating': ['mean', 'count'],
    'votes': ['mean', 'max'],
    'helpful_votes': ['mean', 'max']
}).reset_index()
user_features.columns = ['user_id', 'user_rating_mean', 'user_rating_count', 'user_votes_mean', 'user_votes_max', 
                         'user_helpful_votes_mean', 'user_helpful_votes_max']

# 计算产品特征
product_features = train_data.groupby('product_id').agg({
    'rating': ['mean', 'count'],
    'votes': ['mean', 'max'],
    'helpful_votes': ['mean', 'max']
}).reset_index()
product_features.columns = ['product_id', 'product_rating_mean', 'product_rating_count', 'product_votes_mean', 'product_votes_max',
                            'product_helpful_votes_mean', 'product_helpful_votes_max']

train_data = train_data.merge(user_features, on='user_id').merge(product_features, on='product_id')


In [56]:
# 定义特征和目标变量
# features = ['user_rating_mean', 'user_rating_count', 'user_votes_mean', 'user_votes_max', 'user_helpful_votes_mean', 'user_helpful_votes_max'] \
#          + ['product_id', 'product_rating_mean', 'product_rating_count', 'product_votes_mean', 'product_votes_max','product_helpful_votes_mean', 'product_helpful_votes_max']
features = ['user_rating_mean', 'user_votes_mean', 'product_rating_mean', 'product_votes_mean']

X = train_data[features]
y = train_data['rating']

# 划分训练集和验证集
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape)

# # 训练模型
# model = XGBRegressor()
# model.fit(X_train, y_train)

# # 验证模型
# y_pred = model.predict(X_val)
# print("MSE:", np.sqrt(mean_squared_error(y_val, y_pred)))

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_val, label=y_val)

# 设置参数
params = {
    'objective': 'reg:squarederror',  # 回归任务的损失函数
    'max_depth': 3,
    'learning_rate': 0.01,
    'n_estimators': 100,
    'eval_metric': 'rmse'  # 评估指标
}

# 训练模型
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    early_stopping_rounds=10,
    num_boost_round=10000,
    evals=[(dtrain, 'train'), (dtest, 'test')] # 将训练数据和测试数据都作为验证集，可以实时监督训练情况，是否过拟合
)

# 预测结果
result = bst.predict(
    dtest
)

print("RMSE:", np.sqrt(mean_squared_error(y_val, result)))

test_data = pd.read_csv('./data/test.csv')

# 生成用户和产品特征
test_data = test_data.merge(user_features, on='user_id', how='left').merge(product_features, on='product_id', how='left')

# 填充缺失值
test_data.fillna(0, inplace=True)  # 或者使用其他合适的填充方法

# 预测评分
X_test = test_data[features]
X_test = xgb.DMatrix(X_test)

test_data['rating'] = bst.predict(X_test).astype('int32')
submission = test_data[['ID', 'rating']]
file_name = f"./data/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv"
file_name_zip = f"./data/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv.zip"
with zipfile.ZipFile(file_name_zip, 'w') as z:
    z.write(file_name, os.path.basename(file_name))

submission.to_csv(f"./data/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv",index=False, encoding='utf-8')

(596711, 4) (596711,) (149178, 4) (149178,)
[0]	train-rmse:1.06771	test-rmse:1.06925
[1]	train-rmse:1.06344	test-rmse:1.06498
[2]	train-rmse:1.05924	test-rmse:1.06077
[3]	train-rmse:1.05510	test-rmse:1.05664
[4]	train-rmse:1.05103	test-rmse:1.05255
[5]	train-rmse:1.04707	test-rmse:1.04858
[6]	train-rmse:1.04312	test-rmse:1.04463
[7]	train-rmse:1.03927	test-rmse:1.04076
[8]	train-rmse:1.03543	test-rmse:1.03692
[9]	train-rmse:1.03167	test-rmse:1.03315
[10]	train-rmse:1.02795	test-rmse:1.02943
[11]	train-rmse:1.02432	test-rmse:1.02578
[12]	train-rmse:1.02071	test-rmse:1.02217
[13]	train-rmse:1.01714	test-rmse:1.01861
[14]	train-rmse:1.01367	test-rmse:1.01512
[15]	train-rmse:1.01023	test-rmse:1.01169
[16]	train-rmse:1.00686	test-rmse:1.00831
[17]	train-rmse:1.00352	test-rmse:1.00497


Parameters: { "n_estimators" } are not used.



[18]	train-rmse:1.00024	test-rmse:1.00168
[19]	train-rmse:0.99701	test-rmse:0.99845
[20]	train-rmse:0.99382	test-rmse:0.99525
[21]	train-rmse:0.99067	test-rmse:0.99211
[22]	train-rmse:0.98759	test-rmse:0.98901
[23]	train-rmse:0.98455	test-rmse:0.98598
[24]	train-rmse:0.98156	test-rmse:0.98298
[25]	train-rmse:0.97860	test-rmse:0.98003
[26]	train-rmse:0.97570	test-rmse:0.97712
[27]	train-rmse:0.97283	test-rmse:0.97426
[28]	train-rmse:0.97001	test-rmse:0.97143
[29]	train-rmse:0.96722	test-rmse:0.96865
[30]	train-rmse:0.96448	test-rmse:0.96591
[31]	train-rmse:0.96179	test-rmse:0.96323
[32]	train-rmse:0.95913	test-rmse:0.96057
[33]	train-rmse:0.95652	test-rmse:0.95795
[34]	train-rmse:0.95394	test-rmse:0.95538
[35]	train-rmse:0.95142	test-rmse:0.95287
[36]	train-rmse:0.94892	test-rmse:0.95037
[37]	train-rmse:0.94646	test-rmse:0.94791
[38]	train-rmse:0.94405	test-rmse:0.94552
[39]	train-rmse:0.94166	test-rmse:0.94312
[40]	train-rmse:0.93932	test-rmse:0.94079
[41]	train-rmse:0.93702	test-rmse:

In [21]:
test_data = pd.read_csv('./data/test.csv')

# 生成用户和产品特征
test_data = test_data.merge(user_features, on='user_id', how='left').merge(product_features, on='product_id', how='left')

# 填充缺失值
test_data.fillna(0, inplace=True)  # 或者使用其他合适的填充方法

# 预测评分
X_test = test_data[features]
test_data['rating'] = model.predict(X_test).astype('int32')

In [22]:
# submission = test_data[['ID', 'rating']]
# submission.to_csv(f"./data/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv",index=False, encoding='utf-8')

In [29]:

# 提交submit
model = bst
res = model.predict(test_x_to_predict)
# res = np.array([np.argmax(l) for l in res])
submit = pd.DataFrame({'ID':range(len(res)), 'rating':res}).astype('int32')
submit.to_csv(f"./data/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv",index=False, encoding='utf-8')


In [30]:
a

Unnamed: 0,A,B,C,D
2013-01-01,0.278168,0.291133,0.5955,0.313076
2013-01-02,0.539227,0.100815,0.507844,0.687841
2013-01-03,0.884767,0.014835,0.471557,0.517234
2013-01-04,0.660014,0.732694,0.53235,0.810849
2013-01-05,0.049899,0.896792,0.585958,0.967934
2013-01-06,0.78748,0.615577,0.382548,0.041803


In [34]:
a[['A','B']].astype('int32')

Unnamed: 0,A,B
2013-01-01,0,0
2013-01-02,0,0
2013-01-03,0,0
2013-01-04,0,0
2013-01-05,0,0
2013-01-06,0,0
