# 终于用这个脚本进行训练

In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.metrics import roc_auc_score




import xgboost as xgb
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping


# jupyter配置
pd.options.display.max_rows=1000 #Notebook 的一个cell的显示行数
pd.options.display.max_columns=10000#Notebook 的一个cell的显示列数

In [27]:
# 读取训练数据
train_data_x = np.load('./train_data/train_x.npy')
train_data_y = np.load('./train_data/train_y.npy')
# 读取测试数据
test_x_A = np.load("./test_data/test_x_A.npy")

(37549,)

## 数据处理

In [19]:
# 这里先做做一个简易版，先把两个观测指标片接到一起，即每个样本特征是180+180长度
train_data_x_reshape = train_data_x.reshape(train_data_x.shape[0], -1)
test_x_A_reshape = test_x_A.reshape(test_x_A.shape[0], -1)

In [62]:
# 极值
train_max_oxygen = train_data_x_reshape[:,:180].max(axis=1).reshape(-1, 1)
train_min_oxygen = train_data_x_reshape[:,:180].min(axis=1).reshape(-1, 1)
train_max_heart = train_data_x_reshape[:,180:].max(axis=1).reshape(-1, 1)
train_min_heart = train_data_x_reshape[:,180:].min(axis=1).reshape(-1, 1)

test_max_oxygen = test_x_A_reshape[:,:180].max(axis=1).reshape(-1, 1)
test_min_oxygen = test_x_A_reshape[:,:180].min(axis=1).reshape(-1, 1)
test_max_heart = test_x_A_reshape[:,180:].max(axis=1).reshape(-1, 1)
test_min_heart = test_x_A_reshape[:,180:].min(axis=1).reshape(-1, 1)

# 平均值
train_mean_oxygen = train_data_x_reshape[:,:180].mean(axis=1).reshape(-1, 1)
train_mean_heart = train_data_x_reshape[:,180:].mean(axis=1).reshape(-1, 1)

test_mean_oxygen = test_x_A_reshape[:,:180].mean(axis=1).reshape(-1, 1)
test_mean_heart = test_x_A_reshape[:,180:].mean(axis=1).reshape(-1, 1)

# 标准差和方差
train_std_oxygen = train_data_x_reshape[:,:180].std(axis=1).reshape(-1, 1)
train_std_heart = train_data_x_reshape[:,180:].std(axis=1).reshape(-1, 1)
train_var_oxygen = train_data_x_reshape[:,:180].var(axis=1).reshape(-1, 1)
train_var_heart = train_data_x_reshape[:,180:].var(axis=1).reshape(-1, 1)

test_std_oxygen = test_x_A_reshape[:,:180].std(axis=1).reshape(-1, 1)
test_std_heart = test_x_A_reshape[:,180:].std(axis=1).reshape(-1, 1)
test_var_oxygen = test_x_A_reshape[:,:180].var(axis=1).reshape(-1, 1)
test_var_heart = test_x_A_reshape[:,180:].var(axis=1).reshape(-1, 1)

train_statistic = np.concatenate((train_max_oxygen, train_min_oxygen, train_max_heart, train_min_heart, 
                            train_mean_oxygen, train_mean_heart, train_std_oxygen, train_std_heart,
                            train_var_oxygen, train_var_heart), axis=1)
train_statistic_name = ["max_oxygen", "min_oxygen", "max_heart", "min_heart", 
                            "mean_oxygen", "mean_heart", "std_oxygen", "std_heart",
                            "var_oxygen", "var_heart"]
test_statistic = np.concatenate((test_max_oxygen, test_min_oxygen, test_max_heart, test_min_heart, 
                                test_mean_oxygen, test_mean_heart, test_std_oxygen, test_std_heart,
                                test_var_oxygen, test_var_heart), axis=1)

train_data = np.concatenate((train_data_x_reshape, train_statistic), axis=1)
test_submit = np.concatenate((test_x_A_reshape, test_statistic), axis=1)

feat_name = [f'oxygen_{x}' for x in range(180)] + [f'heart_{x}' for x in range(180)] + train_statistic_name
df_train_x = pd.DataFrame(train_data, columns=feat_name)
df_train_y = pd.DataFrame(train_data_y.reshape(-1, 1), columns=['label'])
df_test = pd.DataFrame(test_submit)

X_train, X_test, y_train, y_test = train_test_split(df_train_x.values, df_train_y.values, test_size=0.3)
print("train_x.shape", X_train.shape)
print("test_x.shape", X_test.shape)
print("train_y.shape", y_train.shape)
print("test_y.shape", y_test.shape)
print("test_x_A.shape", df_test.shape)

train_x.shape (26284, 370)
test_x.shape (11265, 370)
train_y.shape (26284, 1)
test_y.shape (11265, 1)
test_x_A.shape (1155, 370)


## 模型训练

### xgboost

In [64]:
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feat_name)
dtest = xgb.DMatrix(X_test, label=y_test, feature_names=feat_name)
test_x_to_predict = xgb.DMatrix(df_test.values, feature_names=feat_name)

# 设置参数
params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eta': 0.1,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    'max_depth': 8
}

# 训练模型
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    early_stopping_rounds=10,
    num_boost_round=200,
    evals=[(dtrain, 'train'), (dtest, 'test')] # 将训练数据和测试数据都作为验证集，可以实时监督训练情况，是否过拟合
)

# 预测结果
result = bst.predict(
    dtest
)
print('Accuracy of prediction on dataset:', accuracy_score(y_test, result))

# 打印特征重要性
feature_importance_gain = bst.get_score(importance_type='gain')
feature_importance_gain = pd.DataFrame(list(feature_importance_gain.items()), columns=['Feature', 'Importance by Gain'])

# 提交submit
model = bst
res = model.predict(test_x_to_predict)
submit = pd.DataFrame({'id':range(len(res)), 'label':res}).astype('int32')
submit.to_csv(f"/Users/wzq/Desktop/game/DetectionOfSleep/submit/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv",index=False, encoding='utf-8')

[0]	train-mlogloss:0.99586	test-mlogloss:1.00274
[1]	train-mlogloss:0.90985	test-mlogloss:0.92282
[2]	train-mlogloss:0.83654	test-mlogloss:0.85561
[3]	train-mlogloss:0.77328	test-mlogloss:0.79848
[4]	train-mlogloss:0.71873	test-mlogloss:0.74959
[5]	train-mlogloss:0.67092	test-mlogloss:0.70719
[6]	train-mlogloss:0.62877	test-mlogloss:0.67036
[7]	train-mlogloss:0.59169	test-mlogloss:0.63842
[8]	train-mlogloss:0.55866	test-mlogloss:0.61051
[9]	train-mlogloss:0.52907	test-mlogloss:0.58588
[10]	train-mlogloss:0.50253	test-mlogloss:0.56431
[11]	train-mlogloss:0.47859	test-mlogloss:0.54535
[12]	train-mlogloss:0.45719	test-mlogloss:0.52877
[13]	train-mlogloss:0.43822	test-mlogloss:0.51416
[14]	train-mlogloss:0.42098	test-mlogloss:0.50122
[15]	train-mlogloss:0.40482	test-mlogloss:0.48969
[16]	train-mlogloss:0.39059	test-mlogloss:0.47974
[17]	train-mlogloss:0.37695	test-mlogloss:0.47084
[18]	train-mlogloss:0.36477	test-mlogloss:0.46300
[19]	train-mlogloss:0.35398	test-mlogloss:0.45632
[20]	train

In [65]:
feature_importance_gain

Unnamed: 0,Feature,Importance by Gain
0,oxygen_0,1.537645
1,oxygen_1,2.501584
2,oxygen_2,2.666858
3,oxygen_3,2.387419
4,oxygen_4,2.351038
5,oxygen_5,2.979969
6,oxygen_6,2.418351
7,oxygen_7,1.913417
8,oxygen_8,2.186613
9,oxygen_9,4.066802
