In [205]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.metrics import roc_auc_score




import xgboost as xgb
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping


# jupyter配置
pd.options.display.max_rows=None #Notebook 的一个cell的显示行数
pd.options.display.max_columns=None#Notebook 的一个cell的显示列数

In [206]:
# 读取训练数据
train_data_x = np.load('./train_data/train_x.npy')
train_data_y = np.load('./train_data/train_y.npy')
# 读取测试数据
test_x_A = np.load("./test_data/test_x_A.npy")


## 数据处理

In [207]:
# 这里先做做一个简易版，先把两个观测指标片接到一起，即每个样本特征是180+180长度
train_data_x_reshape = train_data_x.reshape(train_data_x.shape[0], -1)
test_x_A_reshape = test_x_A.reshape(test_x_A.shape[0], -1)

In [208]:
X_train, X_test, y_train, y_test = train_test_split(train_data_x_reshape, train_data_y, test_size=0.3)
print("train_x.shape", X_train.shape)
print("test_x.shape", X_test.shape)
print("train_y.shape", y_train.shape)
print("test_y.shape", y_test.shape)
print("test_x_A.shape", test_x_A.shape)

train_x.shape (26284, 360)
test_x.shape (11265, 360)
train_y.shape (26284,)
test_y.shape (11265,)
test_x_A.shape (1155, 2, 180)


## 模型训练

### 随机森林

In [201]:
# 随机深林模型训练
rfc = RandomForestClassifier(n_estimators=25)

# 交叉验证
# rfc_s = cross_val_score(rfc,train_x, train_y,cv=10)
# plt.plot(range(1,11),rfc_s,label = "RandomForest")
# plt.legend()
# plt.show()


# rfc.fit(train_x, train_y)
# res = rfc.predict(test_x)

# 直接利用全量数据训练
rfc.fit(train_data_x, train_data_y)
res = rfc.predict(test_x_A)

print("预测结果shape：",res.shape)

ValueError: Found array with dim 3. RandomForestClassifier expected <= 2.

In [102]:
# 预测结果验证
# 0.7数据训练 0.8458055925432756
score_r = rfc.score(X_test, y_test)
print("Random Forest:",score_r)


Random Forest: 0.9979582778517532


### xgboost

In [213]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
test_x_to_predict = xgb.DMatrix(test_x_A_reshape)

# 设置参数
params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eta': 0.1,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    'max_depth': 8
}

# 训练模型
bst = xgb.train(
    params=params,
    dtrain=dtrain,
    early_stopping_rounds=10,
    num_boost_round=200,
    evals=[(dtrain, 'train'), (dtest, 'test')] # 将训练数据和测试数据都作为验证集，可以实时监督训练情况，是否过拟合
)

# 预测结果
result = bst.predict(
    dtest
)
print('Accuracy of prediction on dataset:', accuracy_score(y_test, result))

# 提交submit
model = bst
res = model.predict(test_x_to_predict)
# res = np.array([np.argmax(l) for l in res])
submit = pd.DataFrame({'id':range(len(res)), 'label':res}).astype('int32')
submit.to_csv(f"/Users/wzq/Desktop/game/DetectionOfSleep/submit/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv",index=False, encoding='utf-8')

[0]	train-mlogloss:1.00422	test-mlogloss:1.01076
[1]	train-mlogloss:0.92587	test-mlogloss:0.93840
[2]	train-mlogloss:0.85822	test-mlogloss:0.87630
[3]	train-mlogloss:0.80011	test-mlogloss:0.82335
[4]	train-mlogloss:0.74910	test-mlogloss:0.77787
[5]	train-mlogloss:0.70358	test-mlogloss:0.73813
[6]	train-mlogloss:0.66461	test-mlogloss:0.70426
[7]	train-mlogloss:0.62844	test-mlogloss:0.67358
[8]	train-mlogloss:0.59757	test-mlogloss:0.64778
[9]	train-mlogloss:0.56907	test-mlogloss:0.62371
[10]	train-mlogloss:0.54308	test-mlogloss:0.60285
[11]	train-mlogloss:0.51988	test-mlogloss:0.58440
[12]	train-mlogloss:0.50094	test-mlogloss:0.56940
[13]	train-mlogloss:0.48349	test-mlogloss:0.55570
[14]	train-mlogloss:0.46701	test-mlogloss:0.54310
[15]	train-mlogloss:0.45238	test-mlogloss:0.53219
[16]	train-mlogloss:0.43905	test-mlogloss:0.52263
[17]	train-mlogloss:0.42630	test-mlogloss:0.51405
[18]	train-mlogloss:0.41389	test-mlogloss:0.50567
[19]	train-mlogloss:0.40119	test-mlogloss:0.49710
[20]	train

### lightgbm

### 不使用 k折

In [None]:
# 不使用 k折
import time


callbacks = [log_evaluation(period=2), early_stopping(stopping_rounds=10)]
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
params = {'num_leaves': 491, # 叶节点数量
        'n_estimators': 1000, # 设置训练轮数
        'min_data_in_leaf': 106, # 每个叶子节点中的数据
        'objective': 'multiclass', # 任务：多分类
        'num_class': 3,
        'max_depth': -1, # -1 ： 不限制深度
        "boosting_type": "gbdt", # 'dart', 'goss', 'rf'
        "metric": 'multi_logloss', # 衡量标准
        "verbosity" : -1, # 不显示信息
        'random_state': 66, # 随机种子
        'learning_rate': 0.1,
        # "callbacks": callbacks, # 添加回调函数
        }
model = lgb.LGBMClassifier(**params, nthread = 4, n_jobs = -1)
model.fit(X_train, y_train,
                eval_set=[(X_train, y_train), (X_test, y_test)],
                eval_metric='multi_error',
                callbacks=callbacks)
y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
score = accuracy_score(y_test, y_pred)
print("准确度", score)

y_pred = model.predict(test_x_A_reshape, num_iteration=model.best_iteration_)
submit = pd.DataFrame({'id':range(len(y_pred)), 'label':y_pred})
submit.to_csv(f"/Users/wzq/Desktop/game/DetectionOfSleep/submit/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv",index=False, encoding='utf-8')

In [173]:
import os
import time
import joblib



X = pd.DataFrame(train_data_x_reshape)
y = pd.DataFrame(train_data_y)


# 创建DataFrame保存特征重要性
feature_importances = pd.DataFrame(index=None)
feature_importances['features'] = X.columns


# 5折交叉验证

#将原始数据随机分为两组，一组做为训练集，一组做为验证集，利用训练集训练分类器，
#然后利用验证集验证模型，记录最后的分类准确率为此分类器的性能指标。

folds = KFold(n_splits=5)

splits = folds.split(X, y) # 分割成5份，前4份是训练集索引，最后1份是验证集索引

# next(iter(splits)) # 输出的是索引

best_score = 0
best_model = 0
best_k = 0



for k, (train_indices, val_indices) in enumerate(splits):
    print("第 %d 折\n" % k)
    
    # iloc：根据标签的所在位置，从0开始计数，先选取行再选取列
    X_train_data, X_val_data = X.iloc[train_indices], X.iloc[val_indices] # 训练集， 验证集
    y_train, y_val = y.iloc[train_indices], y.iloc[val_indices] # 训练标签，验证标签
    y_train = y_train.squeeze()
    y_val = y_val.squeeze()
    print(X_train_data.shape, X_val_data.shape,  y_train.shape, y_val.shape)

    print("X_train_data shape : \t", X_train_data.shape, "X_val_data shape : \t", X_val_data.shape)
    
    #这里调用了 lightGBM 算法 ，第一个是传入数据，第二个是数据的标签

    # params 超参数设置  https://blog.csdn.net/VariableX/article/details/107256149
    callbacks = [log_evaluation(period=2), early_stopping(stopping_rounds=10)]

    params = {'num_leaves': 491, # 叶节点数量
            'n_estimators': 1000, # 设置训练轮数
            'min_data_in_leaf': 106, # 每个叶子节点中的数据
            'objective': 'multiclass', # 任务：多分类
            'num_class': 3,
            'max_depth': -1, # -1 ： 不限制深度
            "boosting_type": "gbdt", # 'dart', 'goss', 'rf'
            "metric": 'multi_logloss', # 衡量标准
            "verbosity" : -1, # 不显示信息
            'random_state': 66, # 随机种子
            'learning_rate': 0.1,
            # "callbacks": callbacks, # 添加回调函数
            }

    # train_dataset = lgb.Dataset(X_train_data, label=y_train) # 训练集
    # val_dataset = lgb.Dataset(X_val_data, label=y_val) # 验证集
    
    model = lgb.LGBMClassifier(**params, nthread = 4, n_jobs = -1)
    model.fit(X_train_data, y_train,
                    eval_set=[(X_train_data, y_train), (X_val_data, y_val)],
                    eval_metric='multi_error',
                    callbacks=callbacks)
    y_pred = model.predict(X_val_data, num_iteration=model.best_iteration_)
    score = accuracy_score(y_val, y_pred)
    print("准确度", score)


    # 指定保存模型的文件夹
    model_folder = '/Users/wzq/Desktop/game/DetectionOfSleep/model'

    # 如果文件夹不存在，则创建文件夹
    if not os.path.exists(model_folder):
        os.makedirs(model_folder)
    # 判断是否是最优模型
    if score > best_score:
        best_k = k
        best_score = score
        best_model = model


# 指定保存模型的文件夹
# model_folder = '/Users/wzq/Desktop/game/DetectionOfSleep/model'
# joblib.dump(best_model, model_folder + f'/{time.strftime('%Y%m%d%H%M%S', time.localtime())}.pkl')
# 模型加载
# model = joblib.load('loan_model.pkl')
print(f"最好的结果是第{best_k}折叠，分数为{best_score}")


    # lgb_model = lgb.train(params=params, # 超参数设置
    #                     train_set=train_dataset, # 训练数据
    #                     num_boost_round=100, # 循环的轮数
    #                     valid_sets=val_dataset, # 验证数据
    #                     valid_names='validation',) # 验证集名称
    
    # 保存特征重要性
    # feature_importances[f'fold_{k+1}'] = lgb_model.feature_importance()
    #print("看一看有啥东西{}".format())
    # 对验证集进行预测
    # y_val_pred = lgb_model.predict(X_val_data)
    # y_val_pred = np.array([np.argmax(l) for l in y_val_pred])
    # print(y_val.squeeze(1).shape, y_val_pred.shape)
    # # 计算roc_auc  # 训练数据label类别分布不均衡 (0   29808) (2     4520) (1     3221) 利用ovo
    # score = accuracy_score(y_val, y_val_pred)
    # print(f'Fold {k + 1} Accuracy of prediction on dataset:', score)
    


第 0 折

(30039, 360) (7510, 360) (30039,) (7510,)
X_train_data shape : 	 (30039, 360) X_val_data shape : 	 (7510, 360)
Training until validation scores don't improve for 10 rounds
[2]	training's multi_error: 0.185725	training's multi_logloss: 0.514886	valid_1's multi_error: 0.287883	valid_1's multi_logloss: 0.716939
[4]	training's multi_error: 0.185725	training's multi_logloss: 0.463382	valid_1's multi_error: 0.287883	valid_1's multi_logloss: 0.66729
[6]	training's multi_error: 0.181231	training's multi_logloss: 0.425987	valid_1's multi_error: 0.285752	valid_1's multi_logloss: 0.633901
[8]	training's multi_error: 0.162755	training's multi_logloss: 0.397617	valid_1's multi_error: 0.2751	valid_1's multi_logloss: 0.612377
[10]	training's multi_error: 0.145644	training's multi_logloss: 0.373012	valid_1's multi_error: 0.260053	valid_1's multi_logloss: 0.594603
[12]	training's multi_error: 0.133893	training's multi_logloss: 0.351751	valid_1's multi_error: 0.250866	valid_1's multi_logloss: 0.5

FileNotFoundError: [Errno 2] No such file or directory: 'loan_model.pkl'

In [181]:
print(f"最好的结果是第{best_k}折叠，分数为{best_score}")
res = best_model.predict(test_x_A_reshape)
for i in res:
    print(i)

最好的结果是第4折叠，分数为0.9633772805966174
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0

## 结果保存

In [165]:
# 提交submit
mode = best_model
res = model.predict(test_x_A_reshape)
# res = np.array([np.argmax(l) for l in res])
submit = pd.DataFrame({'id':range(len(res)), 'label':res})
# submit.to_csv(f"/Users/wzq/Desktop/game/DetectionOfSleep/submit/submit_example_A{time.strftime('%Y%m%d%H%M', time.localtime())}.csv",index=False, encoding='utf-8')