## Created by <a href="https://github.com/yunsuxiaozi">yunsuxiaozi</a> 2024/6/18

### Libraries

In [1]:
import pandas as pd#导入csv文件的库
import numpy as np#进行矩阵运算的库
from tqdm import tqdm#加载进度条的库
import warnings#避免一些可以忽略的报错
warnings.filterwarnings('ignore')#filterwarnings()方法是用于设置警告过滤器的方法，它可以控制警告信息的输出方式和级别。

import random#提供了一些用于生成随机数的函数
#设置随机种子,保证模型可以复现
def seed_everything(seed):
    np.random.seed(seed)#numpy的随机种子
    random.seed(seed)#python内置的随机种子
seed_everything(seed=2024)

### read data

In [2]:
#样本数量*(血氧和心率)*采样为3hz,180个数据总共60秒
path=""#这里需要改成你自己的文件路径
train_X=np.load(path+"训练集\\train_x.npy")
print(f"train_X.shape:{train_X.shape}")
train_y=np.load(path+"训练集\\train_y.npy")
print(f"train_y.shape:{train_y.shape}")
test_X=np.load(path+"测试集A\\test_x_A.npy")
print(f"test_X.shape:{test_X.shape}")
submission=pd.read_csv(path+"测试集A\\submit_example_A.csv")
submission.head()

train_X.shape:(37549, 2, 180)
train_y.shape:(37549,)
test_X.shape:(1155, 2, 180)


Unnamed: 0,id,label
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [3]:
#为了和测试集保持一致,并测试模型对测试集的效果,label=0的样本随机选择4600个,比label1和label2稍微多一点。
zero_index=list(np.where(train_y==0)[0])
np.random.shuffle(zero_index)
total_index=zero_index[:4600]+list(np.where(train_y!=0)[0])
train_X=train_X[total_index]
train_y=train_y[total_index]
print(f"len(total_index):{len(total_index)}")
for i in range(3):
    print(f"label:{i},{np.sum(train_y==i)}")

len(total_index):12341
label:0,4600
label:1,3221
label:2,4520


### Feature engineer

In [4]:
#通过train_X和test_X来构造特征
def get_feats(data):
    feats=[]
    for i in tqdm(range(len(data))):
        #data[i]是2*180 血氧和心率
        data[i][0],data[i][1]
        #由于是3hz,所以按照秒来提取特征
        origin_feats=pd.DataFrame({"血氧/秒":data[i][0].reshape(-1,3).mean(axis=1),"心率/秒":data[i][1].reshape(-1,3).mean(axis=1)})
        for col in ['血氧/秒',"心率/秒"]:
            for gap in [1,2,4,8,16,30]:
                origin_feats[f"{col}_shift{gap}"]=origin_feats[col].shift(gap)
                origin_feats[f"{col}_gap{gap}"]=origin_feats[col]-origin_feats[f"{col}_shift{gap}"]
        feats.append(list(origin_feats.mean(axis=0).values)+list(origin_feats.max(axis=0).values)+\
                     list(origin_feats.min(axis=0).values)+list(origin_feats.std(axis=0).values)+\
                     list(origin_feats.median(axis=0).values)
                    )
    feats=pd.DataFrame(feats)
    origin_cols=list(origin_feats.columns)
    feats.columns=[f"mean_{col}"for col in origin_cols]+[f"max_{col}"for col in origin_cols]\
                   +[f"min_{col}"for col in origin_cols]+[f"std_{col}"for col in origin_cols]+\
                   [f"median_{col}"for col in origin_cols]
    return feats
train_feats=get_feats(train_X)
train_feats['label']=train_y
test_feats=get_feats(test_X)
train_feats.head()

100%|████████████████████████████████████████████████████████████████████████████| 12341/12341 [05:21<00:00, 38.40it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1155/1155 [00:29<00:00, 39.74it/s]


Unnamed: 0,mean_血氧/秒,mean_心率/秒,mean_血氧/秒_shift1,mean_血氧/秒_gap1,mean_血氧/秒_shift2,mean_血氧/秒_gap2,mean_血氧/秒_shift4,mean_血氧/秒_gap4,mean_血氧/秒_shift8,mean_血氧/秒_gap8,...,median_心率/秒_gap2,median_心率/秒_shift4,median_心率/秒_gap4,median_心率/秒_shift8,median_心率/秒_gap8,median_心率/秒_shift16,median_心率/秒_gap16,median_心率/秒_shift30,median_心率/秒_gap30,label
0,92.683333,49.383333,92.627119,0.084746,92.586207,0.155172,92.5,0.303571,92.346154,0.519231,...,0.0,50.0,0.0,50.666667,0.666667,49.833333,1.5,48.0,3.833333,0
1,96.0,63.011111,96.0,0.0,96.0,0.0,96.0,0.0,96.0,0.0,...,0.0,63.0,0.0,63.0,0.666667,63.0,1.0,62.166667,1.5,0
2,95.0,57.7,95.0,0.0,95.0,0.0,95.0,0.0,95.0,0.0,...,0.0,58.0,0.0,58.166667,0.0,58.333333,-1.166667,58.166667,0.0,0
3,95.4,53.25,95.40678,0.050847,95.413793,0.091954,95.428571,0.125,95.461538,0.083333,...,0.0,52.833333,0.0,52.666667,0.166667,52.333333,0.666667,52.0,2.166667,0
4,97.183333,70.033333,97.169492,0.033898,97.155172,0.068966,97.125,0.142857,97.038462,0.326923,...,-0.166667,70.0,-0.333333,70.0,0.0,68.833333,1.0,67.5,4.833333,0


### Model training

In [5]:
#model lgb分类模型,日志评估,早停防止过拟合
from  lightgbm import LGBMClassifier,log_evaluation,early_stopping
#metric:准确率
from sklearn.metrics import accuracy_score
#KFold是直接分成k折,StratifiedKFold还要考虑每种类别的占比
from sklearn.model_selection import StratifiedKFold
choose_cols=[col for col in test_feats.columns]
def fit_and_predict(train_feats=train_feats,test_feats=test_feats,model=None,num_folds=10,seed=2024,name='lgb'):
    X=train_feats[choose_cols].copy()
    y=train_feats['label'].copy()
    oof_pred=np.zeros((len(X)))
    test_X=test_feats[choose_cols].copy()
    test_pred_pro=np.zeros((num_folds,len(test_X),3))#3是num_classes
     
    #10折交叉验证
    skf = StratifiedKFold(n_splits=num_folds,shuffle=True)
    for fold, (train_index, valid_index) in (enumerate(skf.split(X,y))):
        print(f"name {name},fold:{fold}")

        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
        
        model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
                      callbacks=[log_evaluation(100),early_stopping(100)]
                     )
        
        oof_pred[valid_index]=model.predict(X_valid)
        test_pred_pro[fold]=model.predict_proba(test_X)
        
    print(f"accuracy_score:{accuracy_score(y.values,oof_pred)}")
    #(len(test_X),3)
    test_pred_pro=test_pred_pro.mean(axis=0)
    
    test_preds=np.argmax(test_pred_pro,axis=1)
    return oof_pred,test_preds
lgb_params={
    "boosting_type": "gbdt",
    "objective": "multi_class",
    "metric": "multi_logloss",
    "max_depth": 6,
    "learning_rate": 0.05,
    "n_estimators":10000,
    "colsample_bytree": 0.2,
    "colsample_bynode": 0.2,
    "verbose": -1,
    "random_state": 2024,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "extra_trees":True,
    'num_leaves':127,
    "verbose": -1,
    "max_bin":225,
    }

lgb_oof_pred_pro,lgb_test_pred=fit_and_predict(model=LGBMClassifier(**lgb_params),num_folds=10,seed=2024,name='lgb')
print(f"lgb_test_pred[:10]:{lgb_test_pred[:10]}")

name lgb,fold:0
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 0.704301
[200]	valid_0's multi_logloss: 0.686409
[300]	valid_0's multi_logloss: 0.678001
[400]	valid_0's multi_logloss: 0.673091
[500]	valid_0's multi_logloss: 0.668517
[600]	valid_0's multi_logloss: 0.665939
[700]	valid_0's multi_logloss: 0.663869
[800]	valid_0's multi_logloss: 0.662662
[900]	valid_0's multi_logloss: 0.661497
[1000]	valid_0's multi_logloss: 0.661133
[1100]	valid_0's multi_logloss: 0.660287
[1200]	valid_0's multi_logloss: 0.659414
[1300]	valid_0's multi_logloss: 0.659015
[1400]	valid_0's multi_logloss: 0.659026
[1500]	valid_0's multi_logloss: 0.658531
[1600]	valid_0's multi_logloss: 0.658765
Early stopping, best iteration is:
[1508]	valid_0's multi_logloss: 0.658277
name lgb,fold:1
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 0.72123
[200]	valid_0's multi_logloss: 0.702182
[300]	valid_0's multi_logloss: 0.691399
[4

[1600]	valid_0's multi_logloss: 0.667612
[1700]	valid_0's multi_logloss: 0.667209
[1800]	valid_0's multi_logloss: 0.667317
[1900]	valid_0's multi_logloss: 0.667087
Early stopping, best iteration is:
[1883]	valid_0's multi_logloss: 0.666779
accuracy_score:0.7106393323069443
lgb_test_pred[:10]:[2 2 2 0 1 0 1 1 2 1]


### Submission

In [6]:
submission['label']=lgb_test_pred
submission.to_csv(path+"baseline.csv",index=None)
submission.head()

Unnamed: 0,id,label
0,0,2
1,1,2
2,2,2
3,3,0
4,4,1


### 后续改进方向:

#### 1.可以用上全部的数据,这样的问题就是训练数据和测试数据分布不一致,线下CV不具有参考意义。

#### 2.构造统计特征的时候加上q25,q75,skew,kurt等特征。

#### 3.考虑构造血氧和心率的交叉特征(加减乘除),并对交叉特征采用统计方法建模。

#### 4.尝试融合模型(lgb,xgb,cat)

#### 5.采用深度学习的方法并结合赛题背景进行建模。