## 仅供交叉验证 LightGBM-按趋势分类

In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from utils import *
import lightgbm as lgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import StratifiedKFold

np.random.seed(42)
n_input=11
random.seed(42)

读取数据

In [6]:
# gene_arr_path = r'../output/gene_editing/es_with_decay.array'
# transplant_arr_path = r'../output/transplant/es_with_decay.array'

# gene_arr = pickle.load(open(gene_arr_path, mode='rb'))
# transplant_arr = pickle.load(open(transplant_arr_path, mode='rb'))

# print('Shape of the gene_editing array:',gene_arr.shape)
# print('Shape of the transplant array:',transplant_arr.shape)

Shape of the gene_editing array: (2643, 17, 10)
Shape of the transplant array: (5141, 17, 10)


### 截断数据
2019年为无效数据

In [7]:
# gene_arr = gene_arr[:, :-1, :]
# transplant_arr = transplant_arr[:, :-1, :]

# print('Shape of the gene_editing array:',gene_arr.shape)
# print('Shape of the transplant array:',transplant_arr.shape)

Shape of the gene_editing array: (2643, 16, 10)
Shape of the transplant array: (5141, 16, 10)


## 规范数据并获取5折交叉检验所需的训练集和验证集

In [8]:
# scaler, data = scale_data(transplant_arr, 'standard')

# # 用预测第二年的类别变量作为分成Kfold的依据，不支持浮点数
# X, y, y_cat = data[:, :n_input, :], data[:, n_input:, -2],transplant_arr[:, n_input, -1]
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### 按趋势划分数据

In [2]:
def split_data_by_trend(data, targets):
    up_data = []
    down_data = []
    up_target = []
    down_target = []
    
    for i in range(len(data)):
        a, b = np.polyfit(range(len(data[i])), data[i, :, -2].reshape(-1), 1)
        if a > 0:
            up_data.append(data[i])
            up_target.append(targets[i])
        else:
            down_data.append(data[i])
            down_target.append(targets[i])
    return np.array(up_data), np.array(up_target), np.array(down_data), np.array(down_target)

### 构建模型，训练并评估

In [6]:
def cross_validation(X, y, y_cat, kfold, scaler):
    overall_metrics = {
        'mae':[],
        'rmse':[],
        'ndcg':[],
        'mape':[],
        'r2':[],
        'pearson':[],
        'acc':[]
    }

    annual_metrics = {
        'mae':[],
        'rmse':[],
        'ndcg':[],
        'mape':[],
        'r2':[],
        'pearson':[],
        'acc':[]
    }
    
    for train, test in kfold.split(X, y_cat):
        X_train = X[train]
        y_train = y[train]
        X_test = X[test]
        y_test = y[test]
        models = []
        
        # 按总量划分数据集
        X_train1, y_train1, X_train2, y_train2 = split_data_by_trend(X_train, y_train)
        train_xs = [X_train1, X_train2]
        train_ys = [y_train1, y_train2]
        
        X_test1, y_test1, X_test2, y_test2 = split_data_by_trend(X_test, y_test)
        test_xs = [X_test1, X_test2]
        test_ys = [y_test1, y_test2]
        i_s = [1, 2]
        
        # 训练
        for i in range(len(i_s)):
            model = MultiOutputRegressor(lgb.LGBMRegressor(n_estimators=100))
            model.fit(train_xs[i].reshape(len(train_xs[i]), -1), train_ys[i])
            pred = model.predict(test_xs[i].reshape(len(test_xs[i]), -1)).reshape(test_ys[i].shape)
            models.append(model)
        
        # 预测
        y_test = []
        y_pred = []
        for i in range(len(i_s)):
            y_test.append(test_ys[i])
            y_pred.append(models[i].predict(test_xs[i].reshape(len(test_xs[i]), -1)).reshape(test_ys[i].shape))
        
        y_test = np.concatenate(y_test)
        y_pred = np.concatenate(y_pred)

        metrics = ['mae', 'rmse','ndcg', 'mape', 'r2', 'pearson', 'acc']
        for m in metrics:
            overall, annual = eval_model(m, y_test, y_pred, scaler)
            overall_metrics[m].append(overall)
            annual_metrics[m].append(annual)
    
    return overall_metrics, annual_metrics

In [7]:
def full_pipeline():
    gene_arr_path = r'../output/gene_editing/es_with_decay.array'
    transplant_arr_path = r'../output/transplant/es_with_decay.array'

    gene_arr = pickle.load(open(gene_arr_path, mode='rb'))
    transplant_arr = pickle.load(open(transplant_arr_path, mode='rb'))
    
    gene_arr = gene_arr[:, :-1, :]
    transplant_arr = transplant_arr[:, :-1, :]

    print('Shape of the gene_editing array:',gene_arr.shape)
    print('Shape of the transplant array:',transplant_arr.shape)
    
    metrics = {
        'gene':{
            'overall':{},
            'annual':{}
        },
        'transplant':{
            'overall':{},
            'annual':{}
        }
    }
    
    for name, dataset in zip(['gene', 'transplant'], [gene_arr, transplant_arr]):
        scaler, data = scale_data(dataset, 'standard')

        # 用预测第二年的类别变量作为分成Kfold的依据，不支持浮点数
        X, y, y_cat = data[:, :n_input, :], data[:, n_input:, -2], dataset[:, n_input, -1]
        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        overall_metrics, annual_metrics = cross_validation(X, y, y_cat, kfold, scaler)
        
        for metric, value in overall_metrics.items():
            metrics[name]['overall'][metric] = np.mean(value)
        
        for metric, value in annual_metrics.items():
            metrics[name]['annual'][metric] = np.mean(np.array(value), axis=0)
    
    pickle.dump(metrics, open('tree_metrics.dict', 'wb'))
    
    return metrics

In [8]:
metrics = full_pipeline()

Shape of the gene_editing array: (2643, 16, 10)
Shape of the transplant array: (5141, 16, 10)


In [9]:
metrics

{'gene': {'overall': {'mae': 0.7909075416065805,
   'rmse': 1.278444190003476,
   'ndcg': 0.4327606848927772,
   'mape': 5.925912255854935,
   'r2': 0.2528057800626929,
   'pearson': 0.5152516967477198,
   'acc': 0.3287914458791378},
  'annual': {'mae': array([0.4464986 , 0.60152146, 0.81395525, 0.96126659, 1.13129581]),
   'rmse': array([0.74973679, 0.95150734, 1.3175806 , 1.47291795, 1.65661901]),
   'ndcg': array([0.55789007, 0.37683653, 0.25575173, 0.28307893, 0.13834245]),
   'mape': array([3.95263941, 4.76965315, 4.97621227, 8.5047829 , 7.42627354]),
   'r2': array([0.4316227 , 0.30118635, 0.18684038, 0.06892368, 0.03043578]),
   'pearson': array([0.67685971, 0.57423375, 0.45518125, 0.32769033, 0.25514158]),
   'acc': array([0.572082  , 0.32803885, 0.32764939, 0.23380652, 0.18238047])}},
 'transplant': {'overall': {'mae': 0.815755789644913,
   'rmse': 1.2830816356245662,
   'ndcg': 0.48931385372540664,
   'mape': 3.686268269692983,
   'r2': 0.4121105079610429,
   'pearson': 0.646

In [8]:
metrics

{'gene': {'overall': {'mae': 0.8209942683708282,
   'rmse': 1.3337560891982014,
   'ndcg': 0.6096930177727906},
  'annual': {'mae': array([0.45741282, 0.62247792, 0.84728625, 1.00330621, 1.17448814]),
   'rmse': array([0.78222512, 0.99470613, 1.38349926, 1.52179826, 1.73490588]),
   'ndcg': array([0.66153224, 0.32535838, 0.27309157, 0.33757616, 0.10942672])}},
 'transplant': {'overall': {'mae': 0.820901655306564,
   'rmse': 1.3020615560087556,
   'ndcg': 0.5630817064248428},
  'annual': {'mae': array([0.80399247, 0.82789492, 0.79520697, 0.80899437, 0.86841955]),
   'rmse': array([1.28646968, 1.34488998, 1.26323874, 1.24789521, 1.3557784 ]),
   'ndcg': array([0.20565586, 0.02751808, 0.01730095, 0.09461803, 0.12595979])}}}

In [10]:
overall_metrics

{'mae': [0.8202441705852876,
  0.8393395042631376,
  0.8139514802719408,
  0.8104260177773122,
  0.8205471036351414],
 'rmse': [1.3036809517963384,
  1.3202631231575475,
  1.279857091440752,
  1.2569798618025954,
  1.3495267518465446],
 'ndcg': [0.14125640793492913,
  0.9003448482429512,
  0.6445959886326867,
  0.8642917375327137,
  0.2649195497809327]}