## 仅供交叉验证 线性回归

In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from utils import *
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.model_selection import StratifiedKFold

np.random.seed(42)
random.seed(42)
n_input = 11

读取数据

In [2]:
# gene_arr_path = r'../output/gene_editing/es_with_decay.array'
# transplant_arr_path = r'../output/transplant/es_with_decay.array'

# gene_arr = pickle.load(open(gene_arr_path, mode='rb'))
# transplant_arr = pickle.load(open(transplant_arr_path, mode='rb'))

# print('Shape of the gene_editing array:',gene_arr.shape)
# print('Shape of the transplant array:',transplant_arr.shape)

Shape of the gene_editing array: (2643, 17, 10)
Shape of the transplant array: (5141, 17, 10)


### 截断数据
2019年为无效数据

In [3]:
# gene_arr = gene_arr[:, :-1, :]
# transplant_arr = transplant_arr[:, :-1, :]

# print('Shape of the gene_editing array:',gene_arr.shape)
# print('Shape of the transplant array:',transplant_arr.shape)

Shape of the gene_editing array: (2643, 16, 10)
Shape of the transplant array: (5141, 16, 10)


## 规范数据并获取5折交叉检验所需的训练集和验证集

In [6]:
# scaler, data = scale_data(transplant_arr, 'standard')

# # 用预测第二年的类别变量作为分成Kfold的依据，不支持浮点数
# X, y, y_cat = data[:, :n_input, :], data[:, n_input:, -2],transplant_arr[:, n_input, -1]
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### 构建模型，训练并评估

In [2]:
def cross_validation(X, y, y_cat, kfold, scaler):

    overall_metrics = {
        'mae':[],
        'rmse':[],
        'ndcg':[],
        'mape':[],
        'r2':[],
        'pearson':[],
        'acc':[]
    }

    annual_metrics = {
        'mae':[],
        'rmse':[],
        'ndcg':[],
        'mape':[],
        'r2':[],
        'pearson':[],
        'acc':[]
    }

    tests = []
    preds = []

    for train, test in kfold.split(X, y_cat):
        model = RidgeCV(alphas=[0.1, 0.2, 0.3 ,0.5, 0.7, 1.0])
        model.fit(X[train].reshape(len(train), -1), y[train])

        y_test = y[test]
        y_pred = model.predict(X[test].reshape(len(test), -1)).reshape(y[test].shape)

        tests.append(y_test)
        preds.append(y_pred)

        metrics = ['mae', 'rmse','ndcg', 'mape', 'r2', 'pearson', 'acc']
        for m in metrics:
            overall, annual = eval_model(m, y_test, y_pred, scaler)
            overall_metrics[m].append(overall)
            annual_metrics[m].append(annual)
    
    return overall_metrics, annual_metrics, tests, preds

In [3]:
def full_pipeline():
    gene_arr_path = r'../output/gene_editing/es_with_decay.array'
    transplant_arr_path = r'../output/transplant/es_with_decay.array'

    gene_arr = pickle.load(open(gene_arr_path, mode='rb'))
    transplant_arr = pickle.load(open(transplant_arr_path, mode='rb'))
    
    gene_arr = gene_arr[:, :-1, :]
    transplant_arr = transplant_arr[:, :-1, :]

    print('Shape of the gene_editing array:',gene_arr.shape)
    print('Shape of the transplant array:',transplant_arr.shape)
    
    metrics = {
        'gene':{
            'overall':{},
            'annual':{}
        },
        'transplant':{
            'overall':{},
            'annual':{}
        }
    }
    
    for name, dataset in zip(['gene', 'transplant'], [gene_arr, transplant_arr]):
        scaler, data = scale_data(dataset, 'standard')

        # 用预测第二年的类别变量作为分成Kfold的依据，不支持浮点数
        X, y, y_cat = data[:, :n_input, :], data[:, n_input:, -2], dataset[:, n_input, -1]
        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        overall_metrics, annual_metrics, tests, preds = cross_validation(X, y, y_cat, kfold, scaler)
        pickle.dump(tests, open('lr_tests_{}.list'.format(name), 'wb'))
        pickle.dump(preds, open('lr_preds_{}.list'.format(name), 'wb'))
        
        for n in [1, 5, 10, 15, 20, 25, 30, 35, 40]:
            overall_total = []

            for test, pred in zip(tests, preds):
                overall, _ = eval_model('ndcg', test, pred, scaler, n)
                overall_total.append(overall)
            print(n, np.mean(overall_total))

        for metric, value in overall_metrics.items():
            metrics[name]['overall'][metric] = np.mean(value)
        
        for metric, value in annual_metrics.items():
            metrics[name]['annual'][metric] = np.mean(np.array(value), axis=0)
            
        print('=====')
    
    pickle.dump(metrics, open('lr_metrics.dict', 'wb'))
    
    return metrics

In [4]:
metrics = full_pipeline()

Shape of the gene_editing array: (2643, 16, 10)
Shape of the transplant array: (5141, 16, 10)
1 0.22223977128162797
5 0.14981584667329062
10 0.15038070685135352
15 0.2004395120663359
20 0.2492909291406297
25 0.2635951185595985
30 0.26411620928984597
35 0.264121647440036
40 0.2641232669689737
=====
1 0.043180314790711806
5 0.33429069958040786
10 0.3833263674581361
15 0.3844872491068555
20 0.38917158597614976
25 0.39075055083828797
30 0.3964470495795371
35 0.3974889929609235
40 0.39812225275149044
=====


In [7]:
metrics

{'gene': {'overall': {'mae': 0.8018815046986673,
   'rmse': 1.3595098885998171,
   'ndcg': 0.2492909291406297},
  'annual': {'mae': array([0.46522077, 0.62449025, 0.8229462 , 0.95881447, 1.13793583]),
   'rmse': array([0.91029942, 1.08764041, 1.40908608, 1.5134606 , 1.69634745]),
   'ndcg': array([0.43892515, 0.22356679, 0.18785385, 0.16005015, 0.11074381])}},
 'transplant': {'overall': {'mae': 0.8296704622461665,
   'rmse': 1.3081963723530239,
   'ndcg': 0.38917158597614976},
  'annual': {'mae': array([0.81569122, 0.82645236, 0.79235828, 0.83018397, 0.88366648]),
   'rmse': array([1.35811836, 1.31037518, 1.23673301, 1.26169358, 1.36091925]),
   'ndcg': array([0.03360669, 0.02482068, 0.01940027, 0.06732   , 0.09894005])}}}

In [5]:
metrics

{'gene': {'overall': {'mae': 0.8018815046986673,
   'rmse': 1.3595098885998171,
   'ndcg': 0.2492909291406297,
   'mape': 6.269173143466054,
   'r2': 0.14811191758153008,
   'pearson': 0.473244547996673,
   'acc': 0.32244047361717326},
  'annual': {'mae': array([0.46522077, 0.62449025, 0.8229462 , 0.95881447, 1.13793583]),
   'rmse': array([0.91029942, 1.08764041, 1.40908608, 1.5134606 , 1.69634745]),
   'ndcg': array([0.43892515, 0.22356679, 0.18785385, 0.16005015, 0.11074381]),
   'mape': array([4.4641149 , 4.49437097, 5.17637538, 8.46914721, 8.74185726]),
   'r2': array([ 0.05732926,  0.03848562,  0.06256008,  0.00980737, -0.01411822]),
   'pearson': array([0.62985521, 0.48130982, 0.4170647 , 0.29739416, 0.17606587]),
   'acc': array([0.56336835, 0.30915866, 0.31061595, 0.25689199, 0.17216743])}},
 'transplant': {'overall': {'mae': 0.8296704622461665,
   'rmse': 1.3081963723530239,
   'ndcg': 0.38917158597614976,
   'mape': 3.858641832508431,
   'r2': 0.3888450679732709,
   'pearson