## 仅供交叉验证 Encoder-Decoder with LSTM cell

In [2]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from utils import *
import tensorflow.keras as keras
from tensorflow.keras import regularizers
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf

np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)
n_input = 11

读取数据

In [3]:
# gene_arr_path = r'../output/gene_editing/es_with_decay.array'
# transplant_arr_path = r'../output/transplant/es_with_decay.array'

# gene_arr = pickle.load(open(gene_arr_path, mode='rb'))
# transplant_arr = pickle.load(open(transplant_arr_path, mode='rb'))

# print('Shape of the gene_editing array:',gene_arr.shape)
# print('Shape of the transplant array:',transplant_arr.shape)

Shape of the gene_editing array: (2643, 17, 10)
Shape of the transplant array: (5141, 17, 10)


### 截断数据
2019年为无效数据

In [4]:
# gene_arr = gene_arr[:, :-1, :]
# transplant_arr = transplant_arr[:, :-1, :]

# print('Shape of the gene_editing array:',gene_arr.shape)
# print('Shape of the transplant array:',transplant_arr.shape)

Shape of the gene_editing array: (2643, 16, 10)
Shape of the transplant array: (5141, 16, 10)


### 规范数据并获取5折交叉检验所需的训练集和验证集

In [46]:
# scaler, data = scale_data(transplant_arr, 'standard')

# # 用预测第二年的类别变量作为分成Kfold的依据，不支持浮点数
# X, y, y_cat = data[:, :n_input, :], data[:, n_input:, -2],transplant_arr[:, n_input, -1]
# kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### 构建模型

In [3]:
def root_mean_squared_error(y_true, y_pred):
        return keras.backend.sqrt(keras.backend.mean(keras.backend.square(y_pred - y_true), axis=-1)) 

# def build_encoder_decoder_model(lstm_units, dense_units, lr=1e-4):
#     model = keras.models.Sequential()
#     model.add(LSTM(lstm_units, activation='tanh', input_shape=(11, 10), return_sequences=True))
#     model.add(LSTM(lstm_units, activation='tanh'))
#     model.add(RepeatVector(5))
#     model.add(LSTM(lstm_units, activation='tanh', return_sequences=True))
#     model.add(LSTM(lstm_units, activation='tanh', return_sequences=True))
#     model.add(TimeDistributed(Dense(dense_units, activation='relu')))
#     model.add(TimeDistributed(Dense(1)))
    
#     optimizer=keras.optimizers.Adam(learning_rate=lr)
#     model.compile(loss=root_mean_squared_error, optimizer=optimizer)
#     return model

def build_encoder_decoder_model(n_layers=2, n_units=256, lr=1e-4):
    model = keras.models.Sequential()
    model.add(LSTM(n_units, activation='tanh', input_shape=(11, 10), return_sequences=True))
    if n_layers > 2:
        for i in range(n_layers):
            model.add(LSTM(n_units, activation='tanh', return_sequences=True))
    model.add(LSTM(lstm_units, activation='tanh'))
    model.add(RepeatVector(5))
#     model.add(LSTM(lstm_units, activation='tanh', return_sequences=True))
#     model.add(LSTM(lstm_units, activation='tanh', return_sequences=True))
    for i in range(n_layers):
        model.add(LSTM(n_units, activation='tanh', return_sequences=True))
    model.add(TimeDistributed(Dense(n_units, activation='relu')))
    model.add(TimeDistributed(Dense(1)))
    
    optimizer=keras.optimizers.Adam(learning_rate=lr)
    model.compile(loss=root_mean_squared_error, optimizer=optimizer)
    return model

### 进行训练和评估
使用EarlyStopping和Checkpoint做训练停止方式

In [4]:
def cross_validation(X, y, y_cat, kfold, scaler, n_layers, n_units):
    overall_metrics = {
        'mae':[],
        'rmse':[],
        'ndcg':[],
        'mape':[],
        'r2':[],
        'pearson':[],
        'acc':[]
    }

    annual_metrics = {
        'mae':[],
        'rmse':[],
        'ndcg':[],
        'mape':[],
        'r2':[],
        'pearson':[],
        'acc':[]
    }

    tests = []
    preds = []

    for train, test in kfold.split(X, y_cat):
        model = build_encoder_decoder_model(n_layers, n_units, 1e-4)
        history = model.fit(X[train], y[train], epochs=100, batch_size=16, verbose=1, validation_data=(X[test], y[test]),
                           callbacks=[
                               EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto', restore_best_weights=True)
                           ])

        y_test = y[test]
        y_pred = model.predict(X[test]).reshape(y[test].shape)

        tests.append(y_test)
        preds.append(y_pred)

        metrics = ['mae', 'rmse','ndcg', 'mape', 'r2', 'pearson', 'acc']
        for m in metrics:
            overall, annual = eval_model(m, y_test, y_pred, scaler)
            overall_metrics[m].append(overall)
            annual_metrics[m].append(annual)
            
    return overall_metrics, annual_metrics, tests, preds

In [5]:
def full_pipeline(n_layers, n_units):
    gene_arr_path = r'../output/gene_editing/es_with_decay.array'
    transplant_arr_path = r'../output/transplant/es_with_decay.array'

    gene_arr = pickle.load(open(gene_arr_path, mode='rb'))
    transplant_arr = pickle.load(open(transplant_arr_path, mode='rb'))
    
    gene_arr = gene_arr[:, :-1, :]
    transplant_arr = transplant_arr[:, :-1, :]

    print('Shape of the gene_editing array:',gene_arr.shape)
    print('Shape of the transplant array:',transplant_arr.shape)
    
    metrics = {
        'gene':{
            'overall':{},
            'annual':{}
        },
        'transplant':{
            'overall':{},
            'annual':{}
        }
    }
    
    for name, dataset in zip(['gene', 'transplant'], [gene_arr, transplant_arr]):
        scaler, data = scale_data(dataset, 'standard')

        # 用预测第二年的类别变量作为分成Kfold的依据，不支持浮点数
        X, y, y_cat = data[:, :n_input, :], data[:, n_input:, -2], dataset[:, n_input, -1]
        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        overall_metrics, annual_metrics, tests, preds = cross_validation(X, y, y_cat, kfold, scaler, n_layers, n_units)
        pickle.dump(tests, open('rnn_tests_{}.list'.format(name), 'wb'))
        pickle.dump(preds, open('rnn_preds_{}.list'.format(name), 'wb'))

        for metric, value in overall_metrics.items():
            metrics[name]['overall'][metric] = np.mean(value)
        
        for metric, value in annual_metrics.items():
            metrics[name]['annual'][metric] = np.mean(np.array(value), axis=0)
    
    pickle.dump(metrics, open('rnn_metrics.dict', 'wb'))
    
    return metrics

In [None]:
para_tuning_metrics = {}
for n_layers in [1, 2, 3, 4, 5]:
    for n_units in [32, 64, 128, 256, 512]:
        print(n_layers, n_units)
        para_tuning_metrics[(n_layers, n_units)] = full_pipeline(n_layers, n_units)
        print(para_tuning_metrics[(n_layers, n_units)])
#         print('gene_mae', para_tuning_metrics[(n_layers, n_units)]['gene']['overall']['mae'])
#         print('gene_rmse', para_tuning_metrics[(n_layers, n_units)]['gene']['overall']['rmse'])
#         print('transplant_mae', para_tuning_metrics[(n_layers, n_units)]['transplant']['overall']['mae'])
#         print('transplant_rmse', para_tuning_metrics[(n_layers, n_units)]['transplant']['overall']['rmse'])
        print()

In [None]:
metrics = full_pipeline()

In [6]:
metrics = full_pipeline()

Shape of the gene_editing array: (2643, 16, 10)
Shape of the transplant array: (5141, 16, 10)
Train on 2112 samples, validate on 531 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Train on 2113 samples, validate on 530 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/10

Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Train on 4111 samples, validate on 1030 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch

Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Train on 4114 samples, validate on 1027 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Train on 4115 samples, validate on 1026 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100

In [9]:
metrics

{'gene': {'overall': {'mae': 0.7297401469016269,
   'rmse': 1.2887244638659925,
   'ndcg': 0.3803953501729045},
  'annual': {'mae': array([0.41587097, 0.57677975, 0.75007012, 0.88360723, 1.02237268]),
   'rmse': array([0.77792397, 0.97254351, 1.31705204, 1.47657129, 1.66178059]),
   'ndcg': array([0.44694829, 0.37630686, 0.15230446, 0.19535201, 0.14445967])}},
 'transplant': {'overall': {'mae': 0.7538141078186955,
   'rmse': 1.251139174988568,
   'ndcg': 0.5317447321152349},
  'annual': {'mae': array([0.74064418, 0.7623244 , 0.72633263, 0.74707724, 0.79269209]),
   'rmse': array([1.27620115, 1.28425156, 1.18747271, 1.19815046, 1.29499351]),
   'ndcg': array([0.04100715, 0.01954889, 0.02220831, 0.07470384, 0.05093919])}}}

In [7]:
metrics

{'gene': {'overall': {'mae': 0.7297492556384867,
   'rmse': 1.2889003524345672,
   'ndcg': 0.3803953363115765,
   'mape': 4.244950962090654,
   'r2': 0.24038236527618642,
   'pearson': 0.5237988055359878,
   'acc': 0.3445377320027842},
  'annual': {'mae': array([0.4158578 , 0.57681987, 0.75021887, 0.88353171, 1.02231803]),
   'rmse': array([0.77802427, 0.97265524, 1.317182  , 1.47681347, 1.66203587]),
   'ndcg': array([0.44694329, 0.37630686, 0.15396231, 0.19492758, 0.14446005]),
   'mape': array([3.06064781, 2.80838312, 3.11369291, 5.9569929 , 6.28503807]),
   'r2': array([0.39500869, 0.27431165, 0.18671522, 0.06620267, 0.02536364]),
   'pearson': array([0.65211583, 0.56291987, 0.4766435 , 0.33350701, 0.27608457]),
   'acc': array([0.62432858, 0.22443223, 0.24055186, 0.32722977, 0.30614623])}},
 'transplant': {'overall': {'mae': 0.7544008497719823,
   'rmse': 1.2510707017068214,
   'ndcg': 0.47544825701727483,
   'mape': 3.5521715821966304,
   'r2': 0.44112280139615495,
   'pearson': 