<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Load-Data-and-Packages" data-toc-modified-id="Load-Data-and-Packages-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Load Data and Packages</a></span></li><li><span><a href="#Model" data-toc-modified-id="Model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Model</a></span><ul class="toc-item"><li><span><a href="#Data-Selection-and-Data-Split" data-toc-modified-id="Data-Selection-and-Data-Split-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Data Selection and Data Split</a></span></li><li><span><a href="#XGB-&amp;-LGB" data-toc-modified-id="XGB-&amp;-LGB-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>XGB &amp; LGB</a></span></li></ul></li><li><span><a href="#output-Train-and-Test-Files" data-toc-modified-id="output-Train-and-Test-Files-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>output Train and Test Files</a></span><ul class="toc-item"><li><span><a href="#output-Train-predicted-result" data-toc-modified-id="output-Train-predicted-result-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>output Train predicted result</a></span></li><li><span><a href="#output-Test-predicted-result" data-toc-modified-id="output-Test-predicted-result-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>output Test predicted result</a></span></li></ul></li></ul></div>

# Load Data and Packages

In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from math import sqrt
import lightgbm as lgb
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from tqdm import tqdm_notebook as tqdm

from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('./output/data_processed_0411.csv',index_col=None)
label = 'r_alsfrs_r_total'
df_pro = df[df.set==1]
df_test = df[df.set==0]
feature_selected = df_pro.columns.tolist()
feature_selected.remove('pid')
feature_selected.remove(label)
feature_selected.remove('set')

# Model

## Data Selection and Data Split

In [3]:
# used to split data based on patient id
pid_list = list(set(df_pro['pid']))
mylist = []
np.random.seed(10)
for k in range(0, (len(pid_list))):
    x = np.random.randint(0, 10)
    mylist.append(x)     
columns = ['pid', 'cv_cohort']
cohort = pd.DataFrame(columns=columns)
cohort['pid'] = pid_list
cohort['cv_cohort'] = mylist

## XGB & LGB

In [4]:
predicted_value_XGB = [ ]
predicted_value_LGB = [ ]
test_predicted_LGB = []
test_predicted_XGB = []

table = []
# temp_train = pd.DataFrame()
temp_test =pd.DataFrame()
for i in range(10):
    train_pid = cohort['pid'][cohort['cv_cohort'] != i]
    test_pid = cohort['pid'][cohort['cv_cohort'] == i]
    train = df_pro[df_pro['pid'].isin(train_pid)]
    test = df_pro[df_pro['pid'].isin(test_pid)]
    
    train_x = train[feature_selected][:]
    test_x = test[feature_selected][:]
    train_y = train[label][:]
    test_y = test[label][:]
    temp_test = pd.concat([temp_test, test[['pid', 't', 'month', label]]])
#     temp_train = pd.concat([temp_train, train[['pid', 't', 'month', label]]])
    # XGBoost
    dtrain=xgb.DMatrix(train_x,train_y)
    dtest=xgb.DMatrix(test_x, test_y)
    dvalid = xgb.DMatrix(df_test[feature_selected], df_test[label])
    params = {
    'booster': 'gbtree',
    'objective':'reg:linear',
    'max_depth':3, 
    'subsample': 0.6,
    'colsample_bytree': 0.6,
    'reg_alpha': 0.001,
    'min_child_weight': 11,
    'eta': 0.05,
    'seed': 42,
    'nthread': 4,
    'metris':'rmse'
    }

    watchlist = [(dtrain,'train'),(dtest,'test')]
    bst=xgb.train(params,dtrain,num_boost_round=10000, evals=watchlist, early_stopping_rounds=100, verbose_eval=50) 
    ypred_xgb=bst.predict(dtest)
    test_predicted_XGB.append(bst.predict(dvalid))
    # LGBRegressor
    lgb_model = lgb.LGBMRegressor(random_state=42, max_depth=7,
                                                            n_estimators=30000,
                                                            learning_rate=0.05,
                                                            num_leaves=7,
                                                            colsample_bytree=0.9,
                                                            subsample = 0.8,
                                                            reg_alpha = 0.5,
                                                            reg_lambda = 0.3,
                                                            n_jobs=-1)

    lgb_model.fit(train_x, train_y,
                  eval_metric='rmse', 
                  eval_set=[(train_x, train_y), (test_x, test_y)], 
                  verbose=100, early_stopping_rounds=100)

    ypred_lgb_sklearn = lgb_model.predict(test_x)
    test_predicted_LGB.append(lgb_model.predict(df_test[feature_selected]))
    
    predicted_value_XGB.extend(ypred_xgb)
    predicted_value_LGB.extend(ypred_lgb_sklearn)


[0]	train-rmse:31.9422	test-rmse:31.5494
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 100 rounds.
[50]	train-rmse:5.57846	test-rmse:5.55736
[100]	train-rmse:4.65995	test-rmse:4.98389
[150]	train-rmse:4.51428	test-rmse:4.96487
[200]	train-rmse:4.40998	test-rmse:4.97564
Stopping. Best iteration:
[148]	train-rmse:4.52031	test-rmse:4.96136

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 4.65374	valid_1's rmse: 4.93203
[200]	training's rmse: 4.36249	valid_1's rmse: 4.8814
[300]	training's rmse: 4.17829	valid_1's rmse: 4.89554
Early stopping, best iteration is:
[208]	training's rmse: 4.34604	valid_1's rmse: 4.88093


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
       learning_rate=0.05, max_depth=7, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=30000,
       n_jobs=-1, num_leaves=7, objective=None, random_state=42,
       reg_alpha=0.5, reg_lambda=0.3, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=1)

[0]	train-rmse:31.944	test-rmse:31.5519
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 100 rounds.
[50]	train-rmse:5.57178	test-rmse:5.6005
[100]	train-rmse:4.64653	test-rmse:4.90764
[150]	train-rmse:4.49262	test-rmse:4.88575
[200]	train-rmse:4.39203	test-rmse:4.87908
[250]	train-rmse:4.30928	test-rmse:4.87453
[300]	train-rmse:4.23038	test-rmse:4.87286
Stopping. Best iteration:
[230]	train-rmse:4.34133	test-rmse:4.86993

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 4.64004	valid_1's rmse: 4.90622
[200]	training's rmse: 4.3569	valid_1's rmse: 4.88713
[300]	training's rmse: 4.17485	valid_1's rmse: 4.87742
Early stopping, best iteration is:
[255]	training's rmse: 4.24942	valid_1's rmse: 4.86737


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
       learning_rate=0.05, max_depth=7, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=30000,
       n_jobs=-1, num_leaves=7, objective=None, random_state=42,
       reg_alpha=0.5, reg_lambda=0.3, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=1)

[0]	train-rmse:31.8805	test-rmse:32.0356
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 100 rounds.
[50]	train-rmse:5.56358	test-rmse:5.56728
[100]	train-rmse:4.6679	test-rmse:4.78142
[150]	train-rmse:4.52328	test-rmse:4.76216
[200]	train-rmse:4.42108	test-rmse:4.76105
Stopping. Best iteration:
[141]	train-rmse:4.54564	test-rmse:4.75711

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 4.64909	valid_1's rmse: 4.82305
[200]	training's rmse: 4.36801	valid_1's rmse: 4.81147
Early stopping, best iteration is:
[152]	training's rmse: 4.47777	valid_1's rmse: 4.79446


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
       learning_rate=0.05, max_depth=7, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=30000,
       n_jobs=-1, num_leaves=7, objective=None, random_state=42,
       reg_alpha=0.5, reg_lambda=0.3, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=1)

[0]	train-rmse:31.8772	test-rmse:32.0402
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 100 rounds.
[50]	train-rmse:5.53114	test-rmse:5.81416
[100]	train-rmse:4.62778	test-rmse:5.15189
[150]	train-rmse:4.48053	test-rmse:5.12952
[200]	train-rmse:4.37948	test-rmse:5.12342
[250]	train-rmse:4.30123	test-rmse:5.12205
[300]	train-rmse:4.22764	test-rmse:5.12101
[350]	train-rmse:4.16187	test-rmse:5.11325
[400]	train-rmse:4.10197	test-rmse:5.09968
[450]	train-rmse:4.04655	test-rmse:5.09844
[500]	train-rmse:3.98739	test-rmse:5.0988
[550]	train-rmse:3.93716	test-rmse:5.09684
[600]	train-rmse:3.88915	test-rmse:5.097
[650]	train-rmse:3.8431	test-rmse:5.10168
Stopping. Best iteration:
[555]	train-rmse:3.93261	test-rmse:5.09128

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 4.61921	valid_1's rmse: 5.1659
[200]	training's rmse: 4.34634	valid_1's rmse: 5.13739
[300]	training's rm

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
       learning_rate=0.05, max_depth=7, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=30000,
       n_jobs=-1, num_leaves=7, objective=None, random_state=42,
       reg_alpha=0.5, reg_lambda=0.3, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=1)

[0]	train-rmse:31.9726	test-rmse:31.1312
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 100 rounds.
[50]	train-rmse:5.56046	test-rmse:5.79114
[100]	train-rmse:4.64771	test-rmse:5.06384
[150]	train-rmse:4.49771	test-rmse:4.99564
[200]	train-rmse:4.39399	test-rmse:4.95502
[250]	train-rmse:4.30646	test-rmse:4.93919
[300]	train-rmse:4.23131	test-rmse:4.92892
[350]	train-rmse:4.1665	test-rmse:4.91736
[400]	train-rmse:4.10997	test-rmse:4.93623
Stopping. Best iteration:
[342]	train-rmse:4.1782	test-rmse:4.91207

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 4.63015	valid_1's rmse: 5.05566
[200]	training's rmse: 4.3502	valid_1's rmse: 4.95487
[300]	training's rmse: 4.16554	valid_1's rmse: 4.93782
Early stopping, best iteration is:
[275]	training's rmse: 4.20953	valid_1's rmse: 4.93485


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
       learning_rate=0.05, max_depth=7, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=30000,
       n_jobs=-1, num_leaves=7, objective=None, random_state=42,
       reg_alpha=0.5, reg_lambda=0.3, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=1)

[0]	train-rmse:31.8163	test-rmse:32.6977
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 100 rounds.
[50]	train-rmse:5.58454	test-rmse:5.7477
[100]	train-rmse:4.681	test-rmse:4.83489
[150]	train-rmse:4.53577	test-rmse:4.81501
[200]	train-rmse:4.43778	test-rmse:4.82866
Stopping. Best iteration:
[126]	train-rmse:4.59306	test-rmse:4.80952

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 4.66878	valid_1's rmse: 4.82982
[200]	training's rmse: 4.39825	valid_1's rmse: 4.78098
Early stopping, best iteration is:
[173]	training's rmse: 4.45749	valid_1's rmse: 4.76532


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
       learning_rate=0.05, max_depth=7, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=30000,
       n_jobs=-1, num_leaves=7, objective=None, random_state=42,
       reg_alpha=0.5, reg_lambda=0.3, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=1)

[0]	train-rmse:31.9086	test-rmse:31.7613
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 100 rounds.
[50]	train-rmse:5.56114	test-rmse:5.61131
[100]	train-rmse:4.66296	test-rmse:4.90994
[150]	train-rmse:4.51631	test-rmse:4.85984
[200]	train-rmse:4.41319	test-rmse:4.83417
[250]	train-rmse:4.33055	test-rmse:4.81448
[300]	train-rmse:4.25379	test-rmse:4.82265
Stopping. Best iteration:
[242]	train-rmse:4.34354	test-rmse:4.80695

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 4.6436	valid_1's rmse: 4.98556
[200]	training's rmse: 4.35492	valid_1's rmse: 4.88784
[300]	training's rmse: 4.17068	valid_1's rmse: 4.90069
Early stopping, best iteration is:
[214]	training's rmse: 4.32484	valid_1's rmse: 4.88164


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
       learning_rate=0.05, max_depth=7, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=30000,
       n_jobs=-1, num_leaves=7, objective=None, random_state=42,
       reg_alpha=0.5, reg_lambda=0.3, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=1)

[0]	train-rmse:31.898	test-rmse:31.9753
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 100 rounds.
[50]	train-rmse:5.58366	test-rmse:5.6461
[100]	train-rmse:4.68452	test-rmse:4.76663
[150]	train-rmse:4.54066	test-rmse:4.72271
[200]	train-rmse:4.44016	test-rmse:4.69883
[250]	train-rmse:4.35316	test-rmse:4.69703
[300]	train-rmse:4.28288	test-rmse:4.70026
Stopping. Best iteration:
[230]	train-rmse:4.39109	test-rmse:4.69079

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 4.67796	valid_1's rmse: 4.78665
[200]	training's rmse: 4.39388	valid_1's rmse: 4.75503
Early stopping, best iteration is:
[168]	training's rmse: 4.46476	valid_1's rmse: 4.7507


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
       learning_rate=0.05, max_depth=7, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=30000,
       n_jobs=-1, num_leaves=7, objective=None, random_state=42,
       reg_alpha=0.5, reg_lambda=0.3, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=1)

[0]	train-rmse:31.8823	test-rmse:32.0239
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 100 rounds.
[50]	train-rmse:5.55877	test-rmse:5.63389
[100]	train-rmse:4.65865	test-rmse:4.9039
[150]	train-rmse:4.49869	test-rmse:4.87986
[200]	train-rmse:4.40013	test-rmse:4.87733
[250]	train-rmse:4.31432	test-rmse:4.87759
[300]	train-rmse:4.23935	test-rmse:4.86252
[350]	train-rmse:4.17856	test-rmse:4.85258
[400]	train-rmse:4.11545	test-rmse:4.85304
Stopping. Best iteration:
[343]	train-rmse:4.18956	test-rmse:4.84903

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 4.65604	valid_1's rmse: 4.89033
[200]	training's rmse: 4.36148	valid_1's rmse: 4.85312
Early stopping, best iteration is:
[156]	training's rmse: 4.46952	valid_1's rmse: 4.84555


LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
       learning_rate=0.05, max_depth=7, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=30000,
       n_jobs=-1, num_leaves=7, objective=None, random_state=42,
       reg_alpha=0.5, reg_lambda=0.3, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=1)

[0]	train-rmse:31.8739	test-rmse:32.1443
Multiple eval metrics have been passed: 'test-rmse' will be used for early stopping.

Will train until test-rmse hasn't improved in 100 rounds.
[50]	train-rmse:5.5444	test-rmse:6.0459
[100]	train-rmse:4.64067	test-rmse:5.21947
[150]	train-rmse:4.49298	test-rmse:5.15278
[200]	train-rmse:4.38752	test-rmse:5.11686
[250]	train-rmse:4.30381	test-rmse:5.09881
[300]	train-rmse:4.22961	test-rmse:5.08827
[350]	train-rmse:4.16052	test-rmse:5.0831
[400]	train-rmse:4.09791	test-rmse:5.07887
[450]	train-rmse:4.04008	test-rmse:5.08042
[500]	train-rmse:3.98754	test-rmse:5.07394
[550]	train-rmse:3.93904	test-rmse:5.06993
Stopping. Best iteration:
[474]	train-rmse:4.01271	test-rmse:5.06671

Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 4.62408	valid_1's rmse: 5.17632
[200]	training's rmse: 4.33688	valid_1's rmse: 5.06492
[300]	training's rmse: 4.15637	valid_1's rmse: 5.03421
[400]	training's rmse: 4.01056	valid_1's rmse: 5

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=0.9,
       learning_rate=0.05, max_depth=7, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=30000,
       n_jobs=-1, num_leaves=7, objective=None, random_state=42,
       reg_alpha=0.5, reg_lambda=0.3, silent=True, subsample=0.8,
       subsample_for_bin=200000, subsample_freq=1)

# output Train and Test Files

## output Train predicted result

In [5]:
# obtain predicted train value table 
DNN_result = pd.read_csv('./output/train_predicted_value_DNN.csv')
DNN_result = DNN_result.sort_values(by=['pid','t']).reset_index(drop=True)
temp_test['mod1_XGB'] = predicted_value_XGB
temp_test['mod2_LGB'] = predicted_value_LGB
temp_test_reset = temp_test.sort_values(by=['pid','t']).reset_index(drop=True)
temp_test_reset['mod3_DNN'] = DNN_result['test_value']
temp_test_reset = temp_test_reset.rename(columns={'r_alsfrs_r_total':'true'})

In [6]:
# calculate best ratio for final result 
# output predicted data 
a = temp_test_reset.mod1_XGB
b = temp_test_reset.mod2_LGB
c = temp_test_reset.mod3_DNN

best_ensemble_value =[10,0,0,0,0,0]
best_ensemble_value ={'Best_RMSE':10}

for i in range(11):
    for j in range(11):
        k = 10-(i+j)
        if k>=0:
            ensemble_value = 0.1*(a*i+b*j+c*k)
            r2_temp = r2_score(temp_test_reset["true"], pd.DataFrame(ensemble_value))
            rmse_temp = np.sqrt(mean_squared_error(temp_test_reset["true"], pd.DataFrame(ensemble_value)))
            if rmse_temp<best_ensemble_value['Best_RMSE']:
                best_ensemble_value['Best_RMSE'] = rmse_temp
                best_ensemble_value['R_2_Score'] = r2_temp
                best_ensemble_value['DNN_ratio'] = k
                best_ensemble_value['XGB_ratio'] = i
                best_ensemble_value['LGB_ratio'] = j
                best_ensemble_value['mod4_Ensemble'] = ensemble_value
                
temp_test_reset['mod4_Ensemble'] = best_ensemble_value['mod4_Ensemble']
temp_test_reset.to_csv('./output/train_predicted_value_three_models.csv', index=None)
print('Best Ratio for LGB {}, Best Ratio for XGB {}, Best Ratio for DNN {}, '.format(
    best_ensemble_value['LGB_ratio'], best_ensemble_value['XGB_ratio'], best_ensemble_value['DNN_ratio']))

Best Ratio for LGB 6, Best Ratio for XGB 3, Best Ratio for DNN 1, 


In [7]:
def prediction_summary(df, filter_feature=None, lb=0, ub=10^10):
    if filter_feature is not None:
        df = df[(df[filter_feature]>=lb)&(df[filter_feature]<=ub)]
    
    cols  = [i for i in df.columns if 'mod' in i]

    from sklearn.metrics import r2_score
    from sklearn.metrics import mean_squared_error
    from scipy.stats import skew

    row_name = ['R^2', 'RMSE', 'Slope',  'Intercept' ,'Skewness']

    label = 'true'
    temp_value =[ [] for i in (cols)]
    for ind, feature in enumerate(cols):
        temp_value[ind].append(r2_score(df[label], df[feature]))
        temp_value[ind].append(np.sqrt(mean_squared_error(df[label], df[feature])))
        from sklearn.linear_model import LinearRegression
        reg = LinearRegression().fit(df[label].values.reshape(-1, 1), df[feature])
        temp_value[ind].append(reg.coef_[0])
        temp_value[ind].append(reg.intercept_ )
        temp_value[ind].append(skew(df[feature]))
    temp_table = pd.DataFrame(temp_value).T
    temp_table.columns=cols
    temp_table.index=row_name
    return temp_table

In [8]:
prediction_summary(temp_test_reset)
prediction_summary(temp_test_reset,'t', 0,388)

Unnamed: 0,mod1_XGB,mod2_LGB,mod3_DNN,mod4_Ensemble
R^2,0.702371,0.704045,0.60626,0.706234
RMSE,4.895509,4.881717,5.630729,4.863634
Slope,0.711287,0.703709,0.646191,0.70023
Intercept,9.410128,9.673814,11.087822,9.736109
Skewness,-0.537003,-0.522229,2.194413,-0.468072


Unnamed: 0,mod1_XGB,mod2_LGB,mod3_DNN,mod4_Ensemble
R^2,0.718968,0.719757,0.591151,0.721987
RMSE,4.318569,4.3125,5.208872,4.29531
Slope,0.72498,0.71944,0.686511,0.717809
Intercept,9.288792,9.472042,9.946837,9.464546
Skewness,-0.535323,-0.547914,2.682378,-0.482013


## output Test predicted result

In [9]:
test_result = pd.read_csv('./output/test_predicted_value_DNN.csv')
test_result = test_result.rename(columns={'TRUE':'true'})

In [10]:
# output test data for model XGBoost, LGBoost and DNN

test_table = df_test[['pid', 't', 'month', label]].copy()
test_table.reset_index(drop=True, inplace=True)
test_table.rename(columns={'r_alsfrs_r_total':'true'},inplace=True)

test_table['mod1_XGB'] = pd.DataFrame(test_predicted_XGB).T.mean(axis=1).values
test_table['mod2_LGB'] = pd.DataFrame(test_predicted_LGB).T.mean(axis=1).values
test_table['mod3_DNN'] = test_result['mod3_DNN']
test_table['mod4_Ensemble'] = test_table['mod1_XGB']*best_ensemble_value['XGB_ratio']/10.0+test_table['mod2_LGB']*best_ensemble_value['LGB_ratio']/10.0+test_table['mod3_DNN']*best_ensemble_value['DNN_ratio']/10.0

test_table.to_csv('./output/test_predicted_value_full_three_models.csv', index=None)

In [11]:
prediction_summary(test_table)
prediction_summary(test_table,'t', 0,388)

Unnamed: 0,mod1_XGB,mod2_LGB,mod3_DNN,mod4_Ensemble
R^2,0.708915,0.712676,0.716304,0.716652
RMSE,4.624555,4.594582,4.565478,4.562683
Slope,0.61576,0.628845,0.68155,0.63019
Intercept,13.799003,13.348846,11.068111,13.25582
Skewness,-0.496531,-0.455746,-0.043621,-0.425983


Unnamed: 0,mod1_XGB,mod2_LGB,mod3_DNN,mod4_Ensemble
R^2,0.710737,0.714819,0.717937,0.718768
RMSE,4.568425,4.536081,4.511212,4.504561
Slope,0.614346,0.627868,0.682713,0.629296
Intercept,13.896699,13.430359,11.063012,13.333526
Skewness,-0.489267,-0.449631,-0.036975,-0.419022
