In [89]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split

In [90]:
def smape(estimator, X, y):
    return -(100 * np.mean(2 * np.abs(estimator.predict(X) - y) / (np.abs(y) + np.abs(estimator.predict(X)))) + 1)

In [91]:
df = pd.read_csv('../data/2604-prots_pepts.csv', index_col=0)
df.isna().sum().sum()

0

In [92]:
df.columns

Index(['visit_id', 'visit_month', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4',
       'on_med', 'O00391', 'O00533', 'O00584',
       ...
       'YSLTYIYTGLSK', 'YTTEIIK', 'YVGGQEHFAHLLILR', 'YVMLPVADQDQCIR',
       'YVMLPVADQDQCIR.1', 'YVNKEIQNAVNGVK', 'YWGVASFLQK', 'YYCFQGNQFLR',
       'YYTYLIMNK', 'YYWGGQYTWDMAK'],
      dtype='object', length=1205)

In [93]:
df['patient_id'] = df['visit_id'].apply(lambda x: x.split('_')[0])

In [94]:
cols = df.columns[:-1]
df = df[['patient_id'] + list(cols)]
df

Unnamed: 0,patient_id,visit_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,on_med,O00391,O00533,...,YSLTYIYTGLSK,YTTEIIK,YVGGQEHFAHLLILR,YVMLPVADQDQCIR,YVMLPVADQDQCIR.1,YVNKEIQNAVNGVK,YWGVASFLQK,YYCFQGNQFLR,YYTYLIMNK,YYWGGQYTWDMAK
0,55,55_0,0,10.0,6.0,15.0,1.0,0,11254.3,732430.0,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
1,55,55_3,3,10.0,7.0,25.0,1.0,0,11254.3,732430.0,...,201158.0,16492.3,3810270.0,106894.0,580667.0,131155.0,165851.0,437305.0,46289.2,14898.4
2,55,55_6,6,8.0,10.0,34.0,1.0,0,13163.6,630465.0,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
3,55,55_9,9,8.0,9.0,30.0,0.0,1,13163.6,630465.0,...,171079.0,13198.8,4119520.0,113385.0,514861.0,103512.0,144607.0,457891.0,40047.7,20703.9
4,55,55_12,12,10.0,10.0,41.0,0.0,1,15257.6,815083.0,...,231772.0,17873.8,5474140.0,116286.0,711815.0,136943.0,181763.0,452253.0,54725.1,21841.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,65043,65043_48,48,7.0,6.0,13.0,0.0,0,10589.6,902434.0,...,233567.0,14478.3,3185530.0,48793.0,501159.0,133992.0,170146.0,359045.0,45780.0,17370.6
2611,65043,65043_54,54,4.0,8.0,11.0,1.0,0,10589.6,902434.0,...,233567.0,14478.3,3185530.0,48793.0,501159.0,133992.0,170146.0,359045.0,45780.0,17370.6
2612,65043,65043_60,60,6.0,6.0,16.0,1.0,0,10589.6,902434.0,...,233567.0,14478.3,3185530.0,48793.0,501159.0,133992.0,170146.0,359045.0,45780.0,17370.6
2613,65043,65043_72,72,3.0,9.0,14.0,1.0,0,10589.6,902434.0,...,233567.0,14478.3,3185530.0,48793.0,501159.0,133992.0,170146.0,359045.0,45780.0,17370.6


In [95]:
df.shape

(2615, 1206)

In [96]:
df_w_lag = df.join(df.groupby('patient_id')
                    .shift(1)
                    .add_suffix('_prev')
                    ).dropna()

df_w_lag.shape

(2367, 2411)

In [97]:
X = df_w_lag.drop(['patient_id', 'visit_id', 'visit_id_prev', 
                   'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4'], 
                   axis=1)

y = df_w_lag[['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']]
X.columns

Index(['visit_month', 'on_med', 'O00391', 'O00533', 'O00584', 'O14498',
       'O14773', 'O14791', 'O15240', 'O15394',
       ...
       'YSLTYIYTGLSK_prev', 'YTTEIIK_prev', 'YVGGQEHFAHLLILR_prev',
       'YVMLPVADQDQCIR_prev', 'YVMLPVADQDQCIR.1_prev', 'YVNKEIQNAVNGVK_prev',
       'YWGVASFLQK_prev', 'YYCFQGNQFLR_prev', 'YYTYLIMNK_prev',
       'YYWGGQYTWDMAK_prev'],
      dtype='object', length=2404)

In [108]:
num_cols = X.columns.tolist()
len(num_cols)

2404

In [109]:
cols2remove = ['patient_id', 'visit_id', 'visit_id_prev', 'on_med', 'on_med_prev',
               'visit_month', 'visit_month_prev', 'updrs_1_prev', 'updrs_2_prev', 
               'updrs_3_prev', 'updrs_4_prev']

num_cols = [col for col in num_cols if col not in cols2remove]
len(num_cols)

2396

In [110]:
'visit_month' in num_cols

False

In [112]:
X[num_cols] = X[num_cols].apply(lambda x: np.log10(x))
pd.DataFrame(X.describe()).to_csv('../data/2904_log10X_stats.csv')
X[num_cols] = X[num_cols].apply(lambda x: (x - np.mean(x)) / np.std(x))
X

Unnamed: 0,visit_month,on_med,O00391,O00533,O00584,O14498,O14773,O14791,O15240,O15394,...,YSLTYIYTGLSK_prev,YTTEIIK_prev,YVGGQEHFAHLLILR_prev,YVMLPVADQDQCIR_prev,YVMLPVADQDQCIR.1_prev,YVNKEIQNAVNGVK_prev,YWGVASFLQK_prev,YYCFQGNQFLR_prev,YYTYLIMNK_prev,YYWGGQYTWDMAK_prev
1,3,0,0.486069,0.385741,0.441990,0.441795,0.549940,0.483844,0.436785,0.376905,...,0.220297,0.577453,0.346282,0.574941,0.294734,0.387742,0.360997,0.218914,0.297496,0.369607
2,6,0,0.528832,0.316280,0.375471,0.439353,0.464295,0.506552,0.400774,0.374763,...,0.220297,0.577453,0.346282,0.574941,0.294734,0.387742,0.360997,0.218914,0.297496,0.369607
3,9,1,0.528832,0.316280,0.375471,0.439353,0.464295,0.506552,0.400774,0.374763,...,0.161053,0.497873,0.363638,0.590964,0.254740,0.292672,0.309299,0.234790,0.247417,0.462761
4,12,1,0.569117,0.435283,0.470943,0.422874,0.541501,0.498940,0.353902,0.402353,...,0.161053,0.497873,0.363638,0.590964,0.254740,0.292672,0.309299,0.234790,0.247417,0.462761
5,18,1,0.569117,0.435283,0.470943,0.422874,0.541501,0.498940,0.353902,0.402353,...,0.272115,0.606190,0.426867,0.597831,0.362444,0.405087,0.395551,0.230514,0.355381,0.477898
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2610,48,0,0.469456,0.482455,0.513593,0.411861,0.336442,0.349961,0.505576,0.502995,...,0.301599,0.615119,0.288574,0.371596,0.304441,0.436362,0.417897,0.177742,0.415411,0.463937
2611,54,0,0.469456,0.482455,0.513593,0.411861,0.336442,0.349961,0.505576,0.502995,...,0.274937,0.530926,0.306453,0.361775,0.245772,0.396337,0.370640,0.150864,0.293671,0.413067
2612,60,0,0.469456,0.482455,0.513593,0.411861,0.336442,0.349961,0.505576,0.502995,...,0.274937,0.530926,0.306453,0.361775,0.245772,0.396337,0.370640,0.150864,0.293671,0.413067
2613,72,0,0.469456,0.482455,0.513593,0.411861,0.336442,0.349961,0.505576,0.502995,...,0.274937,0.530926,0.306453,0.361775,0.245772,0.396337,0.370640,0.150864,0.293671,0.413067


In [113]:
X.isna().sum().sum()

0

In [119]:
from sklearn.feature_selection import mutual_info_regression

In [123]:
k_best = 50
best_cols = {'updrs_1': [],
            'updrs_2': [],
            'updrs_3': [],
            'updrs_4': []}

for col in ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']:
    mi = mutual_info_regression(X, y[col])
    
    best_cols[col] = X.columns[np.argsort(mi)[-k_best:]].tolist()

best_cols

{'updrs_1': ['ADRDQYELLCLDNTR_prev',
  'ITGYIIK',
  'EDCNELPPRR_prev',
  'KAADDTWEPFASGK_prev',
  'LGADMEDVCGR.1',
  'P05452_prev',
  'GSGGLNLGNFFASR',
  'Q9NYU2_prev',
  'FTILDSQGK_prev',
  'CPFPSRPDNGFVNYPAKPTLYYK_prev',
  'VEKPTADAEAYVFTPNMICAGGEK_prev',
  'SDDKVTLEER_prev',
  'GALQNIIPASTGAAK_prev',
  'P09871_prev',
  'HVEPGEPLAPSPQEPQAVGR',
  'SPFEQHIK_prev',
  'QALPQVR',
  'CFSGQCISK',
  'LVNEVTEFAK_prev',
  'EWVAIESDSVQPVPR_prev',
  'AATVGSLAGQPLQER_prev',
  'NPCQDPYILTPENR_prev',
  'VHVSEEGTEPEAMLQVLGPKPALPAGTEDTAKEDAANR',
  'SVPMVPPGIK_prev',
  'KQINDYVEKGTQGK_prev',
  'GSQTQSHPDLGTEGCWDQLSAPR_prev',
  'VHVSEEGTEPEAMLQVLGPKPALPAGTEDTAKEDAANR_prev',
  'YIVSGTPTFVPYLIK_prev',
  'HVEPGEPLAPSPQEPQAVGR_prev',
  'O00391_prev',
  'LQAEAFQAR_prev',
  'LVDGKGVPIPNKVIFIR',
  'Q99674_prev',
  'O00391',
  'P04406',
  'Q99674',
  'SPAINVAVHVFR_prev',
  'EIVLTQSPATLSLSPGER_prev',
  'P19652_prev',
  'YVNKEIQNAVNGVK_prev',
  'MDASLGNLFAR_prev',
  'P04406_prev',
  'NEQEQPLGQWHLS',
  'GALQNIIPA

In [126]:
xbgr_params = {'updrs_1': {'n_estimators': [300, 500], 
                            'max_depth': [1, 3],
                            'learning_rate': [0.1, 0.5],
                            'objective': ['reg:absoluteerror'],
                            'n_jobs': [2],
                            'random_state': [7656]},

                'updrs_2': {'n_estimators': [500, 750], 
                            'max_depth': [5, 7],
                            'learning_rate': [0.1, 0.5],
                            'objective': ['reg:absoluteerror'],
                            'n_jobs': [2],
                            'random_state': [7656]},

                'updrs_3': {'n_estimators': [500, 750], 
                            'max_depth': [5, 7],
                            'learning_rate': [0.1, 0.5],
                            'objective': ['reg:absoluteerror'],
                            'n_jobs': [2],
                            'random_state': [7656]},

                'updrs_4': {'n_estimators': [750, 1000, 1250], 
                            'max_depth': [1, 3],
                            'learning_rate': [0.1, 0.5],
                            'objective': ['reg:absoluteerror'],
                            'n_jobs': [2],
                            'random_state': [7656]},
                            }

grids = {}

for col in y.columns:
    X_train = X[best_cols[col]]
    grid_search = GridSearchCV(xgb.XGBRegressor(), 
                               param_grid=xbgr_params[col], 
                               cv=5, 
                               scoring=smape, 
                               n_jobs=2,
                               verbose=1)

    grid_search.fit(X_train, y[col])
    grids[col] = {
                    'best_params': grid_search.best_params_, 
                    'best_score': grid_search.best_score_
                 }

    grid_data = pd.DataFrame(grid_search.cv_results_).sort_values(by='rank_test_score')[:10]
    grid_data.to_csv(f'../data/gridsearch/xgbregr/2904-{k_best}best_log-norm-lag_grid_{col}_it2.csv')

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [127]:
grids

{'updrs_1': {'best_params': {'learning_rate': 0.1,
   'max_depth': 1,
   'n_estimators': 300,
   'n_jobs': 2,
   'objective': 'reg:absoluteerror',
   'random_state': 7656},
  'best_score': -47.897666559164016},
 'updrs_2': {'best_params': {'learning_rate': 0.1,
   'max_depth': 5,
   'n_estimators': 500,
   'n_jobs': 2,
   'objective': 'reg:absoluteerror',
   'random_state': 7656},
  'best_score': -68.88006070985095},
 'updrs_3': {'best_params': {'learning_rate': 0.1,
   'max_depth': 5,
   'n_estimators': 500,
   'n_jobs': 2,
   'objective': 'reg:absoluteerror',
   'random_state': 7656},
  'best_score': -54.74459749136351},
 'updrs_4': {'best_params': {'learning_rate': 0.1,
   'max_depth': 1,
   'n_estimators': 750,
   'n_jobs': 2,
   'objective': 'reg:absoluteerror',
   'random_state': 7656},
  'best_score': -105.8301660479311}}