In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
from pandas.plotting import scatter_matrix
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from scipy.optimize import minimize
from statsmodels.tsa.seasonal import seasonal_decompose
import re
import plotly.express as px
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/public_timeseries_testing_util.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/__init__.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test

In [2]:
train_proteins_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')
print('proteins :', train_proteins_df.shape)
print('proteins unique visit_ids:', train_proteins_df.visit_id.nunique())
print('proteins unique patient_ids:', train_proteins_df.patient_id.nunique())
print(train_proteins_df.columns)

train_peptides_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv')
print('')
print('peptides :', train_peptides_df.shape)
print('peptides unique visit_ids:', train_peptides_df.visit_id.nunique())
print('peptides unique patient_ids:', train_peptides_df.patient_id.nunique())
print(train_peptides_df.columns)

train_clinical_data_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
print('')
print('clinical :', train_clinical_data_df.shape)
print('clinical unique visit_ids:', train_clinical_data_df.visit_id.nunique())
print(train_clinical_data_df.columns)

supp_clinical_data_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv')
print('')
print('supplemental :', supp_clinical_data_df.shape)
print('supplemental unique visit_ids:', supp_clinical_data_df.visit_id.nunique())
print(supp_clinical_data_df.columns)

proteins : (232741, 5)
proteins unique visit_ids: 1113
proteins unique patient_ids: 248
Index(['visit_id', 'visit_month', 'patient_id', 'UniProt', 'NPX'], dtype='object')

peptides : (981834, 6)
peptides unique visit_ids: 1113
peptides unique patient_ids: 248
Index(['visit_id', 'visit_month', 'patient_id', 'UniProt', 'Peptide',
       'PeptideAbundance'],
      dtype='object')

clinical : (2615, 8)
clinical unique visit_ids: 2615
Index(['visit_id', 'patient_id', 'visit_month', 'updrs_1', 'updrs_2',
       'updrs_3', 'updrs_4', 'upd23b_clinical_state_on_medication'],
      dtype='object')

supplemental : (2223, 8)
supplemental unique visit_ids: 2223
Index(['visit_id', 'patient_id', 'visit_month', 'updrs_1', 'updrs_2',
       'updrs_3', 'updrs_4', 'upd23b_clinical_state_on_medication'],
      dtype='object')


In [3]:
clinical_data = pd.merge(supp_clinical_data_df, train_clinical_data_df, on=['patient_id','visit_id','visit_month'], how='outer')

for i in [1,2,3,4]:
    clinical_data['updrs_'+str(i)] = np.where(clinical_data['updrs_'+str(i)+'_x'].isna(), clinical_data['updrs_'+str(i)+'_y'], clinical_data['updrs_'+str(i)+'_x'])
    clinical_data.drop(columns=['updrs_'+str(i)+'_x', 'updrs_'+str(i)+'_y'], inplace=True)
    
clinical_data.drop(columns=['upd23b_clinical_state_on_medication_x', 'upd23b_clinical_state_on_medication_y'], inplace=True)

print(clinical_data.shape)
print(clinical_data.visit_id.nunique())

del supp_clinical_data_df, train_clinical_data_df

clinical_data.head(10)

(4838, 7)
4838


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
0,35_0,35,0,5.0,3.0,16.0,0.0
1,35_36,35,36,6.0,4.0,20.0,0.0
2,75_0,75,0,4.0,6.0,26.0,0.0
3,75_36,75,36,1.0,8.0,38.0,0.0
4,155_0,155,0,,,0.0,
5,337_0,337,0,5.0,7.0,6.0,0.0
6,337_36,337,36,8.0,7.0,8.0,0.0
7,527_0,527,0,6.0,2.0,9.0,0.0
8,527_36,527,36,2.0,18.0,22.0,0.0
9,557_0,557,0,5.0,6.0,22.0,0.0


In [4]:
pro_pep_join = pd.merge(train_proteins_df, train_peptides_df, on=['patient_id','visit_id','visit_month','UniProt'], how='outer')

print(pro_pep_join.shape)
print(pro_pep_join.visit_id.nunique())

del train_proteins_df, train_peptides_df
pro_pep_join.head(10)

(981834, 7)
1113


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX,Peptide,PeptideAbundance
0,55_0,0,55,O00391,11254.3,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,732430.0,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,732430.0,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,732430.0,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,732430.0,SMEQNGPGLEYR,30838.7
5,55_0,0,55,O00533,732430.0,TLKIENVSYQDKGNYR,23216.5
6,55_0,0,55,O00533,732430.0,VIAVNEVGR,170878.0
7,55_0,0,55,O00533,732430.0,VMTPAVYAPYDVK,148771.0
8,55_0,0,55,O00533,732430.0,VNGSPVDNHPFAGDVVFPR,55202.1
9,55_0,0,55,O00584,39585.8,ELDLNSVLLK,27229.3


##### The amino acid abundance in peptides are taken from the following wikipedia URL
##### https://en.wikipedia.org/wiki/Amino_acid#Table_of_standard_amino_acid_abbreviations_and_properties

In [6]:
def remove_unimod(peptide_seqn):
    return re.sub('[(UniMod_)0-9]+','',peptide_seqn)

#print(remove_unimod('KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK'))

pro_pep_join['Peptide'] = pro_pep_join['Peptide'].apply(remove_unimod)
#pro_pep_join.head(10)

"""def amino_acid_count(amino_seqn, amino_acid):
    return amino_seqn.count(amino_acid)"""

def amino_acid_abundance(amino_seqn, amino_acid, peptide_abundance, amino_acid_abundance):
    return round(amino_seqn.count(amino_acid) * ((peptide_abundance * amino_acid_abundance)/100),5)

vect_func = np.vectorize(amino_acid_abundance)

pro_pep_join['I_abundance'] = vect_func(pro_pep_join['Peptide'], 'I', pro_pep_join['PeptideAbundance'], 6.73)
pro_pep_join['V_abundance'] = vect_func(pro_pep_join['Peptide'], 'V', pro_pep_join['PeptideAbundance'], 5.49)
pro_pep_join['F_abundance'] = vect_func(pro_pep_join['Peptide'], 'F', pro_pep_join['PeptideAbundance'], 3.87)
pro_pep_join['Y_abundance'] = vect_func(pro_pep_join['Peptide'], 'Y', pro_pep_join['PeptideAbundance'], 2.91)
pro_pep_join['W_abundance'] = vect_func(pro_pep_join['Peptide'], 'W', pro_pep_join['PeptideAbundance'], 1.25)
pro_pep_join['A_abundance'] = vect_func(pro_pep_join['Peptide'], 'A', pro_pep_join['PeptideAbundance'], 8.76)
pro_pep_join['L_abundance'] = vect_func(pro_pep_join['Peptide'], 'L', pro_pep_join['PeptideAbundance'], 9.68)
    
#pro_pep_join.head(10)

pro_pep_join_agg = pro_pep_join.groupby(['patient_id','visit_id','visit_month']).agg(F=('F_abundance', 'sum'),Y=('Y_abundance', 'sum'),W=('W_abundance', 'sum')\
                                                                                    ,A=('A_abundance', 'sum'),L=('L_abundance', 'sum'),V=('V_abundance', 'sum')\
                                                                                    ,I=('I_abundance', 'sum')).reset_index(level=['patient_id','visit_id','visit_month'])

pro_pep_join_agg.head(10)

Unnamed: 0,patient_id,visit_id,visit_month,F,Y,W,A,L,V,I
0,55,55_0,0,11871860.0,7520846.0,478382.42875,57345610.0,73428200.0,55443800.0,8991024.0
1,55,55_12,12,12498370.0,7993317.0,484364.63337,57922940.0,76002350.0,53249860.0,9488818.0
2,55,55_36,36,13194760.0,8373198.0,533001.00401,63354650.0,80600060.0,57199200.0,8898225.0
3,55,55_6,6,11342110.0,7076986.0,450325.41649,54786530.0,68094480.0,48801910.0,8293410.0
4,942,942_12,12,10177700.0,6625098.0,445507.01004,45596310.0,61179320.0,40641280.0,6966456.0
5,942,942_24,24,9903217.0,6586329.0,452366.38213,50550810.0,60517210.0,41432080.0,7087011.0
6,942,942_48,48,10776500.0,6268919.0,449311.39699,54092100.0,62292910.0,42879190.0,6806131.0
7,942,942_6,6,9460291.0,6086457.0,438816.94926,45813460.0,58435740.0,38747510.0,7189919.0
8,1517,1517_0,0,9868730.0,6870025.0,402572.45311,51208000.0,60781890.0,38402560.0,7175566.0
9,1517,1517_24,24,9686343.0,6386043.0,398692.42975,46937290.0,59862000.0,39419390.0,7466161.0


In [7]:
full_training_data = pd.merge(pro_pep_join_agg, clinical_data, on=['patient_id','visit_id','visit_month'], how='outer')
#full_training_data = full_training_data.drop(columns=['UniProt','Peptide','NPX','PeptideAbundance'])

#full_training_data['NPX_inverse'] = 1/full_training_data['NPX']
#full_training_data['PeptideAbundance_inverse'] = 1/full_training_data['PeptideAbundance']

#full_training_data = full_training_data.drop(columns=['UniProt'])

print(full_training_data.shape)
print(full_training_data.visit_id.nunique())

#del pro_pep_join, clinical_data
gc.collect()

full_training_data[full_training_data['patient_id']==55]

(4883, 14)
4883


Unnamed: 0,patient_id,visit_id,visit_month,F,Y,W,A,L,V,I,updrs_1,updrs_2,updrs_3,updrs_4
0,55,55_0,0,11871860.0,7520846.0,478382.42875,57345610.0,73428200.0,55443800.0,8991024.0,10.0,6.0,15.0,
1,55,55_12,12,12498370.0,7993317.0,484364.63337,57922940.0,76002350.0,53249860.0,9488818.0,10.0,10.0,41.0,0.0
2,55,55_36,36,13194760.0,8373198.0,533001.00401,63354650.0,80600060.0,57199200.0,8898225.0,17.0,18.0,51.0,0.0
3,55,55_6,6,11342110.0,7076986.0,450325.41649,54786530.0,68094480.0,48801910.0,8293410.0,8.0,10.0,34.0,
3336,55,55_3,3,,,,,,,,10.0,7.0,25.0,
3337,55,55_9,9,,,,,,,,8.0,9.0,30.0,0.0
3338,55,55_18,18,,,,,,,,7.0,13.0,38.0,0.0
3339,55,55_24,24,,,,,,,,16.0,9.0,49.0,0.0
3340,55,55_30,30,,,,,,,,14.0,13.0,49.0,0.0
3341,55,55_42,42,,,,,,,,12.0,20.0,41.0,0.0


In [8]:
#columns_to_remove_nan_from = ['updrs_1','updrs_2','updrs_3','updrs_4']
columns_to_remove_nan_from = ['updrs_1','updrs_2','updrs_3','updrs_4','F','Y','W','L','A','I','V']

for i in columns_to_remove_nan_from:
    #full_training_data[i].fillna(0, inplace=True)
    #full_training_data[i].fillna(full_training_data[i].expanding(1).max(), inplace=True)
    #full_training_data[i].fillna(0, inplace=True)
    #full_training_data[i].fillna(full_training_data[i].median().round(decimals = 0), inplace=True)
    if i in(['F','Y','W','L','A','I','V']):
        #full_training_data[i].fillna(full_training_data[i].mean().round(decimals = 0), inplace=True)
        full_training_data[i].fillna(0, inplace=True)
    else:
        full_training_data[i].fillna(full_training_data[i].median().round(decimals = 0), inplace=True)

In [9]:
full_training_data = full_training_data.drop_duplicates()

full_training_data_sorted = full_training_data.sort_values(by=['patient_id','visit_month'],  ascending=True)

full_training_data_sorted['visit_month_lag'] = full_training_data_sorted.groupby(['patient_id'])['visit_month'].shift(1)
full_training_data_sorted['visit_month_lag'].fillna(-1, inplace=True)

full_training_data_sorted['visit_month_lead'] = full_training_data_sorted.groupby(['patient_id'])['visit_month'].shift(-1)
full_training_data_sorted['visit_month_lead'].fillna(-1, inplace=True)

"""full_training_data_sorted['visit_month_lead'] = full_training_data_sorted.groupby(['patient_id'])['visit_month'].shift(-1)
full_training_data_sorted['visit_month_lead'].fillna(-1, inplace=True)"""

full_training_data_sorted[full_training_data_sorted['patient_id']==55]

Unnamed: 0,patient_id,visit_id,visit_month,F,Y,W,A,L,V,I,updrs_1,updrs_2,updrs_3,updrs_4,visit_month_lag,visit_month_lead
0,55,55_0,0,11871860.0,7520846.0,478382.42875,57345610.0,73428200.0,55443800.0,8991024.0,10.0,6.0,15.0,0.0,-1.0,3.0
3336,55,55_3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,7.0,25.0,0.0,0.0,6.0
3,55,55_6,6,11342110.0,7076986.0,450325.41649,54786530.0,68094480.0,48801910.0,8293410.0,8.0,10.0,34.0,0.0,3.0,9.0
3337,55,55_9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,9.0,30.0,0.0,6.0,12.0
1,55,55_12,12,12498370.0,7993317.0,484364.63337,57922940.0,76002350.0,53249860.0,9488818.0,10.0,10.0,41.0,0.0,9.0,18.0
3338,55,55_18,18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,13.0,38.0,0.0,12.0,24.0
3339,55,55_24,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,9.0,49.0,0.0,18.0,30.0
3340,55,55_30,30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,13.0,49.0,0.0,24.0,36.0
2,55,55_36,36,13194760.0,8373198.0,533001.00401,63354650.0,80600060.0,57199200.0,8898225.0,17.0,18.0,51.0,0.0,30.0,42.0
3341,55,55_42,42,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,20.0,41.0,0.0,36.0,48.0


In [10]:
full_training_data_sorted = full_training_data_sorted.reset_index()
full_training_data_sorted = full_training_data_sorted.drop(columns=['patient_id','index'])

del full_training_data
gc.collect()
full_training_data_sorted.head(10)

Unnamed: 0,visit_id,visit_month,F,Y,W,A,L,V,I,updrs_1,updrs_2,updrs_3,updrs_4,visit_month_lag,visit_month_lead
0,35_0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,16.0,0.0,-1.0,36.0
1,35_36,36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,4.0,20.0,0.0,0.0,-1.0
2,55_0,0,11871860.0,7520846.0,478382.42875,57345610.0,73428200.0,55443800.0,8991024.0,10.0,6.0,15.0,0.0,-1.0,3.0
3,55_3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,7.0,25.0,0.0,0.0,6.0
4,55_6,6,11342110.0,7076986.0,450325.41649,54786530.0,68094480.0,48801910.0,8293410.0,8.0,10.0,34.0,0.0,3.0,9.0
5,55_9,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,9.0,30.0,0.0,6.0,12.0
6,55_12,12,12498370.0,7993317.0,484364.63337,57922940.0,76002350.0,53249860.0,9488818.0,10.0,10.0,41.0,0.0,9.0,18.0
7,55_18,18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,13.0,38.0,0.0,12.0,24.0
8,55_24,24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,9.0,49.0,0.0,18.0,30.0
9,55_30,30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,13.0,49.0,0.0,24.0,36.0


In [11]:
temp = full_training_data_sorted[full_training_data_sorted['visit_month'].isin([0,6,12,24])]
temp[['updrs_1','updrs_2','updrs_3','updrs_4','visit_month','visit_month_lag','visit_month_lead','F','Y','W','A','L','I','V']].corr()

Unnamed: 0,updrs_1,updrs_2,updrs_3,updrs_4,visit_month,visit_month_lag,visit_month_lead,F,Y,W,A,L,I,V
updrs_1,1.0,0.587245,0.276154,0.302327,0.046661,0.07247,-0.017046,0.027562,0.020676,0.023135,0.02455,0.024678,0.029252,0.027093
updrs_2,0.587245,1.0,0.588422,0.277829,0.026213,0.055592,-0.054681,-0.059867,-0.067454,-0.060091,-0.064476,-0.063799,-0.063025,-0.064526
updrs_3,0.276154,0.588422,1.0,0.193204,0.038724,0.05548,0.084157,-0.143829,-0.14687,-0.143623,-0.147247,-0.145119,-0.148623,-0.14718
updrs_4,0.302327,0.277829,0.193204,1.0,0.07516,0.080621,-0.020622,0.034805,0.030697,0.032843,0.032706,0.033419,0.03786,0.036105
visit_month,0.046661,0.026213,0.038724,0.07516,1.0,0.957017,0.543097,0.18364,0.180795,0.180165,0.182631,0.181963,0.183632,0.181346
visit_month_lag,0.07247,0.055592,0.05548,0.080621,0.957017,1.0,0.532109,0.216692,0.212963,0.213875,0.215819,0.214422,0.216355,0.213869
visit_month_lead,-0.017046,-0.054681,0.084157,-0.020622,0.543097,0.532109,1.0,-0.006442,-0.007334,-0.006942,-0.006946,-0.007396,-0.006112,-0.006876
F,0.027562,-0.059867,-0.143829,0.034805,0.18364,0.216692,-0.006442,1.0,0.993552,0.976171,0.996257,0.997518,0.992823,0.994122
Y,0.020676,-0.067454,-0.14687,0.030697,0.180795,0.212963,-0.007334,0.993552,1.0,0.976836,0.99595,0.996607,0.992233,0.989537
W,0.023135,-0.060091,-0.143623,0.032843,0.180165,0.213875,-0.006942,0.976171,0.976836,1.0,0.983377,0.976009,0.979174,0.973319


In [12]:
full_training_data_sorted[['updrs_1','updrs_2','updrs_3','updrs_4','visit_month','visit_month_lag','visit_month_lead','F','Y','W','A','L','I','V']].corr()

Unnamed: 0,updrs_1,updrs_2,updrs_3,updrs_4,visit_month,visit_month_lag,visit_month_lead,F,Y,W,A,L,I,V
updrs_1,1.0,0.630341,0.303248,0.384606,0.156118,0.167924,0.08028,0.003302,-0.002158,0.005073,0.000846,0.00035,0.004926,0.002394
updrs_2,0.630341,1.0,0.594681,0.34265,0.122514,0.135629,0.031582,-0.077522,-0.083569,-0.072161,-0.079959,-0.080448,-0.078919,-0.079992
updrs_3,0.303248,0.594681,1.0,0.211337,0.046043,0.060763,-0.008986,-0.142694,-0.145407,-0.137828,-0.144748,-0.143617,-0.145787,-0.144171
updrs_4,0.384606,0.34265,0.211337,1.0,0.207707,0.225779,0.112113,0.03154,0.028263,0.031182,0.029818,0.030282,0.034025,0.03251
visit_month,0.156118,0.122514,0.046043,0.207707,1.0,0.93888,0.50229,0.07883,0.078995,0.08005,0.07805,0.077956,0.082193,0.076101
visit_month_lag,0.167924,0.135629,0.060763,0.225779,0.93888,1.0,0.573298,0.118905,0.118943,0.120012,0.118077,0.118076,0.122458,0.116096
visit_month_lead,0.08028,0.031582,-0.008986,0.112113,0.50229,0.573298,1.0,0.150708,0.150127,0.151317,0.149648,0.14935,0.153654,0.148305
F,0.003302,-0.077522,-0.142694,0.03154,0.07883,0.118905,0.150708,1.0,0.994303,0.978127,0.996756,0.99774,0.993318,0.994551
Y,-0.002158,-0.083569,-0.145407,0.028263,0.078995,0.118943,0.150127,0.994303,1.0,0.978139,0.99615,0.996846,0.992448,0.990144
W,0.005073,-0.072161,-0.137828,0.031182,0.08005,0.120012,0.151317,0.978127,0.978139,1.0,0.98442,0.977381,0.98069,0.974779


In [13]:
full_training_data_sorted.isna().sum()

visit_id            0
visit_month         0
F                   0
Y                   0
W                   0
A                   0
L                   0
V                   0
I                   0
updrs_1             0
updrs_2             0
updrs_3             0
updrs_4             0
visit_month_lag     0
visit_month_lead    0
dtype: int64

In [14]:
def smape_plus_1(y_true, y_pred):
    y_true_plus_1 = y_true + 1
    y_pred_plus_1 = y_pred + 1
    metric = np.zeros(len(y_true_plus_1))
    
    numerator = np.abs(y_true_plus_1 - y_pred_plus_1)
    denominator = ((np.abs(y_true_plus_1) + np.abs(y_pred_plus_1)) / 2)
    
    mask_not_zeros = (y_true_plus_1 != 0) | (y_pred_plus_1 != 0)
    metric[mask_not_zeros] = numerator[mask_not_zeros] / denominator[mask_not_zeros]
    
    return 100 * np.nanmean(metric)

In [15]:
ridge_model_dict = {}
#model_dict = {}
list_of_updrs_tests = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

#temp = full_training_data_sorted.dropna(subset=['updrs_3'])
#temp = full_training_data_sorted[full_training_data_sorted['visit_month'].isin([0,6,12,24])]
temp = full_training_data_sorted
#temp = temp[temp['updrs_3'] != 0]
X = temp[['visit_month','F','Y','W','A','L','I','V']]
print(X.shape)
y = temp['updrs_3']
print(y.shape)
poly = PolynomialFeatures(degree = 5)
X_poly = poly.fit_transform(X)
poly.fit(X_poly, y)
print(X_poly.shape)
#trained = LinearRegression().fit(X_poly, y)
#trained = RidgeCV(alphas=[0.001,0.0001,0.01], gcv_mode='svd', alpha_per_target=True).fit(X, y)
#trained = Lasso(alpha=1.03, max_iter=4500).fit(X, y)
trained = Ridge(alpha=1.00000000e+01).fit(X_poly, y)
#trained = SGDRegressor(alpha=0.011, learning_rate='optimal', early_stopping=True, validation_fraction=0.1).fit(X, y)
#trained = LinearRegression().fit(X, y)
#trained = RandomForestRegressor().fit(X, y)
#random_forest_model_dict['updrs_3'] = trained
#poly_model_dict['updrs_3'] = trained
#model_dict['updrs_3'] = trained
ridge_model_dict['updrs_3'] = trained
#sgd_model_dict['updrs_3'] = trained
#lasso_model_dict['updrs_3'] = trained

temp['predicted_updrs_3'] = np.ceil(ridge_model_dict['updrs_3'].predict(X_poly))
temp['predicted_updrs_3'] = temp['predicted_updrs_3'].apply(lambda x: 0 if x < 0 else x)

(4883, 8)
(4883,)
(4883, 1287)


In [16]:
pred_vs_true = temp[temp['visit_month'].isin([24])][['visit_id','visit_month','updrs_3','predicted_updrs_3']]
#pred_vs_true = temp[['visit_id','visit_month','updrs_3','predicted_updrs_3']]

#pred_vs_true['error'] = np.abs(pred_vs_true['updrs_3']-pred_vs_true['predicted_updrs_3'])

pred_vs_true_melt = pd.melt(pred_vs_true, id_vars=['visit_id'], value_vars=['updrs_3','predicted_updrs_3'])

#pred_vs_true_melt

fig = px.line(pred_vs_true_melt, x=pred_vs_true_melt['visit_id'], y=pred_vs_true_melt['value'], color='variable')

"""fig = px.scatter(pred_vs_true, x=pred_vs_true['visit_id'], y=pred_vs_true['error'], facet_row=pred_vs_true['visit_month']\
                 , trendline="ols", trendline_color_override="black")"""

fig.show()

In [17]:
"""def function_to_minimize(x):    
    metric = smape_plus_1(
        y_true=temp['updrs_3'], 
        y_pred=x
    )
    return metric

test = minimize(fun=function_to_minimize, x0=temp['predicted_updrs_3'], method='Powell')
temp['smape_minned_updrs_3'] = test.x.tolist()
temp['predicted_updrs_3'] = temp['smape_minned_updrs_3']"""

"def function_to_minimize(x):    \n    metric = smape_plus_1(\n        y_true=temp['updrs_3'], \n        y_pred=x\n    )\n    return metric\n\ntest = minimize(fun=function_to_minimize, x0=temp['predicted_updrs_3'], method='Powell')\ntemp['smape_minned_updrs_3'] = test.x.tolist()\ntemp['predicted_updrs_3'] = temp['smape_minned_updrs_3']"

In [18]:
print(smape_plus_1(temp['updrs_3'],temp['predicted_updrs_3']))

52.65089538126861


In [19]:
#temp = temp[temp['updrs_2'] != 0]
X = temp[['visit_month','predicted_updrs_3','L','V']]
print(X.shape)
y = temp['updrs_2']
print(y.shape)
poly = PolynomialFeatures(degree = 11)
X_poly = poly.fit_transform(X)
poly.fit(X_poly, y)
print(X_poly.shape)
#trained = LinearRegression().fit(X_poly, y)
#trained = RidgeCV(alphas=[0.001,0.0001,0.01], gcv_mode='svd', alpha_per_target=True).fit(X, y)
trained = Ridge(alpha=1.00000000e+6).fit(X_poly, y)
#trained = Lasso(alpha=1.03, max_iter=4500).fit(X, y)
#trained = SGDRegressor(alpha=0.011, learning_rate='optimal', early_stopping=True, validation_fraction=0.1).fit(X, y)
#trained = LinearRegression().fit(X, y)
#trained = RandomForestRegressor().fit(X, y)
#random_forest_model_dict['updrs_2'] = trained
#poly_model_dict['updrs_2'] = trained
#model_dict['updrs_2'] = trained
ridge_model_dict['updrs_2'] = trained
#sgd_model_dict['updrs_2'] = trained
#lasso_model_dict['updrs_2'] = trained

temp['predicted_updrs_2'] = np.ceil(ridge_model_dict['updrs_2'].predict(X_poly))
temp['predicted_updrs_2'] = temp['predicted_updrs_2'].apply(lambda x: 0 if x < 0 else x)

(4883, 4)
(4883,)
(4883, 1365)


In [20]:
pred_vs_true = temp[temp['visit_month'].isin([0,6,12,24])][['visit_id','visit_month','updrs_2','predicted_updrs_2']]
#pred_vs_true = temp[['visit_id','visit_month','updrs_3','predicted_updrs_3']]

#pred_vs_true['error'] = np.abs(pred_vs_true['updrs_3']-pred_vs_true['predicted_updrs_3'])

pred_vs_true_melt = pd.melt(pred_vs_true, id_vars=['visit_id'], value_vars=['updrs_2','predicted_updrs_2'])

#pred_vs_true_melt

fig = px.line(pred_vs_true_melt, x=pred_vs_true_melt['visit_id'], y=pred_vs_true_melt['value'], color='variable')

"""fig = px.scatter(pred_vs_true, x=pred_vs_true['visit_id'], y=pred_vs_true['error'], facet_row=pred_vs_true['visit_month']\
                 , trendline="ols", trendline_color_override="black")"""

fig.show()

In [21]:
print(smape_plus_1(temp['updrs_2'],temp['predicted_updrs_2']))

63.338032897911


In [22]:
#random_forest_model_dict = {}
#poly_model_dict = {}
#model_dict = {}
#ridge_model_dict = {}
#lasso_model_dict = {}
#sgd_model_dict = {}

#list_of_updrs_tests = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

"""ridge_model_dict = {}"""
#temp = full_training_data_sorted
#temp = full_training_data_sorted.dropna(subset=['updrs_1'])
#X = temp[['visit_month','visit_month_lag']]
X = temp[['visit_month','predicted_updrs_2','predicted_updrs_3']]
print(X.shape)
y = temp['updrs_1']
print(y.shape)
poly = PolynomialFeatures(degree = 3)
X_poly = poly.fit_transform(X)
poly.fit(X_poly, y)
print(X_poly.shape)
#trained = LinearRegression().fit(X_poly, y)
#trained = LinearRegression().fit(X, y)
#trained = RidgeCV(alphas=[0.001,0.0001,0.01], gcv_mode='svd', alpha_per_target=True).fit(X, y)
trained = Ridge(alpha=10).fit(X_poly, y)
#trained = Lasso(alpha=1.03, max_iter=4500).fit(X, y)
#trained = SGDRegressor(alpha=0.011, learning_rate='optimal', early_stopping=True, validation_fraction=0.1).fit(X, y)
#trained = RandomForestRegressor().fit(X, y)
#random_forest_model_dict['updrs_1'] = trained
#poly_model_dict['updrs_1'] = trained
#model_dict['updrs_1'] = trained
ridge_model_dict['updrs_1'] = trained
#sgd_model_dict['updrs_1'] = trained
#lasso_model_dict['updrs_1'] = trained

temp['predicted_updrs_1'] = np.ceil(ridge_model_dict['updrs_1'].predict(X_poly))
temp['predicted_updrs_1'] = temp['predicted_updrs_1'].apply(lambda x: 0 if x < 0 else x)
#print(round(ridge_model_dict['updrs_1'].score(X, y),5))

(4883, 3)
(4883,)
(4883, 20)


In [23]:
pred_vs_true = temp[temp['visit_month'].isin([0,6,12,24])][['visit_id','visit_month','updrs_1','predicted_updrs_1']]
#pred_vs_true = temp[['visit_id','visit_month','updrs_3','predicted_updrs_3']]

#pred_vs_true['error'] = np.abs(pred_vs_true['updrs_3']-pred_vs_true['predicted_updrs_3'])

pred_vs_true_melt = pd.melt(pred_vs_true, id_vars=['visit_id'], value_vars=['updrs_1','predicted_updrs_1'])

#pred_vs_true_melt

fig = px.line(pred_vs_true_melt, x=pred_vs_true_melt['visit_id'], y=pred_vs_true_melt['value'], color='variable')

"""fig = px.scatter(pred_vs_true, x=pred_vs_true['visit_id'], y=pred_vs_true['error'], facet_row=pred_vs_true['visit_month']\
                 , trendline="ols", trendline_color_override="black")"""

fig.show()

In [24]:
print(smape_plus_1(temp['updrs_1'],temp['predicted_updrs_1']))

53.34869306001253


In [25]:
X = temp[['visit_month','predicted_updrs_1','predicted_updrs_2','predicted_updrs_3']]
print(X.shape)
y = temp['updrs_4']
print(y.shape)
poly = PolynomialFeatures(degree = 7)
X_poly = poly.fit_transform(X)
poly.fit(X_poly, y)
print(X_poly.shape)
#trained = LinearRegression().fit(X_poly, y)
#trained = RidgeCV(alphas=[0.001,0.0001,0.01], gcv_mode='svd', alpha_per_target=True).fit(X, y)
trained = Ridge(alpha=1.00000000e+05).fit(X_poly, y)
#trained = Lasso(alpha=1.03, max_iter=4500).fit(X, y)
#trained = SGDRegressor(alpha=0.051, learning_rate='optimal', early_stopping=True, validation_fraction=0.1).fit(X, y)
#trained = LinearRegression().fit(X, y)
#trained = RandomForestRegressor().fit(X, y)
#random_forest_model_dict['updrs_4'] = trained
#poly_model_dict['updrs_4'] = trained
#model_dict['updrs_4'] = trained
ridge_model_dict['updrs_4'] = trained
#sgd_model_dict['updrs_4'] = trained
#lasso_model_dict['updrs_4'] = trained

temp['predicted_updrs_4'] = np.ceil(ridge_model_dict['updrs_4'].predict(X_poly))
temp['predicted_updrs_4'] = temp['predicted_updrs_4'].apply(lambda x: 0 if x < 0 else x)

(4883, 4)
(4883,)
(4883, 330)


In [26]:
pred_vs_true = temp[temp['visit_month'].isin([0,6,12,24])][['visit_id','visit_month','updrs_4','predicted_updrs_4']]
#pred_vs_true = temp[['visit_id','visit_month','updrs_3','predicted_updrs_3']]

#pred_vs_true['error'] = np.abs(pred_vs_true['updrs_3']-pred_vs_true['predicted_updrs_3'])

pred_vs_true_melt = pd.melt(pred_vs_true, id_vars=['visit_id'], value_vars=['updrs_4','predicted_updrs_4'])

#pred_vs_true_melt

fig = px.line(pred_vs_true_melt, x=pred_vs_true_melt['visit_id'], y=pred_vs_true_melt['value'], color='variable')

"""fig = px.scatter(pred_vs_true, x=pred_vs_true['visit_id'], y=pred_vs_true['error'], facet_row=pred_vs_true['visit_month']\
                 , trendline="ols", trendline_color_override="black")"""

fig.show()

In [27]:
print(smape_plus_1(temp['updrs_4'],temp['predicted_updrs_4']))

69.35926449299173


In [28]:
print(full_training_data_sorted.shape)
print(temp.shape)

(4883, 19)
(4883, 19)


In [29]:
for u in list_of_updrs_tests:
    
    #y_true = full_training_data_sorted[u]
    y_true = temp[u]
    y_pred = temp['predicted_'+u]
    
    print('SMAPE + 1 for', u, ':', smape_plus_1(np.array(y_true), np.array(y_pred)))
    
    """mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print('RMSE ', rmse)"""

SMAPE + 1 for updrs_1 : 53.34869306001253
SMAPE + 1 for updrs_2 : 63.338032897911
SMAPE + 1 for updrs_3 : 52.65089538126861
SMAPE + 1 for updrs_4 : 69.35926449299173


In [30]:
def get_full_test_data(df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
    
    if  (df_proteins.shape[0] == 0) & (df_peptides.shape[0] == 0):
        
        print('only the test dataframe has data, proteins and peptides info absent')
        
        full_test_data = df_test[['patient_id','visit_id','visit_month']]
        full_test_data = full_test_data.drop_duplicates()
        full_test_data_sorted = full_test_data.sort_values(by=['patient_id','visit_month'],  ascending=True)
        full_test_data_sorted['visit_month_lag'] = full_test_data_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        full_test_data_sorted['visit_month_lag'].fillna(-1, inplace=True)
        
        full_test_data_sorted = full_test_data_sorted.drop_duplicates()
        full_test_data_sorted = full_test_data_sorted[['visit_id','visit_month','visit_month_lag']]

        full_test_data_sorted = full_test_data_sorted.reset_index()
        full_test_data_sorted = full_test_data_sorted.drop(columns=['index'])
        
        return full_test_data_sorted
    
    elif (df_proteins.shape[0] == 0) & (df_peptides.shape[0] != 0):
        
        print('no proteins only peptides')
        print('df_test shape and number of unique visit_ids:', df_test.shape, df_test.visit_id.nunique())
        print('df_peptides shape and number of unique visit_ids:', df_peptides.shape, df_peptides.visit_id.nunique())
        df_test = df_test[['patient_id','visit_id','visit_month']]
        df_test = df_test.drop_duplicates()
        
        df_peptides['Peptide'] = df_peptides['Peptide'].apply(remove_unimod)
        """df_peptides['W_count'] = vect_func(df_peptides['Peptide'], 'W')
        df_peptides['Y_count'] = vect_func(df_peptides['Peptide'], 'Y')
        df_peptides['F_count'] = vect_func(df_peptides['Peptide'], 'F')
        df_peptides['L_count'] = vect_func(df_peptides['Peptide'], 'L')
        df_peptides['A_count'] = vect_func(df_peptides['Peptide'], 'A')"""
        
        df_peptides['I_abundance'] = vect_func(df_peptides['Peptide'], 'I', df_peptides['PeptideAbundance'], 6.73)
        df_peptides['V_abundance'] = vect_func(df_peptides['Peptide'], 'V', df_peptides['PeptideAbundance'], 5.49)
        df_peptides['F_abundance'] = vect_func(df_peptides['Peptide'], 'F', df_peptides['PeptideAbundance'], 3.87)
        df_peptides['Y_abundance'] = vect_func(df_peptides['Peptide'], 'Y', df_peptides['PeptideAbundance'], 2.91)
        df_peptides['W_abundance'] = vect_func(df_peptides['Peptide'], 'W', df_peptides['PeptideAbundance'], 1.25)
        df_peptides['A_abundance'] = vect_func(df_peptides['Peptide'], 'A', df_peptides['PeptideAbundance'], 8.76)
        df_peptides['L_abundance'] = vect_func(df_peptides['Peptide'], 'L', df_peptides['PeptideAbundance'], 9.68)
    
        

        df_peptides_agg = df_peptides.groupby(['patient_id','visit_id','visit_month']).agg(F=('F_abundance', 'sum'),Y=('Y_abundance', 'sum'),W=('W_abundance', 'sum')\
                                                                                           ,A=('A_abundance', 'sum'),L=('L_abundance', 'sum'),V=('V_abundance', 'sum')\
                                                                                           ,I=('I_abundance', 'sum')).reset_index(level=['patient_id','visit_id','visit_month'])
        

        """df_peptides_agg = df_peptides.groupby(['patient_id','visit_id','visit_month']).agg(W=('W_count', 'sum'),Y=('Y_count', 'sum'),F=('F_count', 'sum')\
                                                                                          ,A=('A_count', 'sum'),L=('L_count', 'sum'))\
        .reset_index(level=['patient_id','visit_id','visit_month'])"""
        
        df_peptides = pd.merge(df_test, df_peptides_agg, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='left')
        df_peptides = df_peptides[['patient_id','visit_id','visit_month','F','Y','W','A','L','V','I']]
        
        columns_to_remove_nan_from = ['F','Y','W','L','A','I','V']

        for i in columns_to_remove_nan_from:
            df_peptides[i].fillna(0, inplace=True)
        
        df_peptides = df_peptides.drop_duplicates()
        
        df_peptides_sorted = df_peptides.sort_values(by=['patient_id','visit_month'],  ascending=True)
        df_peptides_sorted['visit_month_lag'] = df_peptides_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        df_peptides_sorted['visit_month_lag'].fillna(-1, inplace=True)
        
        df_peptides_sorted['visit_month_lead'] = df_peptides_sorted.groupby(['patient_id'])['visit_month'].shift(-1)
        df_peptides_sorted['visit_month_lead'].fillna(-1, inplace=True)

        """
        full_test_data = pd.DataFrame()
        for i in [0,6,12,24]:
            temp = df_peptides[df_peptides['visit_month']==i]
            full_test_data = full_test_data.append(temp)
        
        del df_proteins, df_peptides
        """
        full_test_data = df_peptides_sorted
        del df_peptides_sorted
        print('full_test_data shape and number of unique visit_ids:', full_test_data.shape, full_test_data.visit_id.nunique())
        
        full_test_data = full_test_data.drop_duplicates(keep='first')
        full_test_data = full_test_data[['visit_id','visit_month', 'visit_month_lag','visit_month_lead','F','Y','W','A','L','V','I']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data
            
    elif (df_peptides.shape[0] == 0) & (df_proteins.shape[0] != 0):
        
        print('no peptides only proteins')
        print('df_test shape and number of unique visit_ids:', df_test.shape, df_test.visit_id.nunique())
        print('df_proteins shape and number of unique visit_ids:', df_proteins.shape, df_proteins.visit_id.nunique())
        df_test = df_test[['patient_id','visit_id','visit_month']]
        df_test = df_test.drop_duplicates()
        
        df_proteins = pd.merge(df_test, df_proteins, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='left')
        df_proteins = df_proteins[['patient_id','visit_id','visit_month']]
        df_proteins = df_proteins.drop_duplicates()
        
        df_proteins_sorted = df_proteins.sort_values(by=['patient_id','visit_month'],  ascending=True)
        df_proteins_sorted['visit_month_lag'] = df_proteins_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        df_proteins_sorted['visit_month_lag'].fillna(-1, inplace=True)

        """
        full_test_data = pd.DataFrame()
        for i in [0,6,12,24]:
            temp = df_proteins[df_proteins['visit_month']==i]
            full_test_data = full_test_data.append(temp)
        """
        full_test_data = df_proteins_sorted
        print('full_test_data shape and number of unique visit_ids:', full_test_data.shape, full_test_data.visit_id.nunique())
        
        full_test_data = full_test_data.drop_duplicates(keep='first')
        full_test_data = full_test_data[['visit_id','visit_month', 'visit_month_lag']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data
        
    else:
        
        print('both proteins and peptides are present')
        
        df_test = df_test[['patient_id','visit_id','visit_month']]
        df_test = df_test.drop_duplicates()
        
        pro_pep_join = pd.merge(df_proteins, df_peptides, on=['patient_id','visit_id','visit_month','UniProt'], suffixes=['_left','_right'], how='outer')
        pro_pep_join = pro_pep_join[['patient_id','visit_id','visit_month']]
        pro_pep_join = pro_pep_join.drop_duplicates()
        #del df_proteins, df_peptides
        print('df_proteins shape and number of unique visit_ids:', df_proteins.shape, df_proteins.visit_id.nunique())
        print('df_peptides shape and number of unique visit_ids:', df_peptides.shape, df_peptides.visit_id.nunique())
        print('pro_pep_join shape and number of unique visit_ids:', pro_pep_join.shape, pro_pep_join.visit_id.nunique())
        
        pro_pep_join_test = pd.merge(df_test, pro_pep_join, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='left')
        
        pro_pep_join_test_sorted = pro_pep_join_test.sort_values(by=['patient_id','visit_month'],  ascending=True)
        pro_pep_join_test_sorted['visit_month_lag'] = pro_pep_join_test_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        pro_pep_join_test_sorted['visit_month_lag'].fillna(-1, inplace=True)

        """
        full_test_data = pd.DataFrame()

        for i in [0,6,12,24]:
            temp = pro_pep_join_test[pro_pep_join_test['visit_month']==i]
            full_test_data = full_test_data.append(temp)
        """
        full_test_data = pro_pep_join_test_sorted 
        print('full_test_data shape and number of unique visit_ids:', full_test_data.shape, full_test_data.visit_id.nunique())

        full_test_data = full_test_data.drop_duplicates()
        full_test_data = full_test_data[['visit_id','visit_month', 'visit_month_lag']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data

In [31]:
def get_predictions_v4(model_dict, df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
    
    list_of_updrs_tests = ['updrs_1','updrs_2','updrs_3','updrs_4']
    result = pd.DataFrame()
    
    full_test_data = get_full_test_data(df_test=df_test, df_proteins=df_proteins, df_peptides=df_peptides)
    
    if full_test_data.shape[0] != 0:
        """df_prepared = sklearn_pipeline.transform(full_test_data)
        print(df_prepared.shape)"""

        ##for u in list_of_updrs_tests:
        
        X = full_test_data[['visit_month']]
        for m in [0, 6, 12, 24]:
            full_test_data['predicted_updrs_1_plus_'+str(m)+'_months'] = np.ceil(model_dict['updrs_1'].predict(X))
        
        #X_poly = poly.fit_transform(X)
        for m in [0, 6, 12, 24]:
            X = full_test_data[['visit_month','predicted_updrs_1_plus_'+str(m)+'_months']]
            full_test_data['predicted_updrs_2_plus_'+str(m)+'_months'] = np.ceil(model_dict['updrs_2'].predict(X))
        #full_test_data['predicted_updrs_2'] = np.ceil(model_dict['updrs_2'].predict(X_poly))
        
        #X_poly = poly.fit_transform(X)
        for m in [0, 6, 12, 24]:
            X = full_test_data[['predicted_updrs_1_plus_'+str(m)+'_months','predicted_updrs_2_plus_'+str(m)+'_months']]
            full_test_data['predicted_updrs_3_plus_'+str(m)+'_months'] = np.ceil(model_dict['updrs_3'].predict(X))
        #full_test_data['predicted_updrs_3'] = np.ceil(model_dict['updrs_3'].predict(X_poly))
        
        #X_poly = poly.fit_transform(X)
        for m in [0, 6, 12, 24]:
            X = full_test_data[['visit_month','predicted_updrs_1_plus_'+str(m)+'_months',\
                                'predicted_updrs_2_plus_'+str(m)+'_months','predicted_updrs_3_plus_'+str(m)+'_months']]
            full_test_data['predicted_updrs_4_plus_'+str(m)+'_months'] = np.ceil(model_dict['updrs_4'].predict(X))
        #full_test_data['predicted_updrs_4'] = np.ceil(model_dict['updrs_4'].predict(X_poly))
        
        """poly = PolynomialFeatures(degree = 5)
        X_poly = poly.fit_transform(X)
        full_test_data['predicted_updrs_1'] = np.ceil(model_dict['updrs_1'].predict(X_poly))"""
        
        ## for all other models
        ##full_test_data['result_' + str(u)] = np.ceil(model_dict[u].predict(X))
        
        for m in [0, 6, 12, 24]:
            for u in [1, 2, 3, 4]:
                temp = full_test_data[['visit_id', 'visit_month', 'predicted_updrs_' + str(u) + '_plus_' + str(m) + '_months']]
                temp['prediction_id'] = temp['visit_id'] + '_updrs_' + str(u) + '_plus_' + str(m) + '_months'
                
                #if (u == 2) | (u == 4):
                if (u == 4):
                    temp["rating"] = 0
                else:
                    temp["rating"] = temp["predicted_updrs_" + str(u) + '_plus_' + str(m) + '_months'].apply(lambda x: 0 if x < 0 else x)
                
                temp = temp[['prediction_id', 'rating']]

                result = result.append(temp)
                
        result = result.drop_duplicates(subset=['prediction_id', 'rating'])
        result = result.reset_index()
        result.drop(columns=['index'], inplace=True)
    
    return result

In [32]:
def get_predictions_v3(model_dict, df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
    
    list_of_updrs_tests = ['updrs_1','updrs_2','updrs_3','updrs_4']
    result = pd.DataFrame()
    
    full_test_data = get_full_test_data(df_test=df_test, df_proteins=df_proteins, df_peptides=df_peptides)
    
    if full_test_data.shape[0] != 0:
        """df_prepared = sklearn_pipeline.transform(full_test_data)
        print(df_prepared.shape)"""

        ##for u in list_of_updrs_tests:
        
        X = full_test_data[['visit_month','F','Y','W','A','L','I','V']]
        poly = PolynomialFeatures(degree = 5)
        X_poly = poly.fit_transform(X)
        #full_test_data['predicted_updrs_3'] = np.ceil(model_dict['updrs_3'].predict(X))
        full_test_data['predicted_updrs_3'] = np.ceil(model_dict['updrs_3'].predict(X_poly))
        full_test_data['predicted_updrs_3'] = full_test_data['predicted_updrs_3'].apply(lambda x: 0 if x < 0 else x)
        
        X = full_test_data[['visit_month','predicted_updrs_3','L','V']]
        poly = PolynomialFeatures(degree = 11)
        X_poly = poly.fit_transform(X)
        #full_test_data['predicted_updrs_2'] = np.ceil(model_dict['updrs_2'].predict(X))
        full_test_data['predicted_updrs_2'] = np.ceil(model_dict['updrs_2'].predict(X_poly))
        full_test_data['predicted_updrs_2'] = full_test_data['predicted_updrs_2'].apply(lambda x: 0 if x < 0 else x)
        
        X = full_test_data[['visit_month','predicted_updrs_2','predicted_updrs_3']]
        poly = PolynomialFeatures(degree = 3)
        X_poly = poly.fit_transform(X)
        full_test_data['predicted_updrs_1'] = np.ceil(model_dict['updrs_1'].predict(X_poly))
        full_test_data['predicted_updrs_1'] = full_test_data['predicted_updrs_1'].apply(lambda x: 0 if x < 0 else x)
        
        X = full_test_data[['visit_month','predicted_updrs_1','predicted_updrs_2','predicted_updrs_3']]
        poly = PolynomialFeatures(degree = 7)
        X_poly = poly.fit_transform(X)
        #full_test_data['predicted_updrs_4'] = np.ceil(model_dict['updrs_4'].predict(X))
        full_test_data['predicted_updrs_4'] = np.ceil(model_dict['updrs_4'].predict(X_poly))
        full_test_data['predicted_updrs_4'] = full_test_data['predicted_updrs_4'].apply(lambda x: 0 if x < 0 else x)
        
        """poly = PolynomialFeatures(degree = 5)
        X_poly = poly.fit_transform(X)
        full_test_data['predicted_updrs_1'] = np.ceil(model_dict['updrs_1'].predict(X_poly))"""
        
        ## for all other models
        ##full_test_data['result_' + str(u)] = np.ceil(model_dict[u].predict(X))
        
        for m in [0, 6, 12, 24]:
            for u in [1, 2, 3, 4]:
                temp = full_test_data[['visit_id', 'visit_month', 'predicted_updrs_' + str(u)]]
                temp['prediction_id'] = temp['visit_id'] + '_updrs_' + str(u) + '_plus_' + str(m) + '_months'
                
                #if (u == 2) | (u == 4):
                if (u == 5):
                    temp["rating"] = 0
                else:
                    temp["rating"] = temp["predicted_updrs_" + str(u)].apply(lambda x: 0 if x < 0 else x)
                
                temp = temp[['prediction_id', 'rating']]

                result = result.append(temp)
                
        result = result.drop_duplicates(subset=['prediction_id', 'rating'])
        result = result.reset_index()
        result.drop(columns=['index'], inplace=True)
    
    return result

In [33]:
test = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')

test_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv')

test_peptides = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv')

#result = get_predictions(model_dict=random_forest_model_dict, df_test=test, df_proteins=test_proteins, df_peptides=test_peptides)
#result = get_predictions_v2(model_dict=ridge_model_dict, df_test=test)
#result = get_predictions(model_dict=random_forest_model_dict, df_test=test, df_proteins=test_proteins)
result = get_predictions_v3(model_dict=ridge_model_dict, df_test=test, df_peptides=test_peptides)

#result = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline)

result

no proteins only peptides
df_test shape and number of unique visit_ids: (16, 6) 4
df_peptides shape and number of unique visit_ids: (2057, 7) 2
full_test_data shape and number of unique visit_ids: (4, 12) 4




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,prediction_id,rating
0,3342_0_updrs_1_plus_0_months,6.0
1,3342_6_updrs_1_plus_0_months,6.0
2,50423_0_updrs_1_plus_0_months,6.0
3,50423_6_updrs_1_plus_0_months,6.0
4,3342_0_updrs_2_plus_0_months,7.0
...,...,...
59,50423_6_updrs_3_plus_24_months,23.0
60,3342_0_updrs_4_plus_24_months,1.0
61,3342_6_updrs_4_plus_24_months,1.0
62,50423_0_updrs_4_plus_24_months,-0.0


In [34]:
import amp_pd_peptide

env = amp_pd_peptide.make_env()   # initialize the environment for one run only

"""amp_pd_peptide.make_env.__called__ = False
type(env)._state = type(type(env)._state).__dict__['INIT']"""

iter_test = env.iter_test()

for (test, test_peptides, test_proteins, submission) in iter_test:
    #submission = get_predictions_v2(model_dict=ridge_model_dict, df_test=test)
    submission = get_predictions_v3(model_dict=ridge_model_dict, df_test=test, df_peptides=test_peptides)
    #submission = get_predictions(model_dict=decision_model_dict, df_test=test, df_proteins=test_proteins, df_peptides=test_peptides)
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
no proteins only peptides
df_test shape and number of unique visit_ids: (8, 5) 2
df_peptides shape and number of unique visit_ids: (1021, 6) 1
full_test_data shape and number of unique visit_ids: (2, 12) 2
no proteins only peptides
df_test shape and number of unique visit_ids: (8, 5) 2
df_peptides shape and number of unique visit_ids: (1036, 6) 1
full_test_data shape and number of unique visit_ids: (2, 12) 2




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/