In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
from pandas.plotting import scatter_matrix
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

from statsmodels.tsa.seasonal import seasonal_decompose
import re
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/public_timeseries_testing_util.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/__init__.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test

In [158]:
train_proteins_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')
print('proteins :', train_proteins_df.shape)
print('proteins unique visit_ids:', train_proteins_df.visit_id.nunique())
print(train_proteins_df.columns)

train_peptides_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv')
print('')
print('peptides :', train_peptides_df.shape)
print('peptides unique visit_ids:', train_peptides_df.visit_id.nunique())
print(train_peptides_df.columns)

train_clinical_data_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
print('')
print('clinical :', train_clinical_data_df.shape)
print('clinical unique visit_ids:', train_clinical_data_df.visit_id.nunique())
print(train_clinical_data_df.columns)

supp_clinical_data_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv')
print('')
print('supplemental :', supp_clinical_data_df.shape)
print('supplemental unique visit_ids:', supp_clinical_data_df.visit_id.nunique())
print(supp_clinical_data_df.columns)

proteins : (232741, 5)
proteins unique visit_ids: 1113
Index(['visit_id', 'visit_month', 'patient_id', 'UniProt', 'NPX'], dtype='object')

peptides : (981834, 6)
peptides unique visit_ids: 1113
Index(['visit_id', 'visit_month', 'patient_id', 'UniProt', 'Peptide',
       'PeptideAbundance'],
      dtype='object')

clinical : (2615, 8)
clinical unique visit_ids: 2615
Index(['visit_id', 'patient_id', 'visit_month', 'updrs_1', 'updrs_2',
       'updrs_3', 'updrs_4', 'upd23b_clinical_state_on_medication'],
      dtype='object')

supplemental : (2223, 8)
supplemental unique visit_ids: 2223
Index(['visit_id', 'patient_id', 'visit_month', 'updrs_1', 'updrs_2',
       'updrs_3', 'updrs_4', 'upd23b_clinical_state_on_medication'],
      dtype='object')


In [159]:
clinical_data = pd.merge(supp_clinical_data_df, train_clinical_data_df, on=['patient_id','visit_id','visit_month'], how='outer')

for i in [1,2,3,4]:
    clinical_data['updrs_'+str(i)] = np.where(clinical_data['updrs_'+str(i)+'_x'].isna(), clinical_data['updrs_'+str(i)+'_y'], clinical_data['updrs_'+str(i)+'_x'])
    clinical_data.drop(columns=['updrs_'+str(i)+'_x', 'updrs_'+str(i)+'_y'], inplace=True)
    
clinical_data.drop(columns=['upd23b_clinical_state_on_medication_x', 'upd23b_clinical_state_on_medication_y'], inplace=True)

print(clinical_data.shape)
print(clinical_data.visit_id.nunique())

del supp_clinical_data_df, train_clinical_data_df

clinical_data.head(10)

(4838, 7)
4838


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
0,35_0,35,0,5.0,3.0,16.0,0.0
1,35_36,35,36,6.0,4.0,20.0,0.0
2,75_0,75,0,4.0,6.0,26.0,0.0
3,75_36,75,36,1.0,8.0,38.0,0.0
4,155_0,155,0,,,0.0,
5,337_0,337,0,5.0,7.0,6.0,0.0
6,337_36,337,36,8.0,7.0,8.0,0.0
7,527_0,527,0,6.0,2.0,9.0,0.0
8,527_36,527,36,2.0,18.0,22.0,0.0
9,557_0,557,0,5.0,6.0,22.0,0.0


In [160]:
pro_pep_join = pd.merge(train_proteins_df, train_peptides_df, on=['patient_id','visit_id','visit_month','UniProt'], how='outer')

print(pro_pep_join.shape)
print(pro_pep_join.visit_id.nunique())

del train_proteins_df, train_peptides_df
pro_pep_join.head(10)

(981834, 7)
1113


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX,Peptide,PeptideAbundance
0,55_0,0,55,O00391,11254.3,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,732430.0,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,732430.0,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,732430.0,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,732430.0,SMEQNGPGLEYR,30838.7
5,55_0,0,55,O00533,732430.0,TLKIENVSYQDKGNYR,23216.5
6,55_0,0,55,O00533,732430.0,VIAVNEVGR,170878.0
7,55_0,0,55,O00533,732430.0,VMTPAVYAPYDVK,148771.0
8,55_0,0,55,O00533,732430.0,VNGSPVDNHPFAGDVVFPR,55202.1
9,55_0,0,55,O00584,39585.8,ELDLNSVLLK,27229.3


In [161]:
def remove_unimod(peptide_seqn):
    return re.sub('[(UniMod_)0-9]+','',peptide_seqn)

#print(remove_unimod('KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK'))

pro_pep_join['Peptide'] = pro_pep_join['Peptide'].apply(remove_unimod)
#pro_pep_join.head(10)

def amino_acid_count(amino_seqn, amino_acid):
    return amino_seqn.count(amino_acid)

vect_func = np.vectorize(amino_acid_count)

"""for alph in ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']:
    pro_pep_join[alph+'_count'] = vect_func(pro_pep_join['Peptide'], alph)"""

pro_pep_join['T_count'] = vect_func(pro_pep_join['Peptide'], 'T')
pro_pep_join['Y_count'] = vect_func(pro_pep_join['Peptide'], 'Y')
    
#pro_pep_join.head(10)

pro_pep_join_agg = pro_pep_join.groupby(['patient_id','visit_id','visit_month']).agg(T=('T_count', 'sum'),Y=('Y_count', 'sum'))\
.reset_index(level=['patient_id','visit_id','visit_month'])

pro_pep_join_agg.head(10)

Unnamed: 0,patient_id,visit_id,visit_month,T,Y
0,55,55_0,0,783,430
1,55,55_12,12,794,435
2,55,55_36,36,785,435
3,55,55_6,6,783,434
4,942,942_12,12,753,418
5,942,942_24,24,764,419
6,942,942_48,48,763,416
7,942,942_6,6,775,427
8,1517,1517_0,0,772,418
9,1517,1517_24,24,777,424


In [162]:
full_training_data = pd.merge(pro_pep_join_agg, clinical_data, on=['patient_id','visit_id','visit_month'], how='outer')
#full_training_data = full_training_data.drop(columns=['UniProt','Peptide','NPX','PeptideAbundance'])

#full_training_data['NPX_inverse'] = 1/full_training_data['NPX']
#full_training_data['PeptideAbundance_inverse'] = 1/full_training_data['PeptideAbundance']

#full_training_data = full_training_data.drop(columns=['UniProt'])

print(full_training_data.shape)
print(full_training_data.visit_id.nunique())

del pro_pep_join, clinical_data
gc.collect()

full_training_data.head(10)

(4883, 9)
4883


Unnamed: 0,patient_id,visit_id,visit_month,T,Y,updrs_1,updrs_2,updrs_3,updrs_4
0,55,55_0,0,783.0,430.0,10.0,6.0,15.0,
1,55,55_12,12,794.0,435.0,10.0,10.0,41.0,0.0
2,55,55_36,36,785.0,435.0,17.0,18.0,51.0,0.0
3,55,55_6,6,783.0,434.0,8.0,10.0,34.0,
4,942,942_12,12,753.0,418.0,5.0,2.0,25.0,0.0
5,942,942_24,24,764.0,419.0,2.0,3.0,23.0,
6,942,942_48,48,763.0,416.0,2.0,6.0,35.0,0.0
7,942,942_6,6,775.0,427.0,8.0,2.0,21.0,
8,1517,1517_0,0,772.0,418.0,11.0,6.0,25.0,5.0
9,1517,1517_24,24,777.0,424.0,19.0,11.0,28.0,3.0


In [163]:
#columns_to_remove_nan_from = ['updrs_1','updrs_2','updrs_3','updrs_4']
columns_to_remove_nan_from = ['updrs_1','updrs_2','updrs_3','updrs_4','T','Y']

for i in columns_to_remove_nan_from:
    if (i != 'T') & (i != 'Y'):
        full_training_data[i].fillna(full_training_data[i].median().round(decimals = 0), inplace=True)
    else:
        full_training_data[i].fillna(0, inplace=True)

In [164]:
full_training_data = full_training_data.drop_duplicates()

full_training_data_sorted = full_training_data.sort_values(by=['patient_id','visit_month'],  ascending=True)

full_training_data_sorted['visit_month_lag'] = full_training_data_sorted.groupby(['patient_id'])['visit_month'].shift(1)
full_training_data_sorted['visit_month_lag'].fillna(-1, inplace=True)

"""full_training_data_sorted['visit_month_lead'] = full_training_data_sorted.groupby(['patient_id'])['visit_month'].shift(-1)
full_training_data_sorted['visit_month_lead'].fillna(-1, inplace=True)"""

full_training_data_sorted[full_training_data_sorted['patient_id']==55]

Unnamed: 0,patient_id,visit_id,visit_month,T,Y,updrs_1,updrs_2,updrs_3,updrs_4,visit_month_lag
0,55,55_0,0,783.0,430.0,10.0,6.0,15.0,0.0,-1.0
3336,55,55_3,3,0.0,0.0,10.0,7.0,25.0,0.0,0.0
3,55,55_6,6,783.0,434.0,8.0,10.0,34.0,0.0,3.0
3337,55,55_9,9,0.0,0.0,8.0,9.0,30.0,0.0,6.0
1,55,55_12,12,794.0,435.0,10.0,10.0,41.0,0.0,9.0
3338,55,55_18,18,0.0,0.0,7.0,13.0,38.0,0.0,12.0
3339,55,55_24,24,0.0,0.0,16.0,9.0,49.0,0.0,18.0
3340,55,55_30,30,0.0,0.0,14.0,13.0,49.0,0.0,24.0
2,55,55_36,36,785.0,435.0,17.0,18.0,51.0,0.0,30.0
3341,55,55_42,42,0.0,0.0,12.0,20.0,41.0,0.0,36.0


In [165]:
full_training_data_sorted = full_training_data_sorted.reset_index()
full_training_data_sorted = full_training_data_sorted.drop(columns=['patient_id','index'])

del full_training_data
gc.collect()
full_training_data_sorted.head(10)

Unnamed: 0,visit_id,visit_month,T,Y,updrs_1,updrs_2,updrs_3,updrs_4,visit_month_lag
0,35_0,0,0.0,0.0,5.0,3.0,16.0,0.0,-1.0
1,35_36,36,0.0,0.0,6.0,4.0,20.0,0.0,0.0
2,55_0,0,783.0,430.0,10.0,6.0,15.0,0.0,-1.0
3,55_3,3,0.0,0.0,10.0,7.0,25.0,0.0,0.0
4,55_6,6,783.0,434.0,8.0,10.0,34.0,0.0,3.0
5,55_9,9,0.0,0.0,8.0,9.0,30.0,0.0,6.0
6,55_12,12,794.0,435.0,10.0,10.0,41.0,0.0,9.0
7,55_18,18,0.0,0.0,7.0,13.0,38.0,0.0,12.0
8,55_24,24,0.0,0.0,16.0,9.0,49.0,0.0,18.0
9,55_30,30,0.0,0.0,14.0,13.0,49.0,0.0,24.0


In [166]:
full_training_data_sorted[['updrs_1','updrs_2','updrs_3','updrs_4','visit_month','T','Y']].corr()

Unnamed: 0,updrs_1,updrs_2,updrs_3,updrs_4,visit_month,T,Y
updrs_1,1.0,0.630341,0.303248,0.384606,0.156118,0.004171,0.00394
updrs_2,0.630341,1.0,0.594681,0.34265,0.122514,-0.076512,-0.076782
updrs_3,0.303248,0.594681,1.0,0.211337,0.046043,-0.142929,-0.142946
updrs_4,0.384606,0.34265,0.211337,1.0,0.207707,0.033198,0.031424
visit_month,0.156118,0.122514,0.046043,0.207707,1.0,0.08267,0.082685
T,0.004171,-0.076512,-0.142929,0.033198,0.08267,1.0,0.999735
Y,0.00394,-0.076782,-0.142946,0.031424,0.082685,0.999735,1.0


In [167]:
full_training_data_sorted.isna().sum()

visit_id           0
visit_month        0
T                  0
Y                  0
updrs_1            0
updrs_2            0
updrs_3            0
updrs_4            0
visit_month_lag    0
dtype: int64

#### First train to predict UPDRS_1

In [168]:
#random_forest_model_dict = {}
#poly_model_dict = {}
#model_dict = {}
ridge_model_dict = {}
list_of_updrs_tests = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

temp = full_training_data_sorted.dropna(subset=['updrs_1'])
#X = temp[['visit_month','visit_month_lag']]
X = temp[['visit_month']]
print(X.shape)
y = temp['updrs_1']
print(y.shape)
"""poly = PolynomialFeatures(degree = 5)
X_poly = poly.fit_transform(X)
poly.fit(X_poly, y)
print(X_poly.shape)
trained = LinearRegression().fit(X_poly, y)"""
#trained = LinearRegression().fit(X, y)
trained = Ridge(alpha=1.01).fit(X, y)
#trained = RandomForestRegressor().fit(X, y)
#random_forest_model_dict['updrs_1'] = trained
#poly_model_dict['updrs_1'] = trained
#model_dict['updrs_1'] = trained
ridge_model_dict['updrs_1'] = trained

"""model_dict = {}

for u in list_of_updrs_tests:
        
    # Drop NAs
    temp = full_training_data_sorted.dropna(subset=[u]) 
    print(u)
    # Train data
    X = temp[['visit_month','visit_month_lag']]
    #X = temp[['visit_month','NPX_standardised','Pep_standardised']]
    print(X.shape)
    y = temp[u]
    print(y.shape)
    trained = LinearRegression().fit(X, y)
    
    # Save model
    model_dict[u] = trained"""

"""poly_model_dict = {}
list_of_updrs_tests = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

for u in list_of_updrs_tests:
        
    # Drop NAs
    temp = full_training_data_sorted.dropna(subset=[u]) 
    print(u)
    # Train data
    X = temp[['visit_month','visit_month_lag']]
    #X = temp[['visit_month','NPX_standardised','Pep_standardised']]
    print(X.shape)
    y = temp[u]
    print(y.shape)
    poly = PolynomialFeatures(degree = 5)
    X_poly = poly.fit_transform(X)
    poly.fit(X_poly, y)
    print(X_poly.shape)
    trained = LinearRegression().fit(X_poly, y)
    
    # Save model
    poly_model_dict[u] = trained"""

"""for u in list_of_updrs_tests:
        
    # Drop NAs
    temp = full_training_data_sorted.dropna(subset=[u]) 
    print(u)
    # Train data
    X = temp[['visit_month','visit_month_lag']]
    print(X.shape)
    y = temp[u]
    print(y.shape)
    trained = RandomForestRegressor().fit(X, y)
    
    # Save model
    random_forest_model_dict[u] = trained"""

(4883, 1)
(4883,)


"for u in list_of_updrs_tests:\n        \n    # Drop NAs\n    temp = full_training_data_sorted.dropna(subset=[u]) \n    print(u)\n    # Train data\n    X = temp[['visit_month','visit_month_lag']]\n    print(X.shape)\n    y = temp[u]\n    print(y.shape)\n    trained = RandomForestRegressor().fit(X, y)\n    \n    # Save model\n    random_forest_model_dict[u] = trained"

In [169]:
temp['predicted_updrs_1'] = np.ceil(ridge_model_dict['updrs_1'].predict(X))

In [170]:
#temp = temp[temp['updrs_2'] != 0]
X = temp[['visit_month','predicted_updrs_1']]
print(X.shape)
y = temp['updrs_2']
print(y.shape)
"""X_poly = poly.fit_transform(X)
poly.fit(X_poly, y)
print(X_poly.shape)
trained = LinearRegression().fit(X_poly, y)"""
trained = Ridge(alpha=1.01).fit(X, y)
#trained = LinearRegression().fit(X, y)
#trained = RandomForestRegressor().fit(X, y)
#random_forest_model_dict['updrs_2'] = trained
#poly_model_dict['updrs_2'] = trained
#model_dict['updrs_2'] = trained
ridge_model_dict['updrs_2'] = trained

(4883, 2)
(4883,)


In [171]:
temp['predicted_updrs_2'] = np.ceil(ridge_model_dict['updrs_2'].predict(X))

In [172]:
temp = temp[temp['updrs_3'] != 0]
X = temp[['predicted_updrs_1','predicted_updrs_2','T','Y']]
print(X.shape)
y = temp['updrs_3']
print(y.shape)
"""X_poly = poly.fit_transform(X)
poly.fit(X_poly, y)
print(X_poly.shape)
trained = LinearRegression().fit(X_poly, y)"""
trained = Ridge(alpha=1.01).fit(X, y)
#trained = LinearRegression().fit(X, y)
#trained = RandomForestRegressor().fit(X, y)
#random_forest_model_dict['updrs_3'] = trained
#poly_model_dict['updrs_3'] = trained
#model_dict['updrs_3'] = trained
ridge_model_dict['updrs_3'] = trained

(4517, 4)
(4517,)


In [173]:
temp['predicted_updrs_3'] = np.ceil(ridge_model_dict['updrs_3'].predict(X))

In [174]:
X = temp[['visit_month','predicted_updrs_1','predicted_updrs_2','predicted_updrs_3']]
print(X.shape)
y = temp['updrs_4']
print(y.shape)
"""X_poly = poly.fit_transform(X)
poly.fit(X_poly, y)
print(X_poly.shape)
trained = LinearRegression().fit(X_poly, y)"""
trained = Ridge(alpha=1.01).fit(X, y)
#trained = LinearRegression().fit(X, y)
#trained = RandomForestRegressor().fit(X, y)
#random_forest_model_dict['updrs_4'] = trained
#poly_model_dict['updrs_4'] = trained
#model_dict['updrs_4'] = trained
ridge_model_dict['updrs_4'] = trained

(4517, 4)
(4517,)


In [175]:
temp['predicted_updrs_4'] = np.ceil(ridge_model_dict['updrs_4'].predict(X))

In [93]:
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))

    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)

    pos_ind = dem != 0
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]

    return 100 * np.mean(smap)

In [176]:
print(full_training_data_sorted.shape)
print(temp.shape)

(4883, 9)
(4517, 13)


In [177]:
for u in list_of_updrs_tests:
    
    #y_true = full_training_data_sorted[u]
    y_true = temp[u]
    y_pred = temp['predicted_'+u]
    
    print('SMAPE + 1 for', u, ':', smape(np.array(y_true), np.array(y_pred)))
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print('RMSE ', rmse)

SMAPE + 1 for updrs_1 : 63.15451980069244
RMSE  4.963247703045738
SMAPE + 1 for updrs_2 : 67.78967003633409
RMSE  5.504521457450853
SMAPE + 1 for updrs_3 : 50.00025090862883
RMSE  12.904478682559386
SMAPE + 1 for updrs_4 : 176.52874336592058
RMSE  2.1863079471273825


In [None]:
"""for u in list_of_updrs_tests:
    
    X = full_training_data_sorted[['visit_month','visit_month_lag']]
    #X = full_training_data[['visit_month','NPX_standardised','Pep_standardised']]

    y_true = full_training_data_sorted[u]
    y_pred = np.ceil(model_dict[u].predict(X))
    
    print('Linear regression SMAPE + 1 for', u, ':', smape(np.array(y_true), np.array(y_pred)))
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print('RMSE with Polynomial regression model', rmse)"""

In [146]:
def get_full_test_data(df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
    
    if  (df_proteins.shape[0] == 0) & (df_peptides.shape[0] == 0):
        
        print('only the test dataframe has data, proteins and peptides info absent')
        
        full_test_data = df_test[['patient_id','visit_id','visit_month']]
        full_test_data = full_test_data.drop_duplicates()
        full_test_data_sorted = full_test_data.sort_values(by=['patient_id','visit_month'],  ascending=True)
        full_test_data_sorted['visit_month_lag'] = full_test_data_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        full_test_data_sorted['visit_month_lag'].fillna(-1, inplace=True)
        
        full_test_data_sorted = full_test_data_sorted.drop_duplicates()
        full_test_data_sorted = full_test_data_sorted[['visit_id','visit_month','visit_month_lag']]

        full_test_data_sorted = full_test_data_sorted.reset_index()
        full_test_data_sorted = full_test_data_sorted.drop(columns=['index'])
        
        return full_test_data_sorted
    
    elif (df_proteins.shape[0] == 0) & (df_peptides.shape[0] != 0):
        
        print('no proteins only peptides')
        print('df_test shape and number of unique visit_ids:', df_test.shape, df_test.visit_id.nunique())
        print('df_peptides shape and number of unique visit_ids:', df_peptides.shape, df_peptides.visit_id.nunique())
        df_test = df_test[['patient_id','visit_id','visit_month']]
        df_test = df_test.drop_duplicates()
        
        df_peptides['Peptide'] = df_peptides['Peptide'].apply(remove_unimod)
        df_peptides['T_count'] = vect_func(df_peptides['Peptide'], 'T')
    
        #pro_pep_join.head(10)

        df_peptides_agg = df_peptides.groupby(['patient_id','visit_id','visit_month']).agg(T=('T_count', 'sum'))\
        .reset_index(level=['patient_id','visit_id','visit_month'])
        
        df_peptides = pd.merge(df_test, df_peptides_agg, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='left')
        df_peptides = df_peptides[['patient_id','visit_id','visit_month','T']]
        df_peptides['T'].fillna(0, inplace=True)
        df_peptides = df_peptides.drop_duplicates()
        
        df_peptides_sorted = df_peptides.sort_values(by=['patient_id','visit_month'],  ascending=True)
        df_peptides_sorted['visit_month_lag'] = df_peptides_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        df_peptides_sorted['visit_month_lag'].fillna(-1, inplace=True)

        """
        full_test_data = pd.DataFrame()
        for i in [0,6,12,24]:
            temp = df_peptides[df_peptides['visit_month']==i]
            full_test_data = full_test_data.append(temp)
        
        del df_proteins, df_peptides
        """
        full_test_data = df_peptides_sorted
        del df_peptides_sorted
        print('full_test_data shape and number of unique visit_ids:', full_test_data.shape, full_test_data.visit_id.nunique())
        
        full_test_data = full_test_data.drop_duplicates(keep='first')
        full_test_data = full_test_data[['visit_id','visit_month', 'visit_month_lag','T']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data
            
    elif (df_peptides.shape[0] == 0) & (df_proteins.shape[0] != 0):
        
        print('no peptides only proteins')
        print('df_test shape and number of unique visit_ids:', df_test.shape, df_test.visit_id.nunique())
        print('df_proteins shape and number of unique visit_ids:', df_proteins.shape, df_proteins.visit_id.nunique())
        df_test = df_test[['patient_id','visit_id','visit_month']]
        df_test = df_test.drop_duplicates()
        
        df_proteins = pd.merge(df_test, df_proteins, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='left')
        df_proteins = df_proteins[['patient_id','visit_id','visit_month']]
        df_proteins = df_proteins.drop_duplicates()
        
        df_proteins_sorted = df_proteins.sort_values(by=['patient_id','visit_month'],  ascending=True)
        df_proteins_sorted['visit_month_lag'] = df_proteins_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        df_proteins_sorted['visit_month_lag'].fillna(-1, inplace=True)

        """
        full_test_data = pd.DataFrame()
        for i in [0,6,12,24]:
            temp = df_proteins[df_proteins['visit_month']==i]
            full_test_data = full_test_data.append(temp)
        """
        full_test_data = df_proteins_sorted
        print('full_test_data shape and number of unique visit_ids:', full_test_data.shape, full_test_data.visit_id.nunique())
        
        full_test_data = full_test_data.drop_duplicates(keep='first')
        full_test_data = full_test_data[['visit_id','visit_month', 'visit_month_lag']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data
        
    else:
        
        print('both proteins and peptides are present')
        
        df_test = df_test[['patient_id','visit_id','visit_month']]
        df_test = df_test.drop_duplicates()
        
        pro_pep_join = pd.merge(df_proteins, df_peptides, on=['patient_id','visit_id','visit_month','UniProt'], suffixes=['_left','_right'], how='outer')
        pro_pep_join = pro_pep_join[['patient_id','visit_id','visit_month']]
        pro_pep_join = pro_pep_join.drop_duplicates()
        #del df_proteins, df_peptides
        print('df_proteins shape and number of unique visit_ids:', df_proteins.shape, df_proteins.visit_id.nunique())
        print('df_peptides shape and number of unique visit_ids:', df_peptides.shape, df_peptides.visit_id.nunique())
        print('pro_pep_join shape and number of unique visit_ids:', pro_pep_join.shape, pro_pep_join.visit_id.nunique())
        
        pro_pep_join_test = pd.merge(df_test, pro_pep_join, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='left')
        
        pro_pep_join_test_sorted = pro_pep_join_test.sort_values(by=['patient_id','visit_month'],  ascending=True)
        pro_pep_join_test_sorted['visit_month_lag'] = pro_pep_join_test_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        pro_pep_join_test_sorted['visit_month_lag'].fillna(-1, inplace=True)

        """
        full_test_data = pd.DataFrame()

        for i in [0,6,12,24]:
            temp = pro_pep_join_test[pro_pep_join_test['visit_month']==i]
            full_test_data = full_test_data.append(temp)
        """
        full_test_data = pro_pep_join_test_sorted 
        print('full_test_data shape and number of unique visit_ids:', full_test_data.shape, full_test_data.visit_id.nunique())

        full_test_data = full_test_data.drop_duplicates()
        full_test_data = full_test_data[['visit_id','visit_month', 'visit_month_lag']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data

In [139]:
def get_predictions_v2(model_dict, df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
    
    list_of_updrs_tests = ['updrs_1','updrs_2','updrs_3','updrs_4']
    result = pd.DataFrame()
    
    full_test_data = get_full_test_data(df_test=df_test, df_proteins=df_proteins, df_peptides=df_peptides)
    
    if full_test_data.shape[0] != 0:
        """df_prepared = sklearn_pipeline.transform(full_test_data)
        print(df_prepared.shape)"""

        ##for u in list_of_updrs_tests:
        X = full_test_data[['visit_month']]
        
        """poly = PolynomialFeatures(degree = 5)
        X_poly = poly.fit_transform(X)
        full_test_data['predicted_updrs_1'] = np.ceil(model_dict['updrs_1'].predict(X_poly))"""
        
        ## for all other models
        ##full_test_data['result_' + str(u)] = np.ceil(model_dict[u].predict(X))
        
        full_test_data['predicted_updrs_1'] = np.ceil(model_dict['updrs_1'].predict(X))
        
        X = full_test_data[['visit_month','predicted_updrs_1']]
        #X_poly = poly.fit_transform(X)
        full_test_data['predicted_updrs_2'] = np.ceil(model_dict['updrs_2'].predict(X))
        #full_test_data['predicted_updrs_2'] = np.ceil(model_dict['updrs_2'].predict(X_poly))
        
        X = full_test_data[['predicted_updrs_1','predicted_updrs_2','T']]
        #X_poly = poly.fit_transform(X)
        full_test_data['predicted_updrs_3'] = np.ceil(model_dict['updrs_3'].predict(X))
        #full_test_data['predicted_updrs_3'] = np.ceil(model_dict['updrs_3'].predict(X_poly))
        
        X = full_test_data[['visit_month','predicted_updrs_1','predicted_updrs_2','predicted_updrs_3']]
        #X_poly = poly.fit_transform(X)
        full_test_data['predicted_updrs_4'] = np.ceil(model_dict['updrs_4'].predict(X))
        #full_test_data['predicted_updrs_4'] = np.ceil(model_dict['updrs_4'].predict(X_poly))

        for m in [0, 6, 12, 24]:
            for u in [1, 2, 3, 4]:
                temp = full_test_data[['visit_id', 'visit_month', 'predicted_updrs_' + str(u)]]
                temp['prediction_id'] = temp['visit_id'] + '_updrs_' + str(u) + '_plus_' + str(m) + '_months'
                
                #if (u == 2) | (u == 4):
                if (u == 4):
                    temp["rating"] = 0
                else:
                    temp["rating"] = temp["predicted_updrs_" + str(u)].apply(lambda x: 0 if x < 0 else x)
                
                temp = temp[['prediction_id', 'rating']]

                result = result.append(temp)
                
        result = result.drop_duplicates(subset=['prediction_id', 'rating'])
        result = result.reset_index()
        result.drop(columns=['index'], inplace=True)
    
    return result

In [None]:
def get_predictions(model_dict, df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
    
    list_of_updrs_tests = ['updrs_1','updrs_2','updrs_3','updrs_4']
    result = pd.DataFrame()
    
    full_test_data = get_full_test_data(df_test=df_test, df_proteins=df_proteins, df_peptides=df_peptides)
    
    if full_test_data.shape[0] != 0:
        """df_prepared = sklearn_pipeline.transform(full_test_data)
        print(df_prepared.shape)"""

        for u in list_of_updrs_tests:
            X = full_test_data[['visit_month','visit_month_lag']]
            #X = full_test_data[['visit_month','NPX_standardised','Pep_standardised']]
            ## for poly model
            """poly = PolynomialFeatures(degree = 5)
            X_poly = poly.fit_transform(X)
            full_test_data['result_' + str(u)] = np.ceil(model_dict[u].predict(X_poly))"""
            #full_test_data['result_' + str(u)] = pd.DataFrame(model_dict[u].predict(df_prepared),columns=['rating']).round(decimals=0)
            ## for all other models
            full_test_data['result_' + str(u)] = np.ceil(model_dict[u].predict(X))

        for m in [0, 6, 12, 24]:
            for u in [1, 2, 3, 4]:
                temp = full_test_data[['visit_id', 'visit_month', 'result_updrs_' + str(u)]]
                temp['prediction_id'] = temp['visit_id'] + '_updrs_' + str(u) + '_plus_' + str(m) + '_months'
                
                #if (u == 2) | (u == 4):
                if (u == 4):
                    temp["rating"] = 0
                else:
                    temp["rating"] = temp["result_updrs_" + str(u)].apply(lambda x: 0 if x < 0 else x)
                
                #temp = temp[['visit_id', 'visit_month', 'prediction_id', 'rating']]
                temp = temp[['prediction_id', 'rating']]

                result = result.append(temp)

        #result.sort_values(by=['visit_id', 'visit_month'], inplace=True)
        #result = result[['prediction_id', 'rating']]
        #result['prediction_id'] = result['prediction_id'].astype('string') 
        #result['rating'] = result['rating'].astype('int') 
        result = result.drop_duplicates(subset=['prediction_id', 'rating'])
        result = result.reset_index()
        result.drop(columns=['index'], inplace=True)
    
    return result

In [147]:
test = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')

test_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv')

test_peptides = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv')

#result = get_predictions(model_dict=random_forest_model_dict, df_test=test, df_proteins=test_proteins, df_peptides=test_peptides)
#result = get_predictions_v2(model_dict=ridge_model_dict, df_test=test)
#result = get_predictions(model_dict=random_forest_model_dict, df_test=test, df_proteins=test_proteins)
result = get_predictions_v2(model_dict=ridge_model_dict, df_test=test, df_peptides=test_peptides)

#result = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline)

result

no proteins only peptides
df_test shape and number of unique visit_ids: (16, 6) 4
df_peptides shape and number of unique visit_ids: (2057, 7) 2
full_test_data shape and number of unique visit_ids: (4, 5) 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,prediction_id,rating
0,3342_0_updrs_1_plus_0_months,6.0
1,3342_6_updrs_1_plus_0_months,6.0
2,50423_0_updrs_1_plus_0_months,6.0
3,50423_6_updrs_1_plus_0_months,6.0
4,3342_0_updrs_2_plus_0_months,6.0
...,...,...
59,50423_6_updrs_3_plus_24_months,24.0
60,3342_0_updrs_4_plus_24_months,0.0
61,3342_6_updrs_4_plus_24_months,0.0
62,50423_0_updrs_4_plus_24_months,0.0


In [149]:
import amp_pd_peptide

env = amp_pd_peptide.make_env()   # initialize the environment for one run only

"""amp_pd_peptide.make_env.__called__ = False
type(env)._state = type(type(env)._state).__dict__['INIT']"""

iter_test = env.iter_test()

for (test, test_peptides, test_proteins, submission) in iter_test:
    #submission = get_predictions_v2(model_dict=ridge_model_dict, df_test=test)
    submission = get_predictions_v2(model_dict=ridge_model_dict, df_test=test, df_peptides=test_peptides)
    #submission = get_predictions(model_dict=decision_model_dict, df_test=test, df_proteins=test_proteins, df_peptides=test_peptides)
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
no proteins only peptides
df_test shape and number of unique visit_ids: (8, 5) 2
df_peptides shape and number of unique visit_ids: (1021, 6) 1
full_test_data shape and number of unique visit_ids: (2, 5) 2
no proteins only peptides
df_test shape and number of unique visit_ids: (8, 5) 2
df_peptides shape and number of unique visit_ids: (1036, 6) 1
full_test_data shape and number of unique visit_ids: (2, 5) 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [None]:
"""submission = pd.read_csv('/kaggle/working/submission.csv')
submission.shape"""