In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
from pandas.plotting import scatter_matrix
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

from statsmodels.tsa.seasonal import seasonal_decompose

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/public_timeseries_testing_util.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/__init__.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test

In [2]:
train_proteins_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')
print('proteins :', train_proteins_df.shape)
print('proteins unique visit_ids:', train_proteins_df.visit_id.nunique())
print(train_proteins_df.columns)

train_peptides_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv')
print('')
print('peptides :', train_peptides_df.shape)
print('peptides unique visit_ids:', train_peptides_df.visit_id.nunique())
print(train_peptides_df.columns)

train_clinical_data_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
print('')
print('clinical :', train_clinical_data_df.shape)
print('clinical unique visit_ids:', train_clinical_data_df.visit_id.nunique())
print(train_clinical_data_df.columns)

supp_clinical_data_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv')
print('')
print('supplemental :', supp_clinical_data_df.shape)
print('supplemental unique visit_ids:', supp_clinical_data_df.visit_id.nunique())
print(supp_clinical_data_df.columns)

proteins : (232741, 5)
proteins unique visit_ids: 1113
Index(['visit_id', 'visit_month', 'patient_id', 'UniProt', 'NPX'], dtype='object')

peptides : (981834, 6)
peptides unique visit_ids: 1113
Index(['visit_id', 'visit_month', 'patient_id', 'UniProt', 'Peptide',
       'PeptideAbundance'],
      dtype='object')

clinical : (2615, 8)
clinical unique visit_ids: 2615
Index(['visit_id', 'patient_id', 'visit_month', 'updrs_1', 'updrs_2',
       'updrs_3', 'updrs_4', 'upd23b_clinical_state_on_medication'],
      dtype='object')

supplemental : (2223, 8)
supplemental unique visit_ids: 2223
Index(['visit_id', 'patient_id', 'visit_month', 'updrs_1', 'updrs_2',
       'updrs_3', 'updrs_4', 'upd23b_clinical_state_on_medication'],
      dtype='object')


In [3]:
clinical_data = pd.merge(supp_clinical_data_df, train_clinical_data_df, on=['patient_id','visit_id','visit_month'], how='outer')

for i in [1,2,3,4]:
    clinical_data['updrs_'+str(i)] = np.where(clinical_data['updrs_'+str(i)+'_x'].isna(), clinical_data['updrs_'+str(i)+'_y'], clinical_data['updrs_'+str(i)+'_x'])
    clinical_data.drop(columns=['updrs_'+str(i)+'_x', 'updrs_'+str(i)+'_y'], inplace=True)
    
clinical_data.drop(columns=['upd23b_clinical_state_on_medication_x', 'upd23b_clinical_state_on_medication_y'], inplace=True)

print(clinical_data.shape)
print(clinical_data.visit_id.nunique())

del supp_clinical_data_df, train_clinical_data_df

clinical_data.head(10)

(4838, 7)
4838


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
0,35_0,35,0,5.0,3.0,16.0,0.0
1,35_36,35,36,6.0,4.0,20.0,0.0
2,75_0,75,0,4.0,6.0,26.0,0.0
3,75_36,75,36,1.0,8.0,38.0,0.0
4,155_0,155,0,,,0.0,
5,337_0,337,0,5.0,7.0,6.0,0.0
6,337_36,337,36,8.0,7.0,8.0,0.0
7,527_0,527,0,6.0,2.0,9.0,0.0
8,527_36,527,36,2.0,18.0,22.0,0.0
9,557_0,557,0,5.0,6.0,22.0,0.0


In [4]:
pro_pep_join = pd.merge(train_proteins_df, train_peptides_df, on=['patient_id','visit_id','visit_month','UniProt'], how='outer')

print(pro_pep_join.shape)
print(pro_pep_join.visit_id.nunique())

del train_proteins_df, train_peptides_df
pro_pep_join.head(10)

(981834, 7)
1113


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX,Peptide,PeptideAbundance
0,55_0,0,55,O00391,11254.3,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,732430.0,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,732430.0,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,732430.0,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,732430.0,SMEQNGPGLEYR,30838.7
5,55_0,0,55,O00533,732430.0,TLKIENVSYQDKGNYR,23216.5
6,55_0,0,55,O00533,732430.0,VIAVNEVGR,170878.0
7,55_0,0,55,O00533,732430.0,VMTPAVYAPYDVK,148771.0
8,55_0,0,55,O00533,732430.0,VNGSPVDNHPFAGDVVFPR,55202.1
9,55_0,0,55,O00584,39585.8,ELDLNSVLLK,27229.3


In [5]:
full_training_data = pd.merge(pro_pep_join, clinical_data, on=['patient_id','visit_id','visit_month'], how='outer')
full_training_data = full_training_data.drop(columns=['UniProt','Peptide','NPX','PeptideAbundance'])

print(full_training_data.shape)
print(full_training_data.visit_id.nunique())

del pro_pep_join, clinical_data
gc.collect()

full_training_data.head(10)

(985604, 7)
4883


Unnamed: 0,visit_id,visit_month,patient_id,updrs_1,updrs_2,updrs_3,updrs_4
0,55_0,0,55,10.0,6.0,15.0,
1,55_0,0,55,10.0,6.0,15.0,
2,55_0,0,55,10.0,6.0,15.0,
3,55_0,0,55,10.0,6.0,15.0,
4,55_0,0,55,10.0,6.0,15.0,
5,55_0,0,55,10.0,6.0,15.0,
6,55_0,0,55,10.0,6.0,15.0,
7,55_0,0,55,10.0,6.0,15.0,
8,55_0,0,55,10.0,6.0,15.0,
9,55_0,0,55,10.0,6.0,15.0,


In [6]:
columns_to_remove_nan_from = ['updrs_1','updrs_2','updrs_3','updrs_4']

for i in columns_to_remove_nan_from:
    full_training_data[i].fillna(full_training_data[i].median().round(decimals = 0), inplace=True)

In [7]:
full_training_data = full_training_data.drop_duplicates()

full_training_data_sorted = full_training_data.sort_values(by=['patient_id','visit_month'],  ascending=True)

full_training_data_sorted['visit_month_lag'] = full_training_data_sorted.groupby(['patient_id'])['visit_month'].shift(1)
full_training_data_sorted['visit_month_lag'].fillna(-1, inplace=True)

"""full_training_data_sorted['visit_month_lead'] = full_training_data_sorted.groupby(['patient_id'])['visit_month'].shift(-1)
full_training_data_sorted['visit_month_lead'].fillna(-1, inplace=True)"""

full_training_data_sorted[full_training_data_sorted['patient_id']==55]

Unnamed: 0,visit_id,visit_month,patient_id,updrs_1,updrs_2,updrs_3,updrs_4,visit_month_lag
0,55_0,0,55,10.0,6.0,15.0,0.0,-1.0
984057,55_3,3,55,10.0,7.0,25.0,0.0,0.0
173790,55_6,6,55,8.0,10.0,34.0,0.0,3.0
984058,55_9,9,55,8.0,9.0,30.0,0.0,6.0
284523,55_12,12,55,10.0,10.0,41.0,0.0,9.0
984059,55_18,18,55,7.0,13.0,38.0,0.0,12.0
984060,55_24,24,55,16.0,9.0,49.0,0.0,18.0
984061,55_30,30,55,14.0,13.0,49.0,0.0,24.0
602408,55_36,36,55,17.0,18.0,51.0,0.0,30.0
984062,55_42,42,55,12.0,20.0,41.0,0.0,36.0


In [8]:
full_training_data_sorted = full_training_data_sorted.reset_index()
full_training_data_sorted = full_training_data_sorted.drop(columns=['patient_id','index'])

del full_training_data
gc.collect()
full_training_data_sorted.head(10)

Unnamed: 0,visit_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,visit_month_lag
0,35_0,0,5.0,3.0,16.0,0.0,-1.0
1,35_36,36,6.0,4.0,20.0,0.0,0.0
2,55_0,0,10.0,6.0,15.0,0.0,-1.0
3,55_3,3,10.0,7.0,25.0,0.0,0.0
4,55_6,6,8.0,10.0,34.0,0.0,3.0
5,55_9,9,8.0,9.0,30.0,0.0,6.0
6,55_12,12,10.0,10.0,41.0,0.0,9.0
7,55_18,18,7.0,13.0,38.0,0.0,12.0
8,55_24,24,16.0,9.0,49.0,0.0,18.0
9,55_30,30,14.0,13.0,49.0,0.0,24.0


In [9]:
full_training_data_sorted[['updrs_1','updrs_2','updrs_3','updrs_4','visit_month','visit_month_lag']].corr()

Unnamed: 0,updrs_1,updrs_2,updrs_3,updrs_4,visit_month,visit_month_lag
updrs_1,1.0,0.630939,0.303645,0.384606,0.156118,0.167924
updrs_2,0.630939,1.0,0.592396,0.344035,0.129817,0.142221
updrs_3,0.303645,0.592396,1.0,0.212245,0.04592,0.060545
updrs_4,0.384606,0.344035,0.212245,1.0,0.207707,0.225779
visit_month,0.156118,0.129817,0.04592,0.207707,1.0,0.93888
visit_month_lag,0.167924,0.142221,0.060545,0.225779,0.93888,1.0


In [10]:
full_training_data_sorted.isna().sum()

visit_id           0
visit_month        0
updrs_1            0
updrs_2            0
updrs_3            0
updrs_4            0
visit_month_lag    0
dtype: int64

#### First train to predict UPDRS_1

In [25]:
random_forest_model_dict = {}
list_of_updrs_tests = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

temp = full_training_data_sorted.dropna(subset=['updrs_1'])
#X = temp[['visit_month','visit_month_lag']]
X = temp[['visit_month']]
print(X.shape)
y = temp['updrs_1']
print(y.shape)
trained = RandomForestRegressor().fit(X, y)
random_forest_model_dict['updrs_1'] = trained

"""for u in list_of_updrs_tests:
        
    # Drop NAs
    temp = full_training_data_sorted.dropna(subset=[u]) 
    print(u)
    # Train data
    X = temp[['visit_month','visit_month_lag']]
    print(X.shape)
    y = temp[u]
    print(y.shape)
    trained = RandomForestRegressor().fit(X, y)
    
    # Save model
    random_forest_model_dict[u] = trained"""

(4883, 1)
(4883,)


"for u in list_of_updrs_tests:\n        \n    # Drop NAs\n    temp = full_training_data_sorted.dropna(subset=[u]) \n    print(u)\n    # Train data\n    X = temp[['visit_month','visit_month_lag']]\n    print(X.shape)\n    y = temp[u]\n    print(y.shape)\n    trained = RandomForestRegressor().fit(X, y)\n    \n    # Save model\n    random_forest_model_dict[u] = trained"

In [26]:
temp['predicted_updrs_1'] = np.ceil(random_forest_model_dict['updrs_1'].predict(X))

In [27]:
X = temp[['visit_month','predicted_updrs_1']]
print(X.shape)
y = temp['updrs_2']
print(y.shape)
trained = RandomForestRegressor().fit(X, y)
random_forest_model_dict['updrs_2'] = trained

(4883, 2)
(4883,)


In [28]:
temp['predicted_updrs_2'] = np.ceil(random_forest_model_dict['updrs_2'].predict(X))

In [29]:
X = temp[['visit_month','predicted_updrs_1','predicted_updrs_2']]
print(X.shape)
y = temp['updrs_3']
print(y.shape)
trained = RandomForestRegressor().fit(X, y)
random_forest_model_dict['updrs_3'] = trained

(4883, 3)
(4883,)


In [30]:
temp['predicted_updrs_3'] = np.ceil(random_forest_model_dict['updrs_3'].predict(X))

In [31]:
X = temp[['visit_month','predicted_updrs_1','predicted_updrs_2','predicted_updrs_3']]
print(X.shape)
y = temp['updrs_4']
print(y.shape)
trained = RandomForestRegressor().fit(X, y)
random_forest_model_dict['updrs_4'] = trained

(4883, 4)
(4883,)


In [32]:
temp['predicted_updrs_4'] = np.ceil(random_forest_model_dict['updrs_4'].predict(X))

In [22]:
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))

    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)

    pos_ind = dem != 0
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]

    return 100 * np.mean(smap)

In [33]:
print(full_training_data_sorted.shape)
print(temp.shape)

(4883, 7)
(4883, 11)


In [34]:
for u in list_of_updrs_tests:
    
    y_true = full_training_data_sorted[u]
    y_pred = temp['predicted_'+u]
    
    print('Random Forest regression SMAPE + 1 for', u, ':', smape(np.array(y_true), np.array(y_pred)))
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print('RMSE with Random Forest Regressor model', rmse)

Random Forest regression SMAPE + 1 for updrs_1 : 64.35594291779165
RMSE with Random Forest Regressor model 4.931249640672847
Random Forest regression SMAPE + 1 for updrs_2 : 75.47805916746475
RMSE with Random Forest Regressor model 5.579797384230739
Random Forest regression SMAPE + 1 for updrs_3 : 61.36995324311867
RMSE with Random Forest Regressor model 13.451332929010025
Random Forest regression SMAPE + 1 for updrs_4 : 178.56096393169793
RMSE with Random Forest Regressor model 2.1349610231924356


In [None]:
"""for u in list_of_updrs_tests:
    
    X = full_training_data_sorted[['visit_month','visit_month_lag']]

    y_true = full_training_data_sorted[u]
    y_pred = np.ceil(random_forest_model_dict[u].predict(X))
    
    print('Random Forest regression SMAPE + 1 for', u, ':', smape(np.array(y_true), np.array(y_pred)))
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print('RMSE with Random Forest Regressor model', rmse)"""

In [None]:
"""poly_model_dict = {}
list_of_updrs_tests = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

for u in list_of_updrs_tests:
        
    # Drop NAs
    temp = full_training_data_sorted.dropna(subset=[u]) 
    print(u)
    # Train data
    X = temp[['visit_month','visit_month_lag']]
    #X = temp[['visit_month','NPX_standardised','Pep_standardised']]
    print(X.shape)
    y = temp[u]
    print(y.shape)
    poly = PolynomialFeatures(degree = 5)
    X_poly = poly.fit_transform(X)
    poly.fit(X_poly, y)
    print(X_poly.shape)
    trained = LinearRegression().fit(X_poly, y)
    
    # Save model
    poly_model_dict[u] = trained"""

In [None]:
"""for u in list_of_updrs_tests:
    
    X = full_training_data_sorted[['visit_month','visit_month_lag']]
    #X = full_training_data[['visit_month','NPX_standardised','Pep_standardised']]
    poly = PolynomialFeatures(degree = 5)
    X_poly = poly.fit_transform(X)
    
    y_true = full_training_data_sorted[u]
    y_pred = np.ceil(poly_model_dict[u].predict(X_poly))
    
    print('Polynomial regression SMAPE + 1 for', u, ':', smape(np.array(y_true), np.array(y_pred)))
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print('RMSE with Polynomial regression model', rmse)"""

In [None]:
"""model_dict = {}

for u in list_of_updrs_tests:
        
    # Drop NAs
    temp = full_training_data_sorted.dropna(subset=[u]) 
    print(u)
    # Train data
    X = temp[['visit_month','visit_month_lag']]
    #X = temp[['visit_month','NPX_standardised','Pep_standardised']]
    print(X.shape)
    y = temp[u]
    print(y.shape)
    trained = LinearRegression().fit(X, y)
    
    # Save model
    model_dict[u] = trained"""

In [None]:
"""for u in list_of_updrs_tests:
    
    X = full_training_data_sorted[['visit_month','visit_month_lag']]
    #X = full_training_data[['visit_month','NPX_standardised','Pep_standardised']]

    y_true = full_training_data_sorted[u]
    y_pred = np.ceil(model_dict[u].predict(X))
    
    print('Linear regression SMAPE + 1 for', u, ':', smape(np.array(y_true), np.array(y_pred)))
    
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print('RMSE with Polynomial regression model', rmse)"""

In [None]:
"""for u in list_of_updrs_tests:
    X = full_training_data_sorted[['visit_month','visit_month_lag']]
    y = full_training_data_sorted[u]
    scores = cross_val_score(random_forest_model_dict[u], X, y, scoring="neg_mean_squared_error", cv=10)
    rmse_scores = np.sqrt(-scores)
    print('for ', u)
    print('Scores:', rmse_scores)
    print('Mean:', rmse_scores.mean())
    print('Standard Deviation:', rmse_scores.std())
    print('')"""

In [35]:
def get_full_test_data(df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
    
    if  (df_proteins.shape[0] == 0) & (df_peptides.shape[0] == 0):
        
        print('only the test dataframe has data, proteins and peptides info absent')
        
        full_test_data = df_test[['patient_id','visit_id','visit_month']]
        full_test_data = full_test_data.drop_duplicates()
        full_test_data_sorted = full_test_data.sort_values(by=['patient_id','visit_month'],  ascending=True)
        full_test_data_sorted['visit_month_lag'] = full_test_data_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        full_test_data_sorted['visit_month_lag'].fillna(-1, inplace=True)
        
        full_test_data_sorted = full_test_data_sorted.drop_duplicates()
        full_test_data_sorted = full_test_data_sorted[['visit_id','visit_month','visit_month_lag']]

        full_test_data_sorted = full_test_data_sorted.reset_index()
        full_test_data_sorted = full_test_data_sorted.drop(columns=['index'])
        
        return full_test_data_sorted
    
    elif (df_proteins.shape[0] == 0) & (df_peptides.shape[0] != 0):
        
        print('no proteins only peptides')
        print('df_test shape and number of unique visit_ids:', df_test.shape, df_test.visit_id.nunique())
        print('df_peptides shape and number of unique visit_ids:', df_peptides.shape, df_peptides.visit_id.nunique())
        df_test = df_test[['patient_id','visit_id','visit_month']]
        df_test = df_test.drop_duplicates()
        
        df_peptides = pd.merge(df_test, df_peptides, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='left')
        df_peptides = df_peptides[['patient_id','visit_id','visit_month']]
        df_peptides = df_peptides.drop_duplicates()
        
        df_peptides_sorted = df_peptides.sort_values(by=['patient_id','visit_month'],  ascending=True)
        df_peptides_sorted['visit_month_lag'] = df_peptides_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        df_peptides_sorted['visit_month_lag'].fillna(-1, inplace=True)

        """
        full_test_data = pd.DataFrame()
        for i in [0,6,12,24]:
            temp = df_peptides[df_peptides['visit_month']==i]
            full_test_data = full_test_data.append(temp)
        
        del df_proteins, df_peptides
        """
        full_test_data = df_peptides_sorted
        del df_peptides_sorted
        print('full_test_data shape and number of unique visit_ids:', full_test_data.shape, full_test_data.visit_id.nunique())
        
        full_test_data = full_test_data.drop_duplicates(keep='first')
        full_test_data = full_test_data[['visit_id','visit_month', 'visit_month_lag']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data
            
    elif (df_peptides.shape[0] == 0) & (df_proteins.shape[0] != 0):
        
        print('no peptides only proteins')
        print('df_test shape and number of unique visit_ids:', df_test.shape, df_test.visit_id.nunique())
        print('df_proteins shape and number of unique visit_ids:', df_proteins.shape, df_proteins.visit_id.nunique())
        df_test = df_test[['patient_id','visit_id','visit_month']]
        df_test = df_test.drop_duplicates()
        
        df_proteins = pd.merge(df_test, df_proteins, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='left')
        df_proteins = df_proteins[['patient_id','visit_id','visit_month']]
        df_proteins = df_proteins.drop_duplicates()
        
        df_proteins_sorted = df_proteins.sort_values(by=['patient_id','visit_month'],  ascending=True)
        df_proteins_sorted['visit_month_lag'] = df_proteins_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        df_proteins_sorted['visit_month_lag'].fillna(-1, inplace=True)

        """
        full_test_data = pd.DataFrame()
        for i in [0,6,12,24]:
            temp = df_proteins[df_proteins['visit_month']==i]
            full_test_data = full_test_data.append(temp)
        """
        full_test_data = df_proteins_sorted
        print('full_test_data shape and number of unique visit_ids:', full_test_data.shape, full_test_data.visit_id.nunique())
        
        full_test_data = full_test_data.drop_duplicates(keep='first')
        full_test_data = full_test_data[['visit_id','visit_month', 'visit_month_lag']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data
        
    else:
        
        print('both proteins and peptides are present')
        
        df_test = df_test[['patient_id','visit_id','visit_month']]
        df_test = df_test.drop_duplicates()
        
        pro_pep_join = pd.merge(df_proteins, df_peptides, on=['patient_id','visit_id','visit_month','UniProt'], suffixes=['_left','_right'], how='outer')
        pro_pep_join = pro_pep_join[['patient_id','visit_id','visit_month']]
        pro_pep_join = pro_pep_join.drop_duplicates()
        #del df_proteins, df_peptides
        print('df_proteins shape and number of unique visit_ids:', df_proteins.shape, df_proteins.visit_id.nunique())
        print('df_peptides shape and number of unique visit_ids:', df_peptides.shape, df_peptides.visit_id.nunique())
        print('pro_pep_join shape and number of unique visit_ids:', pro_pep_join.shape, pro_pep_join.visit_id.nunique())
        
        pro_pep_join_test = pd.merge(df_test, pro_pep_join, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='left')
        
        pro_pep_join_test_sorted = pro_pep_join_test.sort_values(by=['patient_id','visit_month'],  ascending=True)
        pro_pep_join_test_sorted['visit_month_lag'] = pro_pep_join_test_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        pro_pep_join_test_sorted['visit_month_lag'].fillna(-1, inplace=True)

        """
        full_test_data = pd.DataFrame()

        for i in [0,6,12,24]:
            temp = pro_pep_join_test[pro_pep_join_test['visit_month']==i]
            full_test_data = full_test_data.append(temp)
        """
        full_test_data = pro_pep_join_test_sorted 
        print('full_test_data shape and number of unique visit_ids:', full_test_data.shape, full_test_data.visit_id.nunique())

        full_test_data = full_test_data.drop_duplicates()
        full_test_data = full_test_data[['visit_id','visit_month', 'visit_month_lag']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data

In [38]:
def get_predictions_v2(model_dict, df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
    
    list_of_updrs_tests = ['updrs_1','updrs_2','updrs_3','updrs_4']
    result = pd.DataFrame()
    
    full_test_data = get_full_test_data(df_test=df_test, df_proteins=df_proteins, df_peptides=df_peptides)
    
    if full_test_data.shape[0] != 0:
        """df_prepared = sklearn_pipeline.transform(full_test_data)
        print(df_prepared.shape)"""

        #for u in list_of_updrs_tests:
        X = full_test_data[['visit_month']]
        
        """poly = PolynomialFeatures(degree = 5)
        X_poly = poly.fit_transform(X)
        full_test_data['result_' + str(u)] = np.ceil(model_dict[u].predict(X_poly))"""
        ## for all other models
        #full_test_data['result_' + str(u)] = np.ceil(model_dict[u].predict(X))
        full_test_data['predicted_updrs_1'] = np.ceil(model_dict['updrs_1'].predict(X))
        
        X = full_test_data[['visit_month','predicted_updrs_1']]
        full_test_data['predicted_updrs_2'] = np.ceil(model_dict['updrs_2'].predict(X))
        
        X = full_test_data[['visit_month','predicted_updrs_1','predicted_updrs_2']]
        full_test_data['predicted_updrs_3'] = np.ceil(model_dict['updrs_3'].predict(X))
        
        X = full_test_data[['visit_month','predicted_updrs_1','predicted_updrs_2','predicted_updrs_3']]
        full_test_data['predicted_updrs_4'] = np.ceil(model_dict['updrs_4'].predict(X))

        for m in [0, 6, 12, 24]:
            for u in [1, 2, 3, 4]:
                temp = full_test_data[['visit_id', 'visit_month', 'predicted_updrs_' + str(u)]]
                temp['prediction_id'] = temp['visit_id'] + '_updrs_' + str(u) + '_plus_' + str(m) + '_months'
                
                #if (u == 2) | (u == 4):
                if (u == 4):
                    temp["rating"] = 0
                else:
                    temp["rating"] = temp["predicted_updrs_" + str(u)].apply(lambda x: 0 if x < 0 else x)
                
                temp = temp[['prediction_id', 'rating']]

                result = result.append(temp)
                
        result = result.drop_duplicates(subset=['prediction_id', 'rating'])
        result = result.reset_index()
        result.drop(columns=['index'], inplace=True)
    
    return result

In [None]:
def get_predictions(model_dict, df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
    
    list_of_updrs_tests = ['updrs_1','updrs_2','updrs_3','updrs_4']
    result = pd.DataFrame()
    
    full_test_data = get_full_test_data(df_test=df_test, df_proteins=df_proteins, df_peptides=df_peptides)
    
    if full_test_data.shape[0] != 0:
        """df_prepared = sklearn_pipeline.transform(full_test_data)
        print(df_prepared.shape)"""

        for u in list_of_updrs_tests:
            X = full_test_data[['visit_month','visit_month_lag']]
            #X = full_test_data[['visit_month','NPX_standardised','Pep_standardised']]
            ## for poly model
            """poly = PolynomialFeatures(degree = 5)
            X_poly = poly.fit_transform(X)
            full_test_data['result_' + str(u)] = np.ceil(model_dict[u].predict(X_poly))"""
            #full_test_data['result_' + str(u)] = pd.DataFrame(model_dict[u].predict(df_prepared),columns=['rating']).round(decimals=0)
            ## for all other models
            full_test_data['result_' + str(u)] = np.ceil(model_dict[u].predict(X))

        for m in [0, 6, 12, 24]:
            for u in [1, 2, 3, 4]:
                temp = full_test_data[['visit_id', 'visit_month', 'result_updrs_' + str(u)]]
                temp['prediction_id'] = temp['visit_id'] + '_updrs_' + str(u) + '_plus_' + str(m) + '_months'
                
                #if (u == 2) | (u == 4):
                if (u == 4):
                    temp["rating"] = 0
                else:
                    temp["rating"] = temp["result_updrs_" + str(u)].apply(lambda x: 0 if x < 0 else x)
                
                #temp = temp[['visit_id', 'visit_month', 'prediction_id', 'rating']]
                temp = temp[['prediction_id', 'rating']]

                result = result.append(temp)

        #result.sort_values(by=['visit_id', 'visit_month'], inplace=True)
        #result = result[['prediction_id', 'rating']]
        #result['prediction_id'] = result['prediction_id'].astype('string') 
        #result['rating'] = result['rating'].astype('int') 
        result = result.drop_duplicates(subset=['prediction_id', 'rating'])
        result = result.reset_index()
        result.drop(columns=['index'], inplace=True)
    
    return result

In [39]:
test = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')

test_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv')

test_peptides = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv')

#result = get_predictions(model_dict=random_forest_model_dict, df_test=test, df_proteins=test_proteins, df_peptides=test_peptides)
result = get_predictions_v2(model_dict=random_forest_model_dict, df_test=test)
#result = get_predictions(model_dict=random_forest_model_dict, df_test=test, df_proteins=test_proteins)
#result = get_predictions(model_dict=random_forest_model_dict, df_test=test, df_peptides=test_peptides)

#result = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline)

result

only the test dataframe has data, proteins and peptides info absent


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,prediction_id,rating
0,3342_0_updrs_1_plus_0_months,6.0
1,3342_6_updrs_1_plus_0_months,6.0
2,50423_0_updrs_1_plus_0_months,6.0
3,50423_6_updrs_1_plus_0_months,6.0
4,3342_0_updrs_2_plus_0_months,6.0
...,...,...
59,50423_6_updrs_3_plus_24_months,22.0
60,3342_0_updrs_4_plus_24_months,0.0
61,3342_6_updrs_4_plus_24_months,0.0
62,50423_0_updrs_4_plus_24_months,0.0


In [40]:
import amp_pd_peptide

env = amp_pd_peptide.make_env()   # initialize the environment for one run only

"""amp_pd_peptide.make_env.__called__ = False
type(env)._state = type(type(env)._state).__dict__['INIT']
"""
iter_test = env.iter_test()

for (test, test_peptides, test_proteins, submission) in iter_test:
    submission = get_predictions_v2(model_dict=random_forest_model_dict, df_test=test)
    #result = get_predictions(model_dict=decision_model_dict, df_test=test, df_peptides=test_peptides)
    #submission = get_predictions(model_dict=decision_model_dict, df_test=test, df_proteins=test_proteins, df_peptides=test_peptides)
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
only the test dataframe has data, proteins and peptides info absent
only the test dataframe has data, proteins and peptides info absent


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [None]:
"""submission = pd.read_csv('/kaggle/working/submission.csv')
submission.shape"""