In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
from pandas.plotting import scatter_matrix
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

from statsmodels.tsa.seasonal import seasonal_decompose
import re
import plotly.express as px
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/public_timeseries_testing_util.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/__init__.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test

In [2]:
train_proteins_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')
print('proteins :', train_proteins_df.shape)
print('proteins unique visit_ids:', train_proteins_df.visit_id.nunique())
print('proteins unique patient_ids:', train_proteins_df.patient_id.nunique())
print(train_proteins_df.columns)

train_peptides_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv')
print('')
print('peptides :', train_peptides_df.shape)
print('peptides unique visit_ids:', train_peptides_df.visit_id.nunique())
print('peptides unique patient_ids:', train_peptides_df.patient_id.nunique())
print(train_peptides_df.columns)

train_clinical_data_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
print('')
print('clinical :', train_clinical_data_df.shape)
print('clinical unique visit_ids:', train_clinical_data_df.visit_id.nunique())
print(train_clinical_data_df.columns)

supp_clinical_data_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv')
print('')
print('supplemental :', supp_clinical_data_df.shape)
print('supplemental unique visit_ids:', supp_clinical_data_df.visit_id.nunique())
print(supp_clinical_data_df.columns)

proteins : (232741, 5)
proteins unique visit_ids: 1113
proteins unique patient_ids: 248
Index(['visit_id', 'visit_month', 'patient_id', 'UniProt', 'NPX'], dtype='object')

peptides : (981834, 6)
peptides unique visit_ids: 1113
peptides unique patient_ids: 248
Index(['visit_id', 'visit_month', 'patient_id', 'UniProt', 'Peptide',
       'PeptideAbundance'],
      dtype='object')

clinical : (2615, 8)
clinical unique visit_ids: 2615
Index(['visit_id', 'patient_id', 'visit_month', 'updrs_1', 'updrs_2',
       'updrs_3', 'updrs_4', 'upd23b_clinical_state_on_medication'],
      dtype='object')

supplemental : (2223, 8)
supplemental unique visit_ids: 2223
Index(['visit_id', 'patient_id', 'visit_month', 'updrs_1', 'updrs_2',
       'updrs_3', 'updrs_4', 'upd23b_clinical_state_on_medication'],
      dtype='object')


In [3]:
clinical_data = pd.merge(supp_clinical_data_df, train_clinical_data_df, on=['patient_id','visit_id','visit_month'], how='outer')

for i in [1,2,3,4]:
    clinical_data['updrs_'+str(i)] = np.where(clinical_data['updrs_'+str(i)+'_x'].isna(), clinical_data['updrs_'+str(i)+'_y'], clinical_data['updrs_'+str(i)+'_x'])
    clinical_data.drop(columns=['updrs_'+str(i)+'_x', 'updrs_'+str(i)+'_y'], inplace=True)
    
clinical_data.drop(columns=['upd23b_clinical_state_on_medication_x', 'upd23b_clinical_state_on_medication_y'], inplace=True)

print(clinical_data.shape)
print(clinical_data.visit_id.nunique())

del supp_clinical_data_df, train_clinical_data_df

clinical_data.head(10)

(4838, 7)
4838


Unnamed: 0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4
0,35_0,35,0,5.0,3.0,16.0,0.0
1,35_36,35,36,6.0,4.0,20.0,0.0
2,75_0,75,0,4.0,6.0,26.0,0.0
3,75_36,75,36,1.0,8.0,38.0,0.0
4,155_0,155,0,,,0.0,
5,337_0,337,0,5.0,7.0,6.0,0.0
6,337_36,337,36,8.0,7.0,8.0,0.0
7,527_0,527,0,6.0,2.0,9.0,0.0
8,527_36,527,36,2.0,18.0,22.0,0.0
9,557_0,557,0,5.0,6.0,22.0,0.0


In [4]:
pro_pep_join = pd.merge(train_proteins_df, train_peptides_df, on=['patient_id','visit_id','visit_month','UniProt'], how='outer')

print(pro_pep_join.shape)
print(pro_pep_join.visit_id.nunique())

del train_proteins_df, train_peptides_df
pro_pep_join.head(10)

(981834, 7)
1113


Unnamed: 0,visit_id,visit_month,patient_id,UniProt,NPX,Peptide,PeptideAbundance
0,55_0,0,55,O00391,11254.3,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,732430.0,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,732430.0,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,732430.0,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,732430.0,SMEQNGPGLEYR,30838.7
5,55_0,0,55,O00533,732430.0,TLKIENVSYQDKGNYR,23216.5
6,55_0,0,55,O00533,732430.0,VIAVNEVGR,170878.0
7,55_0,0,55,O00533,732430.0,VMTPAVYAPYDVK,148771.0
8,55_0,0,55,O00533,732430.0,VNGSPVDNHPFAGDVVFPR,55202.1
9,55_0,0,55,O00584,39585.8,ELDLNSVLLK,27229.3


In [5]:
def remove_unimod(peptide_seqn):
    return re.sub('[(UniMod_)0-9]+','',peptide_seqn)

#print(remove_unimod('KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK'))

pro_pep_join['Peptide'] = pro_pep_join['Peptide'].apply(remove_unimod)
#pro_pep_join.head(10)

def amino_acid_count(amino_seqn, amino_acid):
    return amino_seqn.count(amino_acid)

vect_func = np.vectorize(amino_acid_count)

"""for alph in ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']:
    pro_pep_join[alph+'_count'] = vect_func(pro_pep_join['Peptide'], alph)"""

pro_pep_join['F_count'] = vect_func(pro_pep_join['Peptide'], 'F')
pro_pep_join['Y_count'] = vect_func(pro_pep_join['Peptide'], 'Y')
pro_pep_join['W_count'] = vect_func(pro_pep_join['Peptide'], 'W')
pro_pep_join['A_count'] = vect_func(pro_pep_join['Peptide'], 'A')
pro_pep_join['L_count'] = vect_func(pro_pep_join['Peptide'], 'L')
    
#pro_pep_join.head(10)

pro_pep_join_agg = pro_pep_join.groupby(['patient_id','visit_id','visit_month']).agg(F=('F_count', 'sum'),Y=('Y_count', 'sum'),W=('W_count', 'sum')\
                                                                                    ,A=('A_count', 'sum'),L=('L_count', 'sum'))\
.reset_index(level=['patient_id','visit_id','visit_month'])

pro_pep_join_agg.head(10)

Unnamed: 0,patient_id,visit_id,visit_month,F,Y,W,A,L
0,55,55_0,0,408,430,117,908,1019
1,55,55_12,12,410,435,115,923,1029
2,55,55_36,36,410,435,119,918,1017
3,55,55_6,6,413,434,116,914,1017
4,942,942_12,12,396,418,116,849,993
5,942,942_24,24,399,419,113,887,988
6,942,942_48,48,397,416,116,894,992
7,942,942_6,6,400,427,114,896,995
8,1517,1517_0,0,399,418,119,893,981
9,1517,1517_24,24,408,424,116,889,986


In [6]:
full_training_data = pd.merge(pro_pep_join_agg, clinical_data, on=['patient_id','visit_id','visit_month'], how='outer')
#full_training_data = full_training_data.drop(columns=['UniProt','Peptide','NPX','PeptideAbundance'])

#full_training_data['NPX_inverse'] = 1/full_training_data['NPX']
#full_training_data['PeptideAbundance_inverse'] = 1/full_training_data['PeptideAbundance']

#full_training_data = full_training_data.drop(columns=['UniProt'])

print(full_training_data.shape)
print(full_training_data.visit_id.nunique())

#del pro_pep_join, clinical_data
gc.collect()

full_training_data[full_training_data['patient_id']==55]

(4883, 12)
4883


Unnamed: 0,patient_id,visit_id,visit_month,F,Y,W,A,L,updrs_1,updrs_2,updrs_3,updrs_4
0,55,55_0,0,408.0,430.0,117.0,908.0,1019.0,10.0,6.0,15.0,
1,55,55_12,12,410.0,435.0,115.0,923.0,1029.0,10.0,10.0,41.0,0.0
2,55,55_36,36,410.0,435.0,119.0,918.0,1017.0,17.0,18.0,51.0,0.0
3,55,55_6,6,413.0,434.0,116.0,914.0,1017.0,8.0,10.0,34.0,
3336,55,55_3,3,,,,,,10.0,7.0,25.0,
3337,55,55_9,9,,,,,,8.0,9.0,30.0,0.0
3338,55,55_18,18,,,,,,7.0,13.0,38.0,0.0
3339,55,55_24,24,,,,,,16.0,9.0,49.0,0.0
3340,55,55_30,30,,,,,,14.0,13.0,49.0,0.0
3341,55,55_42,42,,,,,,12.0,20.0,41.0,0.0


In [7]:
#columns_to_remove_nan_from = ['updrs_1','updrs_2','updrs_3','updrs_4']
columns_to_remove_nan_from = ['updrs_1','updrs_2','updrs_3','updrs_4','F','Y','W','L','A']

for i in columns_to_remove_nan_from:
    #full_training_data[i].fillna(0, inplace=True)
    #full_training_data[i].fillna(full_training_data[i].expanding(1).max(), inplace=True)
    #full_training_data[i].fillna(0, inplace=True)
    #full_training_data[i].fillna(full_training_data[i].median().round(decimals = 0), inplace=True)
    if i in(['F','Y','W','L','A']): 
    #!= 'F') & (i != 'Y') & (i != 'W'):
        full_training_data[i].fillna(full_training_data[i].mean().round(decimals = 0), inplace=True)
        #full_training_data[i].fillna(0, inplace=True)
    elif i != 'updrs_4':
        #full_training_data[i].fillna(0, inplace=True)
        full_training_data[i].fillna(full_training_data[i].median().round(decimals = 0), inplace=True)
    else:
        full_training_data[i].fillna(0, inplace=True)

In [8]:
full_training_data = full_training_data.drop_duplicates()

full_training_data_sorted = full_training_data.sort_values(by=['patient_id','visit_month'],  ascending=True)

full_training_data_sorted['visit_month_lag'] = full_training_data_sorted.groupby(['patient_id'])['visit_month'].shift(1)
full_training_data_sorted['visit_month_lag'].fillna(-1, inplace=True)

full_training_data_sorted['visit_month_lead'] = full_training_data_sorted.groupby(['patient_id'])['visit_month'].shift(-1)
full_training_data_sorted['visit_month_lead'].fillna(-1, inplace=True)

"""full_training_data_sorted['visit_month_lead'] = full_training_data_sorted.groupby(['patient_id'])['visit_month'].shift(-1)
full_training_data_sorted['visit_month_lead'].fillna(-1, inplace=True)"""

full_training_data_sorted[full_training_data_sorted['patient_id']==55]

Unnamed: 0,patient_id,visit_id,visit_month,F,Y,W,A,L,updrs_1,updrs_2,updrs_3,updrs_4,visit_month_lag,visit_month_lead
0,55,55_0,0,408.0,430.0,117.0,908.0,1019.0,10.0,6.0,15.0,0.0,-1.0,3.0
3336,55,55_3,3,392.0,407.0,111.0,861.0,966.0,10.0,7.0,25.0,0.0,0.0,6.0
3,55,55_6,6,413.0,434.0,116.0,914.0,1017.0,8.0,10.0,34.0,0.0,3.0,9.0
3337,55,55_9,9,392.0,407.0,111.0,861.0,966.0,8.0,9.0,30.0,0.0,6.0,12.0
1,55,55_12,12,410.0,435.0,115.0,923.0,1029.0,10.0,10.0,41.0,0.0,9.0,18.0
3338,55,55_18,18,392.0,407.0,111.0,861.0,966.0,7.0,13.0,38.0,0.0,12.0,24.0
3339,55,55_24,24,392.0,407.0,111.0,861.0,966.0,16.0,9.0,49.0,0.0,18.0,30.0
3340,55,55_30,30,392.0,407.0,111.0,861.0,966.0,14.0,13.0,49.0,0.0,24.0,36.0
2,55,55_36,36,410.0,435.0,119.0,918.0,1017.0,17.0,18.0,51.0,0.0,30.0,42.0
3341,55,55_42,42,392.0,407.0,111.0,861.0,966.0,12.0,20.0,41.0,0.0,36.0,48.0


In [9]:
full_training_data_sorted = full_training_data_sorted.reset_index()
full_training_data_sorted = full_training_data_sorted.drop(columns=['patient_id','index'])

del full_training_data
gc.collect()
full_training_data_sorted.head(10)

Unnamed: 0,visit_id,visit_month,F,Y,W,A,L,updrs_1,updrs_2,updrs_3,updrs_4,visit_month_lag,visit_month_lead
0,35_0,0,392.0,407.0,111.0,861.0,966.0,5.0,3.0,16.0,0.0,-1.0,36.0
1,35_36,36,392.0,407.0,111.0,861.0,966.0,6.0,4.0,20.0,0.0,0.0,-1.0
2,55_0,0,408.0,430.0,117.0,908.0,1019.0,10.0,6.0,15.0,0.0,-1.0,3.0
3,55_3,3,392.0,407.0,111.0,861.0,966.0,10.0,7.0,25.0,0.0,0.0,6.0
4,55_6,6,413.0,434.0,116.0,914.0,1017.0,8.0,10.0,34.0,0.0,3.0,9.0
5,55_9,9,392.0,407.0,111.0,861.0,966.0,8.0,9.0,30.0,0.0,6.0,12.0
6,55_12,12,410.0,435.0,115.0,923.0,1029.0,10.0,10.0,41.0,0.0,9.0,18.0
7,55_18,18,392.0,407.0,111.0,861.0,966.0,7.0,13.0,38.0,0.0,12.0,24.0
8,55_24,24,392.0,407.0,111.0,861.0,966.0,16.0,9.0,49.0,0.0,18.0,30.0
9,55_30,30,392.0,407.0,111.0,861.0,966.0,14.0,13.0,49.0,0.0,24.0,36.0


In [10]:
temp = full_training_data_sorted[full_training_data_sorted['visit_month'].isin([0,6,12,24])]
temp[['updrs_1','updrs_2','updrs_3','updrs_4','visit_month','visit_month_lag','visit_month_lead','F','Y','W','A','L']].corr()

Unnamed: 0,updrs_1,updrs_2,updrs_3,updrs_4,visit_month,visit_month_lag,visit_month_lead,F,Y,W,A,L
updrs_1,1.0,0.587245,0.276154,0.302327,0.046661,0.07247,-0.017046,-0.08405,-0.071796,-0.058638,-0.092601,-0.091153
updrs_2,0.587245,1.0,0.588422,0.277829,0.026213,0.055592,-0.054681,-0.03598,-0.026398,-0.019071,-0.052415,-0.042523
updrs_3,0.276154,0.588422,1.0,0.193204,0.038724,0.05548,0.084157,-0.047381,-0.039756,-0.03647,-0.058194,-0.050791
updrs_4,0.302327,0.277829,0.193204,1.0,0.07516,0.080621,-0.020622,-0.122511,-0.11166,-0.052107,-0.089966,-0.120151
visit_month,0.046661,0.026213,0.038724,0.07516,1.0,0.957017,0.543097,-0.011828,-0.016497,-0.019294,-0.018782,-0.019465
visit_month_lag,0.07247,0.055592,0.05548,0.080621,0.957017,1.0,0.532109,-0.009734,-0.012687,-0.013036,-0.016576,-0.017096
visit_month_lead,-0.017046,-0.054681,0.084157,-0.020622,0.543097,0.532109,1.0,-0.013073,-0.019458,-0.023512,-0.01736,-0.018278
F,-0.08405,-0.03598,-0.047381,-0.122511,-0.011828,-0.009734,-0.013073,1.0,0.94528,0.797455,0.904346,0.966806
Y,-0.071796,-0.026398,-0.039756,-0.11166,-0.016497,-0.012687,-0.019458,0.94528,1.0,0.865656,0.935736,0.965424
W,-0.058638,-0.019071,-0.03647,-0.052107,-0.019294,-0.013036,-0.023512,0.797455,0.865656,1.0,0.868887,0.855128


In [11]:
full_training_data_sorted[['updrs_1','updrs_2','updrs_3','updrs_4','visit_month','visit_month_lag','visit_month_lead','F','Y','W','A','L']].corr()

Unnamed: 0,updrs_1,updrs_2,updrs_3,updrs_4,visit_month,visit_month_lag,visit_month_lead,F,Y,W,A,L
updrs_1,1.0,0.630341,0.303248,0.384606,0.156118,0.167924,0.08028,-0.069478,-0.063337,-0.043432,-0.075189,-0.072076
updrs_2,0.630341,1.0,0.594681,0.34265,0.122514,0.135629,0.031582,-0.040575,-0.034048,-0.014677,-0.048259,-0.042162
updrs_3,0.303248,0.594681,1.0,0.211337,0.046043,0.060763,-0.008986,-0.036336,-0.029821,-0.020593,-0.046208,-0.037189
updrs_4,0.384606,0.34265,0.211337,1.0,0.207707,0.225779,0.112113,-0.084474,-0.076475,-0.037598,-0.06593,-0.081077
visit_month,0.156118,0.122514,0.046043,0.207707,1.0,0.93888,0.50229,0.009104,0.006345,0.004376,0.008968,0.008997
visit_month_lag,0.167924,0.135629,0.060763,0.225779,0.93888,1.0,0.573298,0.006372,0.004787,0.00531,0.006515,0.006753
visit_month_lead,0.08028,0.031582,-0.008986,0.112113,0.50229,0.573298,1.0,0.025693,0.026059,0.009919,0.027895,0.027005
F,-0.069478,-0.040575,-0.036336,-0.084474,0.009104,0.006372,0.025693,1.0,0.954972,0.816626,0.920163,0.974005
Y,-0.063337,-0.034048,-0.029821,-0.076475,0.006345,0.004787,0.026059,0.954972,1.0,0.874055,0.945009,0.969324
W,-0.043432,-0.014677,-0.020593,-0.037598,0.004376,0.00531,0.009919,0.816626,0.874055,1.0,0.87572,0.862087


In [12]:
full_training_data_sorted.isna().sum()

visit_id            0
visit_month         0
F                   0
Y                   0
W                   0
A                   0
L                   0
updrs_1             0
updrs_2             0
updrs_3             0
updrs_4             0
visit_month_lag     0
visit_month_lead    0
dtype: int64

In [68]:
#random_forest_model_dict = {}
#poly_model_dict = {}
#model_dict = {}
#ridge_model_dict = {}
lasso_model_dict = {}
#sgd_model_dict = {}

list_of_updrs_tests = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

"""ridge_model_dict = {}"""
temp = full_training_data_sorted
#temp = full_training_data_sorted.dropna(subset=['updrs_1'])
#X = temp[['visit_month','visit_month_lag']]
X = temp[['visit_month']]
print(X.shape)
y = temp['updrs_1']
print(y.shape)
"""poly = PolynomialFeatures(degree = 5)
X_poly = poly.fit_transform(X)
poly.fit(X_poly, y)
print(X_poly.shape)
trained = LinearRegression().fit(X_poly, y)"""
#trained = LinearRegression().fit(X, y)
#trained = RidgeCV(alphas=[0.001,0.0001,0.01], gcv_mode='svd', alpha_per_target=True).fit(X, y)
trained = Lasso(alpha=1.03, max_iter=4500).fit(X, y)
#trained = SGDRegressor(alpha=0.011, learning_rate='optimal', early_stopping=True, validation_fraction=0.1).fit(X, y)
#trained = RandomForestRegressor().fit(X, y)
#random_forest_model_dict['updrs_1'] = trained
#poly_model_dict['updrs_1'] = trained
#model_dict['updrs_1'] = trained
#ridge_model_dict['updrs_1'] = trained
#sgd_model_dict['updrs_1'] = trained
lasso_model_dict['updrs_1'] = trained

temp['predicted_updrs_1'] = np.ceil(lasso_model_dict['updrs_1'].predict(X)) + 3.6

#print(round(ridge_model_dict['updrs_1'].score(X, y),5))

(4883, 1)
(4883,)


In [69]:
temp = temp[temp['updrs_2'] != 0]
X = temp[['visit_month','predicted_updrs_1']]
print(X.shape)
y = temp['updrs_2']
print(y.shape)
"""X_poly = poly.fit_transform(X)
poly.fit(X_poly, y)
print(X_poly.shape)
trained = LinearRegression().fit(X_poly, y)"""
#trained = RidgeCV(alphas=[0.001,0.0001,0.01], gcv_mode='svd', alpha_per_target=True).fit(X, y)
trained = Lasso(alpha=1.03, max_iter=4500).fit(X, y)
#trained = SGDRegressor(alpha=0.011, learning_rate='optimal', early_stopping=True, validation_fraction=0.1).fit(X, y)
#trained = LinearRegression().fit(X, y)
#trained = RandomForestRegressor().fit(X, y)
#random_forest_model_dict['updrs_2'] = trained
#poly_model_dict['updrs_2'] = trained
#model_dict['updrs_2'] = trained
#ridge_model_dict['updrs_2'] = trained
#sgd_model_dict['updrs_2'] = trained
lasso_model_dict['updrs_2'] = trained

temp['predicted_updrs_2'] = np.ceil(lasso_model_dict['updrs_2'].predict(X))

(4335, 2)
(4335,)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [70]:
#ridge_model_dict = {}
#model_dict = {}
#list_of_updrs_tests = ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

#temp = full_training_data_sorted.dropna(subset=['updrs_3'])
#temp = full_training_data_sorted[full_training_data_sorted['visit_month'].isin([0,6,12,24])]
#temp = full_training_data_sorted

temp = temp[temp['updrs_3'] != 0]
X = temp[['predicted_updrs_1','predicted_updrs_2']]
print(X.shape)
y = temp['updrs_3']
print(y.shape)
"""X_poly = poly.fit_transform(X)
poly.fit(X_poly, y)
print(X_poly.shape)
trained = LinearRegression().fit(X_poly, y)"""
#trained = RidgeCV(alphas=[0.001,0.0001,0.01], gcv_mode='svd', alpha_per_target=True).fit(X, y)
trained = Lasso(alpha=1.03, max_iter=4500).fit(X, y)
#trained = SGDRegressor(alpha=0.011, learning_rate='optimal', early_stopping=True, validation_fraction=0.1).fit(X, y)
#trained = LinearRegression().fit(X, y)
#trained = RandomForestRegressor().fit(X, y)
#random_forest_model_dict['updrs_3'] = trained
#poly_model_dict['updrs_3'] = trained
#model_dict['updrs_3'] = trained
#ridge_model_dict['updrs_3'] = trained
#sgd_model_dict['updrs_3'] = trained
lasso_model_dict['updrs_3'] = trained

temp['predicted_updrs_3'] = np.ceil(lasso_model_dict['updrs_3'].predict(X)) 

(4230, 2)
(4230,)


In [71]:
X = temp[['visit_month','predicted_updrs_1','predicted_updrs_2','predicted_updrs_3']]
print(X.shape)
y = temp['updrs_4']
print(y.shape)
"""X_poly = poly.fit_transform(X)
poly.fit(X_poly, y)
print(X_poly.shape)
trained = LinearRegression().fit(X_poly, y)"""
#trained = RidgeCV(alphas=[0.001,0.0001,0.01], gcv_mode='svd', alpha_per_target=True).fit(X, y)
trained = Lasso(alpha=1.03, max_iter=4500).fit(X, y)
#trained = SGDRegressor(alpha=0.051, learning_rate='optimal', early_stopping=True, validation_fraction=0.1).fit(X, y)
#trained = LinearRegression().fit(X, y)
#trained = RandomForestRegressor().fit(X, y)
#random_forest_model_dict['updrs_4'] = trained
#poly_model_dict['updrs_4'] = trained
#model_dict['updrs_4'] = trained
#ridge_model_dict['updrs_4'] = trained
#sgd_model_dict['updrs_4'] = trained
lasso_model_dict['updrs_4'] = trained

temp['predicted_updrs_4'] = np.ceil(lasso_model_dict['updrs_4'].predict(X))

(4230, 4)
(4230,)


In [17]:
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))

    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)

    pos_ind = dem != 0
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]

    return 100 * np.mean(smap)

In [18]:
print(full_training_data_sorted.shape)
print(temp.shape)

(4883, 14)
(4230, 17)


In [72]:
for u in list_of_updrs_tests:
    
    #y_true = full_training_data_sorted[u]
    y_true = temp[u]
    y_pred = temp['predicted_'+u]
    
    print('SMAPE + 1 for', u, ':', smape(np.array(y_true), np.array(y_pred)))
    
    """mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print('RMSE ', rmse)"""

SMAPE + 1 for updrs_1 : 71.90451055874517
SMAPE + 1 for updrs_2 : 60.251680674100115
SMAPE + 1 for updrs_3 : 43.880589268396136
SMAPE + 1 for updrs_4 : 173.60082804137872


In [61]:
pred_vs_true = temp[temp['visit_month'].isin([0,6,12,24])][['visit_id','visit_month','updrs_3','predicted_updrs_3']]

pred_vs_true['error'] = np.abs(pred_vs_true['updrs_3']-pred_vs_true['predicted_updrs_3'])

#pred_vs_true_melt = pd.melt(pred_vs_true, id_vars=['visit_id'], value_vars=['updrs_1', 'predicted_updrs_1'])

#pred_vs_true_melt

#fig = px.scatter(pred_vs_true_melt, x=pred_vs_true_melt['visit_id'], y=pred_vs_true_melt['value'], color='variable')
fig = px.scatter(pred_vs_true, x=pred_vs_true['visit_id'], y=pred_vs_true['error'], facet_row=pred_vs_true['visit_month']\
                 , trendline="ols", trendline_color_override="black")

fig.show()

In [73]:
pred_vs_true = temp[temp['visit_month'].isin([0,6,12,24])][['visit_id','visit_month','updrs_1','predicted_updrs_1']]

pred_vs_true['error'] = np.abs(pred_vs_true['updrs_1']-pred_vs_true['predicted_updrs_1'])

#pred_vs_true_melt = pd.melt(pred_vs_true, id_vars=['visit_id'], value_vars=['updrs_1', 'predicted_updrs_1'])

#pred_vs_true_melt

#fig = px.scatter(pred_vs_true_melt, x=pred_vs_true_melt['visit_id'], y=pred_vs_true_melt['value'], color='variable')
fig = px.scatter(pred_vs_true, x=pred_vs_true['visit_id'], y=pred_vs_true['error'], facet_row=pred_vs_true['visit_month']\
                 , trendline="ols", trendline_color_override="black")

fig.show()

In [74]:
pred_vs_true = temp[['visit_id','updrs_1','predicted_updrs_1']]

pred_vs_true_melt = pd.melt(pred_vs_true, id_vars=['visit_id'], value_vars=['updrs_1', 'predicted_updrs_1'])

#pred_vs_true_melt

fig = px.scatter(pred_vs_true_melt, x=pred_vs_true_melt['visit_id'], y=pred_vs_true_melt['value'], color='variable')
fig.show()

In [20]:
def get_full_test_data(df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
    
    if  (df_proteins.shape[0] == 0) & (df_peptides.shape[0] == 0):
        
        print('only the test dataframe has data, proteins and peptides info absent')
        
        full_test_data = df_test[['patient_id','visit_id','visit_month']]
        full_test_data = full_test_data.drop_duplicates()
        full_test_data_sorted = full_test_data.sort_values(by=['patient_id','visit_month'],  ascending=True)
        full_test_data_sorted['visit_month_lag'] = full_test_data_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        full_test_data_sorted['visit_month_lag'].fillna(-1, inplace=True)
        
        full_test_data_sorted = full_test_data_sorted.drop_duplicates()
        full_test_data_sorted = full_test_data_sorted[['visit_id','visit_month','visit_month_lag']]

        full_test_data_sorted = full_test_data_sorted.reset_index()
        full_test_data_sorted = full_test_data_sorted.drop(columns=['index'])
        
        return full_test_data_sorted
    
    elif (df_proteins.shape[0] == 0) & (df_peptides.shape[0] != 0):
        
        print('no proteins only peptides')
        print('df_test shape and number of unique visit_ids:', df_test.shape, df_test.visit_id.nunique())
        print('df_peptides shape and number of unique visit_ids:', df_peptides.shape, df_peptides.visit_id.nunique())
        df_test = df_test[['patient_id','visit_id','visit_month']]
        df_test = df_test.drop_duplicates()
        
        df_peptides['Peptide'] = df_peptides['Peptide'].apply(remove_unimod)
        df_peptides['W_count'] = vect_func(df_peptides['Peptide'], 'W')
        df_peptides['Y_count'] = vect_func(df_peptides['Peptide'], 'Y')
        df_peptides['F_count'] = vect_func(df_peptides['Peptide'], 'F')
        df_peptides['L_count'] = vect_func(df_peptides['Peptide'], 'L')
        df_peptides['A_count'] = vect_func(df_peptides['Peptide'], 'A')
    
        #pro_pep_join.head(10)

        df_peptides_agg = df_peptides.groupby(['patient_id','visit_id','visit_month']).agg(W=('W_count', 'sum'),Y=('Y_count', 'sum'),F=('F_count', 'sum')\
                                                                                          ,A=('A_count', 'sum'),L=('L_count', 'sum'))\
        .reset_index(level=['patient_id','visit_id','visit_month'])
        
        df_peptides = pd.merge(df_test, df_peptides_agg, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='left')
        df_peptides = df_peptides[['patient_id','visit_id','visit_month','W','Y','F','A','L']]
        df_peptides['W'].fillna(0, inplace=True)
        df_peptides['Y'].fillna(0, inplace=True)
        df_peptides['F'].fillna(0, inplace=True)
        df_peptides['A'].fillna(0, inplace=True)
        df_peptides['L'].fillna(0, inplace=True)
        df_peptides = df_peptides.drop_duplicates()
        
        df_peptides_sorted = df_peptides.sort_values(by=['patient_id','visit_month'],  ascending=True)
        df_peptides_sorted['visit_month_lag'] = df_peptides_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        df_peptides_sorted['visit_month_lag'].fillna(-1, inplace=True)
        
        df_peptides_sorted['visit_month_lead'] = df_peptides_sorted.groupby(['patient_id'])['visit_month'].shift(-1)
        df_peptides_sorted['visit_month_lead'].fillna(-1, inplace=True)

        """
        full_test_data = pd.DataFrame()
        for i in [0,6,12,24]:
            temp = df_peptides[df_peptides['visit_month']==i]
            full_test_data = full_test_data.append(temp)
        
        del df_proteins, df_peptides
        """
        full_test_data = df_peptides_sorted
        del df_peptides_sorted
        print('full_test_data shape and number of unique visit_ids:', full_test_data.shape, full_test_data.visit_id.nunique())
        
        full_test_data = full_test_data.drop_duplicates(keep='first')
        full_test_data = full_test_data[['visit_id','visit_month', 'visit_month_lag','visit_month_lead','W','Y','F','A','L']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data
            
    elif (df_peptides.shape[0] == 0) & (df_proteins.shape[0] != 0):
        
        print('no peptides only proteins')
        print('df_test shape and number of unique visit_ids:', df_test.shape, df_test.visit_id.nunique())
        print('df_proteins shape and number of unique visit_ids:', df_proteins.shape, df_proteins.visit_id.nunique())
        df_test = df_test[['patient_id','visit_id','visit_month']]
        df_test = df_test.drop_duplicates()
        
        df_proteins = pd.merge(df_test, df_proteins, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='left')
        df_proteins = df_proteins[['patient_id','visit_id','visit_month']]
        df_proteins = df_proteins.drop_duplicates()
        
        df_proteins_sorted = df_proteins.sort_values(by=['patient_id','visit_month'],  ascending=True)
        df_proteins_sorted['visit_month_lag'] = df_proteins_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        df_proteins_sorted['visit_month_lag'].fillna(-1, inplace=True)

        """
        full_test_data = pd.DataFrame()
        for i in [0,6,12,24]:
            temp = df_proteins[df_proteins['visit_month']==i]
            full_test_data = full_test_data.append(temp)
        """
        full_test_data = df_proteins_sorted
        print('full_test_data shape and number of unique visit_ids:', full_test_data.shape, full_test_data.visit_id.nunique())
        
        full_test_data = full_test_data.drop_duplicates(keep='first')
        full_test_data = full_test_data[['visit_id','visit_month', 'visit_month_lag']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data
        
    else:
        
        print('both proteins and peptides are present')
        
        df_test = df_test[['patient_id','visit_id','visit_month']]
        df_test = df_test.drop_duplicates()
        
        pro_pep_join = pd.merge(df_proteins, df_peptides, on=['patient_id','visit_id','visit_month','UniProt'], suffixes=['_left','_right'], how='outer')
        pro_pep_join = pro_pep_join[['patient_id','visit_id','visit_month']]
        pro_pep_join = pro_pep_join.drop_duplicates()
        #del df_proteins, df_peptides
        print('df_proteins shape and number of unique visit_ids:', df_proteins.shape, df_proteins.visit_id.nunique())
        print('df_peptides shape and number of unique visit_ids:', df_peptides.shape, df_peptides.visit_id.nunique())
        print('pro_pep_join shape and number of unique visit_ids:', pro_pep_join.shape, pro_pep_join.visit_id.nunique())
        
        pro_pep_join_test = pd.merge(df_test, pro_pep_join, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='left')
        
        pro_pep_join_test_sorted = pro_pep_join_test.sort_values(by=['patient_id','visit_month'],  ascending=True)
        pro_pep_join_test_sorted['visit_month_lag'] = pro_pep_join_test_sorted.groupby(['patient_id'])['visit_month'].shift(1)
        pro_pep_join_test_sorted['visit_month_lag'].fillna(-1, inplace=True)

        """
        full_test_data = pd.DataFrame()

        for i in [0,6,12,24]:
            temp = pro_pep_join_test[pro_pep_join_test['visit_month']==i]
            full_test_data = full_test_data.append(temp)
        """
        full_test_data = pro_pep_join_test_sorted 
        print('full_test_data shape and number of unique visit_ids:', full_test_data.shape, full_test_data.visit_id.nunique())

        full_test_data = full_test_data.drop_duplicates()
        full_test_data = full_test_data[['visit_id','visit_month', 'visit_month_lag']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data

In [21]:
def get_predictions_v4(model_dict, df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
    
    list_of_updrs_tests = ['updrs_1','updrs_2','updrs_3','updrs_4']
    result = pd.DataFrame()
    
    full_test_data = get_full_test_data(df_test=df_test, df_proteins=df_proteins, df_peptides=df_peptides)
    
    if full_test_data.shape[0] != 0:
        """df_prepared = sklearn_pipeline.transform(full_test_data)
        print(df_prepared.shape)"""

        ##for u in list_of_updrs_tests:
        
        X = full_test_data[['visit_month']]
        for m in [0, 6, 12, 24]:
            full_test_data['predicted_updrs_1_plus_'+str(m)+'_months'] = np.ceil(model_dict['updrs_1'].predict(X))
        
        #X_poly = poly.fit_transform(X)
        for m in [0, 6, 12, 24]:
            X = full_test_data[['visit_month','predicted_updrs_1_plus_'+str(m)+'_months']]
            full_test_data['predicted_updrs_2_plus_'+str(m)+'_months'] = np.ceil(model_dict['updrs_2'].predict(X))
        #full_test_data['predicted_updrs_2'] = np.ceil(model_dict['updrs_2'].predict(X_poly))
        
        #X_poly = poly.fit_transform(X)
        for m in [0, 6, 12, 24]:
            X = full_test_data[['predicted_updrs_1_plus_'+str(m)+'_months','predicted_updrs_2_plus_'+str(m)+'_months']]
            full_test_data['predicted_updrs_3_plus_'+str(m)+'_months'] = np.ceil(model_dict['updrs_3'].predict(X))
        #full_test_data['predicted_updrs_3'] = np.ceil(model_dict['updrs_3'].predict(X_poly))
        
        #X_poly = poly.fit_transform(X)
        for m in [0, 6, 12, 24]:
            X = full_test_data[['visit_month','predicted_updrs_1_plus_'+str(m)+'_months',\
                                'predicted_updrs_2_plus_'+str(m)+'_months','predicted_updrs_3_plus_'+str(m)+'_months']]
            full_test_data['predicted_updrs_4_plus_'+str(m)+'_months'] = np.ceil(model_dict['updrs_4'].predict(X))
        #full_test_data['predicted_updrs_4'] = np.ceil(model_dict['updrs_4'].predict(X_poly))
        
        """poly = PolynomialFeatures(degree = 5)
        X_poly = poly.fit_transform(X)
        full_test_data['predicted_updrs_1'] = np.ceil(model_dict['updrs_1'].predict(X_poly))"""
        
        ## for all other models
        ##full_test_data['result_' + str(u)] = np.ceil(model_dict[u].predict(X))
        
        for m in [0, 6, 12, 24]:
            for u in [1, 2, 3, 4]:
                temp = full_test_data[['visit_id', 'visit_month', 'predicted_updrs_' + str(u) + '_plus_' + str(m) + '_months']]
                temp['prediction_id'] = temp['visit_id'] + '_updrs_' + str(u) + '_plus_' + str(m) + '_months'
                
                #if (u == 2) | (u == 4):
                if (u == 4):
                    temp["rating"] = 0
                else:
                    temp["rating"] = temp["predicted_updrs_" + str(u) + '_plus_' + str(m) + '_months'].apply(lambda x: 0 if x < 0 else x)
                
                temp = temp[['prediction_id', 'rating']]

                result = result.append(temp)
                
        result = result.drop_duplicates(subset=['prediction_id', 'rating'])
        result = result.reset_index()
        result.drop(columns=['index'], inplace=True)
    
    return result

In [75]:
def get_predictions_v3(model_dict, df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
    
    list_of_updrs_tests = ['updrs_1','updrs_2','updrs_3','updrs_4']
    result = pd.DataFrame()
    
    full_test_data = get_full_test_data(df_test=df_test, df_proteins=df_proteins, df_peptides=df_peptides)
    
    if full_test_data.shape[0] != 0:
        """df_prepared = sklearn_pipeline.transform(full_test_data)
        print(df_prepared.shape)"""

        ##for u in list_of_updrs_tests:
        
        X = full_test_data[['visit_month','visit_month_lag']]
        full_test_data['predicted_updrs_1'] = np.ceil(model_dict['updrs_1'].predict(X))
        
        X = full_test_data[['visit_month','visit_month_lag','predicted_updrs_1']]
        #X_poly = poly.fit_transform(X)
        full_test_data['predicted_updrs_2'] = np.ceil(model_dict['updrs_2'].predict(X))
        #full_test_data['predicted_updrs_2'] = np.ceil(model_dict['updrs_2'].predict(X_poly))
        
        X = full_test_data[['predicted_updrs_1','predicted_updrs_2','F']]
        #X_poly = poly.fit_transform(X)
        full_test_data['predicted_updrs_3'] = np.ceil(model_dict['updrs_3'].predict(X))
        #full_test_data['predicted_updrs_3'] = np.ceil(model_dict['updrs_3'].predict(X_poly))
        
        X = full_test_data[['visit_month','visit_month_lag','visit_month_lead','predicted_updrs_1','predicted_updrs_2','predicted_updrs_3']]
        #X_poly = poly.fit_transform(X)
        full_test_data['predicted_updrs_4'] = np.ceil(model_dict['updrs_4'].predict(X))
        #full_test_data['predicted_updrs_4'] = np.ceil(model_dict['updrs_4'].predict(X_poly))
        
        """poly = PolynomialFeatures(degree = 5)
        X_poly = poly.fit_transform(X)
        full_test_data['predicted_updrs_1'] = np.ceil(model_dict['updrs_1'].predict(X_poly))"""
        
        ## for all other models
        ##full_test_data['result_' + str(u)] = np.ceil(model_dict[u].predict(X))
        
        for m in [0, 6, 12, 24]:
            for u in [1, 2, 3, 4]:
                temp = full_test_data[['visit_id', 'visit_month', 'predicted_updrs_' + str(u)]]
                temp['prediction_id'] = temp['visit_id'] + '_updrs_' + str(u) + '_plus_' + str(m) + '_months'
                
                #if (u == 2) | (u == 4):
                if (u == 4):
                    temp["rating"] = 0
                else:
                    temp["rating"] = temp["predicted_updrs_" + str(u)].apply(lambda x: 0 if x < 0 else x)
                
                temp = temp[['prediction_id', 'rating']]

                result = result.append(temp)
                
        result = result.drop_duplicates(subset=['prediction_id', 'rating'])
        result = result.reset_index()
        result.drop(columns=['index'], inplace=True)
    
    return result

In [22]:
test = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')

test_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv')

test_peptides = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv')

#result = get_predictions(model_dict=random_forest_model_dict, df_test=test, df_proteins=test_proteins, df_peptides=test_peptides)
#result = get_predictions_v2(model_dict=ridge_model_dict, df_test=test)
#result = get_predictions(model_dict=random_forest_model_dict, df_test=test, df_proteins=test_proteins)
result = get_predictions_v4(model_dict=lasso_model_dict, df_test=test, df_peptides=test_peptides)

#result = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline)

result

no proteins only peptides
df_test shape and number of unique visit_ids: (16, 6) 4
df_peptides shape and number of unique visit_ids: (2057, 7) 2
full_test_data shape and number of unique visit_ids: (4, 10) 4


Feature names unseen at fit time:
- predicted_updrs_1_plus_0_months
Feature names seen at fit time, yet now missing:
- predicted_updrs_1

Feature names unseen at fit time:
- predicted_updrs_1_plus_6_months
Feature names seen at fit time, yet now missing:
- predicted_updrs_1

Feature names unseen at fit time:
- predicted_updrs_1_plus_12_months
Feature names seen at fit time, yet now missing:
- predicted_updrs_1

Feature names unseen at fit time:
- predicted_updrs_1_plus_24_months
Feature names seen at fit time, yet now missing:
- predicted_updrs_1

Feature names unseen at fit time:
- predicted_updrs_1_plus_0_months
- predicted_updrs_2_plus_0_months
Feature names seen at fit time, yet now missing:
- predicted_updrs_1
- predicted_updrs_2

Feature names unseen at fit time:
- predicted_updrs_1_plus_6_months
- predicted_updrs_2_plus_6_months
Feature names seen at fit time, yet now missing:
- predicted_updrs_1
- predicted_updrs_2

Feature names unseen at fit time:
- predicted_updrs_1_plus_12_

Unnamed: 0,prediction_id,rating
0,3342_0_updrs_1_plus_0_months,6.0
1,3342_6_updrs_1_plus_0_months,6.0
2,50423_0_updrs_1_plus_0_months,6.0
3,50423_6_updrs_1_plus_0_months,6.0
4,3342_0_updrs_2_plus_0_months,7.0
...,...,...
59,50423_6_updrs_3_plus_24_months,24.0
60,3342_0_updrs_4_plus_24_months,0.0
61,3342_6_updrs_4_plus_24_months,0.0
62,50423_0_updrs_4_plus_24_months,0.0


In [23]:
import amp_pd_peptide

env = amp_pd_peptide.make_env()   # initialize the environment for one run only

"""amp_pd_peptide.make_env.__called__ = False
type(env)._state = type(type(env)._state).__dict__['INIT']"""

iter_test = env.iter_test()

for (test, test_peptides, test_proteins, submission) in iter_test:
    #submission = get_predictions_v2(model_dict=ridge_model_dict, df_test=test)
    submission = get_predictions_v4(model_dict=lasso_model_dict, df_test=test, df_peptides=test_peptides)
    #submission = get_predictions(model_dict=decision_model_dict, df_test=test, df_proteins=test_proteins, df_peptides=test_peptides)
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
no proteins only peptides
df_test shape and number of unique visit_ids: (8, 5) 2
df_peptides shape and number of unique visit_ids: (1021, 6) 1
full_test_data shape and number of unique visit_ids: (2, 10) 2
no proteins only peptides
df_test shape and number of unique visit_ids: (8, 5) 2
df_peptides shape and number of unique visit_ids: (1036, 6) 1
full_test_data shape and number of unique visit_ids: (2, 10) 2


Feature names unseen at fit time:
- predicted_updrs_1_plus_0_months
Feature names seen at fit time, yet now missing:
- predicted_updrs_1

Feature names unseen at fit time:
- predicted_updrs_1_plus_6_months
Feature names seen at fit time, yet now missing:
- predicted_updrs_1

Feature names unseen at fit time:
- predicted_updrs_1_plus_12_months
Feature names seen at fit time, yet now missing:
- predicted_updrs_1

Feature names unseen at fit time:
- predicted_updrs_1_plus_24_months
Feature names seen at fit time, yet now missing:
- predicted_updrs_1

Feature names unseen at fit time:
- predicted_updrs_1_plus_0_months
- predicted_updrs_2_plus_0_months
Feature names seen at fit time, yet now missing:
- predicted_updrs_1
- predicted_updrs_2

Feature names unseen at fit time:
- predicted_updrs_1_plus_6_months
- predicted_updrs_2_plus_6_months
Feature names seen at fit time, yet now missing:
- predicted_updrs_1
- predicted_updrs_2

Feature names unseen at fit time:
- predicted_updrs_1_plus_12_

In [None]:
"""submission = pd.read_csv('/kaggle/working/submission.csv')
submission.shape"""