In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
from pandas.plotting import scatter_matrix
%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/public_timeseries_testing_util.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/__init__.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test

* #### Get the relevant training datasets and combine them to form the full training set with Proteins and Peptides
* #### Remove the NaNs from the UPDRS test scores, using the mean from within the same group of UPDRS test scores, e.g. NaNs in updrs_4 are filled with mean of updrs_4
* #### Create new mean and median based features and get rid of Proteins Expressions (NPX) and Peptide Abundance

In [2]:
train_proteins_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')

train_peptides_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv')

pro_pep_join = pd.merge(train_proteins_df, train_peptides_df, on=['patient_id','visit_id','visit_month','UniProt'], how='inner')

del train_proteins_df, train_peptides_df
gc.collect()

train_clinical_data_df = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')

merged_data = pd.merge(pro_pep_join, train_clinical_data_df, on=['patient_id','visit_id','visit_month'], how='inner')

del pro_pep_join, train_clinical_data_df
gc.collect()

full_training_data = pd.DataFrame()

for i in [0,6,12,24]:
    temp = merged_data[merged_data['visit_month']==i]
    full_training_data = full_training_data.append(temp)
    
full_training_data = full_training_data.drop(columns=['UniProt','Peptide','upd23b_clinical_state_on_medication','patient_id'])

list_of_updrs_tests = ['updrs_1','updrs_2','updrs_3','updrs_4']

for i in range(len(list_of_updrs_tests)):
    full_training_data[list_of_updrs_tests[i]].fillna(full_training_data[list_of_updrs_tests[i]].mean().round(decimals = 0), inplace=True)

new_features = full_training_data.groupby(['visit_id','visit_month']).agg(npx_mean=('NPX', 'mean'), npx_median=('NPX', 'median')
                                                                          , peptide_mean=('PeptideAbundance', 'mean'), peptide_median=('PeptideAbundance', 'median')
                                                          ).reset_index(level=['visit_id','visit_month'])

full_training_data = pd.merge(full_training_data, new_features, on=['visit_id','visit_month'], how='inner')
full_training_data = full_training_data.drop(columns=['NPX','PeptideAbundance'])
full_training_data = full_training_data.drop_duplicates(keep='first')
full_training_data = full_training_data[['visit_id','visit_month','npx_mean','npx_median','peptide_mean','peptide_median','updrs_1','updrs_2','updrs_3','updrs_4']]

full_training_data = full_training_data.reset_index()
full_training_data.head(10)

Unnamed: 0,index,visit_id,visit_month,npx_mean,npx_median,peptide_mean,peptide_median,updrs_1,updrs_2,updrs_3,updrs_4
0,0,55_0,0,20287890.0,1221530.0,748153.907014,93134.8,10.0,6.0,15.0,2.0
1,931,1517_0,0,15795760.0,897182.0,618823.15049,65253.6,11.0,6.0,25.0,5.0
2,1833,1923_0,0,18898000.0,1299230.0,763459.201459,92280.9,2.0,0.0,0.0,2.0
3,2710,2660_0,0,12989730.0,1078570.0,532250.381374,73461.05,2.0,0.0,0.0,2.0
4,3620,3636_0,0,13674220.0,776581.0,501743.93125,54660.2,1.0,2.0,9.0,2.0
5,4495,3863_0,0,22791340.0,885864.0,806928.584862,78719.9,8.0,13.0,36.0,4.0
6,5291,4161_0,0,16477340.0,1067620.0,699803.391029,77037.8,6.0,1.0,0.0,2.0
7,6191,4172_0,0,17294720.0,874171.0,620558.476752,68208.05,2.0,0.0,0.0,2.0
8,7115,5027_0,0,22816320.0,1134880.0,764893.568584,71828.35,1.0,0.0,0.0,2.0
9,8025,5178_0,0,18153100.0,998682.0,689200.799757,79225.5,9.0,1.0,3.0,2.0


#### Check for NaNs

In [3]:
full_training_data.isna().sum()

index             0
visit_id          0
visit_month       0
npx_mean          0
npx_median        0
peptide_mean      0
peptide_median    0
updrs_1           0
updrs_2           0
updrs_3           0
updrs_4           0
dtype: int64

#### We will attempt to train one model each for each of the UPDRS tests. To do that, we will also proceed to create a train-test split on the Training dataset specific to each of the UPDRS test, to test out the accuracy of the developed models
#### We will store all the models in a dictionary
#### The following cells (except the declaration of the model dict) will be repeated for all UPDRS tests

In [4]:
"""model_dict = {}

for i in [1,2,3,4]:
    if (i == 1) | (i == 2):
        full_training_data['updrs_'+str(i)+'_category'] = pd.cut(full_training_data['updrs_'+str(i)],bins=[-np.inf, 15., np.inf],labels=[1,2])
    elif i == 3:
        full_training_data['updrs_'+str(i)+'_category'] = pd.cut(full_training_data['updrs_'+str(i)],bins=[-np.inf, 20., 40., np.inf],labels=[1,2,3])
    else:
        full_training_data['updrs_'+str(i)+'_category'] = pd.cut(full_training_data['updrs_'+str(i)],bins=[-np.inf, 7.5, np.inf],labels=[1,2])
        
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(full_training_data, full_training_data['updrs_'+str(i)+'_category']):
        strat_train_set = full_training_data.loc[train_index]
        strat_test_set = full_training_data.loc[test_index]
        
    if i == 1:
        strat_train_set = strat_train_set.drop(columns=['updrs_1_category'])
        strat_test_set = strat_test_set.drop(columns=['updrs_1_category'])
    elif i == 2:
        strat_train_set = strat_train_set.drop(columns=['updrs_1_category','updrs_2_category'])
        strat_test_set = strat_test_set.drop(columns=['updrs_1_category','updrs_2_category'])
    elif i == 3:
        strat_train_set = strat_train_set.drop(columns=['updrs_1_category','updrs_2_category','updrs_3_category'])
        strat_test_set = strat_test_set.drop(columns=['updrs_1_category','updrs_2_category','updrs_3_category'])
    else:
        strat_train_set = strat_train_set.drop(columns=['updrs_1_category','updrs_2_category','updrs_3_category','updrs_4_category'])
        strat_test_set = strat_test_set.drop(columns=['updrs_1_category','updrs_2_category','updrs_3_category','updrs_4_category'])
    
    strat_train_set_wo_labels = strat_train_set.drop(columns=['updrs_1','updrs_2','updrs_3','updrs_4','index'])
    strat_train_set_labels = strat_train_set['updrs_'+str(i)].copy()
    strat_train_set_numeric = strat_train_set_wo_labels.drop(columns=['visit_id','visit_month'])
    strat_train_set_cat = strat_train_set_wo_labels[['visit_month']]

    strat_test_set_wo_labels = strat_test_set.drop(columns=['updrs_1','updrs_2','updrs_3','updrs_4','index'])
    strat_test_set_labels = strat_test_set['updrs_'+str(i)].copy()
    
    numeric_pipeline = Pipeline([
        ('standard_scaler', StandardScaler())
    ])

    numeric_attributes = list(strat_train_set_numeric)
    categorical_attributes = list(strat_train_set_cat)

    full_pipeline = ColumnTransformer([
        ('num', numeric_pipeline, numeric_attributes),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_attributes)
    ])

    strat_train_set_prepared = full_pipeline.fit_transform(strat_train_set_wo_labels)
    print(strat_train_set_prepared.shape)
    strat_test_set_prepared = full_pipeline.transform(strat_test_set_wo_labels)
    print(strat_test_set_prepared.shape)
    
    lin_reg = LinearRegression()

    model_dict['updrs_'+str(i)] = lin_reg.fit(strat_train_set_prepared, strat_train_set_labels)
    print(model_dict)"""

"model_dict = {}\n\nfor i in [1,2,3,4]:\n    if (i == 1) | (i == 2):\n        full_training_data['updrs_'+str(i)+'_category'] = pd.cut(full_training_data['updrs_'+str(i)],bins=[-np.inf, 15., np.inf],labels=[1,2])\n    elif i == 3:\n        full_training_data['updrs_'+str(i)+'_category'] = pd.cut(full_training_data['updrs_'+str(i)],bins=[-np.inf, 20., 40., np.inf],labels=[1,2,3])\n    else:\n        full_training_data['updrs_'+str(i)+'_category'] = pd.cut(full_training_data['updrs_'+str(i)],bins=[-np.inf, 7.5, np.inf],labels=[1,2])\n        \n    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n    for train_index, test_index in split.split(full_training_data, full_training_data['updrs_'+str(i)+'_category']):\n        strat_train_set = full_training_data.loc[train_index]\n        strat_test_set = full_training_data.loc[test_index]\n        \n    if i == 1:\n        strat_train_set = strat_train_set.drop(columns=['updrs_1_category'])\n        strat_test_set = s

In [5]:
model_dict = {}

for u in list_of_updrs_tests:
        
    # Drop NAs
    temp = full_training_data.dropna(subset=[u]) 
    print(u)
    # Train data
    X = temp[['visit_month','npx_mean','npx_median','peptide_mean','peptide_median']]
    print(X.shape)
    y = temp[u]
    print(y.shape)
    trained = LinearRegression().fit(X, y)
    
    # Save model
    model_dict[u] = trained

updrs_1
(632, 5)
(632,)
updrs_2
(632, 5)
(632,)
updrs_3
(632, 5)
(632,)
updrs_4
(632, 5)
(632,)


#### The commands below are for debugging only, these were used to determine the bins corresponding to the train-test split specific to UPDRS tests

In [6]:
#full_training_data_melted[['updrs_test_score','PeptideAbundance','NPX','group_key']].corr()
#full_training_data[['updrs_1','updrs_2','updrs_3','updrs_4']].hist(figsize=(20,15))

In [7]:
"""full_training_data['updrs_2_category'] = pd.cut(full_training_data['updrs_2'],
                                                          bins=[-np.inf, 15., np.inf],
                                                          labels=[1,2])

full_training_data['updrs_3_category'] = pd.cut(full_training_data['updrs_3'],
                                                          bins=[-np.inf, 20., 40., np.inf],
                                                          labels=[1,2,3])

full_training_data['updrs_4_category'] = pd.cut(full_training_data['updrs_4'],
                                                          bins=[-np.inf, 7.5, np.inf],
                                                          labels=[1,2])

full_training_data['updrs_4_category'].hist()

full_training_data['updrs_4_category'].unique()
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(full_training_data, full_training_data['updrs_4_category']):
    strat_train_set = full_training_data.loc[train_index]
    strat_test_set = full_training_data.loc[test_index]
    
print(strat_test_set['updrs_4_category'].value_counts()/len(strat_test_set))
print(strat_train_set['updrs_4_category'].value_counts()/len(strat_train_set))

"""

"full_training_data['updrs_2_category'] = pd.cut(full_training_data['updrs_2'],\n                                                          bins=[-np.inf, 15., np.inf],\n                                                          labels=[1,2])\n\nfull_training_data['updrs_3_category'] = pd.cut(full_training_data['updrs_3'],\n                                                          bins=[-np.inf, 20., 40., np.inf],\n                                                          labels=[1,2,3])\n\nfull_training_data['updrs_4_category'] = pd.cut(full_training_data['updrs_4'],\n                                                          bins=[-np.inf, 7.5, np.inf],\n                                                          labels=[1,2])\n\nfull_training_data['updrs_4_category'].hist()\n\nfull_training_data['updrs_4_category'].unique()\nsplit = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\nfor train_index, test_index in split.split(full_training_data, full_training_data['updrs_

### Testing the RMSE
#### RMSE <= 0.75 : Very good accuracy | 0.75 < RMSE <= 1.0 : Good accuracy

#### First on train data (train split from the full training dataset)
#### 2nd on test data (test split from the full training dataset)

In [8]:
"""train_df_predictions = lin_reg_model_updrs_4.predict(strat_train_set_prepared)
mse = mean_squared_error(strat_train_set_labels, train_df_predictions)
rmse = np.sqrt(mse)
print(rmse)

test_df_predictions = lin_reg_model_updrs_4.predict(strat_test_set_prepared)
mse = mean_squared_error(strat_test_set_labels, test_df_predictions)
rmse = np.sqrt(mse)
print(rmse)

model_dict['updrs_4'] = lin_reg_model_updrs_4
model_dict"""

"train_df_predictions = lin_reg_model_updrs_4.predict(strat_train_set_prepared)\nmse = mean_squared_error(strat_train_set_labels, train_df_predictions)\nrmse = np.sqrt(mse)\nprint(rmse)\n\ntest_df_predictions = lin_reg_model_updrs_4.predict(strat_test_set_prepared)\nmse = mean_squared_error(strat_test_set_labels, test_df_predictions)\nrmse = np.sqrt(mse)\nprint(rmse)\n\nmodel_dict['updrs_4'] = lin_reg_model_updrs_4\nmodel_dict"

#### The below cell is only for debugging the function get_predictions()

In [9]:
def get_full_test_data(df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
    
    if  (df_proteins.shape[0] == 0) & (df_peptides.shape[0] == 0):
        
        print('only the test dataframe has data, proteins and peptides info absent')
        full_test_data = df_test[['visit_id','visit_month']]
        full_test_data['npx_mean'] = 0
        full_test_data['npx_median'] = 0
        full_test_data['peptide_mean'] = 0
        full_test_data['peptide_median'] = 0
        
        full_test_data = full_test_data.drop_duplicates(keep='first')
        full_test_data = full_test_data[['visit_id','visit_month','npx_mean','npx_median','peptide_mean','peptide_median']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        return full_test_data
    
    elif (df_proteins.shape[0] == 0) & (df_peptides.shape[0] != 0):
        
        print('no proteins only peptides')
        full_test_data = pd.DataFrame()
        
        df_peptides = pd.merge(df_peptides, df_test, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='outer')
        df_peptides['PeptideAbundance'] = np.where(df_peptides['PeptideAbundance'].isna(), 0, df_peptides['PeptideAbundance'])

        for i in [0,6,12,24]:
            temp = df_peptides[df_peptides['visit_month']==i]
            full_test_data = full_test_data.append(temp)
        
        del df_proteins, df_peptides
            
        new_features = full_test_data.groupby(['visit_id','visit_month']).agg(peptide_mean=('PeptideAbundance', 'mean'), peptide_median=('PeptideAbundance', 'median')
                                                              ).reset_index(level=['visit_id','visit_month'])

        full_test_data = pd.merge(full_test_data, new_features, on=['visit_id','visit_month'], how='inner')
        full_test_data['npx_mean'] = 0
        full_test_data['npx_median'] = 0
        full_test_data = full_test_data.drop_duplicates(keep='first')
        full_test_data = full_test_data[['visit_id','visit_month','npx_mean','npx_median','peptide_mean','peptide_median']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data
            
    elif (df_peptides.shape[0] == 0) & (df_proteins.shape[0] != 0):
        
        print('no peptides only proteins')
        full_test_data = pd.DataFrame()
        
        df_proteins = pd.merge(df_proteins, df_test, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='outer')
        df_proteins['NPX'] = np.where(df_proteins['NPX'].isna(), 0, df_proteins['NPX'])

        for i in [0,6,12,24]:
            temp = df_proteins[df_proteins['visit_month']==i]
            full_test_data = full_test_data.append(temp)
            
        new_features = full_test_data.groupby(['visit_id','visit_month']).agg(npx_mean=('NPX', 'mean'), npx_median=('NPX', 'median')
                                                              ).reset_index(level=['visit_id','visit_month'])

        full_test_data = pd.merge(full_test_data, new_features, on=['visit_id','visit_month'], how='inner')
        full_test_data['peptide_mean'] = 0
        full_test_data['peptide_median'] = 0
        full_test_data = full_test_data.drop_duplicates(keep='first')
        full_test_data = full_test_data[['visit_id','visit_month','npx_mean','npx_median','peptide_mean','peptide_median']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data
        
    else:
        
        print('both proteins and peptides are present')
        
        pro_pep_join = pd.merge(df_proteins, df_peptides, on=['patient_id','visit_id','visit_month','UniProt'], suffixes=['_left','_right'], how='outer')
        del df_proteins, df_peptides
        
        pro_pep_join_test = pd.merge(pro_pep_join, df_test, on=['patient_id','visit_id','visit_month'], suffixes=['_left','_right'], how='outer')
        
        pro_pep_join_test['NPX'] = np.where(pro_pep_join_test['NPX'].isna(), 0, pro_pep_join_test['NPX'])
        pro_pep_join_test['PeptideAbundance'] = np.where(pro_pep_join_test['PeptideAbundance'].isna(), 0, pro_pep_join_test['PeptideAbundance'])
        
        del pro_pep_join
        gc.collect()

        full_test_data = pd.DataFrame()

        for i in [0,6,12,24]:
            temp = pro_pep_join_test[pro_pep_join_test['visit_month']==i]
            full_test_data = full_test_data.append(temp)

        new_features = full_test_data.groupby(['visit_id','visit_month']).agg(npx_mean=('NPX', 'mean'), npx_median=('NPX', 'median')
                                                                                  , peptide_mean=('PeptideAbundance', 'mean')
                                                                              , peptide_median=('PeptideAbundance', 'median')
                                                                  ).reset_index(level=['visit_id','visit_month'])

        full_test_data = pd.merge(full_test_data, new_features, on=['visit_id','visit_month'], how='inner')
        full_test_data = full_test_data.drop_duplicates(keep='first')
        full_test_data = full_test_data[['visit_id','visit_month','npx_mean','npx_median','peptide_mean','peptide_median']]

        full_test_data = full_test_data.reset_index()
        full_test_data = full_test_data.drop(columns=['index'])
        
        return full_test_data

In [10]:
#def get_predictions(model_dict, sklearn_pipeline, df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
def get_predictions(model_dict, df_test, df_proteins=pd.DataFrame(), df_peptides=pd.DataFrame()):
    
    list_of_updrs_tests = ['updrs_1','updrs_2','updrs_3','updrs_4']
    result = pd.DataFrame()
    
    full_test_data = get_full_test_data(df_test=df_test, df_proteins=df_proteins, df_peptides=df_peptides)
    
    if full_test_data.shape[0] != 0:
        """df_prepared = sklearn_pipeline.transform(full_test_data)
        print(df_prepared.shape)"""

        for u in list_of_updrs_tests:
            X = full_test_data[['visit_month','npx_mean','npx_median','peptide_mean','peptide_median']]
            #full_test_data['result_' + str(u)] = pd.DataFrame(model_dict[u].predict(df_prepared),columns=['rating']).round(decimals=0)
            full_test_data['result_' + str(u)] = np.ceil(model_dict[u].predict(X))

        for m in [0, 6, 12, 24]:
            for u in [1, 2, 3, 4]:
                temp = full_test_data[['visit_id', 'visit_month', 'result_updrs_' + str(u)]]
                temp['prediction_id'] = temp['visit_id'] + '_updrs_' + str(u) + '_plus_' + str(m) + '_months'
                temp["rating"] = temp["result_updrs_" + str(u)]
                
                #temp = temp[['visit_id', 'visit_month', 'prediction_id', 'rating']]
                temp = temp[['prediction_id', 'rating']]

                result = result.append(temp)

        #result.sort_values(by=['visit_id', 'visit_month'], inplace=True)
        #result = result[['prediction_id', 'rating']]
        #result['prediction_id'] = result['prediction_id'].astype('string') 
        #result['rating'] = result['rating'].astype('int') 
        result = result.drop_duplicates(subset=['prediction_id', 'rating'])
        result = result.reset_index()
        result.drop(columns=['index'], inplace=True)
    
    return result

In [11]:
"""df_test = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')

test_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv')
#result = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline, df_test=df_test, df_proteins=test_proteins, df_peptides=pd.DataFrame())

test_peptides = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv')
#result = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline, df_test=df_test, df_proteins=pd.DataFrame(), df_peptides=test_peptides)

result = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline, df_test=df_test, df_proteins=test_proteins, df_peptides=test_peptides)

#result = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline)

result"""

"df_test = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')\n\ntest_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv')\n#result = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline, df_test=df_test, df_proteins=test_proteins, df_peptides=pd.DataFrame())\n\ntest_peptides = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv')\n#result = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline, df_test=df_test, df_proteins=pd.DataFrame(), df_peptides=test_peptides)\n\nresult = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline, df_test=df_test, df_proteins=test_proteins, df_peptides=test_peptides)\n\n#result = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline)\n\nresult"

In [12]:
import amp_pd_peptide
env = amp_pd_peptide.make_env()   # initialize the environment
iter_test = env.iter_test()

for (test, test_peptides, test_proteins, submission) in iter_test:
    #print(test.visit_id.unique())
    #print(test_peptides.visit_id.unique())
    #print(test_proteins.visit_id.unique())
    #result = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline, df_test=test, df_proteins=test_proteins, df_peptides=test_peptides)
    #result = get_predictions(model_dict=model_dict, df_test=test, df_proteins=test_proteins, df_peptides=test_peptides)
    result = get_predictions(model_dict=model_dict, df_test=test)
    env.predict(result)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
only the test dataframe has data, proteins and peptides info absent
only the test dataframe has data, proteins and peptides info absent


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-

In [13]:
"""train = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")
model = {}
target = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]

for u in target:
        
    # Drop NAs
    temp = train.dropna(subset=[u]) 
    
    # Train data
    X = temp['visit_month']
    y = temp[u]
        
    trained = LinearRegression().fit(X.values.reshape(-1, 1), y)
    
    # Save model
    model[u] = trained"""

'train = pd.read_csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")\nmodel = {}\ntarget = ["updrs_1", "updrs_2", "updrs_3", "updrs_4"]\n\nfor u in target:\n        \n    # Drop NAs\n    temp = train.dropna(subset=[u]) \n    \n    # Train data\n    X = temp[\'visit_month\']\n    y = temp[u]\n        \n    trained = LinearRegression().fit(X.values.reshape(-1, 1), y)\n    \n    # Save model\n    model[u] = trained'

In [14]:
"""def get_predictions(my_train, pro, model):

    # Forecast
    my_train = my_train.fillna(0)
    
    for u in target:
        
        # Here is where we will save the final results
        my_train['result_' + str(u)] = 0
  
        # Predict    
        X = my_train["visit_month"]
        
        if u == 'updrs_4':
            my_train['result_' + str(u)] = 0
        else:
            my_train['result_' + str(u)] = np.ceil(model[u].predict(X.values.reshape(-1, 1)))

        
    # Format for final submission
    result = pd.DataFrame()

    for m in [0, 6, 12, 24]:
        for u in [1, 2, 3, 4]:

            temp = my_train[["visit_id", "result_updrs_" + str(u)]]
            temp["prediction_id"] = temp["visit_id"] + "_updrs_" + str(u) + "_plus_" + str(m) + "_months"
            temp["rating"] = temp["result_updrs_" + str(u)]
            temp = temp [['prediction_id', 'rating']]

            result = result.append(temp)            
    result = result.drop_duplicates(subset=['prediction_id', 'rating'])

    return result"""

'def get_predictions(my_train, pro, model):\n\n    # Forecast\n    my_train = my_train.fillna(0)\n    \n    for u in target:\n        \n        # Here is where we will save the final results\n        my_train[\'result_\' + str(u)] = 0\n  \n        # Predict    \n        X = my_train["visit_month"]\n        \n        if u == \'updrs_4\':\n            my_train[\'result_\' + str(u)] = 0\n        else:\n            my_train[\'result_\' + str(u)] = np.ceil(model[u].predict(X.values.reshape(-1, 1)))\n\n        \n    # Format for final submission\n    result = pd.DataFrame()\n\n    for m in [0, 6, 12, 24]:\n        for u in [1, 2, 3, 4]:\n\n            temp = my_train[["visit_id", "result_updrs_" + str(u)]]\n            temp["prediction_id"] = temp["visit_id"] + "_updrs_" + str(u) + "_plus_" + str(m) + "_months"\n            temp["rating"] = temp["result_updrs_" + str(u)]\n            temp = temp [[\'prediction_id\', \'rating\']]\n\n            result = result.append(temp)            \n    re

In [15]:
# Run once to check results
#get_predictions(train, None, model)

In [16]:
"""amp_pd_peptide.make_env.__called__ = False
type(env)._state = type(type(env)._state).__dict__['INIT']
iter_test = env.iter_test()"""

"amp_pd_peptide.make_env.__called__ = False\ntype(env)._state = type(type(env)._state).__dict__['INIT']\niter_test = env.iter_test()"

In [17]:
"""for (test, test_peptides, test_proteins, sample_submission) in iter_test:
        
    result = get_predictions(test, test_proteins, model)

    env.predict(result)"""   # register your predictions

'for (test, test_peptides, test_proteins, sample_submission) in iter_test:\n        \n    result = get_predictions(test, test_proteins, model)\n\n    env.predict(result)'

In [18]:
"""submission_copied = pd.read_csv("/kaggle/working/submission.csv")
print(submission_copied.shape)
print(submission_copied.memory_usage(deep=True).sum()/1024/1024, " MBs")"""

'submission_copied = pd.read_csv("/kaggle/working/submission.csv")\nprint(submission_copied.shape)\nprint(submission_copied.memory_usage(deep=True).sum()/1024/1024, " MBs")'

In [19]:
"""result_out = pd.DataFrame()

for (test, test_peptides, test_proteins, submission) in iter_test:
    print(test.visit_id.unique())
    print(test_peptides.visit_id.unique())
    print(test_proteins.visit_id.unique())
    result = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline, df_test=test, df_proteins=test_proteins, df_peptides=test_peptides)
    print(result.shape)
    result_out = result_out.append(result)
    print(result_out.shape)
    env.predict(result)"""

'result_out = pd.DataFrame()\n\nfor (test, test_peptides, test_proteins, submission) in iter_test:\n    print(test.visit_id.unique())\n    print(test_peptides.visit_id.unique())\n    print(test_proteins.visit_id.unique())\n    result = get_predictions(model_dict=model_dict, sklearn_pipeline=full_pipeline, df_test=test, df_proteins=test_proteins, df_peptides=test_peptides)\n    print(result.shape)\n    result_out = result_out.append(result)\n    print(result_out.shape)\n    env.predict(result)'