In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/public_timeseries_testing_util.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/amp-parkinsons-disease-progression-prediction/amp_pd_peptide/__init__.py
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv
/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test

In [2]:
# Loading libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import tqdm
import time
import re
from itertools import product
from functools import reduce

from sklearn.model_selection import KFold

from tqdm import tqdm
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option('display.float_format',lambda x: '%.3f' % x)

In [3]:
# Reading the datasets
protein_data = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')
peptides_data = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv')
target_data = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
sup_target_data = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv')
protein_data.shape, peptides_data.shape, target_data.shape

((232741, 5), (981834, 6), (2615, 8))

In [4]:
# Merging target data and sup_target_data since we will be using only clinical data for modelling
target_data = pd.concat([target_data,sup_target_data],axis = 0).reset_index(drop = True)
target_data = target_data[target_data.visit_month != 5].copy()

target_data.shape, target_data.visit_id.nunique(), target_data.patient_id.nunique(), target_data.visit_month.nunique()

((4720, 8), 4720, 1019, 17)

We have data for 1019 patients with max 18 visits 

In [5]:
# Filling misssing values in upd23b_clinical_state_on_medication by unknown
target_data.upd23b_clinical_state_on_medication.fillna('unknown',inplace = True)

In [6]:
# # Dropping missing values in the target dataset after filling na with 0 for updrs_4
# target_data.updrs_4 = target_data.updrs_4.fillna(0)

# target_data = target_data.dropna()
# target_data.shape

In [7]:
target_data.isna().sum()

visit_id                                  0
patient_id                                0
visit_month                               0
updrs_1                                  96
updrs_2                                  98
updrs_3                                  30
updrs_4                                1863
upd23b_clinical_state_on_medication       0
dtype: int64

In [8]:
# Category columns
id_cols = ['visit_id','patient_id','visit_month']
target_cols = ['updrs_1','updrs_2','updrs_3','updrs_4']
month_list  =  [0,6,12,24,36,48]

**Since medication is not available in test data. Filling medication to unknown in test data to see the impact**

In [9]:
from bayes_opt import BayesianOptimization
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GroupKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.svm import LinearSVR, SVR
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

In [10]:
# Creating features for target data 
def create_target_features(target_data,test=None):
    
    if test is None:
        test1 = target_data[['patient_id','visit_month','upd23b_clinical_state_on_medication']].copy()
        test1['upd23b_clinical_state_on_medication']  = test1['upd23b_clinical_state_on_medication'].fillna('unknown')
    else:
        test1 = test.copy()
        if 'upd23b_clinical_state_on_medication' not in test1.columns.tolist():
            test1['upd23b_clinical_state_on_medication'] = 'unknown'
        else:
            test1['upd23b_clinical_state_on_medication']  = test1['upd23b_clinical_state_on_medication'].fillna('unknown')
        test1 = test1[['patient_id','visit_month','upd23b_clinical_state_on_medication']].drop_duplicates().copy()
            
        patients = test1.patient_id.unique()
        visit_months=test1.visit_month.unique()
        
        test1['visit_month_orig'] = test1['visit_month']

        test_data = pd.DataFrame()
        for patient in patients:
            for month in month_list:
                p_data = test1[test1['patient_id'] == patient].copy()
                p_data['visit_month'] = p_data['visit_month'].values + month
                test_data = pd.concat([test_data,p_data],axis=0)
        test1 = test_data.copy()
        

    # Replacing missing values with median by visit_month & medication
    target_data1 = target_data.copy()
    tmeds = target_data1.groupby(['visit_month','upd23b_clinical_state_on_medication'],group_keys =False)[target_cols].apply(lambda x: x.fillna(x.median())).sort_index()
    target_data1 = target_data1[['patient_id','visit_month','upd23b_clinical_state_on_medication']].join(tmeds)

    # Building grp features
    all_grp_cols = [['visit_month'],['upd23b_clinical_state_on_medication']] + [['visit_month','upd23b_clinical_state_on_medication']]
    
    target_data2 = target_data1[['visit_month','upd23b_clinical_state_on_medication']].drop_duplicates()
    for grp_col in all_grp_cols:    
        temp = target_data1.groupby(grp_col)[target_cols].agg(['min','max','mean','median','sum','std'])
        temp.columns = [i+'_' + j + '_'+ '_'.join(grp_col) for i,j in temp.columns]
        target_data2 = target_data2.join(temp,on = grp_col)
        target_data2  = target_data2.join(target_data1.groupby(grp_col)['updrs_1'].count().rename('_'.join(grp_col)+'_count'),
                                          on = grp_col)
    
    # Flags for medication
    test1['med_unknown'] = (test1['upd23b_clinical_state_on_medication'] == 'unknown').astype(int)
    test1['med_off'] = (test1['upd23b_clinical_state_on_medication'] == 'Off').astype(int)
    test1['med_on'] = (test1['upd23b_clinical_state_on_medication'] == 'On').astype(int)
    
    model_data = test1.merge(target_data2,on = ['visit_month','upd23b_clinical_state_on_medication'],how = 'inner').drop(columns = 'upd23b_clinical_state_on_medication')
    feature_cols = model_data.drop(columns = ['patient_id']).columns.tolist()
    
    # Imputing model_data by median value by visit_month
    model_data = model_data.groupby('visit_month',group_keys = False).apply(lambda x: x.fillna(x.median()))

    if test is None:
        model_data = model_data.merge(target_data1[['patient_id','visit_month'] + target_cols],
                                      on = ['patient_id','visit_month'],
                                      how = 'inner')

    return model_data,feature_cols

In [11]:
def smape_plus_1(y_true, y_pred):
    y_true_plus_1 = y_true + 1
    y_pred_plus_1 = y_pred + 1
    metric = np.zeros(len(y_true_plus_1))
    
    numerator = np.abs(y_true_plus_1 - y_pred_plus_1)
    denominator = ((np.abs(y_true_plus_1) + np.abs(y_pred_plus_1)) / 2)
    
    mask_not_zeros = (y_true_plus_1 != 0) | (y_pred_plus_1 != 0)
    metric[mask_not_zeros] = numerator[mask_not_zeros] / denominator[mask_not_zeros]
    
    return 100 * np.nanmean(metric)

In [12]:
# target_data

In [13]:
# Getting the features 
model_data, feature_cols = create_target_features(target_data)
model_data.shape, len(feature_cols)

((4720, 84), 79)

In [14]:
model_data[['patient_id','visit_month']].drop_duplicates().shape, model_data.shape

((4720, 2), (4720, 84))

In [15]:
# feature_cols = ['visit_month','med_unknown','med_off','med_on','visit_month_count', 'upd23b_clinical_state_on_medication_count','visit_month_upd23b_clinical_state_on_medication_count']
feature_cols = ['visit_month','visit_month_count']


In [16]:
# ベイズ最適化によるパラメータチューニング
def train_linear_model2(model_data, feature_cols):
    def lgb_cross_val(learning_rate, num_leaves, max_depth, min_child_samples, reg_alpha, reg_lambda, n_estimators, colsample_bytree, subsample):
        params = {
            'objective': 'mape',
            'learning_rate': learning_rate,
            'max_depth': int(max_depth),
            'num_leaves': int(num_leaves),
            'min_child_samples': int(min_child_samples),
            'reg_alpha': reg_alpha,
            'reg_lambda': reg_lambda,
            'n_estimators': int(n_estimators),
            'colsample_bytree': colsample_bytree,
            'subsample': subsample,
            'random_state': 0
        }

        model = lgb.LGBMRegressor(**params)
        results = cross_validate(model, X, y, groups=groups, cv=gkf, scoring=smape_, return_train_score=True)
        return -results["test_score"].mean()
    
    lgb_params = {
        'objective': 'mape',
        'learning_rate': 0.01,
        'max_depth': 6,
        'num_leaves': 31,
        'min_child_samples': 20,
        'reg_alpha': 1,
        'reg_lambda': 1,
        'n_estimators': 1000,
        'colsample_bytree': 0.7,
        'subsample': 0.8,
        'random_state': 0
    }


    model_data_x = model_data.copy()
    models = {}

    n_folds = 10
    gkf = GroupKFold(n_splits=n_folds)

    score_trn = []
    score_test = []

    for target in target_cols:
        if target.find('updrs_3') >= 0:
            y = model_data[target].replace(0, pd.NA).copy()
            y = model_data[target].dropna().copy()
        else:
            y = model_data[target].dropna().copy()

        y = y.apply(np.log1p).copy()

        X = model_data_x.loc[y.index.tolist(), feature_cols]
        groups = model_data_x.loc[y.index.tolist(), 'patient_id']

        # Bayesian optimization
        optimizer = BayesianOptimization(lgb_cross_val, {
            'learning_rate': (1e-4, 1e-2),
            'num_leaves': (10, 100),
            'max_depth': (2, 15),
            'min_child_samples': (5, 100),
            'reg_alpha': (0, 10),
            'reg_lambda': (0, 100),
            'n_estimators': (500, 3000),
            'colsample_bytree': (0.1, 1),
            'subsample': (0.1, 1)
        },random_state=0)

        
        optimizer.maximize(init_points=20, n_iter=30)


        # Get the best parameters found by Bayesian optimization
        lgb_params.update(optimizer.max["params"])
        lgb_params["max_depth"] = int(lgb_params["max_depth"])
        lgb_params["num_leaves"] = int(lgb_params["num_leaves"])
        lgb_params["min_child_samples"] = int(lgb_params["min_child_samples"])
        lgb_params["n_estimators"] = int(lgb_params["n_estimators"])

        model5 = lgb.LGBMRegressor(**lgb_params)

        results = cross_validate(model5, X, y, groups=groups, cv=gkf, scoring=smape_, return_estimator=True, return_train_score=True)
        results = pd.DataFrame(results)
        score_trn += [results.train_score.abs().mean()]
        score_test += [results.test_score.abs().mean()]
        print(f'\nTrain score for {target} : {results.train_score.abs().mean()}')
        print(f'Test score for {target} : {results.test_score.abs().mean()}')
        models[target] = results
    print('\nTotal avg train score : ', np.mean(score_trn))
    print('Total avg test score : ', np.mean(score_test))

    return models

In [17]:
#  Metric
def smape(y_true, y_pred):
    y_true = np.expm1(y_true) + 1
    y_pred = np.expm1(y_pred) + 1
    
    return 100/len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

In [18]:
from sklearn.preprocessing import MinMaxScaler,  RobustScaler, StandardScaler
from sklearn.model_selection import GroupKFold, GroupShuffleSplit
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression,PoissonRegressor
from sklearn.model_selection import cross_validate,cross_val_predict
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor, StackingRegressor
import lightgbm as lgb

In [19]:
# Scorer for cross validation
smape_ = make_scorer(smape_plus_1,greater_is_better=False)

**Using LinearSVR model with groupshufflesplit with 5 splits**

In [20]:
def train_linear_model(model_data,feature_cols):
    lgb_params = {
        'objective': 'mape',
        'learning_rate': 0.01,
        'max_depth': 6,
        'num_leaves': 31,
        'min_child_samples': 20,
        'reg_alpha': 1,
        'reg_lambda': 1,
        'n_estimators': 1000,
        'colsample_bytree': 0.7,
        'subsample': 0.8,
        'random_state': 0
    }
        
    # Scaling the columns using MinMaxscaler
#     scaler = MinMaxScaler(feature_range = (0,1))
#     scaler.fit(model_data[feature_cols])
#     model_data_x = pd.DataFrame(scaler.transform(model_data[feature_cols]),columns = scaler.feature_names_in_)
    model_data_x = model_data.copy()
    models = {}  
    
    n_folds = 10
    gkf = GroupKFold(n_splits=n_folds)
    
    score_trn = [] ; score_test = []
    
    for target in target_cols:
        
        if target.find('updrs_3')>=0:
            y =  model_data[target].replace(0,pd.NA).copy()
            y =  model_data[target].dropna().copy()
        else:
            y =  model_data[target].dropna().copy() 
            
        y = y.apply(np.log1p).copy()
        
        X  = model_data_x.loc[y.index.tolist(), feature_cols]
        groups = model_data_x.loc[y.index.tolist(),'patient_id']

        model1 = Pipeline([('scaler',RobustScaler()),         
                          ('lsvr',LinearSVR(random_state=0))])
        
        model2 = Pipeline([('scaler',MinMaxScaler()),         
                          ('poisson',PoissonRegressor())])
        
        model3 = Pipeline([('scaler',RobustScaler()),         
                          ('svr_rbf',SVR())])
    
        model4 = Pipeline([('scaler',StandardScaler()),         
                          ('lr',LinearRegression())])
        
        model5 = lgb.LGBMRegressor(**lgb_params)
        
       
        
#         model = StackingRegressor([('lsvr',model1),
#                                    ('poisson',model2),
#                                    ('svr_rbf',model3),
#                                    ('lr',model4),
#                                    ('lgb',model5)],
                                                
#                                   final_estimator= ElasticNet(random_state=42),
#                                  passthrough=True
#                                  )
        
        
        model = Pipeline([('scaler',RobustScaler()),         
                          ('lsvr',LinearSVR(random_state=0))])
        results = cross_validate(model,
                                X,
                                y,
                                groups=groups,
                                cv=gkf,
                                scoring=smape_,
                                return_estimator=True,
                                return_train_score=True)
        results = pd.DataFrame(results)
        score_trn += [results.train_score.abs().mean()]
        score_test += [results.test_score.abs().mean()]
        print(f'\nTrain score for {target} : {results.train_score.abs().mean()}')
        print(f'Test score for {target} : {results.test_score.abs().mean()}')  
        models[target] = results
    print('\nTotal avg train score : ',np.mean(score_trn))
    print('Total avg test score : ',np.mean(score_test))
    
    return models

In [21]:
# Training the models
models = train_linear_model(model_data,feature_cols)


Train score for updrs_1 : 22.963221519361756
Test score for updrs_1 : 22.98432487865924

Train score for updrs_2 : 28.654506955756023
Test score for updrs_2 : 28.68029542513389

Train score for updrs_3 : 23.069811500979064
Test score for updrs_3 : 23.081887399412963

Train score for updrs_4 : 17.26865934131343
Test score for updrs_4 : 17.268659341312553

Total avg train score :  22.989049829352567
Total avg test score :  23.003791761129662


* Total avg train score :  50.49370520571464
* Total avg test score :  50.51135681640096
-----------------------------------------------------
* Total avg train score :  50.4888725371017
* Total avg test score :  50.52275313553273

-----------------------------------------------------
* Total avg train score :  50.48869941211159
* Total avg test score :  50.52903803554226

In [22]:
# Training the models
# models = train_linear_model2(model_data,feature_cols)

In [23]:
# model_data[['visit_month','updrs_1_median_visit_month','updrs_2_median_visit_month','updrs_3_median_visit_month','updrs_4_median_visit_month']].drop_duplicates()

In [24]:
target_meds = model_data[['visit_month','updrs_1_median_visit_month','updrs_2_median_visit_month','updrs_3_median_visit_month','updrs_4_median_visit_month']].drop_duplicates()
# target_meds

In [25]:
# Function to make predictions on test 
def get_predictions(target_data,test,sample_submission,models,target_meds, scaler = None):
    
    # Creating features
    model_data,feature_cols = create_target_features(target_data,test)
    
    feature_cols = ['visit_month','visit_month_count']

    # Making predictions from all the models
    pred_submission = pd.DataFrame(index = range(model_data.shape[0]))
    
    # Scaling the data
#     pred_feats = pd.DataFrame(scaler.transform(model_data.drop(columns = ['patient_id','visit_month_orig'])),columns = scaler.feature_names_in_)
    pred_feats = model_data[feature_cols].copy()
    
    for key,value in models.items():    
        model_fits = models[key]['estimator'].tolist()
        pred_train_temp = pd.DataFrame(np.mean(np.array([np.expm1(mod.predict(pred_feats)) for mod in value['estimator'].tolist()]),axis = 0))
        pred_train_temp.columns = [key]
        pred_submission = pred_submission.join(pred_train_temp)
#     return pred_submission
    pred_submission = pred_submission[models.keys()].copy() 
    model_data['visit_month_temp'] = model_data['visit_month'] - model_data['visit_month_orig']
    pred_submission.index = model_data['patient_id'].astype(str) + '_' +model_data['visit_month_orig'].astype(str) + '_'+model_data['visit_month_temp'].apply(lambda x: 'plus_'+str(x) +'_months')

#     pred_submission['updrs_4'] = 0
#     pred_submission['updrs_3'] += 2
#     pred_submission['updrs_2'] += 1.5 

    # Calculating medians
    ss_visit_months  = sample_submission.prediction_id.apply(lambda x: int(x.split('_')[1])+int(x.split('_')[5])).unique()
    missing_vm = list(set(ss_visit_months).difference(target_meds.visit_month.unique()))
    missing_vm = pd.DataFrame({'visit_month':missing_vm})
    target_meds = target_meds.append(missing_vm,ignore_index=True)
    target_meds = target_meds.fillna(method = 'ffill')    
    target_meds = target_meds.drop_duplicates()
    target_meds['visit_month'] = target_meds['visit_month'].astype(int)
    target_meds = target_meds[target_meds['visit_month']!=5].copy()
    target_meds = target_meds.set_index('visit_month')
    target_meds = target_meds.sort_index()
    target_meds = target_meds.expanding().max()
    target_meds = target_meds.stack().reset_index().rename(columns = {'level_1':'target',0:'rating'})

    target_meds['target'] = target_meds['target'].apply(lambda x: '_'.join(x.split('_')[:2]))
    target_meds['key'] = target_meds[['target','visit_month']].apply(lambda x: str(x[0])+'_'+str(x[1]),axis = 1)
    target_meds = target_meds.drop(columns = ['target','visit_month']).drop_duplicates().copy()
    target_meds = target_meds.set_index('key')
    
#     return pred_submission    
    pred_submission = pred_submission.stack().reset_index().rename(columns = {'level_0':'prediction_id','level_1':'target',0:'rating'})
    pred_submission['prediction_id'] = pred_submission[['prediction_id','target']].apply(lambda x: '_'.join(x[0].split('_')[:2] + [x[1]] + x[0].split('_')[2:]) ,axis= 1)
    pred_submission.drop(columns = ['target'],inplace= True)
#     pred_submission['group_key'] = pred_submission['prediction_id'].apply(lambda x: x.split('_')[1]).astype('str')
    pred_submission.reset_index(drop = True,inplace = True)
    sample_submission = sample_submission.drop(columns = ['rating']).copy()
    sample_submission = sample_submission.merge(pred_submission,on = ['prediction_id'],how = 'left')
    sample_submission['rating1'] = sample_submission.prediction_id.apply(lambda x: '_'.join(x.split('_')[2:4]) + '_'+ str(int(x.split('_')[1])+int(x.split('_')[5]))).map(target_meds.rating)
    
    sample_submission['rating'] = np.where(sample_submission.prediction_id.str.contains('updrs_4|updrs_3'),
                                           sample_submission['rating1'],
                                           sample_submission['rating']*0.55  +  sample_submission['rating1']*0.45
                                          )
    
    sample_submission = sample_submission.drop(columns = 'rating1')
    
#     return sample_submission


    
    # Calculating medians
#     temp  = pred_submission.copy()
#     temp['target'] = temp.prediction_id.apply(lambda x: '_'.join(x.split('_')[2:]))
#     pred_medians = temp.groupby('target')['rating'].median()

    sample_submission['rating'] = sample_submission['rating'].abs().fillna(
        
        sample_submission.prediction_id.apply(lambda x: '_'.join(x.split('_')[2:])).map(target_meds.rating)
    
    ).fillna(0)
    sample_submission['rating'] = sample_submission['rating'].apply(lambda x: np.clip(np.ceil(x),0,None))

    return sample_submission

**Testing the model for sample test data**

In [26]:
# Reading the test files
test_peptides = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_peptides.csv') 
test_proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test_proteins.csv') 
sample_submission = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/sample_submission.csv')
test = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/example_test_files/test.csv')

In [27]:
get_predictions(target_data,test,sample_submission,models,target_meds).head(5)

Unnamed: 0,prediction_id,group_key,rating
0,3342_0_updrs_1_plus_0_months,0,6.0
1,3342_0_updrs_1_plus_6_months,0,5.0
2,3342_0_updrs_1_plus_12_months,0,6.0
3,3342_0_updrs_1_plus_24_months,0,6.0
4,3342_0_updrs_2_plus_0_months,0,4.0


In [28]:
# ss.head(50)

In [29]:
import sys
sys.path.append('/kaggle/input/amp-parkinsons-disease-progression-prediction/')

import amp_pd_peptide
amp_pd_peptide.make_env.func_dict['__called__'] = False
env = amp_pd_peptide.make_env()

iter_test = env.iter_test() 

In [30]:
# Saving test_peptides and test_proteins
test_peps= pd.DataFrame()
test_pros = pd.DataFrame()
test_1 = pd.DataFrame()

In [31]:
for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    
#     test_peps = pd.concat([test_peps,test_peptides],axis = 0)
#     test_pros = pd.concat([test_pros,test_proteins],axis = 0)
    test_1 = pd.concat([test_1,test],axis = 0)
    
    submission = get_predictions(target_data,test_1,sample_submission,models, target_meds)
    submission = submission.drop_duplicates(subset=['prediction_id', 'rating'])
    env.predict(submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [32]:
# submission

In [33]:
# Set the total value 
bar = tqdm(total = 100)
# Add description
bar.set_description('Progress rate')
for i in range(100):
    # Set the progress
    bar.update(5)
    time.sleep(1)

  0%|          | 0/100 [00:00<?, ?it/s]