## Predict No Shows for Medical Appointments

In [4]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Common imports for data
import numpy as np
import pandas as pd
import os

# for metrics, preprocessing
import sklearn


# models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot 
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt


In [5]:
from google.colab import drive
drive.mount("/content/gdrive")
#download the data
os.chdir('/content/gdrive/My Drive/Data_Edv_P_P6/')
os.listdir()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


['Submi_P6_EdvPy_KSV_Colab_S02.csv',
 'test_share.csv',
 'train.csv',
 'medical_history.csv',
 'demographic_details.csv']

### when doing from desktop
import os
os.getcwd()
os.chdir('/Users/vivekkathula/Documents/Edv py projects/P6/')
os.listdir()

In [6]:
train = pd.read_csv("train.csv")
test =pd.read_csv("test_share.csv")
medical = pd.read_csv("medical_history.csv")
demographic=pd.read_csv("demographic_details.csv")

In [7]:
df_1=pd.merge(train,medical,on=['PatientId'],how='inner').copy()
df_train=pd.merge(df_1,demographic,on=['PatientId'],how='inner').copy()

In [8]:
df_2=pd.merge(test,medical,on=['PatientId'],how='inner').copy()
df_test=pd.merge(df_2,demographic,on=['PatientId'],how='inner').copy()

In [9]:
df_train.shape, df_test.shape

((99490, 14), (10854, 13))

In [10]:
df_train.head(2)

Unnamed: 0,PatientId,AppointmentID,ScheduledDay,AppointmentDay,SMS_received,No-show,Hipertension,Diabetes,Alcoholism,Handcap,Gender,Age,Neighbourhood,Scholarship
0,29872500000000.0,5642903,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,0,No,1,0,0,0,F,62,JARDIM DA PENHA,0
1,29872500000000.0,5639907,2016-04-29T09:47:47Z,2016-04-29T00:00:00Z,0,No,1,0,0,0,F,62,JARDIM DA PENHA,0


In [11]:
num_cols = ['Age']
cat_cols = ['Handcap','Neighbourhood']
binary_cols = ['SMS_received','Hipertension','Diabetes','Alcoholism','SMS_received','Gender','Scholarship']
date_cols =['ScheduledDay','AppointmentDay']

## Create Valid Set

In [12]:
df_train['No-show']=np.where(df_train['No-show']=='Yes',1,0)

In [13]:
# stratified split is what i prefer but there are cats that came only once in train
#from sklearn.model_selection import train_test_split
#train_set, valid_set = train_test_split(df_train, test_size=0.2, random_state=42)
train_set=df_train.copy()

## Custom Tranformers

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin

ScheduledDay_ix, AppointmentDay_ix, = 2,3

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_Adow_sin = True): # no *args or **kargs
        self.add_Adow_sin = add_Adow_sin
        
    def fit(self, X, y=None):
        return self # nothing else to do 
    
    def transform(self, X):
        #Sdate = pd.to_datetime(X["ScheduledDay"], format='%Y-%m-%dT%H:%M:%SZ')
        #Adate = pd.to_datetime(X["AppointmentDay"], format='%Y-%m-%dT%H:%M:%SZ')
        Sdate = pd.to_datetime(X[:,2], format='%Y-%m-%dT%H:%M:%SZ')
        Adate = pd.to_datetime(X[:,3], format='%Y-%m-%dT%H:%M:%SZ')
        
        A_day=Adate.day
        A_dofweek=Adate.dayofweek
        A_month=Adate.month
        
        S_hr=Sdate.hour
        S_day=Sdate.day
        S_dofweek=Sdate.dayofweek
        S_month=Sdate.month
        
        sin_S_hr = np.sin(2 * np.pi * (S_hr-S_hr.min())/(S_hr.max()-S_hr.min()))
        cos_S_hr = np.cos(2 * np.pi * (S_hr-S_hr.min())/(S_hr.max()-S_hr.min()))
        
        sin_S_day = np.sin(2 * np.pi * (S_day-S_day.min())/(S_day.max()-S_day.min()))
        cos_S_day = np.cos(2 * np.pi * (S_day-S_day.min())/(S_day.max()-S_day.min()))
        
        sin_S_dofweek=np.sin(2 * np.pi * (S_dofweek-S_dofweek.min())/(S_dofweek.max()-S_dofweek.min()))
        cos_S_dofweek=np.cos(2 * np.pi * (S_dofweek-S_dofweek.min())/(S_dofweek.max()-S_dofweek.min()))
        
        sin_S_month = np.sin(2 * np.pi * (S_month-S_month.min())/(S_month.max()-S_month.min()))
        cos_S_month = np.cos(2 * np.pi * (S_month-S_month.min())/(S_month.max()-S_month.min()))
        
        sin_A_day= np.sin(2 * np.pi * (A_day-A_day.min())/(A_day.max()-A_day.min()))
        cos_A_day= np.sin(2 * np.pi * (A_day-A_day.min())/(A_day.max()-A_day.min()))
        
        sin_A_dofweek = np.sin(2 * np.pi * (A_dofweek-S_dofweek.min())/(A_dofweek.max()-A_dofweek.min()))
        cos_A_dofweek = np.cos(2 * np.pi * (A_dofweek-S_dofweek.min())/(A_dofweek.max()-A_dofweek.min()))
        
        sin_A_month = np.sin(2 * np.pi * (A_month-A_month.min())/(A_month.max()-A_month.min()))
        cos_A_month = np.cos(2 * np.pi * (A_month-A_month.min())/(A_month.max()-A_month.min()))
        
        return np.c_[sin_S_hr, sin_S_hr, sin_S_day, cos_S_day,sin_S_dofweek,cos_S_dofweek,
                     sin_S_month,cos_S_month,sin_A_day,cos_A_day,sin_A_dofweek,cos_A_dofweek,                    
                     sin_A_month,cos_A_month]

attr_adder = CombinedAttributesAdder()
data_extra_attribs = attr_adder.transform(train_set.values)
data_extra_attribs.shape

(99490, 14)

## Transformation Pipeline

In [15]:
# date cols
# for second attempt i opened std scaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

date_cols=["PatientId","AppointmentID","ScheduledDay","AppointmentDay"]
df_train_dates=train_set[date_cols].copy()

date_pipeline = Pipeline([            
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

df_train_dates_tr = date_pipeline.fit_transform(df_train_dates)
df_train_dates_tr.shape

(99490, 14)

In [16]:
# numcols
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_cols=["Age"]
df_train_num=train_set[num_cols].copy()

num_pipeline = Pipeline([            
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('std_scaler', StandardScaler()),
])

df_train_num_tr = num_pipeline.fit_transform(df_train_num)
df_train_num_tr.shape

(99490, 1)

In [17]:
#Binarycols
df_train_bin=train_set[binary_cols].copy()
from sklearn.preprocessing import OneHotEncoder
cat_encoder=OneHotEncoder(drop="first")

df_train_bin_tr = cat_encoder.fit_transform(df_train_bin)
df_train_bin_tr.shape

(99490, 7)

In [18]:
#catcols, not binary
df_train_cat=train_set[cat_cols].copy()
from sklearn.preprocessing import OneHotEncoder
cat_encoder=OneHotEncoder()

df_train_cat_tr = cat_encoder.fit_transform(df_train_cat)
df_train_cat_tr.shape

(99490, 86)

In [19]:
from sklearn.compose import ColumnTransformer 

date_attribs =["PatientId","AppointmentID",'ScheduledDay','AppointmentDay']
num_attribs = ["Age"]
cat_attribs = ['Handcap','Neighbourhood']
binary_attribs=['SMS_received','Hipertension','Diabetes','Alcoholism','SMS_received','Gender','Scholarship']
     
full_pipeline = ColumnTransformer([
    ("date.time", date_pipeline, date_attribs), #14
    ("num_cols", num_pipeline, num_attribs),    # 1
    ("binary_cols",OneHotEncoder(drop="first"),binary_attribs),#7
    ("cat_cols", OneHotEncoder(), cat_attribs), #86 (81+5)
])
# for second attempt i opened std scaler and ran

In [20]:
# for second attempt i opened std scaler
X_train = train_set.drop(["No-show"],axis=1).copy()
y_train = train_set["No-show"].copy()

#X_valid = valid_set.drop(["No-show"],axis=1).copy()
#y_valid = valid_set["No-show"].copy()

In [21]:
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(df_test)

X_train_prepared.shape, X_test_prepared.shape# X_valid_prepared.shape

((99490, 108), (10854, 108))

In [22]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=0,max_iter=2000,penalty="l2",C=10)
log_reg.fit(X_train_prepared, y_train)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_reg, X_train_prepared, y_train,scoring="roc_auc", cv=10)
print("roc-auc:  ",scores)

#from sklearn.metrics import roc_auc_score
#y_pred=log_reg.predict_proba(X_valid_prepared)[:,1]
#y_true=y_valid

#roc_auc_score(y_true,y_pred)

roc-auc:   [0.61057993 0.63145909 0.68502372 0.65239398 0.6529254  0.62835543
 0.60677187 0.64605443 0.66686936 0.68437697]


## scale_pos_weight [default=1]
Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider Freq of Neg/ Fre of Positive instances

In [24]:
display(df_train["No-show"].value_counts()/len(df_train))
0.797668/0.202332

0    0.797668
1    0.202332
Name: No-show, dtype: float64

3.942371943142953

In [None]:
xgb_clf=XGBClassifier(scale_pos_weight=4)

xgb_clf.fit(X_train_prepared, y_train)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(xgb_clf, X_train_prepared, y_train,scoring="roc_auc", cv=10)
print("roc-auc:  ",scores)

#from sklearn.metrics import roc_auc_score
#y_pred=xgb_clf.predict_proba(X_valid_prepared)[:,1]
#y_true=y_valid

#roc_auc_score(y_true,y_pred)

In [None]:
XGBClassifier()

In [None]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidate = np.flatnonzero(results['rank_test_score'] == i)[0]
        print("Model with rank: {0}".format(i))
        print("Mean validation score: {0:.5f} (std: {1:.5f})".format(
            results['mean_test_score'][candidate],
            results['std_test_score'][candidate]))
        print("Parameters: {0}".format(results['params'][candidate]))
        print("")

from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgb_params = {"n_estimators":[100,500,1000,1500],}

xgb1=XGBClassifier(subsample=0.8,colsample_bylevel=0.8,colsample_bytree=0.8)

grid_search=GridSearchCV(xgb1,cv=5,param_grid=xgb_params,

scoring='roc_auc',verbose=False,n_jobs=-1)

grid_search.fit(X_train_prepared, y_train)

report(grid_search.cv_results_,3)

In [25]:
xgb_params={"gamma":[0,2,5,8,10],
            "max_depth": [2,3,4,5,6,7,8],
            "min_child_weight":[0.5,1,2,5,10]
}

xgb2=XGBClassifier(n_estimators=500,subsample=0.8,
                   colsample_bylevel=0.8,colsample_bytree=0.8)

random_search=RandomizedSearchCV(xgb2,param_distributions=xgb_params,n_iter=10,
                                 cv=5,scoring='roc_auc',
                                 n_jobs=-1,verbose=False)

random_search.fit(X_train_prepared, y_train)
report(random_search.cv_results_,3)

NameError: ignored

In [26]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidate = np.flatnonzero(results['rank_test_score'] == i)[0]
        print("Model with rank: {0}".format(i))
        print("Mean validation score: {0:.5f} (std: {1:.5f})".format(
            results['mean_test_score'][candidate],
            results['std_test_score'][candidate]))
        print("Parameters: {0}".format(results['params'][candidate]))
        print("")
report(random_search.cv_results_,3)

Model with rank: 1
Mean validation score: 0.71415 (std: 0.02108)
Parameters: {'min_child_weight': 10, 'max_depth': 8, 'gamma': 8}

Model with rank: 2
Mean validation score: 0.71359 (std: 0.02211)
Parameters: {'min_child_weight': 1, 'max_depth': 6, 'gamma': 8}

Model with rank: 3
Mean validation score: 0.71318 (std: 0.02108)
Parameters: {'min_child_weight': 1, 'max_depth': 5, 'gamma': 10}



In [28]:
xgb_params={
    'max_delta_step':[0,1,3,6,10],
    'scale_pos_weight':[1,2,3,4]
}

xgb3=XGBClassifier(n_estimators=500,min_child_weight=10,gamma=8,max_depth=8,
                  subsample=0.8,colsample_bylevel=0.8,colsample_bytree=0.8)

from sklearn.model_selection import GridSearchCV
grid_search=GridSearchCV(xgb3,param_grid=xgb_params,
                         cv=5,scoring='roc_auc',n_jobs=-1,verbose=False)
grid_search.fit(X_train_prepared, y_train)
report(grid_search.cv_results_,3)

Model with rank: 1
Mean validation score: 0.71504 (std: 0.02056)
Parameters: {'max_delta_step': 1, 'scale_pos_weight': 1}

Model with rank: 2
Mean validation score: 0.71415 (std: 0.02108)
Parameters: {'max_delta_step': 0, 'scale_pos_weight': 1}



IndexError: ignored

In [29]:
final_xgb=XGBClassifier(n_estimators=500,min_child_weight=10,gamma=8,max_depth=8, scale_pos_weight=1,max_delta_step=1)
final_xgb.fit(X_train_prepared, y_train)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(final_xgb, X_train_prepared, y_train,scoring="roc_auc", cv=10)
print("roc-auc:  ",scores)

roc-auc:   [0.69148522 0.68927156 0.74825197 0.71305992 0.71855626 0.70324638
 0.63557864 0.71876881 0.74262612 0.7468693 ]


## Predictions

In [30]:
y_test_pred=final_xgb.predict_proba(X_test_prepared)[:,1]

mysubmission=pd.DataFrame(y_test_pred,columns=["No-show"])
pd.DataFrame(mysubmission).to_csv("Submi_P6_EdvPy_KSV_Xgb_S05.csv",index=False)

In [31]:
from google.colab import files
files.download("Submi_P6_EdvPy_KSV_Xgb_S05.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>