In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_score, recall_score

# Import Scipy Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.svm import LinearSVC, SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('../data/clean_data.csv')

_FEATURES = ['Age', 'booking_window_days', 'Hipertension', 'Scholarship', 
             'Diabetes', 'Handcap', 'SMS_received', 'ScheduledHour', 
             'AppointmentDayOfWeek']
_TARGET = 'No-show'


### Cross Validation

In [3]:
def run_model(data, model):
    skf = StratifiedKFold(n_splits=5)

    X = data[_FEATURES].values 
    y = data[_TARGET].values 

    scores = np.array([])
    cms = np.array([[0, 0], [0, 0]])

    for train_index, test_index in skf.split(X, y):
        # Calculate the user history
        train = data.iloc[train_index]
        train = train.join(train.groupby('PatientId')['No-show'].agg(np.mean), on='PatientId', rsuffix='_user_mean')
        train = train.join(train.groupby('Neighbourhood')['No-show'].agg(np.mean), on='Neighbourhood', rsuffix='_neighbourhood_mean')
        train = train.join(train.groupby('PatientId')['No-show'].agg('sum'), on='PatientId', rsuffix='_sum')
        #train = train.join(train.groupby('PatientId')['Age'].count(), on='PatientId', rsuffix='_count')
        
        test = data.iloc[test_index]
        
        test = pd.merge(test, train[['PatientId', 'No-show_user_mean']], on='PatientId', how='left') 
        test.at[test['No-show_user_mean'].isna(), 'No-show_user_mean'] = 1
        
        test = pd.merge(test, train[['PatientId', 'No-show_neighbourhood_mean']], on='PatientId', how='left') 
        test.at[test['No-show_neighbourhood_mean'].isna(), 'No-show_neighbourhood_mean'] = 1
        
        test = pd.merge(test, train[['PatientId', 'No-show_sum']], on='PatientId', how='left')
        test.at[test['No-show_sum'].isna(), 'No-show_sum'] = 0
        
        #test = pd.merge(test, train[['PatientId', 'Age_count']], on='PatientId', how='left')
        #test.at[test['Age_count'].isna(), 'Age_count'] = 0
        
        X_train = train[_FEATURES + ['No-show_user_mean', 'No-show_neighbourhood_mean']].values
        y_train = train[_TARGET].values
        
        X_test = test[_FEATURES + ['No-show_user_mean', 'No-show_neighbourhood_mean']].values
        y_test = test[_TARGET].values
        
        #X_train, X_test = X[train_index], X[test_index]
        #y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train)
        scores = np.append(scores, model.score(X_test, y_test))

        y_pred = model.predict(X_test)
        
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        
        cm = confusion_matrix(y_test, y_pred)
        cms = np.add(cms, cm)
    
    return scores, cms, precision, recall


# Logistic Regression

In [None]:
%%capture

param_grid = {'C': [0.001, 0.01, 0.1]}

lr_results = {}

for C in param_grid['C']:
    model = LogisticRegression(random_state=0, class_weight='balanced', C=C)
    
    scores, cms, precision, recall = run_model(data, model)
    
    lr_results[C] = (scores, cms, precision, recall)

In [None]:
for C in lr_results.keys():
    scores, cms, precision, recall = lr_results[C]
    
    print('Score: %f' % (np.mean(scores)))
    print('Precision: %f' % (np.mean(precision)))
    print('Recall: %f' % (np.mean(recall)))
    
    df_cm = pd.DataFrame(cms, range(2), range(2))
    sns.heatmap(df_cm, annot=True, fmt='g') # font size
    plt.show() 

# Decision Trees

Hyperparameters:
- min_impurity_split
- max_depth
- min_samples_leaf
- max_features


In [None]:
%%capture

param_grid = {'min_impurity_split': [0.1, 0.2, 0.3, 0.4],
              'min_samples_leaf': [2, 5, 10, 20, 50, 100, 250]}

dt_results = {}

#for min_impurity_split in tqdm(param_grid['min_impurity_split']):
#    for min_samples_leaf in param_grid['min_samples_leaf']:
    
model = DecisionTreeClassifier(random_state=0, 
                               class_weight='balanced', 
                               min_impurity_split=0.2,
                               min_samples_leaf=2)

scores, cms, precision, recall = run_model(data, model)

dt_results[(0.2, 2)] = (scores, cms, precision, recall)

In [None]:
#for min_impurity_split, min_samples_leaf in dt_results.keys():
#    print(min_impurity_split, min_samples_leaf)

scores, cms, precision, recall = dt_results[(0.2, 2)]

print('Score: %f' % (np.mean(scores)))
print('Precision: %f' % (np.mean(precision)))
print('Recall: %f' % (np.mean(recall)))

df_cm = pd.DataFrame(cms, range(2), range(2))
sns.heatmap(df_cm, annot=True, fmt='g') # font size
plt.show() 

# Random Foreset

In [None]:
%%capture

param_grid = {'min_impurity_split': [0.1, 0.2, 0.3, 0.4],
              'min_samples_leaf': [2, 5, 10, 20, 50, 100, 250]}

dt_results = {}

#for min_impurity_split in tqdm(param_grid['min_impurity_split']):
#    for min_samples_leaf in param_grid['min_samples_leaf']:
    
model = RandomForestClassifier(n_jobs=2, 
                               random_state=0, 
                               class_weight='balanced',
                               min_impurity_split=0.2,
                               min_samples_leaf=2)

scores, cms, precision, recall = run_model(data, model)

dt_results[(0.2, 2)] = (scores, cms, precision, recall)

In [None]:
#for min_impurity_split, min_samples_leaf in dt_results.keys():
#    print(min_impurity_split, min_samples_leaf)
scores, cms, precision, recall = dt_results[(0.2, 2)]

print('Score: %f' % (np.mean(scores)))
print('Precision: %f' % (np.mean(precision)))
print('Recall: %f' % (np.mean(recall)))

df_cm = pd.DataFrame(cms, range(2), range(2))
sns.heatmap(df_cm, annot=True, fmt='g') # font size
plt.show() 