# Notebook: Healthcare No Shows Appointments

Deze notebook werd gemaakt in het kader van opleiding Data Scientist,    
en heeft tot doel om de opgedane kennis te leren gebruiken.   

Deze case hoort thuis bij <B>Supervised Learning</B> en <b>Logistic Regression</B>     

De dataset komt van Kaggle: <a href="https://www.kaggle.com/datasets/iamtanmayshukla/healthcare-no-shows-appointments-dataset"><i>iamtanmayshukla/healthcare-no-shows-appointments-dataset</i></a>

Deze notebook is opgedeeld in verschillende hoofdstukken, telkens aangeduid met een Markdown.

# Some initial settings


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.width', 800)
import matplotlib
matplotlib.use('TkAgg')
%matplotlib inline
seed=42

# remove future warnings
import warnings
warnings.simplefilter('ignore')
warnings.filterwarnings("ignore")

# path settings
from pathlib import Path
import os
p = Path()
download_path = p / 'data'
output_path = p / 'output'
images_path = p / 'images'
filename=""

# Create the output directory if it does not exist
if not os.path.exists(output_path):
		os.makedirs(output_path)
# Create the images directory if it does not exist, dit keer op een andere manier dan hierboven
if not images_path.exists():
	images_path.mkdir(parents=True, exist_ok=True)

# Functie om de output van de gemaakte plots te bewaren
def save_fig(fig_name, tight_layout=True, fig_extension="png", resolution=300):
    path = images_path / f"{fig_name}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


from tqdm.notebook import tqdm
from datetime import datetime
#import datetime
start_time = datetime.now() # Om op het einde de uitvoertijd te berekenen


def print_exec_time(exec_info):
    tqdm.write(f"✅ Uitgevoerd op: {datetime.now():%Y-%m-%d %H:%M:%S}")

get_ipython().events.register("post_run_cell", print_exec_time)

✅ Uitgevoerd op: 2025-02-15 20:55:22


# Get the data from Kaggle

In [None]:
from kaggle.api.kaggle_api_extended import KaggleApi
import os
import zipfile

api = KaggleApi()
api.authenticate()

print(datetime.now())

if 'data\\' in filename: 
    print(f"Bestaat al: {filename}")
    print(f"Het bestand is al gedownload en moet niet meer worden ingelezen.\n")
else:
    # download dataset
    kaggle_dataset_name = 'iamtanmayshukla/healthcare-no-shows-appointments-dataset' # te halen van de Kaggle.com url
    api.dataset_download_files(kaggle_dataset_name, path=download_path, unzip=True)

    # Het gedownloade bestand oproepen vanuit de download map
    files = os.listdir(download_path)

    # neem de eerste file uit de folder
    filename1 = os.path.join(download_path, files[0])
    # Neem de recentste file op basis van de file met de nieuwste modification time
    file_paths = [os.path.join(download_path, file) for file in files]
    filename2 = max(file_paths, key=os.path.getmtime)

    # check filename (en path)
    print(f"\nHet bestand waarmme je verder werkt is: \n 1. {filename1} of \n 2. {filename2}")

    # Kies met welke filename je verder wilt werken
    filename = filename1

    # Lees het bestand in
    df_raw = pd.read_csv(filename)
    display(df_raw.head())


### Uit te voeren om met *verse* data te starten

In [None]:
# Herstart door de dataset opnieuw te kopiëren vanuit de originele dataset: df_raw
dataset = df_raw.copy()
print(dataset.shape)
print(dataset.head())

## Profiling

### ydata profiling

In [None]:
# MAAK EEN PROFILER HTML DOCUMENT DAT JE HELPT BIJ DE DATA EXPLORATIE EN LAAD HET MEE OP MET JE ANTWOORDEN OP DE SYNTRA CLOUD.
##########################
# PROFILER HTML DOCUMENT #
##########################

from ydata_profiling import ProfileReport
import os
import webbrowser
"""
# Generate the profiling report, kies een goede titel
profile = ProfileReport(dataset, title="Dataset Healthcare No Shows Appointments Report", explorative=True) # explorative=True om ook de correlaties te zien

# Create the output directory if it does not exist
output_dir = "output"
if not os.path.exists(output_dir):
	os.makedirs(output_dir)

# Save the report as an HTML file
profile.to_file(os.path.join(output_dir, "HNSA_2_profiler.html"))

"""

# controleer of er al een rapport is
profiler_file = 'HNSA_2_profiler.html'
profiler_path = os.path.abspath(os.path.join(output_path, profiler_file))
if os.path.exists(profiler_path):
	print(f"profiler betsaat al. Openen...")
else:
	# Generate the profiling report, kies een goede titel
	profile = ProfileReport(dataset, title="Dataset Healthcare No Shows Appointments Report", explorative=True) # explorative=True om ook de correlaties te zien

	# Save the report as an HTML file
	profile.to_file(os.path.join(output_path,profiler_file))

	# Pad naar je bestand
	profiler_path = os.path.abspath(os.path.join(output_path, profiler_file))
	print(f"profiler gemaakt: {profiler_path}")

# Open het bestand in de standaardbrowser
webbrowser.open(f"file://{profiler_path}")



## Exploring

### Elementaire checks

In [None]:
# aantal rijen printen
print(f"Aantal rijen: {dataset.shape[0]}")
print(f"Aantal kolommen: {dataset.shape[1]}")
print("-----------------")
# een overzicht van de kolommen
print(f"Kolommen: {dataset.columns}")
print("-----------------")
# Check for missing values
if dataset.isnull().values.any():
    print(dataset.isnull().sum()) # aantal missing values per kolom
else:
    print(f"Missing values: GEEN !! ")
print("-----------------")
# Get basic statistics
print(f"Statistics: (voor numerische kolommen) \n")
print(dataset.describe())
print("-----------------")
# Check the data types
print(f"Data types :")
print(dataset.dtypes)
print("-----------------")

# Check unique values in categorical columns
for col in dataset.select_dtypes(include=['object']).columns:
    print(f"{col} unique values: {dataset[col].nunique()}")

print("-----------------")

# Cleaning     
In HNSA_2 voeg ik twee kolommen toe met de dag van de week voor scheduled en appointment

In [None]:
# omzetten van PatinetId naar string en verwijder .0 op het einde van de string
dataset['PatientId'] = dataset['PatientId'].astype(str).str.replace('.0', '', regex=False)
# omzetten van ScheduledDay en AppointmentDay naar datetime
dataset['ScheduledDay'] = pd.to_datetime(dataset['ScheduledDay'])
dataset['AppointmentDay'] = pd.to_datetime(dataset['AppointmentDay'])

# toevoegen van extra kolommen, zoals ScheduledDayOfWeek en AppointmentDayOfWeek
dataset['ScheduledDayOfWeek'] = (dataset['ScheduledDay']).dt.day_name()
dataset['AppointmentDayOfWeek'] = dataset['AppointmentDay'].dt.day_name()


In [None]:
# check uitvoering vorig codeblok
print(dataset.dtypes)
dataset


# Exploratory Data Analysis

In deze notebook maak ik een extra kolom: Day of Week voor de afspraken

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
#import matplotlib
#matplotlib.use('TkAgg')  # or 'Qt5Agg'
#%matplotlib inline

# Day of Week verdeling
#twee subplots
plt.figure(figsize=(12, 6))
# Sort days on x-axis
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
plt.subplot(1, 2, 1)
sns.countplot(data=dataset, x='AppointmentDayOfWeek', hue='Showed_up', order=day_order)
plt.title('Appointment Showed up by Day of the Week')
plt.subplot(1, 2, 2)
sns.countplot(data=dataset, x='ScheduledDayOfWeek', hue='Showed_up', order=day_order)
plt.title('Scheduled Day Showed up by Day of the Week')
save_fig('HNSA_2_day_of_week')
plt.show()

### Correlatie matrix   
Eerst op de originele data   
Daarna op de versie met de Bool-kolommen omgezet naar 0/1

In [None]:
# Originele Matrix

# Selectie kolommen met numerieke waarden
numeric_df_raw = df_raw.select_dtypes(include=[np.number])

# Bereken de correlatie matrix
corr = numeric_df_raw.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
save_fig('HNSA_2_correlation_heatmap')
plt.show()

In [None]:
"""ArithmeticError# Nieuwe Matrix met alle (numerische) kolommen
# wetende dat de Boolean omgezet zijn naar int 1 of 0

# Selectie kolommen met numerieke waarden
numeric_data = dataset.select_dtypes(include=[np.number])

# Bereken de correlatie matrix
corr = numeric_data.corr()

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
save_fig('HNSA_2_correlation_heatmap2')
plt.show()
"""

# START Modelling  

### Split Train & Test    
X dit keer ALLE kolommen (behalve Target)

Maar eerst alle kolommen omzetten naar getallen

In [None]:
from sklearn.preprocessing import LabelEncoder

# omzetten van Scholarship, Hipertension, Diabetes, Alcoholism, Handcap en SMS_received naar 0 of 1
dataset['Scholarship'] = dataset['Scholarship'].astype(bool).astype(int)
dataset['Hipertension'] = dataset['Hipertension'].astype(bool).astype(int)
dataset['Diabetes'] = dataset['Diabetes'].astype(bool).astype(int)
dataset['Alcoholism'] = dataset['Alcoholism'].astype(bool).astype(int)
dataset['Handcap'] = dataset['Handcap'].astype(bool).astype(int)
dataset['SMS_received'] = dataset['SMS_received'].astype(bool).astype(int)
# TARGET: omzetten van Showed_up naar 0 of 1
dataset['Showed_up'] = dataset['Showed_up'].astype(bool).astype(int)

# Gender f/M omzetten naar 0/1 met de LabelEncoder
encoder = LabelEncoder()
dataset['Gender_encoded'] = encoder.fit_transform(dataset['Gender'])



# verwijderen van de originele kolommen
#dataset.drop


Dagen van de week omzetten met EEN van de twee hieronder codeblokken

In [None]:
# Dagen van de week omzetten met OHE via get dummies
days_SDW = {'ScheduledDayOfWeek': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']}
dataset = pd.get_dummies(dataset, columns=days_SDW.keys())
days_ADW = {'AppointmentDayOfWeek': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']}
dataset = pd.get_dummies(dataset, columns=days_ADW.keys())

In [None]:
""" UNCOMMENT OM DE OHE UIT TE VOEREN ipv get_dummies
from sklearn.preprocessing import OneHotEncoder

# Werken met OHE zelf ipv getdummies:
days = {'ScheduledDayOfWeek': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'],
        'AppointmentDayOfWeek': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']}
enc1 = OneHotEncoder(sparse=False)
encoded_SDW = enc1.fit_transform(dataset[['ScheduledDayOfWeek']])
encoded_df1 = pd.DataFrame(encoded_SDW, columns=[f'SDW_{col}' for col in enc1.categories_[0]])
enc2 = OneHotEncoder(sparse=False)
encoded_ADW = enc1.fit_transform(dataset[['ScheduledDayOfWeek']])
encoded_df2 = pd.DataFrame(encoded_ADW, columns=[f'ADW_{col}' for col in enc2.categories_[0]])

#Voeg samen met originele dataset
dataset = pd.concat([dataset, encoded_df1, encoded_df2], axis=1)
"""

In [None]:
# Nog eens hetzelfde voor Neighbourhood
# Neighbourhood omzetten met OHE via get dummies
dataset = pd.get_dummies(dataset, columns=['Neighbourhood'], drop_first=True)

In [None]:
dataset.columns

In [None]:
dataset.dtypes

## HNSA_2    
Hierin werk ik met alle kolommen, behalve ... (ipv een lijst features, zoals in HNSA_1)

In [None]:
from sklearn.model_selection import train_test_split

# Prepare data for modeling
#features = ['PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay', 'Age', 'Neighbourhood', 'Scholarship', 'Hipertension', 'Diabetes', 'Alcoholism', 'Handcap', 'SMS_received', 'Showed_up', 'Date.diff', 'ScheduledDayOfWeek', 'AppointmentDayOfWeek', 'Gender_encoded']
target = 'Showed_up'
#X = dataset[features]
X = dataset.drop(columns=['Gender', 'PatientId', 'AppointmentID', 'ScheduledDay', 'AppointmentDay', 'Showed_up'])
y = dataset[target]

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)


In [None]:
X

### Predictive Modeling   Logistic Regression

In [None]:

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
con_matrix = confusion_matrix(y_test, y_pred)


print(f"Accuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(con_matrix)



In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5)
print(f"Model gebruikt: {model} \n")
print(f"Cross-Validation Scores: {scores}")
print(f"Mean CV Score: {scores.mean():.4f}")

### RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the model
model = RandomForestClassifier(random_state=seed)
print(f"Model gebruikt: {model} \n")
# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# More detailed evaluation
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

# For ROC-AUC score
y_pred_proba = model.predict_proba(X_test)[:, 1]  # Probabilities for the positive class
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}\n")

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
print(f"False Positive Rate: {fpr}")
print(f"True Positive Rate: {tpr}")
print(f"Thresholds: {thresholds}")
print(len(fpr), len(tpr), len(thresholds))


In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5)
print(f"Model gebruikt: {model} \n")
print(f"Cross-Validation Scores: {scores}")
print(f"Mean CV Score: {scores.mean():.4f}")

### SMOTE   

In [None]:
from imblearn.over_sampling import SMOTE

# Create an instance of SMOTE
smote = SMOTE()

# Apply SMOTE to your training data
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train your model
model = RandomForestClassifier()
model.fit(X_resampled, y_resampled)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate performance
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

### Hoe lang heeft dit script erover gedaan?

In [None]:
# Bereken de uitvoertijd, vergeet niet om de start_time te definieren in het begin van het  script
end_time = datetime.now()
print(f"Gestart om: {start_time}\n")
print(f"Eindtijd: {end_time}\n")
# print uitvoertijd in seconden
print(f"Uitvoertijd: {end_time-start_time}\n")
# print THE END in grote karakters

import pyfiglet
text = "THE  END"
ascii_art = pyfiglet.figlet_format(text)
print(ascii_art)