# Healthcare No Show Datasets Preparation

Prepare datasets for model training. By default, will prepare 3 for cross validation.

In [None]:
import sys
import os
import dotenv
import sqlalchemy
import pandas as pd

import torch
from torch.utils.data import random_split
import numpy as np

from src.dataset import CustomDataset
from src.utils import sort_pd_column

sys.path.append("../..")  # add src to path to import custom modules
dotenv.load_dotenv()
pd.set_option('display.expand_frame_repr', False)

## Load data from SQL to pandas

In [2]:
engine = sqlalchemy.create_engine(
    f"mysql+mysqlconnector://{os.getenv('USER')}:{os.getenv('PASSWORD')}@{os.getenv('HOST')}:{os.getenv('PORT')}/{os.getenv('DATABASE')}"
)

In [3]:
meta_data = sqlalchemy.MetaData()
meta_data.reflect(bind=engine)
HEALTHCARE = meta_data.tables['healthcare']
query = sqlalchemy.select(
    HEALTHCARE.c.gender,  # statistically no relationship
    HEALTHCARE.c.scheduled_day,
    HEALTHCARE.c.appointment_day,
    HEALTHCARE.c.age,
    # HEALTHCARE.c.neighbourhood,  # doesn't seem useful
    HEALTHCARE.c.scholarship,
    HEALTHCARE.c.hypertension,
    HEALTHCARE.c.diabetes,
    HEALTHCARE.c.alcoholism,  # statistically no relationship
    HEALTHCARE.c.handicap,
    HEALTHCARE.c.sms_received,
    HEALTHCARE.c.no_show
)

In [4]:
df = pd.read_sql_query(query, engine)
print(df)

       gender       scheduled_day appointment_day   age  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show
0           F 2015-11-10 07:13:56      2016-05-04  51.0            0             0         0           0         0             1        0
1           M 2015-12-03 08:17:28      2016-05-02  34.0            0             1         0           0         0             1        1
2           F 2015-12-07 10:40:59      2016-06-03  27.0            1             0         0           0         0             1        1
3           F 2015-12-07 10:42:42      2016-06-03  48.0            0             1         1           0         0             1        0
4           F 2015-12-07 10:43:01      2016-06-03  80.0            0             1         1           0         0             1        0
...       ...                 ...             ...   ...          ...           ...       ...         ...       ...           ...      ...
110522      M 2016-06-08 19:32:25 

## Data cleaning

In [5]:
# Gender
df["gender"] = df["gender"].apply(lambda x: 1 if x == "F" else 0)
print(df)

        gender       scheduled_day appointment_day   age  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show
0            1 2015-11-10 07:13:56      2016-05-04  51.0            0             0         0           0         0             1        0
1            0 2015-12-03 08:17:28      2016-05-02  34.0            0             1         0           0         0             1        1
2            1 2015-12-07 10:40:59      2016-06-03  27.0            1             0         0           0         0             1        1
3            1 2015-12-07 10:42:42      2016-06-03  48.0            0             1         1           0         0             1        0
4            1 2015-12-07 10:43:01      2016-06-03  80.0            0             1         1           0         0             1        0
...        ...                 ...             ...   ...          ...           ...       ...         ...       ...           ...      ...
110522       0 2016-06-08 1

In [6]:
# Lag days
df["scheduled_day"] = df["scheduled_day"].dt.normalize()
df["appointment_day"] = df["appointment_day"].dt.normalize()
df["lag_days"] = (df["appointment_day"] - df["scheduled_day"]).dt.days
df = df[df["lag_days"] >= 0]  # remove negative lag days as they are noises
print(df)

        gender scheduled_day appointment_day   age  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show  lag_days
0            1    2015-11-10      2016-05-04  51.0            0             0         0           0         0             1        0       176
1            0    2015-12-03      2016-05-02  34.0            0             1         0           0         0             1        1       151
2            1    2015-12-07      2016-06-03  27.0            1             0         0           0         0             1        1       179
3            1    2015-12-07      2016-06-03  48.0            0             1         1           0         0             1        0       179
4            1    2015-12-07      2016-06-03  80.0            0             1         1           0         0             1        0       179
...        ...           ...             ...   ...          ...           ...       ...         ...       ...           ...      ...       ...

In [None]:
# Normalize lag days
min_value = df["lag_days"].min()
max_value = df["lag_days"].max()
df["lag_days"] = (df["lag_days"] - min_value) / (max_value - min_value)
print(df)

In [7]:
# Group lag days
lag_days_ranges = {
    "same day": 0.99,
    "7 days": 7,
    "14 days": 14,
    "30 days": 30,
    "60 days": 60,
    "90 days": 90,
    ">90 days": 200
}
labels, bins = zip(*lag_days_ranges.items())
df["lag_days"] = pd.cut(df["lag_days"], bins=(0,)+bins, labels=labels, include_lowest=True)
lag_days_keys = list(lag_days_ranges.keys())
df["lag_days"] = df["lag_days"].apply(lambda x: lag_days_keys.index(x))
print(df)

        gender scheduled_day appointment_day   age  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show lag_days
0            1    2015-11-10      2016-05-04  51.0            0             0         0           0         0             1        0        6
1            0    2015-12-03      2016-05-02  34.0            0             1         0           0         0             1        1        6
2            1    2015-12-07      2016-06-03  27.0            1             0         0           0         0             1        1        6
3            1    2015-12-07      2016-06-03  48.0            0             1         1           0         0             1        0        6
4            1    2015-12-07      2016-06-03  80.0            0             1         1           0         0             1        0        6
...        ...           ...             ...   ...          ...           ...       ...         ...       ...           ...      ...      ...
110522

In [8]:
# Day of week
df["dayofweek"] = df["appointment_day"].dt.day_name()
dayofweek_all = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5}
df["dayofweek"] = df["dayofweek"].apply(lambda x: dayofweek_all[x])
print(df)

        gender scheduled_day appointment_day   age  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show lag_days  dayofweek
0            1    2015-11-10      2016-05-04  51.0            0             0         0           0         0             1        0        6          2
1            0    2015-12-03      2016-05-02  34.0            0             1         0           0         0             1        1        6          0
2            1    2015-12-07      2016-06-03  27.0            1             0         0           0         0             1        1        6          4
3            1    2015-12-07      2016-06-03  48.0            0             1         1           0         0             1        0        6          4
4            1    2015-12-07      2016-06-03  80.0            0             1         1           0         0             1        0        6          4
...        ...           ...             ...   ...          ...           ...     

In [9]:
# Remove unused columns
df.drop(columns=["scheduled_day", "appointment_day"], inplace=True)
print(df)

        gender   age  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show lag_days  dayofweek
0            1  51.0            0             0         0           0         0             1        0        6          2
1            0  34.0            0             1         0           0         0             1        1        6          0
2            1  27.0            1             0         0           0         0             1        1        6          4
3            1  48.0            0             1         1           0         0             1        0        6          4
4            1  80.0            0             1         1           0         0             1        0        6          4
...        ...   ...          ...           ...       ...         ...       ...           ...      ...      ...        ...
110522       0  54.0            0             0         0           0         0             0        0        0          2
110523       1  

In [10]:
# Age
df = df[df["age"] >= 0]  # remove rows with negative age, they are noises.
print(df)

        gender   age  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show lag_days  dayofweek
0            1  51.0            0             0         0           0         0             1        0        6          2
1            0  34.0            0             1         0           0         0             1        1        6          0
2            1  27.0            1             0         0           0         0             1        1        6          4
3            1  48.0            0             1         1           0         0             1        0        6          4
4            1  80.0            0             1         1           0         0             1        0        6          4
...        ...   ...          ...           ...       ...         ...       ...           ...      ...      ...        ...
110522       0  54.0            0             0         0           0         0             0        0        0          2
110523       1  

In [None]:
# Normalize age
min_value = df["age"].min()
max_value = df["age"].max()
df["age"] = (df["age"] - min_value) / (max_value - min_value)
print(df)

In [11]:
# Group age
age_ranges = {
    "Infant": 1,
    "Toddler": 4,
    "Child": 12,
    "Teen": 19,
    "Adult": 39,
    "Middle": 59,
    "Senior": 120
}
labels, bins = zip(*age_ranges.items())
df["age"] = pd.cut(df["age"], bins=(0,)+bins, labels=labels, include_lowest=True)
age_keys = list(age_ranges.keys())
df["age"] = df["age"].apply(lambda x: age_keys.index(x))
print(df)

        gender age  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show lag_days  dayofweek
0            1   5            0             0         0           0         0             1        0        6          2
1            0   4            0             1         0           0         0             1        1        6          0
2            1   4            1             0         0           0         0             1        1        6          4
3            1   5            0             1         1           0         0             1        0        6          4
4            1   6            0             1         1           0         0             1        0        6          4
...        ...  ..          ...           ...       ...         ...       ...           ...      ...      ...        ...
110522       0   5            0             0         0           0         0             0        0        0          2
110523       1   5            0 

In [None]:
# Neighbourhood
cities = sorted(df["neighbourhood"].unique())
df["neighbourhood"] = df["neighbourhood"].apply(lambda x: cities.index(x))
print(df)

In [12]:
# Normalize handicap to Y/N
df["handicap"] = df["handicap"].apply(lambda x: 1 if x > 0 else 0)
print(df)

        gender age  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show lag_days  dayofweek
0            1   5            0             0         0           0         0             1        0        6          2
1            0   4            0             1         0           0         0             1        1        6          0
2            1   4            1             0         0           0         0             1        1        6          4
3            1   5            0             1         1           0         0             1        0        6          4
4            1   6            0             1         1           0         0             1        0        6          4
...        ...  ..          ...           ...       ...         ...       ...           ...      ...      ...        ...
110522       0   5            0             0         0           0         0             0        0        0          2
110523       1   5            0 

In [13]:
cont_features = []

In [None]:
# Sort columns
cont_features = ["age", "lag_days"]
df = sort_pd_column(df, cont_features)
print(df)

In [14]:
df = sort_pd_column(df, ["no_show"], first=False)
print(df)

        gender age  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received lag_days  dayofweek  no_show
0            1   5            0             0         0           0         0             1        6          2        0
1            0   4            0             1         0           0         0             1        6          0        1
2            1   4            1             0         0           0         0             1        6          4        1
3            1   5            0             1         1           0         0             1        6          4        0
4            1   6            0             1         1           0         0             1        6          4        0
...        ...  ..          ...           ...       ...         ...       ...           ...      ...        ...      ...
110522       0   5            0             0         0           0         0             0        0          2        0
110523       1   5            0 

In [16]:
df.drop(columns=["gender", "scholarship", "hypertension", "diabetes", "alcoholism", "handicap"], inplace=True)
print(df)

       age  sms_received lag_days  dayofweek  no_show
0        5             1        6          2        0
1        4             1        6          0        1
2        4             1        6          4        1
3        5             1        6          4        0
4        6             1        6          4        0
...     ..           ...      ...        ...      ...
110522   5             0        0          2        0
110523   5             0        0          2        0
110524   4             0        0          2        0
110525   4             0        0          2        0
110526   4             0        0          2        0

[110521 rows x 5 columns]


## Create and Store Dataset

In [17]:
def create_and_store_dataset(idx):
    full_dataset = CustomDataset(df)
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])
    labels = np.array([x[1].numpy() for x in train_dataset])
    torch.save(
        {
            "train_dataset": train_dataset,
            "val_dataset": val_dataset,
            "feature_cats": [len(df[x].unique()) if x not in cont_features else -1 for x in df.columns[:-1]],
            "class_size": len(df[df.columns[-1]].unique()),
            "class_weights": [len(train_dataset) / (2 * sum(1 - labels)), len(train_dataset) / (2 * sum(labels))]
        },
        f"../../data/healthcare_no_show/healthcare_datasets_reduced_{idx}.pt"
    )

In [18]:
n_cross_val = 3
for i in range(n_cross_val):
    create_and_store_dataset(i)
    print(f"Dataset {i+1}/{n_cross_val} created and stored.")

Dataset 1/3 created and stored.
Dataset 2/3 created and stored.
Dataset 3/3 created and stored.
