# Healthcare No Show Machine Learning

Use machine learning technique to forecast no show.

In [55]:
import os
import numpy as np
import dotenv
dotenv.load_dotenv()

import sqlalchemy
import pandas as pd
pd.set_option('display.expand_frame_repr', False)

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, random_split

## Load data from SQL to pandas

In [2]:
engine = sqlalchemy.create_engine(
    f"mysql+mysqlconnector://{os.getenv('USER')}:{os.getenv('PASSWORD')}@{os.getenv('HOST')}:{os.getenv('PORT')}/{os.getenv('DATABASE')}"
)

In [3]:
meta_data = sqlalchemy.MetaData()
meta_data.reflect(bind=engine)
HEALTHCARE = meta_data.tables['healthcare']
query = sqlalchemy.select(
    HEALTHCARE.c.gender,
    HEALTHCARE.c.scheduled_day,
    HEALTHCARE.c.appointment_day,
    HEALTHCARE.c.age,
    HEALTHCARE.c.neighbourhood,
    HEALTHCARE.c.scholarship,
    HEALTHCARE.c.hypertension,
    HEALTHCARE.c.diabetes,
    HEALTHCARE.c.alcoholism,
    HEALTHCARE.c.handicap,
    HEALTHCARE.c.sms_received,
    HEALTHCARE.c.no_show
)

In [4]:
df = pd.read_sql_query(query, engine)
print(df)

       gender       scheduled_day appointment_day   age   neighbourhood  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show
0           F 2015-11-10 07:13:56      2016-05-04  51.0     RESISTÊNCIA            0             0         0           0         0             1        0
1           M 2015-12-03 08:17:28      2016-05-02  34.0      VILA RUBIM            0             1         0           0         0             1        1
2           F 2015-12-07 10:40:59      2016-06-03  27.0   SÃO CRISTÓVÃO            1             0         0           0         0             1        1
3           F 2015-12-07 10:42:42      2016-06-03  48.0         MARUÍPE            0             1         1           0         0             1        0
4           F 2015-12-07 10:43:01      2016-06-03  80.0   SÃO CRISTÓVÃO            0             1         1           0         0             1        0
...       ...                 ...             ...   ...             ...     

## Data cleaning

Data representations for machine learning:
1. patient_id
    - no need
2. appointment_id
    - no need
3. gender
    - encode as a binary with 0 as male and 1 as female
4. scheduled_day
    - calculate lag days with appointment_day, then group them into ["same day", "7 days", "14 days", "30 days", "60 days", "90 days", ">90 days"], and use one-hot encoding
5. appointment_day
    - see scheduled_day
    - Also, convert to day of week
6. age
    - remove negatives
    - group into ["Infant", "Toddler", "Child", "Teen", "Adult", "Middle", "Senior"] and use one-hot encoding
7. neighbourhood
    - one-hot encoding
8. scholarship
    - binary
9. hypertension
    - binary
10. diabetes
    - binary
11. alcoholism
    - binary
12. handicap
    - one-hot encoding
13. sms_received
    - binary
14. no_show (this is the output)
    - binary

In [5]:
df["gender"] = df["gender"].apply(lambda x: 1 if x == "F" else 0)
print(df)

        gender       scheduled_day appointment_day   age   neighbourhood  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show
0            1 2015-11-10 07:13:56      2016-05-04  51.0     RESISTÊNCIA            0             0         0           0         0             1        0
1            0 2015-12-03 08:17:28      2016-05-02  34.0      VILA RUBIM            0             1         0           0         0             1        1
2            1 2015-12-07 10:40:59      2016-06-03  27.0   SÃO CRISTÓVÃO            1             0         0           0         0             1        1
3            1 2015-12-07 10:42:42      2016-06-03  48.0         MARUÍPE            0             1         1           0         0             1        0
4            1 2015-12-07 10:43:01      2016-06-03  80.0   SÃO CRISTÓVÃO            0             1         1           0         0             1        0
...        ...                 ...             ...   ...             .

In [6]:
df["scheduled_day"] = df["scheduled_day"].dt.normalize()
df["appointment_day"] = df["appointment_day"].dt.normalize()
df["lag_days"] = (df["appointment_day"] - df["scheduled_day"]).dt.days
print(df)

        gender scheduled_day appointment_day   age   neighbourhood  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show  lag_days
0            1    2015-11-10      2016-05-04  51.0     RESISTÊNCIA            0             0         0           0         0             1        0       176
1            0    2015-12-03      2016-05-02  34.0      VILA RUBIM            0             1         0           0         0             1        1       151
2            1    2015-12-07      2016-06-03  27.0   SÃO CRISTÓVÃO            1             0         0           0         0             1        1       179
3            1    2015-12-07      2016-06-03  48.0         MARUÍPE            0             1         1           0         0             1        0       179
4            1    2015-12-07      2016-06-03  80.0   SÃO CRISTÓVÃO            0             1         1           0         0             1        0       179
...        ...           ...             ...  

In [7]:
lag_days_ranges = {
    "same day": 0.99,
    "7 days": 7,
    "14 days": 14,
    "30 days": 30,
    "60 days": 60,
    "90 days": 90,
    ">90 days": 200
}
labels, bins = zip(*lag_days_ranges.items())
df["lag_days_bins"] = pd.cut(df["lag_days"], bins=(0,)+bins, labels=labels, include_lowest=True)
print(df)

        gender scheduled_day appointment_day   age   neighbourhood  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show  lag_days lag_days_bins
0            1    2015-11-10      2016-05-04  51.0     RESISTÊNCIA            0             0         0           0         0             1        0       176      >90 days
1            0    2015-12-03      2016-05-02  34.0      VILA RUBIM            0             1         0           0         0             1        1       151      >90 days
2            1    2015-12-07      2016-06-03  27.0   SÃO CRISTÓVÃO            1             0         0           0         0             1        1       179      >90 days
3            1    2015-12-07      2016-06-03  48.0         MARUÍPE            0             1         1           0         0             1        0       179      >90 days
4            1    2015-12-07      2016-06-03  80.0   SÃO CRISTÓVÃO            0             1         1           0         0          

In [8]:
lag_days_keys = list(lag_days_ranges.keys())
df["lag_days_bins"] = df["lag_days_bins"].apply(lambda x: lag_days_keys.index(x))
print(df)

        gender scheduled_day appointment_day   age   neighbourhood  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show  lag_days lag_days_bins
0            1    2015-11-10      2016-05-04  51.0     RESISTÊNCIA            0             0         0           0         0             1        0       176             6
1            0    2015-12-03      2016-05-02  34.0      VILA RUBIM            0             1         0           0         0             1        1       151             6
2            1    2015-12-07      2016-06-03  27.0   SÃO CRISTÓVÃO            1             0         0           0         0             1        1       179             6
3            1    2015-12-07      2016-06-03  48.0         MARUÍPE            0             1         1           0         0             1        0       179             6
4            1    2015-12-07      2016-06-03  80.0   SÃO CRISTÓVÃO            0             1         1           0         0          

In [9]:
df["dayofweek"] = df["appointment_day"].dt.day_name()
print(df)

        gender scheduled_day appointment_day   age   neighbourhood  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show  lag_days lag_days_bins  dayofweek
0            1    2015-11-10      2016-05-04  51.0     RESISTÊNCIA            0             0         0           0         0             1        0       176             6  Wednesday
1            0    2015-12-03      2016-05-02  34.0      VILA RUBIM            0             1         0           0         0             1        1       151             6     Monday
2            1    2015-12-07      2016-06-03  27.0   SÃO CRISTÓVÃO            1             0         0           0         0             1        1       179             6     Friday
3            1    2015-12-07      2016-06-03  48.0         MARUÍPE            0             1         1           0         0             1        0       179             6     Friday
4            1    2015-12-07      2016-06-03  80.0   SÃO CRISTÓVÃO            0 

In [10]:
dayofweek_all = {"Monday": 0, "Tuesday": 1, "Wednesday": 2, "Thursday": 3, "Friday": 4, "Saturday": 5}
df["dayofweek"] = df["dayofweek"].apply(lambda x: dayofweek_all[x])
print(df)

        gender scheduled_day appointment_day   age   neighbourhood  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show  lag_days lag_days_bins  dayofweek
0            1    2015-11-10      2016-05-04  51.0     RESISTÊNCIA            0             0         0           0         0             1        0       176             6          2
1            0    2015-12-03      2016-05-02  34.0      VILA RUBIM            0             1         0           0         0             1        1       151             6          0
2            1    2015-12-07      2016-06-03  27.0   SÃO CRISTÓVÃO            1             0         0           0         0             1        1       179             6          4
3            1    2015-12-07      2016-06-03  48.0         MARUÍPE            0             1         1           0         0             1        0       179             6          4
4            1    2015-12-07      2016-06-03  80.0   SÃO CRISTÓVÃO            0 

In [11]:
df.drop(columns=["scheduled_day", "appointment_day", "lag_days"], inplace=True)
print(df)

        gender   age   neighbourhood  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show lag_days_bins  dayofweek
0            1  51.0     RESISTÊNCIA            0             0         0           0         0             1        0             6          2
1            0  34.0      VILA RUBIM            0             1         0           0         0             1        1             6          0
2            1  27.0   SÃO CRISTÓVÃO            1             0         0           0         0             1        1             6          4
3            1  48.0         MARUÍPE            0             1         1           0         0             1        0             6          4
4            1  80.0   SÃO CRISTÓVÃO            0             1         1           0         0             1        0             6          4
...        ...   ...             ...          ...           ...       ...         ...       ...           ...      ...           ...    

In [12]:
df = df[df["age"] >= 0]
print(df)

        gender   age   neighbourhood  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show lag_days_bins  dayofweek
0            1  51.0     RESISTÊNCIA            0             0         0           0         0             1        0             6          2
1            0  34.0      VILA RUBIM            0             1         0           0         0             1        1             6          0
2            1  27.0   SÃO CRISTÓVÃO            1             0         0           0         0             1        1             6          4
3            1  48.0         MARUÍPE            0             1         1           0         0             1        0             6          4
4            1  80.0   SÃO CRISTÓVÃO            0             1         1           0         0             1        0             6          4
...        ...   ...             ...          ...           ...       ...         ...       ...           ...      ...           ...    

In [13]:
age_ranges = {
    "Infant": 1,
    "Toddler": 4,
    "Child": 12,
    "Teen": 19,
    "Adult": 39,
    "Middle": 59,
    "Senior": 120
}
labels, bins = zip(*age_ranges.items())
df["age_group"] = pd.cut(df["age"], bins=(0,)+bins, labels=labels, include_lowest=True)
print(df)

        gender   age   neighbourhood  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show lag_days_bins  dayofweek age_group
0            1  51.0     RESISTÊNCIA            0             0         0           0         0             1        0             6          2    Middle
1            0  34.0      VILA RUBIM            0             1         0           0         0             1        1             6          0     Adult
2            1  27.0   SÃO CRISTÓVÃO            1             0         0           0         0             1        1             6          4     Adult
3            1  48.0         MARUÍPE            0             1         1           0         0             1        0             6          4    Middle
4            1  80.0   SÃO CRISTÓVÃO            0             1         1           0         0             1        0             6          4    Senior
...        ...   ...             ...          ...           ...       ...   

In [14]:
age_keys = list(age_ranges.keys())
df["age_group"] = df["age_group"].apply(lambda x: age_keys.index(x))
print(df)

        gender   age   neighbourhood  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show lag_days_bins  dayofweek age_group
0            1  51.0     RESISTÊNCIA            0             0         0           0         0             1        0             6          2         5
1            0  34.0      VILA RUBIM            0             1         0           0         0             1        1             6          0         4
2            1  27.0   SÃO CRISTÓVÃO            1             0         0           0         0             1        1             6          4         4
3            1  48.0         MARUÍPE            0             1         1           0         0             1        0             6          4         5
4            1  80.0   SÃO CRISTÓVÃO            0             1         1           0         0             1        0             6          4         6
...        ...   ...             ...          ...           ...       ...   

In [15]:
df.drop(columns=["age"], inplace=True)
print(df)

        gender   neighbourhood  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show lag_days_bins  dayofweek age_group
0            1     RESISTÊNCIA            0             0         0           0         0             1        0             6          2         5
1            0      VILA RUBIM            0             1         0           0         0             1        1             6          0         4
2            1   SÃO CRISTÓVÃO            1             0         0           0         0             1        1             6          4         4
3            1         MARUÍPE            0             1         1           0         0             1        0             6          4         5
4            1   SÃO CRISTÓVÃO            0             1         1           0         0             1        0             6          4         6
...        ...             ...          ...           ...       ...         ...       ...           ...      ...

In [16]:
cities = sorted(df["neighbourhood"].unique())
df["neighbourhood"] = df["neighbourhood"].apply(lambda x: cities.index(x))
print(df)

        gender  neighbourhood  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show lag_days_bins  dayofweek age_group
0            1             59            0             0         0           0         0             1        0             6          2         5
1            0             80            0             1         0           0         0             1        1             6          0         4
2            1             75            1             0         0           0         0             1        1             6          4         4
3            1             44            0             1         1           0         0             1        0             6          4         5
4            1             75            0             1         1           0         0             1        0             6          4         6
...        ...            ...          ...           ...       ...         ...       ...           ...      ...       

In [17]:
df = pd.get_dummies(df, columns=["neighbourhood", "lag_days_bins", "dayofweek", "age_group"], dtype=int)
print(df)

        gender  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  no_show  neighbourhood_0  neighbourhood_1  ...  dayofweek_3  dayofweek_4  dayofweek_5  age_group_0  age_group_1  age_group_2  age_group_3  age_group_4  age_group_5  age_group_6
0            1            0             0         0           0         0             1        0                0                0  ...            0            0            0            0            0            0            0            0            1            0
1            0            0             1         0           0         0             1        1                0                0  ...            0            0            0            0            0            0            0            1            0            0
2            1            1             0         0           0         0             1        1                0                0  ...            0            1            0            0            0  

In [18]:
df = df[[c for c in df if c not in ["no_show"]] + ["no_show"]]
print(df)

        gender  scholarship  hypertension  diabetes  alcoholism  handicap  sms_received  neighbourhood_0  neighbourhood_1  neighbourhood_2  ...  dayofweek_4  dayofweek_5  age_group_0  age_group_1  age_group_2  age_group_3  age_group_4  age_group_5  age_group_6  no_show
0            1            0             0         0           0         0             1                0                0                0  ...            0            0            0            0            0            0            0            1            0        0
1            0            0             1         0           0         0             1                0                0                0  ...            0            0            0            0            0            0            1            0            0        1
2            1            1             0         0           0         0             1                0                0                0  ...            1            0            0        

## Training a model

In [19]:
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        features = torch.tensor(row[:-1].values, dtype=torch.float32)
        label = torch.tensor(row[-1], dtype=torch.float32)
        return features, label

In [20]:
full_dataset = CustomDataset(df)
train_size = int(0.8 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [49]:
class TransformerClassifier(nn.Module):
    """A transformer-based classifier for sequence data.
    
    This model uses a transformer encoder architecture followed by a classification layer
    to perform sequence classification tasks.
    
    Args:
        input_dim (int): Dimension of input features
        num_classes (int): Number of output classes
        d_model (int, optional): Dimension of transformer model. Defaults to 512.
        nhead (int, optional): Number of attention heads. Defaults to 8.
        num_encoder_layers (int, optional): Number of transformer encoder layers. Defaults to 3.
        dim_feedforward (int, optional): Dimension of feedforward network. Defaults to 2048.
        dropout (float, optional): Dropout rate. Defaults to 0.1.
    """
    
    def __init__(
            self, 
            input_dim: int, 
            num_classes: int, 
            d_model: int = 512, 
            nhead: int = 8, 
            num_encoder_layers: int = 3, 
            dim_feedforward: int = 2048, 
            dropout: float = 0.1
    ) -> None:
        super().__init__()
        
        # Input projection layer
        self.input_projection = nn.Linear(input_dim, d_model)
        nn.init.xavier_normal_(self.input_projection.weight)
        nn.init.constant_(self.input_projection.bias, 0.1)
        
        # Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=num_encoder_layers
        )
        for name, param in self.transformer_encoder.named_parameters():
            if 'weight' in name:
                nn.init.xavier_normal_(param.unsqueeze(0))
        
        # Output classifier
        self.classifier = nn.Linear(d_model, num_classes, bias=False)
        nn.init.xavier_normal_(self.classifier.weight)
        
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """Forward pass of the model.
        
        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, seq_length, input_dim)
            
        Returns:
            torch.Tensor: Output tensor of shape (batch_size, num_classes)
        """
        # Project input to d_model dimensions
        x = self.input_projection(x)
        x = nn.ReLU()(x)
        
        # Apply transformer encoder
        x = self.transformer_encoder(x)
        
        # Classification layer
        output = self.classifier(x)
        return output

In [50]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_features = len(df.columns) - 1
n_classes = 1
model = TransformerClassifier(
    input_dim=n_features,
    num_classes=n_classes,
    num_encoder_layers=6
).to(device)
criterion = nn.BCEWithLogitsLoss()

In [51]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 60, 90], gamma=0.1)
num_epochs = 100
running_loss = 0.0
for epoch in range(num_epochs):
    model.train()
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(features.unsqueeze(1))  # Add sequence dimension
        loss = criterion(outputs.squeeze(), labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    scheduler.step()
    
    model.eval()
    with torch.no_grad():
        val_loss = 0.0
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features.unsqueeze(1))
            val_loss += criterion(outputs.squeeze(), labels).item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")
    running_loss = 0.0

  label = torch.tensor(row[-1], dtype=torch.float32)


Epoch [1/100], Loss: 0.5051, Val Loss: 0.4861
Epoch [2/100], Loss: 0.4974, Val Loss: 0.4990
Epoch [3/100], Loss: 0.5046, Val Loss: 0.4980
Epoch [4/100], Loss: 0.4758, Val Loss: 0.4529
Epoch [5/100], Loss: 0.4533, Val Loss: 0.4439
Epoch [6/100], Loss: 0.4493, Val Loss: 0.4451
Epoch [7/100], Loss: 0.4487, Val Loss: 0.4455
Epoch [8/100], Loss: 0.4472, Val Loss: 0.4434
Epoch [9/100], Loss: 0.4455, Val Loss: 0.4417
Epoch [10/100], Loss: 0.4441, Val Loss: 0.4417
Epoch [11/100], Loss: 0.4446, Val Loss: 0.4464
Epoch [12/100], Loss: 0.4441, Val Loss: 0.4402
Epoch [13/100], Loss: 0.4412, Val Loss: 0.4405
Epoch [14/100], Loss: 0.4409, Val Loss: 0.4445
Epoch [15/100], Loss: 0.4394, Val Loss: 0.4401
Epoch [16/100], Loss: 0.4403, Val Loss: 0.4439
Epoch [17/100], Loss: 0.4384, Val Loss: 0.4404
Epoch [18/100], Loss: 0.4360, Val Loss: 0.4400
Epoch [19/100], Loss: 0.4347, Val Loss: 0.4402
Epoch [20/100], Loss: 0.4329, Val Loss: 0.4459
Epoch [21/100], Loss: 0.4318, Val Loss: 0.4410
Epoch [22/100], Loss: 

In [None]:
accuracies = []
model.eval()
with torch.no_grad():
    for features, labels in val_loader:
        features = features.to(device)
        outputs = model(features.unsqueeze(1))
        predictions = torch.sigmoid(outputs.squeeze()).cpu().numpy()
        accuracies.append((predictions > 0.5) == labels.numpy())

print(f"Validation Accuracy: {np.concat(accuracies, axis=0).mean():.4f}")

  label = torch.tensor(row[-1], dtype=torch.float32)


Validation Accuracy: 0.7720


In [None]:
torch.save(model.state_dict(), "transformer_classifier.pth")