These are the standard models used in the publication "Generalizability Assessment of AI Models Across Hospitals: A Comparative Study in Low-Middle Income and High Income Countries" (https://www.medrxiv.org/content/10.1101/2023.11.05.23298109v1).

Data from OUH studied here are available from the Infections in Oxfordshire Research Database (https://oxfordbrc.nihr.ac.uk/research-themes/modernising-medical-microbiology-and-big-infection-diagnostics/infections-in-oxfordshire-research-database-iord/), subject to an application meeting the ethical and governance requirements of the Database. Data from UHB, PUH and BH are available on reasonable request to the respective trusts, subject to HRA requirements.

Data from HTD and NHTD are available from the CCAA Vietnam Data Access Committee, subject to an application meeting the ethical and governance requirements.

## Load dataset

In [1]:
#This script automatically loads the Adult (Census) dataset (replace with your own data)

import pandas as pd

# Define column names
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation',
                'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

# Load dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
df = pd.read_csv(url, header=None, names=column_names)

# Display the first few rows of the dataset
print(df.head())

   age          workclass  fnlwgt   education  education-num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital-status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital-gain  capital-loss  hours-per-week  native-country  income  
0          2174             0              40   United-States   <=50

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Encode categorical variables
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    df[column] = label_encoders[column].fit_transform(df[column])

# Define features (X) and target variable (y)
X = df.drop('income', axis=1)  # Features
y = df['income']  # Target variable

# Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (26048, 14) (26048,)
Testing set shape: (6513, 14) (6513,)


In [3]:
from sklearn.metrics import roc_auc_score

## Logistic Regression

In [4]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict_proba(X_test)[:, 1]

# Calculate accuracy
auc = roc_auc_score(y_test, y_pred)
print("AUC:", auc)

AUC: 0.85676430940089


## XGBoost

In [5]:
from xgboost import XGBClassifier

model = XGBClassifier()

model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict_proba(X_test)[:, 1]

# Calculate accuracy
auc = roc_auc_score(y_test, y_pred)
print("AUC:", auc)

AUC: 0.9272358853470467


## Neural Network

In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

X = torch.tensor(np.array(X_train), dtype=torch.float32)

y_train = np.array(y_train).reshape(-1, 1)

# define the model
num_hidden_nodes = 10
class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden1 = nn.Linear(X_train.shape[1], num_hidden_nodes)
        self.act1 = nn.ReLU()
        self.output = nn.Linear(num_hidden_nodes, 1)
        self.act_output = nn.Sigmoid()
 
    def forward(self, x):
        x = self.act1(self.hidden1(x))
        x = self.act_output(self.output(x))
        return x

model = Classifier()
print(model)


Classifier(
  (hidden1): Linear(in_features=14, out_features=10, bias=True)
  (act1): ReLU()
  (output): Linear(in_features=10, out_features=1, bias=True)
  (act_output): Sigmoid()
)


In [7]:
n_epochs = 10
batch_size = 16

In [8]:
# train the model
loss_fn   = nn.BCELoss()  # binary cross entropy

optimizer = optim.Adam(model.parameters(), lr=0.01)

for epoch in range(n_epochs):
    #print(ncr_weight)
    for i in range(0, len(X_train), batch_size):
        Xbatch = X[i:i+batch_size]
        ybatch = y_train[i:i+batch_size]
        Xbatch=torch.tensor(Xbatch)
        ybatch=torch.tensor(ybatch)
        y_pred = model(Xbatch)
        y_pred = y_pred[:,0]
        loss = loss_fn(y_pred.to(torch.float), ybatch[:,0].to(torch.float))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f'Finished epoch {epoch}, latest loss {loss}')

torch.save({'epoch': epoch,
            'model_state_dict': model.state_dict(),'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,}, 'model_checkpoint.pt')

  Xbatch=torch.tensor(Xbatch)
  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Finished epoch 0, latest loss 0.2388536036014557
Finished epoch 1, latest loss 0.22713789343833923
Finished epoch 2, latest loss 0.22442805767059326
Finished epoch 3, latest loss 0.2101062834262848
Finished epoch 4, latest loss 0.1991681009531021
Finished epoch 5, latest loss 0.20513883233070374
Finished epoch 6, latest loss 0.2097243368625641
Finished epoch 7, latest loss 0.2076314091682434
Finished epoch 8, latest loss 0.20745497941970825
Finished epoch 9, latest loss 0.20936359465122223


In [9]:
X_test = torch.tensor(X_test, dtype=torch.float32)
#y_test = torch.tensor(y_test, dtype=torch.float32).reshape(-1, 1)
y_pred = model(X_test).detach().numpy()[:,0]

# Calculate accuracy
auc = roc_auc_score(y_test, y_pred)
print("AUC:", auc)

AUC: 0.9045867775939923
