In [33]:
# '''
# Take in raw csv file path and batch size, return DataLoader object
# '''

import csv
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
# from sklearn.model_selection import KFold

def load_data(file_path, batch_size):
    inputs = []
    labels = []

    # Read the CSV file and load data into lists
    with open(file_path, "r") as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  # skip header row
        for row in reader:
            labels.append(float(row[0]))
            inputs.append([float(val) for val in row[1:]])

    # Convert inputs and labels to tensors
    inputs_tensor = torch.tensor(inputs, dtype=torch.float32)
    labels_tensor = torch.tensor(labels, dtype=torch.float32)

    # Normalize the inputs using Z-score normalization
    means = inputs_tensor.mean(dim=0)  # Mean for each feature
    stds = inputs_tensor.std(dim=0)    # Standard deviation for each feature
    inputs_normalized = (inputs_tensor - means) / stds  # Z-score normalization

    # Combine inputs and labels into a TensorDataset
    dataset = TensorDataset(inputs_normalized, labels_tensor)

    total_size = len(dataset)
    train_size = int(0.8 * total_size)
    val_size = int(0.1 * total_size)
    test_size = total_size - train_size - val_size  # To handle rounding

    # Split the dataset
    train_dataset, val_dataset, test_dataset = random_split(
        dataset, [train_size, val_size, test_size],
        generator=torch.Generator().manual_seed(42)  # For reproducibility
    )
    
    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    return train_loader, val_loader, test_loader

In [34]:
import torch.nn as nn

# Define the neural network model
class DiabetesNN(nn.Module):
    def __init__(self, input_dim=21, hidden_size1=64, hidden_size2=32, activation_fn="relu"):
        super(DiabetesNN, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_size1)
        self.layer2 = nn.Linear(hidden_size1, hidden_size2)
        self.output = nn.Linear(hidden_size2, 1)
        if activation_fn == "sigmoid":
            self.activation = torch.sigmoid
        elif activation_fn == "tanh":
            self.activation = torch.tanh
        elif activation_fn == "relu":
            self.activation = torch.relu
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.activation(self.layer1(x))
        x = self.activation(self.layer2(x))
        x = self.output(x)
        return x

In [35]:
import torch.optim as optim

def train_model(model, train_loader, num_epochs, learning_rate):
    # Use BCEWithLogitsLoss for binary classification
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        epoch_loss = 0
        for data, target in train_loader:
            target = target.float()  # Convert target to float
            optimizer.zero_grad()  # Reset gradients
            output = model(data).squeeze(1)  # Forward pass, squeeze for shape match
            loss = criterion(output, target)  # Calculate BCEWithLogitsLoss

            loss.backward()  # Backpropagation
            optimizer.step()  # Update parameters based on gradients

            epoch_loss += loss.item()  # Accumulate loss

        # Print epoch loss
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss/len(train_loader):.4f}")


In [39]:
'''
Train model with train set. 
'''

file_path = "diabetes_binary_health_indicators_BRFSS2015.csv"

# Hyperparams
hidden_size1 = 128 # Try 128, 256
hidden_size2 = 64 # Try 64, 128
batch_size = 32 # Try 64, 128
activation_fn = "relu"
num_epochs = 100
learning_rate = 0.01 # Try lower

train_loader, val_loader, test_loader = load_data(file_path, batch_size)
# print(train_loader[:5])
# def __init__(self, input_dim=21, hidden_size1=64, hidden_size2=32, activation_fn="relu"):
model = DiabetesNN(hidden_size1=hidden_size1, hidden_size2=hidden_size2, activation_fn=activation_fn)

train_model(model, train_loader, num_epochs, learning_rate)

(tensor([[ 1.1537,  1.1653,  0.1969, -0.5118,  1.1209, -0.2056, -0.3225,  0.5673,
          0.7594,  0.4821, -0.2440,  0.2269, -0.3032, -0.4786, -0.4296, -0.4866,
         -0.4497, -0.8870,  1.6266, -1.0656, -0.5088],
        [ 1.1537,  1.1653,  0.1969,  0.6987, -0.8921, -0.2056, -0.3225,  0.5673,
          0.7594,  0.4821, -0.2440,  0.2269, -0.3032, -0.4786, -0.4296,  2.1516,
         -0.4497,  1.1274,  0.6443,  0.9633,  0.4568],
        [ 1.1537, -0.8582,  0.1969,  0.2448, -0.8921, -0.2056, -0.3225,  0.5673,
         -1.3169,  0.4821, -0.2440,  0.2269, -0.3032, -0.4786, -0.4296, -0.4866,
         -0.4497, -0.8870, -1.6476, -1.0656,  0.4568],
        [ 1.1537,  1.1653,  0.1969, -0.3605,  1.1209, -0.2056, -0.3225,  0.5673,
          0.7594,  0.4821, -0.2440,  0.2269, -0.3032, -1.4145, -0.4296, -0.4866,
         -0.4497, -0.8870,  0.9717,  0.9633, -0.9917],
        [-0.8668, -0.8582,  0.1969, -0.3605, -0.8921, -0.2056, -0.3225,  0.5673,
          0.7594,  0.4821, -0.2440,  0.2269, -0.30

KeyboardInterrupt: 

In [37]:
from sklearn.metrics import precision_score, recall_score, f1_score


def evaluate_model(model, test_loader, threshold=0.5):
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for data, target in test_loader:
            output = model(data)  # Raw logits
            probs = torch.sigmoid(output).squeeze(1)  # Convert logits to probabilities
            preds = (probs >= threshold).long()  # Apply threshold to get binary predictions
            all_preds.extend(preds.numpy())
            all_targets.extend(target.numpy())

    # Convert lists to tensors for comparison
    all_preds = torch.tensor(all_preds)
    all_targets = torch.tensor(all_targets)

    # Calculate accuracy
    correct = (all_preds == all_targets).sum().item()
    total = len(all_targets)
    accuracy = correct / total

    precision = precision_score(all_targets, all_preds)
    recall = recall_score(all_targets, all_preds)
    f1 = f1_score(all_targets, all_preds)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

    return accuracy, precision, recall, f1


In [38]:
threshold = 0.3
evaluate_model(model, val_loader, threshold)

Accuracy: 0.8631
Precision: 0.5288
Recall: 0.2091
F1-Score: 0.2997


(0.8631346578366446,
 np.float64(0.5288256227758007),
 np.float64(0.2091190543202927),
 np.float64(0.299717628075837))