In [14]:
# Full improved training code using PyTorch with best practices for binary classification

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

# Load and preprocess the dataset
df = pd.read_csv("dataset.csv")
df.drop(columns=["Id"], inplace=True)

# Replace zeros with NaN in key columns
fields_with_invalid_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in fields_with_invalid_zeros:
    df[col] = df[col].apply(lambda x: np.nan if x == 0 else x)

# Impute missing values using median
imputer = SimpleImputer(strategy='median')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

# Split features and labels
X = df_imputed.drop(columns=['Outcome'])
y = df_imputed['Outcome']

# Normalize features
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Train-test split BEFORE SMOTE to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Apply SMOTE only to training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Combine resampled data into DataFrame
train_df = pd.DataFrame(X_train_resampled, columns=X.columns)
train_df['Outcome'] = y_train_resampled.values

test_df = pd.DataFrame(X_test, columns=X.columns)
test_df['Outcome'] = y_test.values

# Define custom dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        features = torch.tensor(row[:-1].values, dtype=torch.float32)
        label = torch.tensor(row[-1], dtype=torch.float32).unsqueeze(0)
        return features, label

# Define the neural network (no sigmoid at the end)
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(8, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 1)  # No Sigmoid here
        )

    def forward(self, x):
        return self.model(x)

# Training and testing functions
def train(data_loader, model, loss_fn, optimizer, device):
    model.train()
    for X, y in data_loader:
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

def test(data_loader, model, loss_fn, device):
    model.eval()
    total_loss = 0
    correct = 0
    size = len(data_loader.dataset)

    with torch.no_grad():
        for X, y in data_loader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            pred_prob = torch.sigmoid(pred)  # For accuracy calculation
            predicted = (pred_prob >= 0.5).float()
            total_loss += loss_fn(pred, y).item()
            correct += (predicted == y).sum().item()

    accuracy = correct / size
    print(f"Test Accuracy: {accuracy:.4f}, Avg Loss: {total_loss / len(data_loader):.6f}")

# Prepare DataLoaders
train_dataset = CustomDataset(train_df)
test_dataset = CustomDataset(test_df)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Initialize model, optimizer, and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuralNetwork().to(device)
loss_fn = nn.BCEWithLogitsLoss()  # Use BCEWithLogitsLoss for logits output
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 200
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}")
    train(train_loader, model, loss_fn, optimizer, device)
    test(test_loader, model, loss_fn, device)
print("Training complete.")



Epoch 1
Test Accuracy: 0.7635, Avg Loss: 0.621034
Epoch 2
Test Accuracy: 0.7491, Avg Loss: 0.542599
Epoch 3


  label = torch.tensor(row[-1], dtype=torch.float32).unsqueeze(0)


Test Accuracy: 0.7581, Avg Loss: 0.480487
Epoch 4
Test Accuracy: 0.7834, Avg Loss: 0.477858
Epoch 5
Test Accuracy: 0.7816, Avg Loss: 0.464499
Epoch 6
Test Accuracy: 0.7888, Avg Loss: 0.461093
Epoch 7
Test Accuracy: 0.7888, Avg Loss: 0.447925
Epoch 8
Test Accuracy: 0.7870, Avg Loss: 0.450978
Epoch 9
Test Accuracy: 0.7834, Avg Loss: 0.456832
Epoch 10
Test Accuracy: 0.7888, Avg Loss: 0.444314
Epoch 11
Test Accuracy: 0.7852, Avg Loss: 0.439645
Epoch 12
Test Accuracy: 0.7870, Avg Loss: 0.444747
Epoch 13
Test Accuracy: 0.7834, Avg Loss: 0.437236
Epoch 14
Test Accuracy: 0.7924, Avg Loss: 0.431001
Epoch 15
Test Accuracy: 0.7960, Avg Loss: 0.436724
Epoch 16
Test Accuracy: 0.8051, Avg Loss: 0.424957
Epoch 17
Test Accuracy: 0.8051, Avg Loss: 0.427555
Epoch 18
Test Accuracy: 0.8069, Avg Loss: 0.423481
Epoch 19
Test Accuracy: 0.8014, Avg Loss: 0.427697
Epoch 20
Test Accuracy: 0.8105, Avg Loss: 0.420507
Epoch 21
Test Accuracy: 0.8177, Avg Loss: 0.408407
Epoch 22
Test Accuracy: 0.8195, Avg Loss: 0.40