In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

def robust_normalize(tensor):
    q1 = torch.quantile(tensor, 0.25, dim=0)
    q3 = torch.quantile(tensor, 0.75, dim=0)
    iqr = q3 - q1
    
    lower_bound = q1 - 3 * iqr
    upper_bound = q3 + 3 * iqr
    clipped_tensor = torch.clamp(tensor, lower_bound, upper_bound)
    
    min_vals = torch.min(clipped_tensor, dim=0)[0]
    max_vals = torch.max(clipped_tensor, dim=0)[0]
    normalized = 2 * (clipped_tensor - min_vals) / (max_vals - min_vals + 1e-8) - 1
    return normalized

#Boolean checker for NaN or Inf values
def validate_data(features, labels):
    print("\nData Validation:")
    print(f"Feature statistics:")
    print(f"Mean: {torch.mean(features):.4f}")
    print(f"Std: {torch.std(features):.4f}")
    print(f"Min: {torch.min(features):.4f}")
    print(f"Max: {torch.max(features):.4f}")
    print(f"NaN values: {torch.isnan(features).any()}")
    print(f"Inf values: {torch.isinf(features).any()}")
    
    print("\nLabel statistics:")
    print(f"Unique values: {torch.unique(labels)}")
    print(f"Class distribution: {torch.bincount(labels)}")
    
    return not (torch.isnan(features).any() or torch.isinf(features).any())

class CustomDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

class BinaryClassificationModel(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.LayerNorm(32),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.2),
            
            nn.Linear(32, 16),
            nn.LayerNorm(16),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1),
            
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
        
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.zeros_(m.bias)
    
    def forward(self, x):
        return self.fc(x)

def train_model(model, dataloader, num_epochs=100):
    criterion = nn.BCELoss()
    optimizer = optim.AdamW(model.parameters(), lr=0.0003, weight_decay=0.01)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    
    best_loss = float('inf')
    patience_counter = 0
    max_patience = 10
    
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        num_batches = 0
        max_grad_norm = 0
        
        for batch_features, batch_labels in dataloader:
            predictions = model(batch_features)
            loss = criterion(predictions, batch_labels.unsqueeze(1).float())
            
            optimizer.zero_grad()
            loss.backward()
            
            current_grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
            max_grad_norm = max(max_grad_norm, current_grad_norm.item())
            
            optimizer.step()
            
            epoch_loss += loss.item()
            num_batches += 1
            
            if torch.isnan(loss):
                print("NaN loss detected! Stopping training...")
                return
        
        avg_epoch_loss = epoch_loss / num_batches
        scheduler.step(avg_epoch_loss)
        
        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}")
            print(f"Loss: {avg_epoch_loss:.4f}")
            print(f"Max gradient norm: {max_grad_norm:.4f}")
        
        if avg_epoch_loss < best_loss:
            best_loss = avg_epoch_loss
            patience_counter = 0
        else:
            patience_counter += 1
            
        if patience_counter >= max_patience:
            print("Early stop")
            break
            
        if epoch == 0 and avg_epoch_loss > 100:
            print("Initial loss too high, reinitialize model")
            break

# Load and preprocess data
data = pd.read_csv("/Users/sumed/Downloads/data.csv")
data = data.dropna()

features = data.iloc[:, [196, 261, 280, 76, 290, 246, 257, 545]].to_numpy()
labels = data.iloc[:, 555].to_numpy()

# Convert to tensors
features = torch.tensor(features, dtype=torch.float32)
labels = torch.tensor(labels, dtype=torch.long)

# Normalize features and convert labels to binary
features = robust_normalize(features)
labels = (labels > 0).long()

# Create dataset and dataloader
dataset = CustomDataset(features, labels)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Validate data and train model
if validate_data(features, labels):
    model = BinaryClassificationModel(input_dim=8)
    train_model(model, dataloader)
else:
    print("Data is invalid.")

model.eval()
with torch.no_grad():
    sample_input = features[:5]
    sample_output = model(sample_input)
    print("\nSample predictions:")
    print(sample_output)
    print("Actual labels:")
    print(labels[:5])


Data Validation:
Feature statistics:
Mean: -0.1536
Std: 0.4169
Min: -1.0000
Max: 1.0000
NaN values: False
Inf values: False

Label statistics:
Unique values: tensor([0, 1])
Class distribution: tensor([476674, 423736])




Epoch 5/100
Loss: 0.5018
Max gradient norm: 9.9543
