# PyTorch Training and FastAPI Deployment Notebook

This notebook demonstrates how to:

- Load and preprocess a CSV dataset
- Split the dataset into training, validation, and test sets
- Define a custom PyTorch Dataset and a neural network model
- Train the model and save it (reporting metrics including true positive rate and true negative rate)
- Write a FastAPI inference endpoint that loads the saved model and provides predictions

After running the notebook, you can run the FastAPI server by executing:
```bash
uvicorn app:app --reload
```
in your terminal.

In [114]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

print('Libraries imported successfully.')

Libraries imported successfully.


## Step 1: Load and Preprocess Data

Assume that `data.csv` is in the current directory. We will:

- Convert the `Type` column to numeric (filling missing values with 0)
- Map the `Label` column so that `Success` becomes 1 and `Fail` becomes 0
- Use all columns except `BlockId` and `Label` as features.

In [115]:
# Load the dataset
data_path = 'HDFS_v1/preprocessed/event_occurrence_matrix.csv'
df = pd.read_csv(data_path)

# Preprocess the data
df['Type'] = pd.to_numeric(df['Type'], errors='coerce').fillna(0)
df['Label'] = df['Label'].apply(lambda x: 1 if x.strip().lower() == 'success' else 0)

# Define features and target
feature_cols = [col for col in df.columns if col not in ['BlockId', 'Label']]
target_col = 'Label'

print('Data shape:', df.shape)
df.head()

Data shape: (575061, 32)


Unnamed: 0,BlockId,Label,Type,E1,E2,E3,E4,E5,E6,E7,...,E20,E21,E22,E23,E24,E25,E26,E27,E28,E29
0,blk_-1608999687919862906,1,0.0,0,0,203,0,10,7,0,...,0,10,1,10,0,4,10,0,0,0
1,blk_7503483334202473044,1,0.0,0,2,1,0,3,0,0,...,0,3,1,3,0,0,3,0,0,0
2,blk_-3544583377289625738,0,21.0,0,0,203,0,3,0,0,...,1,3,1,3,0,0,3,0,0,0
3,blk_-9073992586687739851,1,0.0,0,3,0,0,3,0,0,...,0,3,1,3,0,0,3,0,0,0
4,blk_7854771516489510256,1,0.0,0,3,1,15,3,0,0,...,0,3,1,3,0,0,3,0,0,0


## Step 2: Split the Data and Save as CSVs

We split the dataset into training (70%), validation (15%), and test (15%) sets and save them as CSV files.

In [116]:
# Split the data
df_train, df_temp = train_test_split(df, test_size=0.3, random_state=42, stratify=df[target_col])
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42, stratify=df_temp[target_col])

# Save splits as CSV
df_train.to_csv('train.csv', index=False)
df_val.to_csv('val.csv', index=False)
df_test.to_csv('test.csv', index=False)

print(f"Train samples: {len(df_train)}, Validation samples: {len(df_val)}, Test samples: {len(df_test)}")

Train samples: 402542, Validation samples: 86259, Test samples: 86260


## Step 3: Define a Custom Dataset and Neural Network Model

In [117]:
class CSVDataset(Dataset):
    def __init__(self, csv_file, feature_cols, target_col):
        self.data = pd.read_csv(csv_file)
        self.features = self.data[feature_cols].values.astype(np.float32)
        self.labels = self.data[target_col].values.astype(np.float32)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        x = torch.tensor(self.features[idx])
        y = torch.tensor(self.labels[idx])
        return x, y

class Net(nn.Module):
    def __init__(self, input_dim):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)  # Single logit output
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

print('Custom Dataset and model classes defined.')

Custom Dataset and model classes defined.


## Step 4: Train the Model and Save It

This cell trains the model for a few epochs and prints the training loss, validation loss, validation accuracy, true positive rate (TPR), and true negative rate (TNR). Finally, the trained model is saved to `model.pth`.

In [118]:
# Hyperparameters
batch_size = 2048
num_epochs = 5
learning_rate = 0.001
input_dim = len(feature_cols)  # Number of features

# Create datasets and dataloaders
train_dataset = CSVDataset('train.csv', feature_cols, target_col)
val_dataset = CSVDataset('val.csv', feature_cols, target_col)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Initialize model, loss function, and optimizer
model = Net(input_dim)
criterion = nn.BCEWithLogitsLoss()  # Expects raw logits
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for features, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(features).squeeze(1)  # Squeeze to match labels shape
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * features.size(0)
    train_loss /= len(train_loader.dataset)
    
    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    TP = TN = FP = FN = 0
    with torch.no_grad():
        for features, labels in val_loader:
            outputs = model(features).squeeze(1)
            loss = criterion(outputs, labels)
            val_loss += loss.item() * features.size(0)
            preds = (torch.sigmoid(outputs) > 0.5).float()
            correct += (preds == labels).sum().item()
            total += labels.size(0)
            TP += ((preds == 1) & (labels == 1)).sum().item()
            TN += ((preds == 0) & (labels == 0)).sum().item()
            FP += ((preds == 1) & (labels == 0)).sum().item()
            FN += ((preds == 0) & (labels == 1)).sum().item()
    
    val_loss /= len(val_loader.dataset)
    val_acc = correct / total * 100.0
    TPR = (TP / (TP + FN) * 100.0) if (TP + FN) > 0 else 0.0
    TNR = (TN / (TN + FP) * 100.0) if (TN + FP) > 0 else 0.0
    print(f"Epoch [{epoch+1}/{num_epochs}] - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%, TPR: {TPR:.2f}%, TNR: {TNR:.2f}%")

# Save the trained model
torch.save(model.state_dict(), 'model.pth')
print("Model saved as 'model.pth'.")

Epoch [1/5] - Train Loss: 0.1197, Val Loss: 0.0096, Val Acc: 99.68%, TPR: 99.92%, TNR: 91.49%
Epoch [2/5] - Train Loss: 0.0040, Val Loss: 0.0018, Val Acc: 99.98%, TPR: 99.99%, TNR: 99.68%
Epoch [3/5] - Train Loss: 0.0013, Val Loss: 0.0011, Val Acc: 99.98%, TPR: 99.98%, TNR: 99.80%
Epoch [4/5] - Train Loss: 0.0010, Val Loss: 0.0010, Val Acc: 99.98%, TPR: 99.98%, TNR: 99.80%
Epoch [5/5] - Train Loss: 0.0010, Val Loss: 0.0011, Val Acc: 99.99%, TPR: 99.99%, TNR: 99.80%
Model saved as 'model.pth'.


## Step 5: Create a FastAPI Inference Endpoint

The cell below writes a FastAPI application to a file named `app.py`. This app loads the saved model and exposes a `/predict` endpoint.

To run the FastAPI server, execute the following in your terminal:
```bash
uvicorn app:app --reload
```

In [119]:
%%writefile app.py
from fastapi import FastAPI
from pydantic import BaseModel
import torch
import torch.nn as nn

# Define the same neural network architecture
class Net(nn.Module):
    def __init__(self, input_dim):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)  # Single logit
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Set the input dimension (adjust this to match your features, e.g., if you have 30 features)
input_dim = 30

# Load the saved model
model = Net(input_dim)
model.load_state_dict(torch.load('model.pth', map_location=torch.device('cpu')))
model.eval()

class PredictionInput(BaseModel):
    features: list[float]  # List of feature values

app = FastAPI()

@app.post("/predict")
async def predict(input_data: PredictionInput):
    # Convert the input list into a tensor and add a batch dimension
    features_tensor = torch.tensor(input_data.features, dtype=torch.float32).unsqueeze(0)
    with torch.no_grad():
        output = model(features_tensor).squeeze(1)
        probability = torch.sigmoid(output).item()
        predicted_class = 1 if probability > 0.5 else 0
    return {"probability": probability, "predicted_class": predicted_class}

if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)


Writing app.py


The FastAPI app has now been written to `app.py`. To run the server, open a terminal in the notebook's directory and run:
```bash
uvicorn app:app --reload
```

You can then send a POST request to `http://127.0.0.1:8000/predict` with a JSON body containing a list of feature values. For example:

```bash
curl -X POST "http://127.0.0.1:8000/predict" \
     -H "Content-Type: application/json" \
     -d '{"features": [0, 3, 1, 15, 3, 0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 0, 0, 3, 0, 0, 0, 0]}'
```

Make sure that the length of the `features` list exactly matches the `input_dim` (in this example, 30).