In [3]:
# Install required packages
!pip install torch torch-geometric geoopt tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import kneighbors_graph
from scipy.sparse import coo_matrix
from tqdm import tqdm
import geoopt
from geoopt import PoincareBall
from torch.utils.data import DataLoader
from torch_geometric.data import Data
import os

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Check for CUDA availability and set the device accordingly. One of the two options below must be commented out
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = 'cpu'
print(f'Using device: {device}')

# Load the CSV file
file_path = '/content/drive/MyDrive/CIC_data.csv'  # Update this with the actual path to your CSV file in Google Drive
data = pd.read_csv(file_path, low_memory=False)

# Data preprocessing steps
data.columns = data.columns.str.strip()
data['Label'] = data['Label'].str.strip()

# Verify unique labels and their distribution
unique_labels = data['Label'].unique()
print(f"Unique labels in the dataset: {unique_labels}")
label_counts = data['Label'].value_counts()
print("Label distribution in the dataset:")
print(label_counts)

# Sample a smaller fraction of the rows (e.g., 20%)
data_sampled = data.sample(frac=0.05, random_state=42)

# Convert all columns to numeric, coerce errors to NaN
data_numeric = data_sampled.apply(pd.to_numeric, errors='coerce')

# Fill NaN values with the mean of each column
data_filled = data_numeric.fillna(data_numeric.mean())

# Handle infinite and very large values
data_filled = data_filled.replace([np.inf, -np.inf], np.nan)
data_filled = data_filled.fillna(data_filled.max())

# Check again for any remaining NaNs and fill them
if data_filled.isnull().values.any():
    data_filled = data_filled.fillna(0)

# Extract labels
labels = data_sampled['Label']
data_filled = data_filled.drop(columns=['Label'])

# Normalize the data
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_filled)

# Convert to PyTorch tensors
node_features = torch.tensor(data_scaled, dtype=torch.float32).to(device)
print(f"Input tensor shape: {node_features.shape}")

# UHG Operations
def uhg_quadrance(a, b, eps=1e-9):
    """Compute UHG quadrance between two points."""
    dot_product = torch.sum(a * b, dim=-1)
    return 1 - (dot_product ** 2) / ((torch.sum(a ** 2, dim=-1) - a[:, -1] ** 2 + eps) * (torch.sum(b ** 2, dim=-1) - b[:, -1] ** 2 + eps))

def uhg_spread(L, M, eps=1e-9):
    """Compute UHG spread between two lines."""
    dot_product = torch.sum(L * M, dim=-1)
    return 1 - (dot_product ** 2) / ((torch.sum(L ** 2, dim=-1) - L[:, -1] ** 2 + eps) * (torch.sum(M ** 2, dim=-1) - M[:, -1] ** 2 + eps))

# Transform the node features to UHG space
def to_uhg_space(x):
    """Transform Euclidean coordinates to UHG space."""
    return torch.cat([x, torch.ones(x.shape[0], 1, device=x.device)], dim=-1)


node_features_uhg = to_uhg_space(node_features)
print(f"Node features transformed to UHG space: {node_features_uhg.shape}")

# Create a k-nearest neighbors graph
k = 2  # Set k value to 2
knn_graph = kneighbors_graph(data_scaled, k, mode='connectivity', include_self=False)

# Convert knn_graph to COO format
knn_graph_coo = coo_matrix(knn_graph)

# Create edge index
edge_index_np = np.array([knn_graph_coo.row, knn_graph_coo.col])
edge_index = torch.from_numpy(edge_index_np).long().to(device)
print(f"Edge index shape: {edge_index.shape}")

# Convert labels to numeric
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
labels_numeric = labels.map(label_mapping).values
labels_tensor = torch.tensor(labels_numeric, dtype=torch.long).to(device)

# Create 70/15/15 train/val/test split
total_samples = node_features_uhg.size(0)
train_size = int(0.7 * total_samples)
val_size = int(0.15 * total_samples)

indices = torch.randperm(total_samples)
train_indices = indices[:train_size]
val_indices = indices[train_size:train_size+val_size]
test_indices = indices[train_size+val_size:]

train_mask = torch.zeros(total_samples, dtype=torch.bool)
val_mask = torch.zeros(total_samples, dtype=torch.bool)
test_mask = torch.zeros(total_samples, dtype=torch.bool)

train_mask[train_indices] = True
val_mask[val_indices] = True
test_mask[test_indices] = True

# Create the PyTorch Geometric data object
graph_data = Data(x=node_features_uhg, edge_index=edge_index, y=labels_tensor,
                  train_mask=train_mask, val_mask=val_mask, test_mask=test_mask).to(device)

print(f"Train size: {graph_data.train_mask.sum()}, Val size: {graph_data.val_mask.sum()}, Test size: {graph_data.test_mask.sum()}")

# Define the UHG Quadrance for prediction
def uhg_quadrance(a, b):
    """Compute UHG quadrance between two points."""
    dot_product = torch.sum(a * b, dim=-1)
    return 1 - (dot_product ** 2) / ((torch.sum(a ** 2, dim=-1) - a[:, -1] ** 2) * (torch.sum(b ** 2, dim=-1) - b[:, -1] ** 2))

# Define the UHG GraphSAGE Layer
class UHGGraphSAGELayer(nn.Module):
    def __init__(self, in_features, out_features):
        super(UHGGraphSAGELayer, self).__init__()
        self.weight_neigh = nn.Parameter(torch.Tensor(out_features, in_features))
        self.weight_self = nn.Parameter(torch.Tensor(out_features, in_features))
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.xavier_uniform_(self.weight_neigh)
        nn.init.xavier_uniform_(self.weight_self)

    def forward(self, x, edge_index):
        row, col = edge_index

        # Neighbor aggregation
        neigh_sum = torch.zeros_like(x)
        neigh_sum.index_add_(0, row, x[col])
        neigh_count = torch.zeros(x.size(0), device=x.device)
        neigh_count.index_add_(0, row, torch.ones_like(row, dtype=torch.float))
        neigh_count = torch.clamp(neigh_count.unsqueeze(1), min=1)
        neigh_features = neigh_sum / neigh_count

        # Apply linear transformations
        neigh_transformed = torch.matmul(neigh_features, self.weight_neigh.t())
        self_transformed = torch.matmul(x, self.weight_self.t())

        # Combine using UHG-inspired operation (simplified addition)
        combined = neigh_transformed + self_transformed

        return F.relu(combined)

# UHG GraphSAGE Model
class UHGGraphSAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers, dropout=0.2):
        super(UHGGraphSAGE, self).__init__()
        self.layers = nn.ModuleList()
        self.dropout = nn.Dropout(dropout)

        self.layers.append(UHGGraphSAGELayer(in_channels, hidden_channels))
        for _ in range(num_layers - 2):
            self.layers.append(UHGGraphSAGELayer(hidden_channels, hidden_channels))
        self.layers.append(UHGGraphSAGELayer(hidden_channels, out_channels))

    def forward(self, x, edge_index):
        for layer in self.layers[:-1]:
            x = self.dropout(F.relu(layer(x, edge_index)))
        x = self.layers[-1](x, edge_index)
        return x

# Initialize the model
in_channels = node_features_uhg.size(1)
hidden_channels = 128
out_channels = len(label_mapping)
num_layers = 2

# Define the loss
criterion = nn.CrossEntropyLoss()

# Create a simple DataLoader
batch_size = 16
accumulation_steps = 4

#Create Dataloader with smaller batch size
train_loader = DataLoader(range(graph_data.train_mask.sum()), batch_size=batch_size, shuffle=True)

# Training process with gradient accumulation
def train_with_accumulation(model, optimizer):
    model.train()
    total_loss = 0
    optimizer.zero_grad()  # Reset gradients
    for batch_idx, batch in enumerate(tqdm(train_loader, desc="Training")):
        batch = batch.to(device)

        # Get the features for the sampled nodes
        x = graph_data.x[graph_data.train_mask][batch]
        y = graph_data.y[graph_data.train_mask][batch]

        # Create a subgraph for the batch
        batch_node_ids = graph_data.train_mask.nonzero(as_tuple=True)[0][batch]
        edge_mask = torch.isin(graph_data.edge_index[0], batch_node_ids) & torch.isin(graph_data.edge_index[1], batch_node_ids)
        batch_edge_index = graph_data.edge_index[:, edge_mask]

        # Relabel nodes to have consecutive indices
        node_idx = torch.unique(batch_edge_index)
        idx_map = {int(idx): i for i, idx in enumerate(node_idx)}
        mapped_edge_index = torch.tensor([[idx_map[int(i)] for i in batch_edge_index[0]],
                                          [idx_map[int(i)] for i in batch_edge_index[1]]],
                                         dtype=torch.long,
                                         device=device)

        # Forward pass
        out = model(x, mapped_edge_index)
        loss = criterion(out, y) / accumulation_steps  # Scale loss by accumulation steps

        loss.backward()  # Backpropagate the loss

        # Accumulate gradients and update model weights every accumulation_steps batches
        if (batch_idx + 1) % accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()  # Update weights
            optimizer.zero_grad()  # Reset gradients for next accumulation
            total_loss += loss.item() * accumulation_steps  # Accumulate loss

    return total_loss / len(train_loader)


# Evaluation function
@torch.no_grad()
def evaluate(model, mask):
    model.eval()
    node_indices = mask.nonzero(as_tuple=True)[0]
    sub_x = graph_data.x[node_indices]
    sub_y = graph_data.y[node_indices]
    edge_mask = torch.isin(graph_data.edge_index[0], node_indices) & torch.isin(graph_data.edge_index[1], node_indices)
    sub_edge_index = graph_data.edge_index[:, edge_mask]
    node_idx = torch.unique(sub_edge_index)
    idx_map = {int(idx): i for i, idx in enumerate(node_idx)}
    mapped_edge_index = torch.tensor([[idx_map[int(i)] for i in sub_edge_index[0]],
                                      [idx_map[int(i)] for i in sub_edge_index[1]]],
                                     dtype=torch.long,
                                     device=device)
    out = model(sub_x, mapped_edge_index)

    # For simplicity, using the model's output directly for classification
    pred = out.argmax(dim=1)  # Choose the class with the highest logit
    correct = (pred == sub_y).sum().item()
    accuracy = correct / len(node_indices)  # Calculate accuracy

    return accuracy  # Return the calculated accuracy


# Set the learning rate
best_lr = 0.01
print(f"Using learning rate: {best_lr}")

# Initialize the model
model = UHGGraphSAGE(in_channels=in_channels, hidden_channels=hidden_channels,
                     out_channels=out_channels, num_layers=num_layers).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=best_lr, weight_decay=1e-5)

# Learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10)

# Training loop
num_epochs = 400
best_val_acc = 0
patience = 20
counter = 0
best_model_path = '/content/drive/MyDrive/best_uhg_graphsage_model.pth'

for epoch in range(1, num_epochs + 1):
    try:
        # Train the model for one epoch
        loss = train_with_accumulation(model, optimizer)


        # Evaluate the model on validation and test sets
        val_acc = evaluate(model, graph_data.val_mask)
        test_acc = evaluate(model, graph_data.test_mask)

        # Get current learning rate
        current_lr = optimizer.param_groups[0]['lr']

        # Adjust learning rate based on validation accuracy
        scheduler.step(val_acc)

        # Check if current model is the best so far
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            counter = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            counter += 1

        # Print progress every 10 epochs
        if epoch % 10 == 0:
            print(f'Epoch: {epoch}, Loss: {loss:.4f}, Val Accuracy: {val_acc:.4f}, Test Accuracy: {test_acc:.4f}, Learning Rate: {current_lr:.6f}')

        if counter >= patience:
            print("Early stopping")
            break
    except RuntimeError as e:
        print(f"Error occurred in epoch {epoch}:")
        print(str(e))
        break

# After training, print the final learning rate
final_lr = optimizer.param_groups[0]['lr']
print(f"Final Learning Rate: {final_lr:.6f}")

# Load the best model and evaluate on the test set
if os.path.exists(best_model_path):
    model.load_state_dict(torch.load(best_model_path))
    final_test_acc = evaluate(model, graph_data.test_mask)
    print(f"Final Test Accuracy: {final_test_acc:.4f}")
else:
    print("No best model found. Training might not have completed successfully.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cpu
Unique labels in the dataset: ['BENIGN' 'DDoS' 'PortScan' 'Bot' 'Infiltration'
 'Web Attack � Brute Force' 'Web Attack � XSS'
 'Web Attack � Sql Injection' 'FTP-Patator' 'SSH-Patator' 'DoS slowloris'
 'DoS Slowhttptest' 'DoS Hulk' 'DoS GoldenEye' 'Heartbleed']
Label distribution in the dataset:
Label
BENIGN                        2273097
DoS Hulk                       231073
PortScan                       158930
DDoS                           128027
DoS GoldenEye                   10293
FTP-Patator                      7938
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1966
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed         

Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.81it/s]
Training: 100%|██████████| 6193/6193 [04:46<00:00, 21.59it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.84it/s]
Training: 100%|██████████| 6193/6193 [04:49<00:00, 21.37it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.91it/s]
Training: 100%|██████████| 6193/6193 [04:47<00:00, 21.55it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.86it/s]
Training: 100%|██████████| 6193/6193 [04:44<00:00, 21.78it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.89it/s]
Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.71it/s]


Epoch: 10, Loss: 0.0291, Val Accuracy: 0.9458, Test Accuracy: 0.9402, Learning Rate: 0.010000


Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.89it/s]
Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.72it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.89it/s]
Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.72it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.90it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.81it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.87it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.95it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.84it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 21.98it/s]


Epoch: 20, Loss: 0.0291, Val Accuracy: 0.9301, Test Accuracy: 0.9267, Learning Rate: 0.010000


Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.68it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.95it/s]
Training: 100%|██████████| 6193/6193 [04:44<00:00, 21.79it/s]
Training: 100%|██████████| 6193/6193 [04:40<00:00, 22.08it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.92it/s]
Training: 100%|██████████| 6193/6193 [04:47<00:00, 21.52it/s]
Training: 100%|██████████| 6193/6193 [04:44<00:00, 21.79it/s]
Training: 100%|██████████| 6193/6193 [04:47<00:00, 21.56it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 22.04it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.88it/s]


Epoch: 30, Loss: 0.0225, Val Accuracy: 0.9542, Test Accuracy: 0.9539, Learning Rate: 0.005000


Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.67it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 21.99it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.82it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.85it/s]
Training: 100%|██████████| 6193/6193 [04:44<00:00, 21.78it/s]
Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.69it/s]
Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.68it/s]
Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.67it/s]
Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.69it/s]
Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.71it/s]


Epoch: 40, Loss: 0.0225, Val Accuracy: 0.9591, Test Accuracy: 0.9575, Learning Rate: 0.005000


Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.83it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.86it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.89it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.96it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 21.97it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.89it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 22.03it/s]
Training: 100%|██████████| 6193/6193 [04:40<00:00, 22.07it/s]
Training: 100%|██████████| 6193/6193 [04:39<00:00, 22.12it/s]
Training: 100%|██████████| 6193/6193 [04:44<00:00, 21.77it/s]


Epoch: 50, Loss: 0.0183, Val Accuracy: 0.9640, Test Accuracy: 0.9614, Learning Rate: 0.002500


Training: 100%|██████████| 6193/6193 [04:44<00:00, 21.78it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 22.02it/s]
Training: 100%|██████████| 6193/6193 [04:40<00:00, 22.07it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.90it/s]
Training: 100%|██████████| 6193/6193 [04:44<00:00, 21.76it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.90it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.88it/s]
Training: 100%|██████████| 6193/6193 [04:44<00:00, 21.77it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 21.96it/s]
Training: 100%|██████████| 6193/6193 [04:46<00:00, 21.65it/s]


Epoch: 60, Loss: 0.0180, Val Accuracy: 0.9672, Test Accuracy: 0.9653, Learning Rate: 0.002500


Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.92it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.96it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.83it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.96it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.96it/s]
Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.67it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 21.97it/s]
Training: 100%|██████████| 6193/6193 [04:40<00:00, 22.05it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 22.03it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.82it/s]


Epoch: 70, Loss: 0.0163, Val Accuracy: 0.9733, Test Accuracy: 0.9713, Learning Rate: 0.001250


Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.86it/s]
Training: 100%|██████████| 6193/6193 [04:40<00:00, 22.11it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.86it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 21.97it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 22.02it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.86it/s]
Training: 100%|██████████| 6193/6193 [04:44<00:00, 21.74it/s]
Training: 100%|██████████| 6193/6193 [04:46<00:00, 21.64it/s]
Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.67it/s]
Training: 100%|██████████| 6193/6193 [04:51<00:00, 21.26it/s]


Epoch: 80, Loss: 0.0158, Val Accuracy: 0.9724, Test Accuracy: 0.9707, Learning Rate: 0.001250


Training: 100%|██████████| 6193/6193 [04:49<00:00, 21.38it/s]
Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.68it/s]
Training: 100%|██████████| 6193/6193 [04:48<00:00, 21.44it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.84it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.94it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.95it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 22.00it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.89it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.87it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.92it/s]


Epoch: 90, Loss: 0.0154, Val Accuracy: 0.9726, Test Accuracy: 0.9685, Learning Rate: 0.001250


Training: 100%|██████████| 6193/6193 [04:41<00:00, 22.01it/s]
Training: 100%|██████████| 6193/6193 [04:44<00:00, 21.74it/s]
Training: 100%|██████████| 6193/6193 [04:40<00:00, 22.05it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 21.98it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.85it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 22.03it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 21.99it/s]
Training: 100%|██████████| 6193/6193 [04:39<00:00, 22.16it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.90it/s]
Training: 100%|██████████| 6193/6193 [04:40<00:00, 22.05it/s]


Epoch: 100, Loss: 0.0139, Val Accuracy: 0.9700, Test Accuracy: 0.9661, Learning Rate: 0.000625


Training: 100%|██████████| 6193/6193 [04:40<00:00, 22.06it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.88it/s]
Training: 100%|██████████| 6193/6193 [04:40<00:00, 22.11it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 22.03it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.84it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 21.98it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 22.01it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 22.01it/s]
Training: 100%|██████████| 6193/6193 [04:40<00:00, 22.07it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 22.01it/s]


Epoch: 110, Loss: 0.0142, Val Accuracy: 0.9751, Test Accuracy: 0.9715, Learning Rate: 0.000625


Training: 100%|██████████| 6193/6193 [04:44<00:00, 21.75it/s]
Training: 100%|██████████| 6193/6193 [05:08<00:00, 20.09it/s]
Training: 100%|██████████| 6193/6193 [05:09<00:00, 20.00it/s]
Training: 100%|██████████| 6193/6193 [05:18<00:00, 19.44it/s]
Training: 100%|██████████| 6193/6193 [05:07<00:00, 20.12it/s]
Training: 100%|██████████| 6193/6193 [05:09<00:00, 20.04it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.87it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.92it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.87it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.94it/s]


Epoch: 120, Loss: 0.0132, Val Accuracy: 0.9755, Test Accuracy: 0.9735, Learning Rate: 0.000625


Training: 100%|██████████| 6193/6193 [04:39<00:00, 22.16it/s]
Training: 100%|██████████| 6193/6193 [04:35<00:00, 22.44it/s]
Training: 100%|██████████| 6193/6193 [04:40<00:00, 22.06it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.87it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 22.01it/s]
Training: 100%|██████████| 6193/6193 [04:42<00:00, 21.93it/s]
Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.71it/s]
Training: 100%|██████████| 6193/6193 [04:45<00:00, 21.72it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 21.99it/s]
Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.81it/s]


Epoch: 130, Loss: 0.0135, Val Accuracy: 0.9764, Test Accuracy: 0.9765, Learning Rate: 0.000313


Training: 100%|██████████| 6193/6193 [04:43<00:00, 21.87it/s]
Training: 100%|██████████| 6193/6193 [04:41<00:00, 21.96it/s]


Early stopping
Final Learning Rate: 0.000313


  model.load_state_dict(torch.load(best_model_path))


Final Test Accuracy: 0.9754
