In [None]:
import os
import csv
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import numpy as np
from rich.progress import Progress
from tqdm import tqdm

In [2]:
# Define Autoencoder Model
class Autoencoder(nn.Module):
    def __init__(self, input_dim, latent_dim, dropout=0.0, use_batch_norm=False):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128) if use_batch_norm else nn.Identity(),
            nn.Dropout(dropout),
            nn.Linear(128, latent_dim),
            nn.ReLU(),
            nn.BatchNorm1d(latent_dim) if use_batch_norm else nn.Identity(),
            nn.Sigmoid(),  # Output values are normalized to [0, 1]
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128) if use_batch_norm else nn.Identity(),
            nn.Dropout(dropout),
            nn.Linear(128, input_dim),
            nn.Sigmoid(),  # Assuming input values are normalized to [0, 1]
        )

    def forward(self, x):
        latent = self.encoder(x)
        reconstructed = self.decoder(latent)
        return latent, reconstructed

In [3]:
# Custom dataset class to handle CSV input and padding
class TreeDataset(Dataset):
    def __init__(self, csv_file, padding_length=161):
        # Load data from CSV
        self.padding_length = padding_length
        self.data = self._load_csv(csv_file)

    def _min_max_normalize(self, array, min_val=0, max_val=100):
        """
        Normalize array to the range [0, 1] based on given min_val and max_val.
        """
        array = np.array(array, dtype=np.float32)
        return (array - min_val) / (max_val - min_val)

    def _load_csv(self, csv_file):
        """Read CSV file using Python's built-in csv module."""
        data = []
        with open(csv_file, "r") as csvfile:
            reader = csv.reader(csvfile)
            with Progress() as progress:
                task = progress.add_task(
                    "[cyan]Processing CSV...", total=sum(1 for _ in csvfile)
                )  # Total rows in file
                csvfile.seek(0)  # Reset file pointer

                for row in reader:
                    row = list(map(int, row))
                    normalized_row = self._min_max_normalize(row)
                    data.append(normalized_row)
                    # Update progress bar
                    progress.update(task, advance=1)
        return data

    def __len__(self):
        """Return the length of the dataset (number of rows)."""
        return len(self.data)

    def __getitem__(self, idx):
        """Retrieve a single data point from the dataset."""
        return self.data[idx]

In [None]:
def set_seed_for_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(True, warn_only=True)
    os.environ["OMP_NUM_THREADS"] = "1"
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["PYTHONHASHSEED"] = str(seed)

In [5]:
def train_model(model, criterion, optimizer, epochs, train_loader):
    print(f"Training the Autoencoder, Total epochs: {epochs}")
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        epoch_rmse = 0
        total_samples = 0
        for batch in tqdm(
            train_loader, desc=f"Epoch [{epoch+1}/{epochs}]", unit="batch"
        ):
            optimizer.zero_grad()
            _, reconstructed = model(batch)
            loss = criterion(reconstructed, batch)  # Reconstruction loss
            loss.backward()
            optimizer.step()

            # Accumulate loss
            batch_size = batch.size(0)
            total_samples += batch_size
            epoch_loss += loss.item() * batch_size  # Weighted by batch size

            # Calculate and accumulate RMSE
            rmse = torch.sqrt(loss)  # RMSE = sqrt(MSE)
            epoch_rmse += rmse.item() * batch_size  # Weighted by batch size

        # Compute average loss and RMSE over all samples
        epoch_loss /= total_samples
        epoch_rmse /= total_samples

        print(
            f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.6f}, RMSE: {epoch_rmse:.6f}"
        )

In [6]:
def validate_model(model, criterion, val_loader):
    ###
    # Validate the model
    ###
    model.eval()
    val_loss = 0
    val_rmse = 0
    total_samples = 0
    latent_representations = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating...", unit="batch"):
            latent, reconstructed = model(batch)
            loss = criterion(reconstructed, batch)
            batch_size = batch.size(0)
            total_samples += batch_size
            val_loss += loss.item() * batch_size

            latent_representations.append(latent)

            rmse = torch.sqrt(loss)
            val_rmse += rmse.item() * batch_size

    val_loss /= total_samples
    val_rmse /= total_samples

    latent_representations = torch.cat(latent_representations).numpy()
    print(f"Latent representations shape: {latent_representations.shape}")
    print(f"Validation Loss: {val_loss:.6f}, Validation RMSE: {val_rmse:.6f}")
    return val_rmse

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

seed = 1234
set_seed_for_everything(seed=seed)

g = torch.Generator()
g.manual_seed(seed)

# Load dataset
csv_file = "data/5hosts_and_10hosts_train_data.csv"  # Replace with your CSV file path
dataset = TreeDataset(csv_file)
print(f"Dataset size: {len(dataset)}")
train_size = int(0.8 * len(dataset))
print(f"Train size: {train_size}")
val_size = len(dataset) - train_size
print(f"Validation size: {val_size}")
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(
    train_dataset, batch_size=32, shuffle=True, worker_init_fn=seed, generator=g
)
val_loader = DataLoader(
    val_dataset, batch_size=32, shuffle=False, worker_init_fn=seed, generator=g
)

Output()

Using device: cuda


Dataset size: 2551965
Train size: 2041572
Validation size: 510393


In [9]:
input_dim = 161  # Number of features
criterion = nn.MSELoss()

# Track best configuration
best_val_rmse = float("inf")
best_params = {}
best_model = None

# Define hyperparameters
lr = 0.001
weight_decay = 0
dropout = 0
latent_dim = 64
use_batch_norm = True
epochs = 10

model = Autoencoder(
    input_dim=input_dim,
    latent_dim=latent_dim,
    dropout=dropout,
    use_batch_norm=use_batch_norm,
)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

# Train the model
train_model(model, criterion, optimizer, epochs, train_loader)

# Validate the model
rmse = validate_model(model, criterion, val_loader)

Training the Autoencoder, Total epochs: 10


Epoch [1/10]: 100%|██████████| 63800/63800 [01:54<00:00, 556.51batch/s] 


Epoch [1/10], Loss: 0.000291, RMSE: 0.004376


Epoch [2/10]: 100%|██████████| 63800/63800 [01:24<00:00, 752.02batch/s] 


Epoch [2/10], Loss: 0.000005, RMSE: 0.002238


Epoch [3/10]: 100%|██████████| 63800/63800 [02:13<00:00, 476.29batch/s] 


Epoch [3/10], Loss: 0.000004, RMSE: 0.001975


Epoch [4/10]: 100%|██████████| 63800/63800 [01:24<00:00, 756.86batch/s] 


Epoch [4/10], Loss: 0.000004, RMSE: 0.001809


Epoch [5/10]: 100%|██████████| 63800/63800 [02:19<00:00, 457.26batch/s] 


Epoch [5/10], Loss: 0.000003, RMSE: 0.001716


Epoch [6/10]: 100%|██████████| 63800/63800 [00:58<00:00, 1098.92batch/s]


Epoch [6/10], Loss: 0.000003, RMSE: 0.001654


Epoch [7/10]: 100%|██████████| 63800/63800 [01:06<00:00, 959.94batch/s] 


Epoch [7/10], Loss: 0.000003, RMSE: 0.001603


Epoch [8/10]: 100%|██████████| 63800/63800 [02:32<00:00, 419.40batch/s] 


Epoch [8/10], Loss: 0.000003, RMSE: 0.001562


Epoch [9/10]: 100%|██████████| 63800/63800 [02:01<00:00, 524.89batch/s] 


Epoch [9/10], Loss: 0.000003, RMSE: 0.001531


Epoch [10/10]: 100%|██████████| 63800/63800 [02:27<00:00, 431.62batch/s] 


Epoch [10/10], Loss: 0.000002, RMSE: 0.001512


Validating...: 100%|██████████| 15950/15950 [00:02<00:00, 5908.24batch/s]

Latent representations shape: (510393, 64)
Validation Loss: 0.000009, Validation RMSE: 0.002931





In [10]:
# Save the best model
torch.save(model.state_dict(), "AE_5hosts_and_10hosts_64_BN.pth")

In [None]:
# ###
# # Testing the best model
# ###

# # Load the best model for testing
# best_model = Autoencoder(
#     input_dim=input_dim,
#     latent_dim=best_params["latent_dim"],
#     dropout=best_params["dropout"],
#     use_batch_norm=best_params["use_batch_norm"],
# )

# best_model.load_state_dict(torch.load("best_autoencoder.pth"))
# best_model.eval()

# test_loss = 0
# test_rmse = 0
# total_samples = 0

# test_loader = DataLoader(
#     test_dataset, batch_size=best_params["batch_size"], shuffle=False
# )

# with torch.no_grad():
#     for batch in tqdm(test_loader, desc="Testing...", unit="batch"):
#         _, reconstructed = best_model(batch)
#         loss = criterion(reconstructed, batch)
#         batch_size = batch.size(0)
#         total_samples += batch_size

#         test_loss += loss.item() * batch_size

#         rmse = torch.sqrt(loss)
#         test_rmse += rmse.item() * batch_size

# test_loss /= total_samples
# test_rmse /= total_samples

# print(f"Test Loss: {test_loss:.6f}, Test RMSE: {test_rmse:.6f}")


In [None]:
# Load the model
model.load_state_dict(torch.load("model_path.pth"))

# Step 4: Set the model to evaluation mode
model.eval()