In [3]:
import csv
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import numpy as np
from rich.progress import Progress
from tqdm import tqdm

In [4]:
class TransformerEncoderLatentSpace(nn.Module):
    def __init__(self, input_dim, latent_dim, num_heads, num_layers, dropout=0.1):
        """
        Transformer-based model to learn a latent space.

        Parameters:
        - input_dim (int): Dimension of the input features.
        - latent_dim (int): Dimension of the latent space.
        - num_heads (int): Number of attention heads in each Transformer layer.
        - num_layers (int): Number of Transformer layers in the encoder.
        - dropout (float): Dropout probability.
        """
        super(TransformerEncoderLatentSpace, self).__init__()

        self.input_dim = input_dim
        self.latent_dim = latent_dim

        # Positional Encoding
        self.positional_encoding = nn.Parameter(
            torch.zeros(1, 1000, input_dim)
        )  # Assume max 1000 sequence length

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim, nhead=num_heads, dropout=dropout
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=num_layers
        )

        # Projection to Latent Space
        self.to_latent = nn.Linear(input_dim, latent_dim)

    def forward(self, x):
        """
        Forward pass through the Transformer encoder.

        Parameters:
        - x (torch.Tensor): Input tensor of shape (batch_size, sequence_length, input_dim).

        Returns:
        - torch.Tensor: Latent space representation of shape (batch_size, latent_dim).
        """
        # Add positional encoding
        seq_len = x.size(1)
        x = x + self.positional_encoding[:, :seq_len, :]

        # Permute to match Transformer expected input shape (seq_len, batch_size, input_dim)
        x = x.permute(1, 0, 2)

        # Pass through Transformer Encoder
        encoded = self.transformer_encoder(x)

        # Take the mean of encoded representations across the sequence dimension
        encoded_mean = encoded.mean(dim=0)

        # Project to latent space
        latent_representation = self.to_latent(encoded_mean)

        return latent_representation


In [5]:
# Custom dataset class to handle CSV input and padding
class TreeDataset(Dataset):
    def __init__(self, csv_file, padding_length=161):
        # Load data from CSV
        self.padding_length = padding_length
        self.data = self._load_csv(csv_file)

    def _min_max_normalize(self, array, min_val=0, max_val=100):
        """
        Normalize array to the range [0, 1] based on given min_val and max_val.
        """
        array = np.array(array, dtype=np.float32)
        return (array - min_val) / (max_val - min_val)

    def _load_csv(self, csv_file):
        """Read CSV file using Python's built-in csv module."""
        data = []
        with open(csv_file, "r") as csvfile:
            reader = csv.reader(csvfile)
            with Progress() as progress:
                task = progress.add_task(
                    "[cyan]Processing CSV...", total=sum(1 for _ in csvfile)
                )  # Total rows in file
                csvfile.seek(0)  # Reset file pointer

                for row in reader:
                    row = list(map(int, row))
                    normalized_row = self._min_max_normalize(row)
                    data.append(normalized_row)
                    # Update progress bar
                    progress.update(task, advance=1)
        return data

    def __len__(self):
        """Return the length of the dataset (number of rows)."""
        return len(self.data)

    def __getitem__(self, idx):
        """Retrieve a single data point from the dataset."""
        return self.data[idx]

In [6]:
def set_seed_for_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [7]:
def train_model(model, criterion, optimizer, epochs, train_loader):
    print(f"Training the Autoencoder, Total epochs: {epochs}")
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        epoch_rmse = 0
        total_samples = 0
        for batch in tqdm(
            train_loader, desc=f"Epoch [{epoch+1}/{epochs}]", unit="batch"
        ):
            optimizer.zero_grad()
            _, reconstructed = model(batch)
            loss = criterion(reconstructed, batch)  # Reconstruction loss
            loss.backward()
            optimizer.step()

            # Accumulate loss
            batch_size = batch.size(0)
            total_samples += batch_size
            epoch_loss += loss.item() * batch_size  # Weighted by batch size

            # Calculate and accumulate RMSE
            rmse = torch.sqrt(loss)  # RMSE = sqrt(MSE)
            epoch_rmse += rmse.item() * batch_size  # Weighted by batch size

        # Compute average loss and RMSE over all samples
        epoch_loss /= total_samples
        epoch_rmse /= total_samples

        print(
            f"Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.6f}, RMSE: {epoch_rmse:.6f}"
        )

In [8]:
def validate_model(model, criterion, val_loader):
    ###
    # Validate the model
    ###
    model.eval()
    val_loss = 0
    val_rmse = 0
    total_samples = 0
    latent_representations = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validating...", unit="batch"):
            latent, reconstructed = model(batch)
            loss = criterion(reconstructed, batch)
            batch_size = batch.size(0)
            total_samples += batch_size
            val_loss += loss.item() * batch_size

            latent_representations.append(latent)

            rmse = torch.sqrt(loss)
            val_rmse += rmse.item() * batch_size

    val_loss /= total_samples
    val_rmse /= total_samples

    latent_representations = torch.cat(latent_representations).numpy()
    print(f"Latent representations shape: {latent_representations.shape}")
    print(f"Validation Loss: {val_loss:.6f}, Validation RMSE: {val_rmse:.6f}")
    return val_rmse

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

set_seed_for_everything(seed=1234)

# Load dataset
csv_file = "data/train_data.csv"  # Replace with your CSV file path
dataset = TreeDataset(csv_file)
print(f"Dataset size: {len(dataset)}")
train_size = int(0.8 * len(dataset))
print(f"Train size: {train_size}")
val_size = len(dataset) - train_size
print(f"Validation size: {val_size}")
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

Output()

Using device: cuda


Dataset size: 851229
Train size: 680983
Validation size: 170246


In [16]:
def pad_to_divisible(tensor, divisor):
    """
    Pads the last dimension of the tensor to make it divisible by the divisor.

    Args:
        tensor (torch.Tensor): Input tensor of shape (..., input_dim).
        divisor (int): The divisor to make the last dimension divisible by.

    Returns:
        torch.Tensor: Padded tensor.
    """
    input_dim = tensor.shape[-1]
    target_dim = (
        torch.ceil(input_dim / divisor) * divisor
    )  # Find the nearest divisible value
    padding = target_dim - input_dim  # Calculate required padding

    if padding > 0:
        # Apply padding to the last dimension
        padded_tensor = torch.nn.functional.pad(tensor, (0, padding))
        return padded_tensor, target_dim
    else:
        return tensor, input_dim


In [None]:
input_dim = 161  # Number of features
criterion = nn.MSELoss()

# Track best configuration
best_val_rmse = float("inf")
best_params = {}
best_model = None

# Define hyperparameters
lr = 0.001
weight_decay = 0
dropout = 0
latent_dim = 16
num_heads = 8  # Number of attention heads
num_layers = 4  # Number of Transformer layers
use_batch_norm = True
epochs = 10

model = TransformerEncoderLatentSpace(
    input_dim=input_dim,
    latent_dim=latent_dim,
    num_heads=num_heads,
    num_layers=num_layers,
    dropout=dropout,
)
optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

# Train the model
train_model(model, criterion, optimizer, epochs, train_loader)

# Validate the model
rmse = validate_model(model, criterion, val_loader)



Training the Autoencoder, Total epochs: 10


Epoch [1/10]:   0%|          | 0/21281 [00:00<?, ?batch/s]


RuntimeError: The size of tensor a (32) must match the size of tensor b (161) at non-singleton dimension 1