In [33]:
import csv
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader  # Corrected import for DataLoader
from torch.utils.data import random_split
import numpy as np
from rich.progress import Progress  # Importing Progress from rich
from tqdm import tqdm  # Importing tqdm for progress bar

In [None]:
# OLD
# class TreeGNN(torch.nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim):
#         super(TreeGNN, self).__init__()
#         # GNN layers
#         self.conv1 = GCNConv(input_dim, hidden_dim)
#         self.conv2 = GCNConv(hidden_dim, hidden_dim)
#         # Fully connected layer for the global embedding
#         self.fc = Linear(hidden_dim, output_dim)

#     def forward(self, x, edge_index, batch):
#         # GNN layers
#         out = self.conv1(x, edge_index)
#         out = F.relu(out)
#         out = self.conv2(out, edge_index)
#         # Global pooling to create a single embedding
#         embedding = global_mean_pool(out, batch)
#         # Final embedding
#         reconstruction = self.fc(out)
#         return embedding, reconstruction

In [None]:
class TreeGNN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TreeGNN, self).__init__()
        # GNN layers for encoding
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)

        # Linear layer for embedding generation
        self.fc_embedding = Linear(hidden_dim, output_dim)

        # Decoding layers for reconstruction
        self.fc_decode1 = Linear(hidden_dim, hidden_dim)
        self.fc_decode2 = Linear(hidden_dim, input_dim)  # Match input_dim here

    def forward(self, x, edge_index, batch):
        # Encoding
        out = self.conv1(x, edge_index)
        out = F.relu(out)
        out = self.conv2(out, edge_index)

        # Global pooling for graph-level embedding
        embedding = global_mean_pool(out, batch)
        embedding = self.fc_embedding(embedding)  # Final embedding

        # Decoding for reconstruction
        decoded = F.relu(self.fc_decode1(out))
        reconstruction = self.fc_decode2(decoded)  # Output matches input dimension

        return embedding, reconstruction

In [35]:
def min_max_normalize(array, min_val=0, max_val=100):
    """
    Normalize array to the range [0, 1] based on given min_val and max_val.
    """
    array = np.array(array, dtype=np.float32)
    return (array - min_val) / (max_val - min_val)


def read_and_process_csv(file_path, max_nodes=161, device="cpu"):
    """
    Read the CSV file without using pandas, pad the arrays to max_nodes, and convert to graph format.
    """
    graphs = []
    with open(file_path, "r") as csvfile:
        csvreader = csv.reader(csvfile)

        # Initialize rich progress bar
        with Progress() as progress:
            task = progress.add_task(
                "[cyan]Processing CSV...", total=sum(1 for _ in csvfile)
            )  # Total is calculated by counting rows
            csvfile.seek(0)  # Reset file pointer to start of the file

            # Read the file line by line
            for row in csvreader:
                row = list(map(int, row))
                # Normalize the padded row
                normalized_row = min_max_normalize(row, min_val=0, max_val=100)

                # Create node features (each node has a single feature, its value)
                node_features = torch.tensor(
                    normalized_row, dtype=torch.float, device=device
                ).view(-1, 1)

                # Create edge index for a binary tree structure
                edge_index = []
                for i in range(1, max_nodes):
                    parent = (i - 1) // 2
                    edge_index.append([parent, i])  # Parent to child
                    edge_index.append(
                        [i, parent]
                    )  # Child to parent (for undirected graph)

                edge_index = (
                    torch.tensor(edge_index, dtype=torch.long, device=device)
                    .t()
                    .contiguous()
                )

                # Single graph, so batch index is all zeros
                batch = torch.zeros(max_nodes, dtype=torch.long, device=device)

                # Create a PyTorch Geometric Data object
                graph = Data(x=node_features, edge_index=edge_index, batch=batch)
                graphs.append(graph)

                # Update progress bar
                progress.update(task, advance=1)

    return graphs

In [36]:
def set_seed_for_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [41]:
set_seed_for_everything(1234)

# Define the model
input_dim = 1  # Dimension of node features
hidden_dim = 64  # Hidden dimension in the GNN layers
output_dim = 16  # Desired embedding size
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = TreeGNN(input_dim, hidden_dim, output_dim).to(device)

# File path to the CSV file
file_path = "train_data.csv"  # Replace with your CSV file path

# Process the CSV and create a dataset of graphs
graphs = read_and_process_csv(file_path, device=device)

# Split the dataset into 80% train and 20% validation
train_size = int(0.8 * len(graphs))
val_size = len(graphs) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(
    graphs, [train_size, val_size]
)

# Use DataLoaders for batching
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

Output()

Using device: cuda


In [42]:
# Training loop with progress bar
epochs = 10  # Number of epochs
lr = 0.01
total_samples = 0

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

print("Starting training...")
for epoch in range(epochs):  # Loop over epochs
    model.train()  # Set the model to training mode
    epoch_loss = 0  # To accumulate the loss for the epoch

    # Create the tqdm progress bar for batches
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", unit="batch"):
        optimizer.zero_grad()  # Clear the gradients
        embedding, reconstruction = model(
            batch.x, batch.edge_index, batch.batch
        )  # Forward pass
        loss = F.mse_loss(reconstruction, batch.x)
        loss.backward()  # Backpropagation
        optimizer.step()  # Update the model parameters

        # Update progress bar with the loss value
        epoch_loss += loss.item()
        batch_size = batch.x.size(0)
        total_samples += batch_size

        epoch_loss /= total_samples
        epoch_rmse = torch.sqrt(torch.tensor(epoch_loss))

Starting training...


Epoch 1/10: 100%|██████████| 21281/21281 [01:00<00:00, 350.07batch/s]
Epoch 2/10: 100%|██████████| 21281/21281 [01:00<00:00, 351.96batch/s]
Epoch 3/10: 100%|██████████| 21281/21281 [01:01<00:00, 345.52batch/s]
Epoch 4/10: 100%|██████████| 21281/21281 [01:02<00:00, 338.44batch/s]
Epoch 5/10: 100%|██████████| 21281/21281 [01:01<00:00, 348.52batch/s]
Epoch 6/10: 100%|██████████| 21281/21281 [01:01<00:00, 343.55batch/s]
Epoch 7/10: 100%|██████████| 21281/21281 [01:02<00:00, 341.31batch/s]
Epoch 8/10: 100%|██████████| 21281/21281 [01:03<00:00, 337.00batch/s]
Epoch 9/10: 100%|██████████| 21281/21281 [01:00<00:00, 353.51batch/s]
Epoch 10/10: 100%|██████████| 21281/21281 [01:00<00:00, 350.26batch/s]


In [44]:
print(epoch_rmse.item())

7.16176828063908e-07


In [40]:
# Free memory
torch.cuda.empty_cache()
del model
del graphs
del train_dataset
del val_dataset
del train_loader
del val_loader
del optimizer
del embedding