# Table of Contents
- [Imports](#imports)
- [Load Data](#load-data)
- [Split Data](#split-data)
- [Normalizing Data](#normalizing-data)
- [Conditional Variational Autoencoder Definition](#conditional-variational-autoencoder-definition)
- [Training CVAE](#training-cvae)


# Imports

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import csv
import os


# Load data

In [9]:
# Load dataset
data_filepath = r"Wiley.csv"

wiley_data = pd.read_csv(data_filepath)
# remove
df = wiley_data.drop(['Index','Unnamed: 16'], axis=1)

  # Update with your actual CSV file path

# Separate input features (X) and output feature (Y)
X = df.drop(columns=['fc (MPa)']).values  # All features except target
Y = df[['fc (MPa)']].values  # Target variable

# Split data
Make sure that data is split between train and test so that MinMaxScaler will not have problems.

In [10]:
# The min and max must be checked to ensure that scaling is performed properly.
# The min and max of each feature in the test must be within the range of min and max of the 
# corresponding feature in the train.
# Otherwise, the scaling of the test will generate greater than 1 or less than 0 values.
# Thus, we will find the min and max and make sure they follow the above conditions.

# Define function to check min-max conditions for both X and Y
def min_max_check(train, test):
    min_train, max_train = train.min(axis=0), train.max(axis=0)
    min_test, max_test = test.min(axis=0), test.max(axis=0)
    
    return np.all(min_train <= min_test) and np.all(max_train >= max_test)

# Try splitting up to 10 times
for attempt in range(10):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42 + attempt)

    if min_max_check(X_train, X_test) and min_max_check(Y_train, Y_test):
        print(f"Valid split found on attempt {attempt + 1}")
        break
else:
    print("Failed to find a valid split after 10 attempts.")

# Report min and max values
train_min_max_X = pd.DataFrame({'Min': X_train.min(axis=0), 'Max': X_train.max(axis=0)})
test_min_max_X = pd.DataFrame({'Min': X_test.min(axis=0), 'Max': X_test.max(axis=0)})

train_min_max_Y = pd.DataFrame({'Min': Y_train.min(axis=0), 'Max': Y_train.max(axis=0)})
test_min_max_Y = pd.DataFrame({'Min': Y_test.min(axis=0), 'Max': Y_test.max(axis=0)})

print("\nTrain set min-max (X):\n", train_min_max_X)
print("\nTest set min-max (X):\n", test_min_max_X)

print("\nTrain set min-max (Y):\n", train_min_max_Y)
print("\nTest set min-max (Y):\n", test_min_max_Y)


Valid split found on attempt 3

Train set min-max (X):
       Min     Max
0   270.0  1251.2
1     0.0   433.7
2     0.0   375.0
3     0.0   356.0
4     0.0   397.0
5     0.0   772.2
6     0.0    38.0
7     0.0   234.0
8     0.0  1502.8
9     0.0  1195.0
10   90.0   234.0
11    1.1    57.0
12   20.0   200.0
13   50.0   100.0
14    1.0   365.0

Test set min-max (X):
        Min     Max
0   401.00  1251.2
1     0.00   433.7
2     0.00   360.0
3     0.00   270.0
4     0.00   397.0
5     0.00   772.2
6     0.00    34.5
7     0.00   234.0
8     0.00  1502.8
9     0.00  1138.0
10  126.00   234.0
11    3.85    57.0
12   20.00    90.0
13   60.00   100.0
14    1.00   365.0

Train set min-max (Y):
     Min    Max
0  32.5  220.5

Test set min-max (Y):
     Min    Max
0  39.1  206.6


# Normalizing data

In [11]:
# Initialize scalers
scaler_X = MinMaxScaler()
scaler_Y = MinMaxScaler()

# Fit on training data only and transform both sets
X_train_normalized = scaler_X.fit_transform(X_train)
X_test_normalized = scaler_X.transform(X_test)  # Use transform, not fit_transform

Y_train_normalized = scaler_Y.fit_transform(Y_train)
Y_test_normalized = scaler_Y.transform(Y_test)  # Use transform, not fit_transform

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_normalized, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train_normalized, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_normalized, dtype=torch.float32)
Y_test_tensor = torch.tensor(Y_test_normalized, dtype=torch.float32)

# Create DataLoaders for training and testing
# batch_size = 64 original
batch_size = 32
train_loader = DataLoader(TensorDataset(X_train_tensor, Y_train_tensor), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test_tensor, Y_test_tensor), batch_size=batch_size, shuffle=False)


# Data for cost and embodied carbon calculation

## Material Costs and Embodied Carbon  

### Based on the average prices per kg as of 4.3.2025 from [IndiaMART](https://dir.indiamart.com)  

| Material             | Price (₹/kg) |
|----------------------|-------------|
| Cement              | 6.25        |
| Silica fume         | 25          |
| Blast furnace slag  | 10          |
| Fly ash             | 1           |
| Quarry powder       | 21          |
| Limestone powder    | 3           |
| Nano Silica         | 965         |
| Fiber               | 75          |
| Sand                | 2.3         |
| Gravel              | 2           |
| Superplasticizer    | 61          |

### Embodied Carbon (ICE Advanced Database)  

*Source: [ICE Advanced Database](https://circularecology.com/ice-database-faqs-2.html?utm_source=chatgpt.com)*  
**(Make sure to follow citation rules for ICE Database)**  

| Material             | Embodied Carbon (kg CO₂/kg) |
|----------------------|---------------------------|
| Cement              | 0.84                        |
| Blast furnace slag  | 0.08                        |
| Limestone powder    | 0.02                        |


In [12]:
# Cost and Embodied Carbon Data
material_costs = np.array([6.25, 25, 10, 1, 21, 3, 965, 75, 2.3, 2, 0, 61, 0, 0, 0]) # Cost per kg
embedded_carbon = np.array([0.84, 0, 0.08, 0, 0, 0.02, 0, 0, 0, 0, 0, 0, 0, 0, 0])  # Embodied carbon

# Conditional Variational Autoencoder definition

In [13]:
# Source for the formulae is official PyTorch tutorial
# Define CVAE model
class CVAE(nn.Module):
    def __init__(self, input_dim, cond_dim, latent_dim, encoder_layers, decoder_layers ):
        super(CVAE, self).__init__()

        # Encoder: Learns (z | X, Y)
        encoder_sequential_layers = []
        in_dim = input_dim+cond_dim
        for out_dim in encoder_layers:
            encoder_sequential_layers.append(nn.Linear(in_dim,out_dim))
            encoder_sequential_layers.append(nn.ReLU())
            in_dim = out_dim

        self.encoder = nn.Sequential(*encoder_sequential_layers)

        self.mu = nn.Linear(encoder_layers[-1], latent_dim)
        self.logvar = nn.Linear(encoder_layers[-1], latent_dim)

        # Decoder: Learns (X' | z, Y)
        decoder_sequential_layers = []
        in_dim = latent_dim + cond_dim
        for out_dim in decoder_layers:
            decoder_sequential_layers.append(nn.Linear(in_dim, out_dim))
            decoder_sequential_layers.append(nn.ReLU())
            in_dim = out_dim

        # Final layer maps to original input dimension
        decoder_sequential_layers.append(nn.Linear(in_dim, input_dim))
        decoder_sequential_layers.append(nn.Sigmoid())  # If output is scaled between 0 and 1

        self.decoder = nn.Sequential(*decoder_sequential_layers)

    def encode(self, x, y):
        inputs = torch.cat((x, y), dim=1)  # Concatenate input features with condition
        h = self.encoder(inputs)
        return self.mu(h), self.logvar(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std  # Reparametrization trick

    def decode(self, z, y):
        inputs = torch.cat((z, y), dim=1)  # Concatenate latent space with condition
        return self.decoder(inputs)

    def forward(self, x, y):
        mu, logvar = self.encode(x, y)
        z = self.reparameterize(mu, logvar)
        return self.decode(z, y), mu, logvar

# Define loss function: Reconstruction Loss (MSE) + KL Divergence
# Define loss function: Reconstruction Loss (MSE) + KL Divergence
def loss_function(recon_x, x, mu, logvar,c1=0.000001,c2=0.001):
    # MSE Loss (Reconstruction Loss)
    MSE = nn.functional.mse_loss(recon_x, x, reduction='sum') 
    
    # KLD (KL Divergence)
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    
    # Denormalizing the inputs and reconstructions
    X_denormalized = torch.tensor(scaler_X.inverse_transform(x.detach().numpy()), dtype=torch.float32)
    recon_X_denormalized = torch.tensor(scaler_X.inverse_transform(recon_x.detach().numpy()), dtype=torch.float32)
    
    # Convert material_costs and embedded_carbon to torch tensors
    material_costs_tensor = torch.tensor(material_costs, dtype=torch.float32)
    embedded_carbon_tensor = torch.tensor(embedded_carbon, dtype=torch.float32)
    
    # Compute additional losses (material cost and carbon losses)
    material_costs_loss = torch.sum(material_costs_tensor * recon_X_denormalized) - torch.sum(material_costs_tensor * X_denormalized)
    embedded_carbon_loss = torch.sum(embedded_carbon_tensor * recon_X_denormalized) - torch.sum(embedded_carbon_tensor * X_denormalized)
    
    # Total loss
    vae_loss = MSE + 0.001 * KLD 
    
    # Return the square root of the total loss
    return vae_loss+ c1 * material_costs_loss + c2 * embedded_carbon_loss

# Initialize model, optimizer, and loss function
input_dim = X.shape[1]
cond_dim = 1  # 'fc (MPa)' is the condition
latent_dim = 2  # Size of latent space



# Training CVAE

In [14]:
encoder_layers_list = [
    [16],
    [32],
    [64],
    [128],
    [256],
    [32, 16],
    [128, 64],
    [256, 128],
    [32, 16, 8],
    [256, 128, 64],
    [32, 16, 8, 4, 3]
]

decoder_layers_list = [
    [16],
    [32],
    [64],
    [128],
    [256],
    [16, 32],
    [64, 128],
    [128, 256],
    [8, 16, 32],
    [64, 128, 256],
    [3, 4, 8, 16, 32]
]


In [15]:
count_architectures = len(encoder_layers_list)
for i in range(count_architectures):
        # Example encoder architecture for this run
    encoder_layers = encoder_layers_list[i]
    decoder_layers = decoder_layers_list[i]
    model = CVAE(input_dim, cond_dim, latent_dim,encoder_layers,decoder_layers)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # Generate encoder name string
    encoder_name = "Encoder_" + "_".join(str(x) for x in encoder_layers)

    # Create file paths
    loss_log_file = f"output_vae_full_weight/output_file_vae_{encoder_name}_batch_32.csv"
    model_path = f"output_vae_full_weight/cvae_model_{encoder_name}_batch_32.pth"
    optimizer_path = f"output_vae_full_weight/cvae_optimizer_{encoder_name}_batch_32.pth"

    # Write CSV header
    with open(loss_log_file, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Epoch", "Train Loss", "Test Loss"])

    # Training loop
    epochs = 5000
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for x_batch, y_batch in train_loader:
            optimizer.zero_grad()
            recon_x, mu, logvar = model(x_batch, y_batch)
            # Calculate losses
            loss = loss_function(recon_x, x_batch, mu, logvar)   
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Evaluation on test set
        model.eval()
        test_loss = 0
        with torch.no_grad():
            for x_batch, y_batch in test_loader:
                recon_x, mu, logvar = model(x_batch, y_batch)
                loss = loss_function(recon_x, x_batch, mu, logvar)
                test_loss += loss.item()

        with open(loss_log_file, mode="a", newline="") as file:
            writer = csv.writer(file)
            writer.writerow([epoch, train_loss/len(train_loader.dataset), test_loss/len(test_loader.dataset)])

    # Save model and optimizer
    torch.save(model, model_path)
    torch.save(optimizer.state_dict(), optimizer_path)

