# Function Definitions

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import json

def create_fixed_train_test_splits(csv_file, n_splits=5, test_size=0.2, random_seed=42, output_dir="splits"):
    """
    Creates and saves fixed train-test splits for a dataset.

    Args:
        csv_file: Path to the CSV file containing the dataset
        n_splits: Number of different train-test splits to create
        test_size: Proportion of the dataset to include in the test split (0.2 = 20%)
        random_seed: Base random seed (will be incremented for each split)
        output_dir: Directory to save the splits

    Returns:
        Dictionary with information about the saved splits
    """
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Load the dataset
    print(f"Loading dataset from {csv_file}")
    df = pd.read_csv(csv_file)
    print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

    # Information about saved splits
    split_info = {"n_splits": n_splits, "test_size": test_size, "splits": []}

    # Create and save splits
    for split_idx in range(n_splits):
        current_seed = random_seed + split_idx
        print(f"\nCreating split {split_idx + 1}/{n_splits} with seed {current_seed}")

        # Create train-test split
        train_df, test_df = train_test_split(
            df,
            test_size=test_size,
            random_state=current_seed,
            shuffle=True
        )

        # Save files
        train_file = os.path.join(output_dir, f"split_{split_idx}_train.csv")
        test_file = os.path.join(output_dir, f"split_{split_idx}_test.csv")

        train_df.to_csv(train_file, index=False)
        test_df.to_csv(test_file, index=False)

        # Store information about this split
        split_info["splits"].append({
            "split_idx": split_idx,
            "seed": current_seed,
            "train_file": train_file,
            "test_file": test_file,
            "train_size": len(train_df),
            "test_size": len(test_df)
        })

        print(f"  Train set saved to {train_file}: {len(train_df)} rows")
        print(f"  Test set saved to {test_file}: {len(test_df)} rows")

    # Save split information to a JSON file
    import json
    info_file = os.path.join(output_dir, "split_info.json")
    with open(info_file, 'w') as f:
        json.dump(split_info, f, indent=2)

    print(f"\nAll splits saved to {output_dir}")
    print(f"Split information saved to {info_file}")

    return split_info

def load_train_test_split(split_idx, output_dir="splits"):
    """
    Loads a specific train-test split.

    Args:
        split_idx: Index of the split to load (0-indexed)
        output_dir: Directory where the splits are saved

    Returns:
        Tuple of (train_df, test_df) - pandas DataFrames for the train and test sets
    """
    # Check if the split exists
    train_file = os.path.join(output_dir, f"split_{split_idx}_train.csv")
    test_file = os.path.join(output_dir, f"split_{split_idx}_test.csv")

    if not os.path.exists(train_file) or not os.path.exists(test_file):
        raise FileNotFoundError(f"Split {split_idx} not found in {output_dir}")

    # Load the files
    print(f"Loading split {split_idx}...")
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(test_file)

    print(f"Train set loaded: {len(train_df)} rows")
    print(f"Test set loaded: {len(test_df)} rows")

    return train_df, test_df

def load_all_train_test_splits(output_dir="splits"):
    """
    Loads all train-test splits.

    Args:
        output_dir: Directory where the splits are saved

    Returns:
        List of tuples (train_df, test_df) for each split
    """
    # Load split information
    info_file = os.path.join(output_dir, "split_info.json")

    if not os.path.exists(info_file):
        raise FileNotFoundError(f"Split information file not found: {info_file}")

    with open(info_file, 'r') as f:
        split_info = json.load(f)

    # Load all splits
    splits_test=[]
    splits_train = []
    for split_data in split_info["splits"]:
        split_idx = split_data["split_idx"]
        train_df, test_df = load_train_test_split(split_idx, output_dir)
        splits_train.append(train_df)
        splits_test.append(test_df)

    return splits_train,splits_test

# Saving SPlits

In [2]:
split_info = create_fixed_train_test_splits(
    csv_file="HTMDEC_MasterTable_Iterations_v5.csv",
    n_splits=5,
    output_dir="alloy_splits"
)

Loading dataset from HTMDEC_MasterTable_Iterations_v5.csv
Dataset loaded: 147 rows, 52 columns

Creating split 1/5 with seed 42
  Train set saved to alloy_splits/split_0_train.csv: 117 rows
  Test set saved to alloy_splits/split_0_test.csv: 30 rows

Creating split 2/5 with seed 43
  Train set saved to alloy_splits/split_1_train.csv: 117 rows
  Test set saved to alloy_splits/split_1_test.csv: 30 rows

Creating split 3/5 with seed 44
  Train set saved to alloy_splits/split_2_train.csv: 117 rows
  Test set saved to alloy_splits/split_2_test.csv: 30 rows

Creating split 4/5 with seed 45
  Train set saved to alloy_splits/split_3_train.csv: 117 rows
  Test set saved to alloy_splits/split_3_test.csv: 30 rows

Creating split 5/5 with seed 46
  Train set saved to alloy_splits/split_4_train.csv: 117 rows
  Test set saved to alloy_splits/split_4_test.csv: 30 rows

All splits saved to alloy_splits
Split information saved to alloy_splits/split_info.json


# Reading Splits

In [3]:
# Later, to load a specific split:
train_df, test_df = load_train_test_split(split_idx=0, output_dir="alloy_splits")

# Or to load all splits:
splits_train,splits_test = load_all_train_test_splits(output_dir="alloy_splits")

Loading split 0...
Train set loaded: 117 rows
Test set loaded: 30 rows
Loading split 0...
Train set loaded: 117 rows
Test set loaded: 30 rows
Loading split 1...
Train set loaded: 117 rows
Test set loaded: 30 rows
Loading split 2...
Train set loaded: 117 rows
Test set loaded: 30 rows
Loading split 3...
Train set loaded: 117 rows
Test set loaded: 30 rows
Loading split 4...
Train set loaded: 117 rows
Test set loaded: 30 rows


In [4]:
len(splits_train)

5

In [5]:
len(splits_test)

5

In [6]:
splits_test[0]

Unnamed: 0,Year,Iteration,Alloy Name,Al,Co,Cr,Cu,Fe,Mn,Ni,...,Indentation Depth (µm),Depth of Penetration (mm) FE_Sim,Grain Size (µm),Cracked,Processing,Recrystallization,XRD Phase,Tension Test Designation,Grain Size(um),Notes
0,2,BBB,BBB11,0,24,4,0,4,32,20,...,,,,0.0,Cold Rolled,950C for 30min/ water quench,FCC+σ,,,
1,1,AAD,AAD04,0,20,5,0,25,0,35,...,,,,0.0,Forged,No Treatment,FCC,Ductile,,
2,2,BBC,BBC08,0,8,0,4,4,28,36,...,,2.56,,,Cold Rolled,950C for 30min/ water quench,FCC+σ,,,
3,1,AAB,AAB04,0,25,10,0,25,0,35,...,,,,0.0,Forged,No Treatment,FCC,Ductile,,
4,2,BBA,BBA09,4,16,8,0,12,0,48,...,,2.67,,0.0,Cold Rolled,950C for 30min/ water quench,FCC,,30.79,"Sample Remade due to chemistry issues, failed ..."
5,1,AAA,AAA13,5,10,5,0,25,0,50,...,,,,0.0,Forged,No Treatment,FCC,Brittle,,
6,1,AAE,AAE13,0,10,5,0,10,0,55,...,,,,0.0,Forged,No Treatment,FCC,Ductile,,
7,1,AAB,AAB16,15,5,0,0,15,0,65,...,,,,0.0,Forged,No Treatment,L12,Ductile,,
8,2,BAA,BAA02,4,16,0,4,12,8,52,...,,2.96,,0.0,Forged,925C for 30min/ water quench,FCC,,19.0,
9,1,AAA,AAA10,0,5,15,0,5,0,70,...,,,,0.0,Forged,No Treatment,FCC,Ductile,,


# Input data for prior prediction from the 5 trained encoder-decoder models

In [7]:
import pandas as pd

def extract_element_composition(csv_file):
    """
    Reads the HTMDEC CSV file and returns a DataFrame containing only
    the element composition columns.

    Args:
        csv_file: Path to the CSV file

    Returns:
        DataFrame with only Al, Co, Cr, Cu, Fe, Mn, Ni, V columns
    """
    # Define the columns to extract
    element_columns = ['Al', 'Co', 'Cr', 'Cu', 'Fe', 'Mn', 'Ni', 'V']

    # Read the CSV file
    print(f"Loading dataset from {csv_file}")
    df = pd.read_csv(csv_file)
    print(f"Full dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

    # Check if all required columns exist
    missing_columns = [col for col in element_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following columns are missing from the dataset: {missing_columns}")

    # Extract only the element columns
    elements_df = df[element_columns]
    print(f"Extracted element composition: {elements_df.shape[0]} rows, {elements_df.shape[1]} columns")

    return elements_df


elements_df = extract_element_composition("HTMDEC_MasterTable_Iterations_v5.csv")

# Display the first few rows
print("\nFirst 5 rows of element composition:")
print(elements_df.head(5))

# You can also save the extracted data to a new CSV
elements_df.to_csv("element_composition.csv", index=False)

Loading dataset from HTMDEC_MasterTable_Iterations_v5.csv
Full dataset loaded: 147 rows, 52 columns
Extracted element composition: 147 rows, 8 columns

First 5 rows of element composition:
   Al  Co  Cr  Cu  Fe  Mn  Ni   V
0   0  45  10   0  20   0  15  10
1   0  30  10   0   5   0  45  10
2   0  30   5   0  30   0  20  15
3   0  25  10   0  20   0  40   5
4   0  10  10   0  55   0  25   0


In [8]:
elements_df.shape

(147, 8)