In [3]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import numpy as np

In [2]:
# Load dataset (same logic as before)
file_path = '/Users/m.mughees/Desktop/2025-IEEE_SciVis-Contest-Mughees/data/Dataset_VisContest_Rapid_Alloy_development_v3.txt'
try:
    df = pd.read_csv(file_path, sep='\t', encoding='ISO-8859-1')
except Exception:
    df = pd.read_csv(file_path, sep=';', encoding='ISO-8859-1')

In [3]:

# Output columns (targets)
target_cols = ['YS(MPa)', 'CSC', 'Therm.conductivity(W/(mK))']

# Input features: composition only for now
composition_cols = ['Al', 'Cu', 'Mg', 'Mn', 'Fe', 'Si', 'Zn', 'Ni', 'Cr', 'Ti']

# Optional: Add microstructure features if desired later
# micro_cols = [col for col in df.columns if col.startswith(('Vf_', 'T_', 'delta_T'))]

# Drop rows with missing values in inputs or targets
df_model = df[composition_cols + target_cols].dropna()

# Final shapes
print("✅ Data prepared!")
print("Input shape:", df_model[composition_cols].shape)
print("Target shape:", df_model[target_cols].shape)
df_model.head()


✅ Data prepared!
Input shape: (324632, 10)
Target shape: (324632, 3)


Unnamed: 0,Al,Cu,Mg,Mn,Fe,Si,Zn,Ni,Cr,Ti,YS(MPa),CSC,Therm.conductivity(W/(mK))
0,83.675,0.9,1.05,0.025,0.55,12.25,0.125,1.3,0.05,0.025,384.67201,0.478438,159.046
1,84.11885,0.874425,1.017,0.065425,0.55495,11.86555,0.122525,1.2571,0.04835,0.024175,383.73928,0.479781,160.429
2,84.5627,0.84885,0.984,0.10585,0.5599,11.4811,0.12005,1.2142,0.0467,0.02335,381.73599,0.480653,161.346
3,85.00655,0.823275,0.951,0.146275,0.56485,11.09665,0.117575,1.1713,0.04505,0.022525,379.77859,0.480645,162.105
4,85.4504,0.7977,0.918,0.1867,0.5698,10.7122,0.1151,1.1284,0.0434,0.0217,377.97213,0.479638,163.127


In [4]:
X = df_model[['Al', 'Cu', 'Mg', 'Mn', 'Fe', 'Si', 'Zn', 'Ni', 'Cr', 'Ti']]
y_ys = df_model['YS(MPa)']
y_csc = df_model['CSC']
y_cond = df_model['Therm.conductivity(W/(mK))']

# --- Split into train/test ---
X_train, X_test, y_train, y_test = train_test_split(X, y_ys, test_size=0.2, random_state=42)

# --- Scale features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [5]:
#Testing for data leakage

print(y_ys.describe())
print(y_csc.describe())
print(y_cond.describe())


print(X.columns)



count    324632.000000
mean        277.797555
std          41.714142
min         194.422830
25%         245.984650
50%         264.841275
75%         306.806645
max         423.404970
Name: YS(MPa), dtype: float64
count    324632.000000
mean          0.456215
std           0.063712
min           0.313578
25%           0.424489
50%           0.448343
75%           0.468313
max           1.171836
Name: CSC, dtype: float64
count    324632.000000
mean        176.167594
std           8.009061
min         148.924000
25%         170.532000
50%         176.232000
75%         181.953000
max         198.787000
Name: Therm.conductivity(W/(mK)), dtype: float64
Index(['Al', 'Cu', 'Mg', 'Mn', 'Fe', 'Si', 'Zn', 'Ni', 'Cr', 'Ti'], dtype='object')


In [8]:
# --- Input features ---
X = df_model[['Al', 'Cu', 'Mg', 'Mn', 'Fe', 'Si', 'Zn', 'Ni', 'Cr', 'Ti']]

# --- Target variables ---
targets = {
    "YS (MPa)": df_model["YS(MPa)"],
    "CSC": df_model["CSC"],
    "Thermal Conductivity (W/mK)": df_model["Therm.conductivity(W/(mK))"]
}

# --- Store models and metrics ---
models = {}   # format: models["YS (MPa)"]["rf"] = model
metrics = {}  # format: metrics["YS (MPa)"]["rf"] = {rmse, r2}
importances = {} 

for name, y in targets.items():
    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initialize models
    lr = LinearRegression()
    rf = RandomForestRegressor(n_estimators=100, random_state=42)

    # Fit models
    lr.fit(X_train_scaled, y_train)
    rf.fit(X_train_scaled, y_train)

    # Predict
    y_pred_lr = lr.predict(X_test_scaled)
    y_pred_rf = rf.predict(X_test_scaled)

    mse_lr = mean_squared_error(y_test, y_pred_lr)
    mse_rf = mean_squared_error(y_test, y_pred_rf)


    # Evaluate
    metrics[name] = {
        "Linear Regression": {
            "RMSE": np.sqrt(mse_lr),
            "R²": r2_score(y_test, y_pred_lr)
        },
        "Random Forest": {
            "RMSE": np.sqrt(mse_rf),
            "R²": r2_score(y_test, y_pred_rf)
        }
    }

    # Store trained models
    models[name] = {"lr": lr, "rf": rf}

    # Store feature importances for Random Forest
    importances[name] = sorted(
        zip(X.columns, rf.feature_importances_), 
        key=lambda x: x[1], reverse=True
    )

# --- Display metrics ---
for target, model_results in metrics.items():
    print(f"\n📈 Results for: {target}")
    for model_name, result in model_results.items():
        print(f"  🔹 {model_name}")
        print(f"     RMSE: {result['RMSE']:.4f}")
        print(f"     R²: {result['R²']:.4f}")


# --- Display feature importances ---
for target, feature_list in importances.items():
    print(f"\n🔍 Top Features for: {target}")
    for feat, score in feature_list:
        print(f"   {feat}: {score:.4f}")


📈 Results for: YS (MPa)
  🔹 Linear Regression
     RMSE: 11.0944
     R²: 0.9292
  🔹 Random Forest
     RMSE: 0.3548
     R²: 0.9999

📈 Results for: CSC
  🔹 Linear Regression
     RMSE: 0.0520
     R²: 0.3290
  🔹 Random Forest
     RMSE: 0.0012
     R²: 0.9997

📈 Results for: Thermal Conductivity (W/mK)
  🔹 Linear Regression
     RMSE: 1.0614
     R²: 0.9823
  🔹 Random Forest
     RMSE: 0.0926
     R²: 0.9999

🔍 Top Features for: YS (MPa)
   Si: 0.9371
   Al: 0.0406
   Mn: 0.0071
   Cu: 0.0051
   Zn: 0.0039
   Fe: 0.0031
   Ti: 0.0019
   Mg: 0.0010
   Ni: 0.0001
   Cr: 0.0001

🔍 Top Features for: CSC
   Ni: 0.5064
   Cu: 0.1839
   Al: 0.1426
   Si: 0.1334
   Ti: 0.0198
   Zn: 0.0073
   Mg: 0.0035
   Mn: 0.0025
   Cr: 0.0003
   Fe: 0.0002

🔍 Top Features for: Thermal Conductivity (W/mK)
   Al: 0.9302
   Cu: 0.0515
   Mn: 0.0104
   Zn: 0.0044
   Si: 0.0024
   Ni: 0.0005
   Cr: 0.0003
   Fe: 0.0002
   Ti: 0.0001
   Mg: 0.0001


In [4]:
# Define element columns
elements = ['Al', 'Cu', 'Mg', 'Mn', 'Fe', 'Si', 'Zn', 'Ni', 'Cr', 'Ti']

# Function to generate N new alloys with random but constrained percentages
def generate_new_alloys(n_samples=1000, total_percentage=100):
    np.random.seed(42)  # for reproducibility
    compositions = []

    for _ in range(n_samples):
        # Randomly assign weights
        weights = np.random.rand(len(elements))
        weights /= weights.sum()  # normalize to sum to 1
        composition = weights * total_percentage
        compositions.append(composition)

    df_new_alloys = pd.DataFrame(compositions, columns=elements)
    return df_new_alloys

In [5]:
# Generate 1000 new alloys
new_alloys_df = generate_new_alloys(n_samples=1000)
print("✅ Generated New Alloy Compositions:")
print(new_alloys_df.head())

✅ Generated New Alloy Compositions:
          Al         Cu         Mg         Mn         Fe         Si        Zn  \
0   7.200801  18.278161  14.073106  11.509637   2.999570   2.999106  1.116699   
1   0.520773  24.538041  21.060217   5.372031   4.600045   4.640006  7.697116   
2  15.281528   3.483974   7.296552   9.150188  11.390722  19.610414  4.987017   
3  11.929704   3.348399   1.277348  18.632243  18.961076  15.873628  5.981372   
4   3.016589  12.239978   0.850029  22.476941   6.396626  16.376487  7.704997   

          Ni         Cr         Ti  
0  16.652855  11.556865  13.613201  
1  13.275971  10.927907   7.367894  
2  12.843427  14.796040   1.160137  
3   1.917882  13.435547   8.642800  
4  12.855247  13.513800   4.569305  
