In [12]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
import numpy as np

In [4]:
# Load dataset (same logic as before)
file_path = '/Users/m.mughees/Desktop/2025-IEEE_SciVis-Contest-Mughees/data/Dataset_VisContest_Rapid_Alloy_development_v3.txt'
try:
    df = pd.read_csv(file_path, sep='\t', encoding='ISO-8859-1')
except Exception:
    df = pd.read_csv(file_path, sep=';', encoding='ISO-8859-1')

In [5]:

# Output columns (targets)
target_cols = ['YS(MPa)', 'CSC', 'Therm.conductivity(W/(mK))']

# Input features: composition only for now
composition_cols = ['Al', 'Cu', 'Mg', 'Mn', 'Fe', 'Si', 'Zn', 'Ni', 'Cr', 'Ti']

# Optional: Add microstructure features if desired later
# micro_cols = [col for col in df.columns if col.startswith(('Vf_', 'T_', 'delta_T'))]

# Drop rows with missing values in inputs or targets
df_model = df[composition_cols + target_cols].dropna()

# Final shapes
print("✅ Data prepared!")
print("Input shape:", df_model[composition_cols].shape)
print("Target shape:", df_model[target_cols].shape)
df_model.head()


✅ Data prepared!
Input shape: (324632, 10)
Target shape: (324632, 3)


Unnamed: 0,Al,Cu,Mg,Mn,Fe,Si,Zn,Ni,Cr,Ti,YS(MPa),CSC,Therm.conductivity(W/(mK))
0,83.675,0.9,1.05,0.025,0.55,12.25,0.125,1.3,0.05,0.025,384.67201,0.478438,159.046
1,84.11885,0.874425,1.017,0.065425,0.55495,11.86555,0.122525,1.2571,0.04835,0.024175,383.73928,0.479781,160.429
2,84.5627,0.84885,0.984,0.10585,0.5599,11.4811,0.12005,1.2142,0.0467,0.02335,381.73599,0.480653,161.346
3,85.00655,0.823275,0.951,0.146275,0.56485,11.09665,0.117575,1.1713,0.04505,0.022525,379.77859,0.480645,162.105
4,85.4504,0.7977,0.918,0.1867,0.5698,10.7122,0.1151,1.1284,0.0434,0.0217,377.97213,0.479638,163.127


In [8]:
X = df_model[['Al', 'Cu', 'Mg', 'Mn', 'Fe', 'Si', 'Zn', 'Ni', 'Cr', 'Ti']]
y_ys = df_model['YS(MPa)']
y_csc = df_model['CSC']
y_cond = df_model['Therm.conductivity(W/(mK))']

# --- Split into train/test ---
X_train, X_test, y_train, y_test = train_test_split(X, y_ys, test_size=0.2, random_state=42)

# --- Scale features ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
# --- Train models ---
lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=100, random_state=42)

lr.fit(X_train_scaled, y_train)
rf.fit(X_train_scaled, y_train)

In [13]:
# --- Predict ---
y_pred_lr = lr.predict(X_test_scaled)
y_pred_rf = rf.predict(X_test_scaled)

# --- Evaluate ---
def print_metrics(y_true, y_pred, name):
    print(f"📊 {name}")
    mse = mean_squared_error(y_true, y_pred)
    print("RMSE:", np.sqrt(mse))
    print("R²:", r2_score(y_true, y_pred))
    print("")

print_metrics(y_test, y_pred_lr, "Linear Regression")
print_metrics(y_test, y_pred_rf, "Random Forest")

📊 Linear Regression
RMSE: 11.094402725331205
R²: 0.9292097162106319

📊 Random Forest
RMSE: 0.3547846037358918
R²: 0.9999276072074553

