### XGBoost Model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold
)
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)
import xgboost as xgb


csv_path = ( "input_file_path.csv")
df = pd.read_csv(csv_path)


FEATURES = ['Ti','Er','Rb','Ho','Pu','Cs','Zr','S','Tb','N','Ge','Pd','Mg','Re','La','K',
    'Hf','P','Br','Ag','Os','F','Sc','Cm','Mo','In','Cl','Hg','Se','Tm','Ir','W',
    'Th','H','Te','Np','Zn','Li','Gd','Ni','Co','Bi','I','Pr','Cd','Nb','Pa','Pt',
    'Si','U','V','Sb','Mn','Na','Ce','Yb','Ta','Nd','Rh','O','Au','Sr','Eu','C',
    'Pb','Ca','Cr','Cu','Ga','Fe','Y','As','Sn','B','Ba','Dy','Be','Sm','Lu','Al',
    'Tl','Ru','Avg_Atomic_Number','Average_Weight','Average_Electronegativity',
    'Magnetic_proportion','Entropy','average_period','avg_magnetic_moment',
    'average_group','Rare_Earth_proportion'
]
TARGET = 'Mean_TC_K'

X = df[FEATURES]
y = df[TARGET]

bins = pd.cut(
    y,
    bins=[-np.inf,  60, 165, 270, 325, 500, 665, np.inf],
    labels=False
)

# Stratified train/test split
X_train, X_test, y_train, y_test, bins_train, bins_test = train_test_split(
    X, y, bins,
    test_size=0.20,
    random_state=42,
    stratify=bins
)


train_bin_counts = pd.Series(bins_train).value_counts().sort_index()
print("Train bin counts:\n", train_bin_counts)
minority_bin_size = train_bin_counts.min()
print(f"\nSmallest bin size = {minority_bin_size} samples\n")


best_params = {
    'n_estimators':     1200,
    'max_depth':        12,
    'learning_rate':    0.08,
    'subsample':        0.8,
    'colsample_bytree': 0.6
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
r2_scores, mae_scores = [], []

for fold, (tr_idx, val_idx) in enumerate(cv.split(X_train, bins_train), 1):
    Xt, Xv = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    yt, yv = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    
    bins_tr = bins_train.iloc[tr_idx]  

    
    fold_counts = pd.Series(bins_tr).value_counts()
    fold_min = fold_counts.min()
    

    
    M = 5
    val_preds = np.zeros((len(Xv), M))
    for i in range(M):
        idxs = []
        for b in fold_counts.index:
            b_idxs = np.where(bins_tr == b)[0]
            if len(b_idxs) <= fold_min:
                sel = b_idxs
            else:
                sel = np.random.RandomState(100+i).choice(
                    b_idxs, size=fold_min, replace=False
                )
            idxs.append(sel)
        idxs = np.concatenate(idxs)

        mb = xgb.XGBRegressor(
            objective='reg:squarederror',
            random_state=100+i,
            n_jobs=1,
            **best_params
        )
        mb.fit(Xt.iloc[idxs], yt.iloc[idxs])
        val_preds[:, i] = mb.predict(Xv)

    yv_pred = val_preds.mean(axis=1)
    r2_scores.append(r2_score(yv, yv_pred))
    mae_scores.append(mean_absolute_error(yv, yv_pred))
    print(f" Fold {fold:>2}:   R²={r2_scores[-1]:.3f}, MAE={mae_scores[-1]:.1f}")

print("\nValidation performance (mean ± std):")
print(f" R²  : {np.mean(r2_scores):.3f} ± {np.std(r2_scores):.3f}")
print(f" MAE : {np.mean(mae_scores):.1f} ± {np.std(mae_scores):.1f}\n")


M_final = 20
models = []
for i in range(M_final):
    idxs = []
    for b in train_bin_counts.index:
        b_idxs = np.where(bins_train == b)[0]
        if len(b_idxs) <= minority_bin_size:
            sel = b_idxs
        else:
            sel = np.random.RandomState(200+i).choice(
                b_idxs, size=minority_bin_size, replace=False
            )
        idxs.append(sel)
    idxs = np.concatenate(idxs)

    m = xgb.XGBRegressor(
        objective='reg:squarederror',
        random_state=200+i,
        n_jobs=1,
        **best_params
    )
    m.fit(X_train.iloc[idxs], y_train.iloc[idxs])
    models.append(m)


all_preds = np.column_stack([m.predict(X_test) for m in models])
y_pred = all_preds.mean(axis=1)
y_std  = all_preds.std(axis=1)


print(" Test performance:")
print(" R²  :", r2_score(y_test, y_pred))
print(" MAE :", mean_absolute_error(y_test, y_pred))
print(" RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))



sns.set(style="whitegrid")
plt.figure(figsize=(6,6))
plt.errorbar(
    y_test, y_pred, 
    yerr=y_std, 
    fmt='o', 
    ecolor='lightgray', 
    capsize=2, 
    alpha=0.7
)
plt.plot(
    [y_test.min(), y_test.max()],
    [y_test.min(), y_test.max()],
    'r--', lw=1
)
plt.xlabel('Actual Mean Curie Temperature (K)')
plt.ylabel('Predicted Mean Curie Temperature (K)')
plt.title('Ensemble Predictions ±1σ on TEST')
plt.tight_layout()
plt.show()

###Plot for the Predicted Vs Actual Temperature 

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
import matplotlib as mpl
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


r2   = r2_score(y_test, y_pred)
mae  = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))


mpl.rcParams.update({
    "axes.titlesize": 20,
    "axes.labelsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "sans-serif",
    "font.sans-serif": "Helvetica",
    "legend.fontsize": 13,
    "axes.linewidth": 1.2,
    "xtick.major.width": 1.1,
    "ytick.major.width": 1.1,
    "grid.alpha": 0.3,
    "grid.linestyle": "--"
})


norm = plt.Normalize(vmin=y_std.min(), vmax=y_std.max())
colors = cm.turbo(norm(y_std))  


fig, ax = plt.subplots(figsize=(7.1, 6))  


scatter = ax.scatter(
    y_test, y_pred,
    c=y_std, cmap='turbo',
    edgecolor='black', linewidth=0.25,
    s=65, alpha=0.9
)


ax.errorbar(
    y_test, y_pred,
    yerr=y_std,
    fmt='none',
    ecolor='gray',
    alpha=0.2,
    capsize=2,
    linewidth=0.6,
    zorder=0
)


ax.plot(
    [y_test.min(), y_test.max()],
    [y_test.min(), y_test.max()],
    'r--', linewidth=1.5
)


x_min, x_max = y_test.min(), y_test.max()
ax.set_xlim([x_min - 40, x_max + 40])
ax.set_ylim([x_min - 40, x_max + 40])
ax.set_xlabel('Actual Curie Temperature (K)', labelpad=10)
ax.set_ylabel('Predicted Curie Temperature (K)', labelpad=10)
ax.set_title('XGBoost Ensemble with Stratified Undersampling', pad=20)


stats_text = f"R² = {r2:.2f}\nMAE = {mae:.0f} K\nRMSE = {rmse:.0f} K"
ax.text(
    0.05, 0.95, stats_text,
    transform=ax.transAxes,
    fontsize=14,
    verticalalignment='top',
    bbox=dict(facecolor='white', edgecolor='gray', boxstyle='round,pad=0.5')
)


cbar = plt.colorbar(scatter, ax=ax, pad=0.03)
cbar.set_label('Prediction Std Deviation (K)', size=14)

# Save plot
plt.tight_layout(pad=2.5)
#plt.savefig("filepath_to_save_plot", dpi=400, bbox_inches='tight')
plt.show()

##plot the histogram for the error bar

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import expon


abs_errors = np.abs(y_test - y_pred)
mae = np.mean(abs_errors)
mae_rounded = round(mae)


loc, scale = expon.fit(abs_errors)


cutoff = 500
filtered_errors = abs_errors[abs_errors <= cutoff]


plt.figure(figsize=(7, 5))
counts, bins, _ = plt.hist(
    filtered_errors, bins=50, density=True, 
    alpha=0.85, color='royalblue', label='Histogram'
)


x_vals = np.linspace(0, cutoff, 500)
plt.plot(
    x_vals, expon.pdf(x_vals, loc, scale), 
    'k-', lw=2, label='Fitted Exponential Distribution'
)


plt.axvline(
    mae, color='red', linestyle='--', lw=2,
    label=f'Mean Absolute Error: {mae_rounded} K'
)


plt.xlabel('Absolute Error (K)')
plt.ylabel('Density')
plt.title('Distribution of Absolute Prediction Errors')
plt.xlim(0, cutoff)
plt.legend(loc='upper right')
plt.grid(True)
plt.tight_layout()
#plt.savefig("filepath_to_save_plot", dpi=400, bbox_inches='tight')
plt.show()

## feature importance plot 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import xgboost as xgb


mpl.rcParams.update({
    "axes.titlesize": 20,
    "axes.labelsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "sans-serif",
    "font.sans-serif": "Helvetica",
    "legend.fontsize": 13,
    "axes.linewidth": 1.2,
    "xtick.major.width": 1.1,
    "ytick.major.width": 1.1,
    "grid.alpha": 0.3,
    "grid.linestyle": "--"
})


#   Gain-based feature importances from XGBoost ensemble

all_imps = []
for m in models:  
    booster  = m.get_booster()
    imp_dict = booster.get_score(importance_type='gain')  
    imp_arr  = np.array([imp_dict.get(feat, 0.0) for feat in FEATURES])
    all_imps.append(imp_arr)

all_imps = np.vstack(all_imps)            
mean_imp = all_imps.mean(axis=0)
std_imp  = all_imps.std(axis=0)


# Build a DataFrame and pick Top-20 features

imp_df = (
    pd.DataFrame({
        'feature': FEATURES,
        'mean_importance': mean_imp,
        'std_importance': std_imp
    })
    .sort_values('mean_importance', ascending=False)
    .reset_index(drop=True)
)

TOP_N = 20
top20 = imp_df.head(TOP_N).copy()


# Names mapping (for plot only)

pretty_names = {
    'avg_magnetic_moment'      : 'Avg Magnetic Moment',
    'Average_Weight'           : 'Avg Atomic Weight',
    'Magnetic_proportion'      : 'Prop. of High Curie Elements',
    'Average_Electronegativity': 'Avg Electronegativity',
    'Avg_Atomic_Number'        : 'Avg Atomic Number',
    'Entropy'                  : 'Avg Entropy',
    'Rare_Earth_proportion'    : 'Proportion of RE Elements',
    'average_group'            : 'Avg Group',
    'average_period'           : 'Avg Period',
}
top20['label'] = top20['feature'].map(pretty_names).fillna(top20['feature'])


#  Plot Top-20 Feature Importances (uniform with RF plot)

plt.figure(figsize=(9, 6))
plt.barh(
    top20['label'][::-1],                
    top20['mean_importance'][::-1],
    xerr=top20['std_importance'][::-1],
    color='royalblue',
    edgecolor='black',
    capsize=3,
    zorder=3   
)


plt.grid(axis='x', linestyle='--', alpha=0.4, zorder=0)

plt.xlabel('Average gain importance\n(Ensemble Mean ± 1 SD)', fontsize=13)
plt.title('Top-20 Feature Importances — XGBoost Ensemble', fontsize=15)
plt.tight_layout()

#plt.savefig("filepath", dpi=400, bbox_inches='tight')
plt.show()