In [None]:
##XGB Classifier

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl 

from sklearn.model_selection import (
    StratifiedShuffleSplit,
    StratifiedKFold,
    GridSearchCV
)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix
)
from xgboost import XGBClassifier

# Load the data 
CSV = (filepath)
df = pd.read_csv(CSV)

FEATURES = [
    'Lu','Co','Pm','Pt','Sb','Tl','N','F','Ni','Er','Mn','K','Be','Np','Mg','Ag',
    'Mo','Os','Gd','Pu','Cr','Ba','Tm','Cl','Br','C','Te','Hf','Ca','Y','Hg','O',
    'Ac','Ta','Th','Ce','Si','Am','Sc','Ru','Eu','Sr','Li','La','Ti','Tb','Bi',
    'Pb','Se','Ga','Zr','Au','As','P','Pd','I','Ge','Ra','Cm','W','Sm','Sn','U',
    'Bk','Pr','In','Tc','Al','H','Rh','Ho','Cs','Dy','Zn','Na','Fe','Cd','S','Nd',
    'V','Cu','Pa','Ir','Rb','B','Re','Yb','Nb',
    'Avg_Atomic_Number','Average_Weight','Average_Electronegativity',
    'average_period','avg_magnetic_moment','average_group',
    'Rare_Earth_proportion','Magnetic_proportion','Entropy'
]
TARGET = 'Type'   # 0=FM,1=AFM,2=NM

X = df[FEATURES]
y = df[TARGET]


sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
train_val_idx, test_idx = next(sss1.split(X, y))
X_train_val, X_test = X.iloc[train_val_idx], X.iloc[test_idx]
y_train_val, y_test = y.iloc[train_val_idx], y.iloc[test_idx]

sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
train_idx, val_idx = next(sss2.split(X_train_val, y_train_val))
X_train, X_val = X_train_val.iloc[train_idx], X_train_val.iloc[val_idx]
y_train, y_val = y_train_val.iloc[train_idx], y_train_val.iloc[val_idx]

print("Class counts → Train / Val / Test:")
print(y_train.value_counts(), "\n", y_val.value_counts(), "\n", y_test.value_counts(), "\n")


pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('xgb', XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=42,
        n_jobs=1
    ))
])

param_grid = {
    'xgb__n_estimators':     [200,400,600,800,1000,1200],
    'xgb__max_depth':        [5,10,15,20],
    'xgb__learning_rate':    [0.1, 0.05, 0.01],
    'xgb__subsample':        [0.7, 0.8, 0.9],
    'xgb__colsample_bytree': [0.6, 0.7, 0.8],
}

inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
search = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    cv=inner_cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
search.fit(X_train, y_train)

print("Best hyperparameters (on Train):")
print(search.best_params_)

best_model = search.best_estimator_


y_train_pred = best_model.predict(X_train)
print(f"Training accuracy: {accuracy_score(y_train, y_train_pred):.3f}")
print(classification_report(
    y_train, y_train_pred, digits=3, target_names=['FM','AFM','NM']
))


y_val_pred = best_model.predict(X_val)
print(f"Validation accuracy: {accuracy_score(y_val, y_val_pred):.3f}")
print(classification_report(
    y_val, y_val_pred, digits=3, target_names=['FM','AFM','NM']
))


best_model.fit(X_train_val, y_train_val)
y_test_pred = best_model.predict(X_test)
print(f"Test accuracy: {accuracy_score(y_test, y_test_pred):.3f}")
print(classification_report(
    y_test, y_test_pred, digits=3, target_names=['FM','AFM','NM']
))


labels = ['FM','AFM','NM']
cm_val  = confusion_matrix(y_val,  y_val_pred,  labels=[0,1,2])
cm_test = confusion_matrix(y_test, y_test_pred, labels=[0,1,2])

mpl.rcParams.update({
    "axes.titlesize":   20,
    "axes.labelsize":   18,
    "xtick.labelsize":  16,
    "ytick.labelsize":  16,
    "font.family":      "sans-serif",
    "font.sans-serif":  "Helvetica",
    "legend.fontsize":  13,
    "axes.linewidth":   1.2,
    "grid.alpha":       0.3,
    "grid.linestyle":   "--"
})
sns.set(style="whitegrid")

fig, axes = plt.subplots(1, 2, figsize=(14, 6), sharey=True)
for ax, cmatrix, title, acc in zip(
    axes,
    [cm_val, cm_test],
    ["Validation", "Test"],
    [accuracy_score(y_val, y_val_pred), accuracy_score(y_test, y_test_pred)]
):
    sns.heatmap(
        cmatrix,
        annot=True, fmt='d',
        cmap='Blues', cbar=False,
        xticklabels=labels, yticklabels=labels,
        linewidths=0.5, linecolor='gray',
        annot_kws={"size":14,"weight":"bold"},
        ax=ax
    )
    ax.set_xlabel("Predicted", fontsize=16)
    ax.set_ylabel("Actual",    fontsize=16)
    ax.set_title(f"{title}\nAccuracy = {acc:.3f}", pad=12)
    ax.set_facecolor("#F7F7F7")

plt.tight_layout(pad=2)
plt.show()

In [None]:
## Confusion Matrix 

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score

cm_val  = confusion_matrix(y_val,  y_val_pred,  labels=[0,1,2])
cm_test = confusion_matrix(y_test, y_test_pred, labels=[0,1,2])
labels  = ['FM', 'AFM', 'NM']

mpl.rcParams.update({
    "axes.titlesize":   20,
    "axes.labelsize":   18,
    "xtick.labelsize":  16,
    "ytick.labelsize":  16,
    "font.family":      "sans-serif",
    "font.sans-serif":  "Helvetica",
    "legend.fontsize":  13,
    "axes.linewidth":   1.2,
    "grid.alpha":       0.3,
    "grid.linestyle":   "--"
})
sns.set(style="whitegrid")

fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(
    cm_val,
    annot=True, fmt='d',
    cmap='Blues',
    cbar=True,
    linewidths=0.5,
    linecolor='gray',
    xticklabels=labels,
    yticklabels=labels,
    annot_kws={"size":14, "weight":"bold"},
    ax=ax
)
ax.set_xlabel('Predicted', labelpad=10)
ax.set_ylabel('Actual', labelpad=10)
ax.set_title(f'Validation Set\nAccuracy = {accuracy_score(y_val, y_val_pred):.2f}', pad=12)
for spine in ax.spines.values():
    spine.set_visible(False)
plt.tight_layout(pad=2)
plt.show()

fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(
    cm_test,
    annot=True, fmt='d',
    cmap='Purples',
    cbar=True,
    linewidths=0.5,
    linecolor='gray',
    xticklabels=labels,
    yticklabels=labels,
    annot_kws={"size":14, "weight":"bold"},
    ax=ax
)
ax.set_xlabel('Predicted', labelpad=10)
ax.set_ylabel('Actual', labelpad=10)
ax.set_title(f'Test Set\nAccuracy = {accuracy_score(y_test, y_test_pred):.2f}', pad=12)
for spine in ax.spines.values():
    spine.set_visible(False)
plt.tight_layout(pad=2)
plt.show()

In [None]:
##feature importance plot 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

mpl.rcParams.update({
    "axes.titlesize":    20,
    "axes.labelsize":    18,
    "xtick.labelsize":   14,
    "ytick.labelsize":   14,
    "font.family":       "sans-serif",
    "font.sans-serif":   "Helvetica",
    "legend.fontsize":   13,
    "axes.linewidth":    1.2,
    "xtick.major.width": 1.1,
    "ytick.major.width": 1.1,
    "grid.linestyle":    "--",
    "grid.alpha":        0.3,
})

xgb_model = best_model.named_steps['xgb']
booster = xgb_model.get_booster()
score_dict = booster.get_score(importance_type='gain')
importances = np.array([score_dict.get(f"f{idx}", 0) for idx in range(len(FEATURES))])

imp_df = pd.DataFrame({
    'feature': FEATURES,
    'importance': importances
}).sort_values('importance', ascending=False).reset_index(drop=True)

TOP_N = 20
top20 = imp_df.head(TOP_N).copy()

pretty_names = {
    'avg_magnetic_moment': 'Avg Magnetic Moment',
    'Average_Weight': 'Avg Atomic Weight',
    'Magnetic_proportion': 'Prop. of High Curie Elements',
    'Average_Electronegativity': 'Avg Electronegativity',
    'Avg_Atomic_Number': 'Avg Atomic Number',
    'Entropy': 'Avg Entropy',
    'Rare_Earth_proportion': 'Prop. of Rare‐Earth Elements',
    'average_group': 'Avg Group',
    'average_period': 'Avg Period',
}
top20['label'] = top20['feature'].map(pretty_names).fillna(top20['feature'])

plt.figure(figsize=(6, 5))
plt.grid(axis='x', linestyle='--', alpha=0.4, zorder=0)

plt.barh(
    top20['label'][::-1],
    top20['importance'][::-1],
    color='royalblue',
    edgecolor='black',
    zorder=3
)

plt.xlabel('Average Gain Importance', fontsize=13)
plt.title('Top-20 Feature Importances — XGBClassifier', fontsize=15)
plt.tight_layout()
plt.show()