In [1]:
import joblib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import pickle

In [2]:
pwd = os.getcwd()
print(pwd)

e:\research\HEA_phase_classification\weighted_features\classification\new_alloy_modified_ranges\model_training


In [3]:
# Load the saved scaler and encoder
loaded_scaler = joblib.load(pwd + '/scaler/MinMaxScaler.pkl')
loaded_encoder = joblib.load(pwd + '/scaler/LabelEncoder.pkl')

# Load the saved model
with open(pwd + '/saved_models/RandomForestClassifier_optimized.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

In [4]:
data = pd.read_excel(pwd + "/new_alloy_data/new_alloy_weighted_features.xlsx")
print(data.shape)

(5850, 32)


In [5]:
features = data.iloc[:, 2:].copy()
feature_names = features.columns
feature_names

Index(['weighted_volume_1', 'weighted_density_1', 'weighted_form_energy_pa_1',
       'weighted_volume_2', 'weighted_density_2', 'weighted_form_energy_pa_2',
       'weighted_volume_3', 'weighted_density_3', 'weighted_form_energy_pa_3',
       'weighted_volume_4', 'weighted_density_4', 'weighted_form_energy_pa_4',
       'weighted_volume_5', 'weighted_density_5', 'weighted_form_energy_pa_5',
       'weighted_volume_6', 'weighted_density_6', 'weighted_form_energy_pa_6',
       'weighted_volume_7', 'weighted_density_7', 'weighted_form_energy_pa_7',
       'weighted_volume_8', 'weighted_density_8', 'weighted_form_energy_pa_8',
       'weighted_volume_9', 'weighted_density_9', 'weighted_form_energy_pa_9',
       'weighted_volume_10', 'weighted_density_10',
       'weighted_form_energy_pa_10'],
      dtype='object')

In [6]:
X = pd.DataFrame(loaded_scaler.transform(features), columns=feature_names)

In [7]:
predicted_phases = loaded_model.predict(X)

In [8]:
loaded_encoder.inverse_transform(predicted_phases)

array(['M', 'M', 'S', ..., 'M', 'M', 'M'], dtype=object)

In [9]:
data["predicted_phases"] = loaded_encoder.inverse_transform(predicted_phases)
data.head()

Unnamed: 0,HEA_formula,Elements,weighted_volume_1,weighted_density_1,weighted_form_energy_pa_1,weighted_volume_2,weighted_density_2,weighted_form_energy_pa_2,weighted_volume_3,weighted_density_3,...,weighted_volume_8,weighted_density_8,weighted_form_energy_pa_8,weighted_volume_9,weighted_density_9,weighted_form_energy_pa_9,weighted_volume_10,weighted_density_10,weighted_form_energy_pa_10,predicted_phases
0,V0.75Zr0.25Cr0.25W0.25Hf,"['V', 'Zr', 'Cr', 'W', 'Hf']",4.370456,0.664049,-0.006915,1.114218,0.136783,-0.001459,3.502768,0.428528,...,11.61946,1.153963,0.004174,2.959555,0.195026,0.001466,0.282639,0.138557,0.001212,M
1,V0.86Zr0.29Cr0.29W0.43Hf,"['V', 'Zr', 'Cr', 'W', 'Hf']",5.703893,0.866652,-0.009024,1.686836,0.207078,-0.002208,3.083085,0.377184,...,10.109725,1.004027,0.003632,2.987018,0.196836,0.001479,0.427892,0.209764,0.001835,M
2,VZr0.33Cr0.33W0.67Hf,"['V', 'Zr', 'Cr', 'W', 'Hf']",6.601659,1.003059,-0.010445,2.221623,0.272729,-0.002908,2.606014,0.318819,...,8.73205,0.867206,0.003137,2.935828,0.193463,0.001454,0.563549,0.276266,0.002416,S
3,VZr0.33Cr0.33W0.83Hf0.83,"['V', 'Zr', 'Cr', 'W', 'Hf']",6.828837,1.037577,-0.010804,2.768764,0.339896,-0.003625,2.176042,0.266216,...,7.291328,0.724123,0.002619,2.95354,0.19463,0.001463,0.70234,0.344305,0.003012,S
4,VZr0.33Cr0.33WHf0.67,"['V', 'Zr', 'Cr', 'W', 'Hf']",6.601659,1.003059,-0.010445,3.315855,0.407058,-0.004341,1.74603,0.213609,...,5.850474,0.581028,0.002102,2.935828,0.193463,0.001454,0.841118,0.412338,0.003607,S


In [10]:
np.unique(data["predicted_phases"], return_counts=True)

(array(['M', 'S'], dtype=object), array([3832, 2018], dtype=int64))

In [11]:
# Selecting the desired columns
subset_df = data[['HEA_formula', 'predicted_phases']]

# Display the subset DataFrame
subset_df.head()

Unnamed: 0,HEA_formula,predicted_phases
0,V0.75Zr0.25Cr0.25W0.25Hf,M
1,V0.86Zr0.29Cr0.29W0.43Hf,M
2,VZr0.33Cr0.33W0.67Hf,S
3,VZr0.33Cr0.33W0.83Hf0.83,S
4,VZr0.33Cr0.33WHf0.67,S


In [12]:
df = subset_df.copy()

In [13]:
import re

def get_base_element(hea_string):
    # Split based on element and its fraction
    parsed = re.findall(r'([A-Z][a-z]*)(\d*\.\d+|\d*)', hea_string)
    
    # If no fraction is given, assume it to be 1 (e.g., V in VZr...)
    parsed = [(el, float(frac) if frac else 1.0) for el, frac in parsed]
    
    # Get the max fraction
    max_frac = max(parsed, key=lambda x: x[1])[1]
    
    # Find all elements with the max fraction
    max_elements = [el for el, frac in parsed if frac == max_frac]
    
    return max_elements[0]  # Return the first one in case of a tie

df['Base_element'] = df['HEA_formula'].apply(get_base_element)

df.head()

Unnamed: 0,HEA_formula,predicted_phases,Base_element
0,V0.75Zr0.25Cr0.25W0.25Hf,M,Hf
1,V0.86Zr0.29Cr0.29W0.43Hf,M,Hf
2,VZr0.33Cr0.33W0.67Hf,S,V
3,VZr0.33Cr0.33W0.83Hf0.83,S,V
4,VZr0.33Cr0.33WHf0.67,S,V


In [14]:
np.unique(df["Base_element"], return_counts=True)

(array(['Al', 'Co', 'Cr', 'Hf', 'Mo', 'Nb', 'Ta', 'Ti', 'V', 'W', 'Zr'],
       dtype=object),
 array([755, 236, 236, 236, 755, 755, 755, 755, 895, 236, 236], dtype=int64))

In [15]:
base_elements = ['V', 'Mo', 'Ti', 'Nb', 'Ta', 'Al']

df = df[df["Base_element"].isin(base_elements)]

In [16]:
np.unique(df["Base_element"], return_counts=True)

(array(['Al', 'Mo', 'Nb', 'Ta', 'Ti', 'V'], dtype=object),
 array([755, 755, 755, 755, 755, 895], dtype=int64))

In [17]:
df.to_excel(pwd + "/new_alloys_and_predicted_phases.xlsx", index=False)