In [2]:
import pandas as pd
from pymatgen.core import Composition, Element
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

data = pd.read_csv("data/hydrogen_database.csv")

# Function to parse composition formula and create a weighted feature vector for each property
def featurize_composition(formula):
    comp = Composition(formula)
    properties = {
        'atomic_mass': [],
        'atomic_radius': [],
        'melting_point': []
    }
    
    element_stoichiometry = {}
    total_weight = sum(comp.get_atomic_fraction(e) * e.atomic_mass for e in comp.elements)

    for element in comp.elements:
        element_stoichiometry[element.symbol] = comp.get_atomic_fraction(element)
        weight = comp.get_atomic_fraction(element) * element.atomic_mass / total_weight
        for prop in properties:
            value = getattr(element, prop, np.nan)
            properties[prop].append(value * weight)
    
    feature_vector = {f'{prop}_weighted': sum(properties[prop]) for prop in properties}
    feature_vector.update(element_stoichiometry)

    return feature_vector

data['features'] = data['Composition_Formula'].apply(featurize_composition)

features_df = pd.DataFrame(data['features'].tolist())
data = pd.concat([data, features_df], axis=1)
data.drop(columns=['features'], inplace=True)

keep_columns = [
    'Material_Class', 'Composition_Formula', 'Hydrogen_Weight_Percent',
    'Heat_of_Formation_kJperMolH2', 'Temperature_oC', 'Pressure_Atmospheres_Absolute',
    'Entropy_of_Formation_JperMolH2perK', 'Equilibrium_Pressure_25C', 'LnEquilibrium_Pressure_25C',
    'HtoM', 'Reference', 'atomic_mass_weighted', 'atomic_radius_weighted', 'melting_point_weighted'
]

elemental_columns = [col for col in data.columns if col not in keep_columns and not col.startswith('PC')]

data[elemental_columns] = data[elemental_columns].fillna(0)

# Standardize the elemental columns for PCA
scaler = StandardScaler()
elemental_features_scaled = scaler.fit_transform(data[elemental_columns])

# Apply PCA
pca = PCA(n_components=3) #This is currently hardcoded- need to justify the no. of principal components
principal_components = pca.fit_transform(elemental_features_scaled)
principal_df = pd.DataFrame(data=principal_components, columns=[f'PC{i+1}' for i in range(3)])

data = pd.concat([data, principal_df], axis=1)

data.drop(columns=elemental_columns, inplace=True)

data.drop('Reference', axis=1, inplace=True)

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# Impute remaining missing values using KNN
knn_imputer = KNNImputer(n_neighbors=5)  # Hardcoded- need to justify the choice
data_imputed = knn_imputer.fit_transform(data.select_dtypes(include=[np.number]))
data[data.select_dtypes(include=[np.number]).columns] = data_imputed

data.to_csv("PCA_KNN_imputed_hea.csv", index=False)