# Features Generate from Chemical Composition and atomic properties

In [None]:
# ------------------------------------------------------------------
# Feature-engineering pipeline for magnetic-materials ML – notebook version
# ------------------------------------------------------------------

import re, numpy as np, pandas as pd
from pathlib import Path
from pymatgen.core import Composition


ELEMENTS = [
    'H','He','Li','Be','B','C','N','O','F','Ne','Na','Mg','Al','Si','P','S','Cl','Ar','K','Ca',
    'Sc','Ti','V','Cr','Mn','Fe','Co','Ni','Cu','Zn','Ga','Ge','As','Se','Br','Kr','Rb','Sr','Y','Zr',
    'Nb','Mo','Tc','Ru','Rh','Pd','Ag','Cd','In','Sn','Sb','Te','I','Xe','Cs','Ba','La','Ce','Pr','Nd',
    'Pm','Sm','Eu','Gd','Tb','Dy','Ho','Er','Tm','Yb','Lu','Hf','Ta','W','Re','Os','Ir','Pt','Au','Hg',
    'Tl','Pb','Bi','Po','At','Rn','Fr','Ra','Ac','Th','Pa','U','Np','Pu','Am','Cm','Bk','Cf','Es','Fm',
    'Md','No','Lr'
]
ATOMIC_NUM = {el: Z for Z, el in enumerate(ELEMENTS, 1)}


PERIOD = {}
PERIOD.update({el:1 for el in ['H','He']})
PERIOD.update({el:2 for el in ['Li','Be','B','C','N','O','F','Ne']})
PERIOD.update({el:3 for el in ['Na','Mg','Al','Si','P','S','Cl','Ar']})
PERIOD.update({el:4 for el in
               ['K','Ca','Sc','Ti','V','Cr','Mn','Fe','Co','Ni','Cu','Zn','Ga','Ge','As','Se','Br','Kr']})
PERIOD.update({el:5 for el in
               ['Rb','Sr','Y','Zr','Nb','Mo','Tc','Ru','Rh','Pd','Ag','Cd','In','Sn','Sb','Te','I','Xe']})
PERIOD.update({el:6 for el in
               ['Cs','Ba','La','Ce','Pr','Nd','Pm','Sm','Eu','Gd','Tb','Dy','Ho','Er','Tm','Yb','Lu',
                'Hf','Ta','W','Re','Os','Ir','Pt','Au','Hg','Tl','Pb','Bi','Po','At','Rn']})
PERIOD.update({el:7 for el in set(ELEMENTS) - set(PERIOD)})


from pymatgen.core.periodic_table import Element as _El
GROUP = {el: (_El(el).group or 0) for el in ELEMENTS}

MAG_MOMENT = {el:0.0 for el in ELEMENTS}

MAG_MOMENT.update({
    'H':1,'Li':1,'B':1,'N':1,'F':1,'Na':1,'Al':1,'P':1,'Cl':1,'K':1,'Sc':1,
    'V':3,'Cu':1,'Ga':1,'As':1,'Br':1,'Rb':1,'Y':1,'Ag':1,'In':1,'Sb':1,'I':1,
    'Cs':1,'La':1,'Lu':1,'Tl':1,'Bi':1,'Fr':1,'Ac':1,
    'O':2,'S':2,'Se':2,'Te':2,'Cr':4,'Fe':4,'Mo':4,'Ru':4,'W':4,'Os':4,'U':4,
    'Mn':5,'Tc':5,'Re':5,'Gd':4,'Pm':5,'Tb':5,'Dy':4,'Np':5,'Pu':4,
    'Co':3,'Rh':3,'Ir':3,'Pa':3,'Eu':3,'Bk':3,'Ho':3,'Er':2,'Ni':2,'Pd':2,
    'Yb':0
})

MAG_ELEMENTS   = ['Fe','Co','Ni','Gd','Dy']
RARE_EARTHS    = ["Sc","Y","La","Ce","Pr","Nd","Sm","Eu","Gd","Tb",
                  "Dy","Ho","Er","Tm","Yb","Lu"]


FORMULA_RE = re.compile(r'([A-Z][a-z]*)(\d*\.?\d*)')

def parse_formula(formula: str):
    return {el: float(num) if num else 1.0 for el, num in FORMULA_RE.findall(str(formula))}

def entropy(vec: pd.Series):
    p = vec[vec > 0.0].values
    return float(-(p*np.log(p)).sum()) if p.size else 0.0


def engineer_features(df: pd.DataFrame, formula_col: str = "Normalized_Composition") -> pd.DataFrame:
    # --- initialise element columns (counts) ---
    for el in ELEMENTS:
        if el not in df.columns:
            df[el] = 0.0

    # --- parse formulas into counts ---
    for idx, formula in df[formula_col].astype(str).items():
        for el, n in parse_formula(formula).items():
            df.at[idx, el] = n

    
    totals = df[ELEMENTS].sum(axis=1).replace(0, np.nan)
    df[ELEMENTS] = df[ELEMENTS].div(totals, axis=0).fillna(0).round(5)
    frac = df[ELEMENTS]

    
    def p_props(f):
        try:
            c = Composition(f)
            return (c.weight, c.average_electroneg, c.total_electrons, c.num_atoms)
        except Exception:
            return (np.nan,)*4
    props = df[formula_col].apply(p_props)
    df[['Average_Weight','Average_Electronegativity','Total_Electrons','Num_Atoms']] = pd.DataFrame(props.tolist(), index=df.index)

    # --- averaged periodic/atomic descriptors ---
    df["Avg_Atomic_Number"]   = (frac * pd.Series(ATOMIC_NUM)).sum(axis=1).round(5)
    df["average_period"]      = (frac * pd.Series(PERIOD)).sum(axis=1).round(5)
    df["avg_magnetic_moment"] = (frac * pd.Series(MAG_MOMENT)).sum(axis=1).round(5)
    df["average_group"]       = (frac * pd.Series(GROUP)).sum(axis=1).round(5)

    # --- entropy of mixing ---
    df["Entropy"] = frac.apply(entropy, axis=1).round(5)

    # --- aggregated proportions ---
    df["Magnetic_proportion"]   = df[MAG_ELEMENTS].sum(axis=1).round(5)
    df["Rare_Earth_proportion"] = df[RARE_EARTHS].sum(axis=1).round(5)

    # --- L2 norm of composition vector ---
    df["L2_norm"] = np.sqrt((frac**2).sum(axis=1)).round(5)

    return df


input_path  = ""
output_path = ""
formula_column_name = "Normalized_Composition"   

df_raw  = pd.read_csv(input_path)
df_feat = engineer_features(df_raw.copy(), formula_col=formula_column_name)
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
df_feat.to_csv(output_path, index=False)

print(f"Done!   Final shape: {df_feat.shape}  →  {output_path}")
df_feat.head()