# Script for data preprocessing of Materials Project datasets 

In [2]:
import pandas as pd 
import numpy as np

In [3]:
df = pd.read_csv('materials_data_featurized.csv')

In [6]:
df.head()


Unnamed: 0,nsites,composition,formula_pretty,volume,density,symmetry,material_id,structure,formation_energy_per_atom,energy_above_hull,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,10,O2,O2,159.269552,1.668092,"{'crystal_system': 'Triclinic', 'symbol': 'P1'...",mp-1180064,"{'@module': 'pymatgen.core.structure', '@class...",0.387014,0.387014,...,0.0,0.0,0.0,0.0,12.0,12.0,0.0,12.0,0.0,12.0
1,100,C1,C,1190.353192,1.675489,"{'crystal_system': 'Triclinic', 'symbol': 'P1'...",mp-1244913,"{'@module': 'pymatgen.core.structure', '@class...",0.994253,0.994253,...,0.0,0.0,0.0,0.0,194.0,194.0,0.0,194.0,0.0,194.0
2,100,Ti1,Ti,1744.318224,4.556796,"{'crystal_system': 'Triclinic', 'symbol': 'P1'...",mp-1245006,"{'@module': 'pymatgen.core.structure', '@class...",0.141384,0.141384,...,0.0,2.3e-05,0.0,2.3e-05,194.0,194.0,0.0,194.0,0.0,194.0
3,100,Si1,Si,1793.764938,2.599954,"{'crystal_system': 'Triclinic', 'symbol': 'P1'...",mp-1244933,"{'@module': 'pymatgen.core.structure', '@class...",0.349291,0.349291,...,0.0,0.0,0.0,0.0,227.0,227.0,0.0,227.0,0.0,227.0
4,12,Nb1,Nb,226.839615,8.161256,"{'crystal_system': 'Triclinic', 'symbol': 'P1'...",mp-1094120,"{'@module': 'pymatgen.core.structure', '@class...",0.189748,0.189748,...,0.0,0.0,0.0,0.0,229.0,229.0,0.0,229.0,0.0,229.0


In [7]:
print(len(df))

99997


### get rid of bandgaps smaller than 0.2

In [8]:
df = df[(df['band_gap'] > 0.2)]
y = df['band_gap']


In [9]:
print(len(y))

35966


### Drop unwanted columns

In [10]:
df.drop(columns=["shear_modulus", "bulk_modulus"], inplace=True, errors='ignore')

### Drop duplicated of materials 

In [12]:
#If there are two same elements in the composition, drop one with higher energy above hull

def drop_duplicate_elements(df):
    # Group by 'composition' and keep the one with the lowest 'energy_above_hull'
    df = df.sort_values(by="energy_above_hull")
    df = df.drop_duplicates(subset=["composition"], keep='first')
    return df

df = drop_duplicate_elements(df)
df.head()

Unnamed: 0,nsites,composition,formula_pretty,volume,density,symmetry,material_id,structure,formation_energy_per_atom,energy_above_hull,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
55190,26,Ca2 Se3 O8,Ca2Se3O8,391.285151,3.777254,"{'crystal_system': 'Triclinic', 'symbol': 'P-1...",mp-28535,"{'@module': 'pymatgen.core.structure', '@class...",-2.072683,0.0,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,45.230769,55.313609,12.0
71264,44,Nd2 Ti2 O7,Nd2Ti2O7,545.646768,6.040409,"{'crystal_system': 'Monoclinic', 'symbol': 'P2...",mp-12193,"{'@module': 'pymatgen.core.structure', '@class...",-3.730972,0.0,...,2.3e-05,4e-06,7e-06,0.0,12.0,194.0,182.0,78.181818,84.231405,12.0
79350,20,Cs1 Eu1 Br3,CsEuBr3,808.699987,4.308584,"{'crystal_system': 'Orthorhombic', 'symbol': '...",mp-638685,"{'@module': 'pymatgen.core.structure', '@class...",-2.291878,0.0,...,0.0,0.0,0.0,0.0,64.0,229.0,165.0,130.0,79.2,64.0
79349,16,Ga1 H1 O2,GaHO2,131.266086,5.198197,"{'crystal_system': 'Orthorhombic', 'symbol': '...",mp-634326,"{'@module': 'pymatgen.core.structure', '@class...",-1.898591,0.0,...,0.0,0.0,0.0,0.0,12.0,194.0,182.0,70.5,61.75,12.0
39835,120,K1 P1 Se3,KPSe3,3731.40401,3.278375,"{'crystal_system': 'Trigonal', 'symbol': 'P3_1...",mp-569702,"{'@module': 'pymatgen.core.structure', '@class...",-0.799751,0.0,...,0.0,0.0,0.0,0.0,2.0,229.0,227.0,54.6,69.76,14.0


In [18]:
df.columns

Index(['nsites', 'composition', 'formula_pretty', 'volume', 'density',
       'symmetry', 'material_id', 'structure', 'formation_energy_per_atom',
       'energy_above_hull',
       ...
       'MagpieData range GSmagmom', 'MagpieData mean GSmagmom',
       'MagpieData avg_dev GSmagmom', 'MagpieData mode GSmagmom',
       'MagpieData minimum SpaceGroupNumber',
       'MagpieData maximum SpaceGroupNumber',
       'MagpieData range SpaceGroupNumber', 'MagpieData mean SpaceGroupNumber',
       'MagpieData avg_dev SpaceGroupNumber',
       'MagpieData mode SpaceGroupNumber'],
      dtype='object', length=143)

### Drop single element materials

In [13]:
def get_rid_of_singular_elements(df):
    # Filter out rows where 'composition' contains only one element

    for element in df["composition"]:
        if isinstance(element, str) and " " in element:
            # If the composition is a string with multiple elements, keep it
            continue
        else:
            # If the composition is a single element, drop the row
            df = df[df["composition"] != element]   
    return df

df = get_rid_of_singular_elements(df)
    

In [14]:
len(df)

21185

In [81]:
#save cleaned and processed DataFrame to CSV
df.to_csv("materials_data_10k_cleaned.csv", index=False)

KeyboardInterrupt: 

In [82]:
df_mp = pd.read_csv('materials_data_10k_cleaned.csv')

In [83]:
len(df_mp)

12673

In [76]:
df_mp.columns





Index(['nsites', 'composition', 'formula_pretty', 'volume', 'density',
       'symmetry', 'material_id', 'structure', 'formation_energy_per_atom',
       'energy_above_hull',
       ...
       'MagpieData range GSmagmom', 'MagpieData mean GSmagmom',
       'MagpieData avg_dev GSmagmom', 'MagpieData mode GSmagmom',
       'MagpieData minimum SpaceGroupNumber',
       'MagpieData maximum SpaceGroupNumber',
       'MagpieData range SpaceGroupNumber', 'MagpieData mean SpaceGroupNumber',
       'MagpieData avg_dev SpaceGroupNumber',
       'MagpieData mode SpaceGroupNumber'],
      dtype='object', length=143)

In [66]:
unwanted_columns = ["builder_meta", "formula_anonymous", "property_name", "deprecated", 
                    "deprecated_reason", "last_updated", "last_updated_by", "origins", "warnings", "task_ids", "uncorrected_energy_per_atom", "is_stable", "is_metal", 
                    "equilibrum_reaction_energy_per_atom", "decomposes_to", "xas", "es_source_calc_id", "dos", "bandstructure", "dos_energy_up", "dos_energy_down",
                    "is_magnetic", "ordering", "total_magnetization", "total_magnetization_normalized_vol", "total_magnetization_normalized_formula_units", "num_magnetic_sites", 
                    "num_unique_magnetic_sites", "types_of_magnetic_species", "universal_anisotropy", "homogenous_poisson", "e_total", "e_ionic", "e_elctronic", "n", "e_ij_max", 
                    "weightded_sace_energy_EV_PER_ANG2", "weighted_surface_energy", "weighted_work_function","surface_anisotropy", "shape_factor", "has_reconstructed",
                     "possible_species", "has_props", "theoretical", "database_IDs", "fields_not_requested"]
# Remove unwanted columns
df_mp = df_mp.drop(columns=unwanted_columns, errors='ignore')

In [77]:
df_mp = df_mp.drop(columns=["deprecation_reasons","equilibrum_reaction_energy_per_atom", "deprecation_reasons", "equilibrum_reaction_energy_per_atom", "grain_boundaries", 
                            "cbm", "vbm", "efermi", "is_gap_direct", "homogenous_poisson", "e_electronic", "weighted_surface_energy_EV_PER_ANG2", "shear_modulus", "bulk_modulus"], errors='ignore')

In [78]:
df_mp.columns

Index(['nsites', 'composition', 'formula_pretty', 'volume', 'density',
       'symmetry', 'material_id', 'structure', 'formation_energy_per_atom',
       'energy_above_hull',
       ...
       'MagpieData range GSmagmom', 'MagpieData mean GSmagmom',
       'MagpieData avg_dev GSmagmom', 'MagpieData mode GSmagmom',
       'MagpieData minimum SpaceGroupNumber',
       'MagpieData maximum SpaceGroupNumber',
       'MagpieData range SpaceGroupNumber', 'MagpieData mean SpaceGroupNumber',
       'MagpieData avg_dev SpaceGroupNumber',
       'MagpieData mode SpaceGroupNumber'],
      dtype='object', length=143)

In [79]:
df_mp["structure"][0]

"{'@module': 'pymatgen.core.structure', '@class': 'Structure', 'charge': 0, 'lattice': {'matrix': [[3.93764022, 0.0, -0.0], [-1.96882011, 3.41009558, 0.0], [0.0, -0.0, 5.5125645500000005]], 'pbc': [True, True, True], 'a': 3.93764022, 'b': 3.9376394566130544, 'c': 5.5125645500000005, 'alpha': 90.0, 'beta': 90.0, 'gamma': 120.00000641314116, 'volume': 74.02122568300027}, 'properties': {}, 'sites': [{'species': [{'element': 'Te', 'occu': 1}], 'abc': [0.333333, 0.666667, 0.24867616], 'properties': {'magmom': 0.0}, 'label': 'Te', 'xyz': [-1.9688201100550593e-06, 2.2733981900318603, 1.370843384046128]}, {'species': [{'element': 'Te', 'occu': 1}], 'abc': [0.666667, 0.333333, 0.75132384], 'properties': {'magmom': 0.0}, 'label': 'Te', 'xyz': [1.96882207882011, 1.13669738996814, 4.141721165953872]}, {'species': [{'element': 'Rh', 'occu': 1}], 'abc': [0.0, -0.0, -0.0], 'properties': {'magmom': -0.0}, 'label': 'Rh', 'xyz': [0.0, 0.0, 0.0]}]}"

In [80]:
len(df_mp)

5804

In [None]:
from matminer.featurizers.structure import (
    DensityFeatures,
    GlobalSymmetryFeatures,
    StructuralHeterogeneity,
    BondFractions,
)
from pymatgen.core import Structure


import ast

def safe_structure_from_dict(x):
    # If already a dict, use as is
    if isinstance(x, dict):
        return Structure.from_dict(x)
    # If it's a string, try to parse as dict
    if isinstance(x, str):
        try:
            d = ast.literal_eval(x)
            if isinstance(d, dict):
                return Structure.from_dict(d)
        except Exception:
            pass
    return None  # or np.nan if you want to drop these rows later

df_mp["structure"] = df_mp["structure"].apply(safe_structure_from_dict)
# Optionally, drop rows where structure could not be parsed
df_mp = df_mp[df_mp["structure"].notnull()]

for col in ["density", "vpa", "packing fraction"]:
    if col in df_mp.columns:
        df_mp = df_mp.drop(columns=[col])


# Initialize featurizers
density_feat = DensityFeatures()
symmetry_feat = GlobalSymmetryFeatures()
heterogeneity_feat = StructuralHeterogeneity()
#bond_frac_feat = BondFractions()

# Drop columns that will be created by GlobalSymmetryFeatures if they exist
#for col in ["spacegroup_num", "crystal_system", "point_group", "crystal_system_int"]:
#    if col in df_mp.columns:
#        df_mp = df_mp.drop(columns=[col])

#df_mp = symmetry_feat.featurize_dataframe(df_mp, col_id="structure")

# Drop columns that will be created by StructuralHeterogeneity if they exist
#for col in ["structural_heterogeneity"]:
#    if col in df_mp.columns:
#        df_mp = df_mp.drop(columns=[col])

#df_mp = heterogeneity_feat.featurize_dataframe(df_mp, col_id="structure", ignore_errors=True)

# Drop columns that will be created by BondFractions if they exist
#for col in bond_frac_feat.feature_labels():
 #   if col in df_mp.columns:
  #      df_mp = df_mp.drop(columns=[col])

#df_mp = bond_frac_feat.featurize_dataframe(df_mp, col_id="structure")

# Fit BondFractions to your structures
bond_frac_feat.fit(df_mp["structure"])

# Apply featurizers to the dataframe
df_mp = density_feat.featurize_dataframe(df_mp, col_id="structure", ignore_errors=True)
df_mp = symmetry_feat.featurize_dataframe(df_mp, col_id="structure", ignore_errors=True)
df_mp = heterogeneity_feat.featurize_dataframe(df_mp, col_id="structure", ignore_errors=True)
df_mp = bond_frac_feat.featurize_dataframe(df_mp, col_id="structure", ignore_errors=True)




DensityFeatures:   0%|          | 0/5804 [00:00<?, ?it/s]

GlobalSymmetryFeatures:   0%|          | 0/5804 [00:00<?, ?it/s]

StructuralHeterogeneity:   0%|          | 0/5804 [00:00<?, ?it/s]

BondFractions:   0%|          | 0/5804 [00:00<?, ?it/s]

  r1 = _get_radius(structure[n])
  r1 = _get_radius(structure[n])
  r1 = _get_radius(structure[n])
  r1 = _get_radius(structure[n])
  r2 = _get_radius(entry["site"])
  r2 = _get_radius(entry["site"])
  r2 = _get_radius(entry["site"])
  r2 = _get_radius(entry["site"])
  r1 = _get_radius(structure[n])
  nn_data = self.get_nn_data(structure, n)
  nn_data = self.get_nn_data(structure, n)
  nn_data = self.get_nn_data(structure, n)
  nn_data = self.get_nn_data(structure, n)
  r2 = _get_radius(entry["site"])
  r1 = _get_radius(structure[n])
  r1 = _get_radius(structure[n])
  nn_data = self.get_nn_data(structure, n)
  r1 = _get_radius(structure[n])
  r2 = _get_radius(entry["site"])
  nn_data = self.get_nn_data(structure, n)
  r2 = _get_radius(entry["site"])
  r2 = _get_radius(entry["site"])
  nn_data = self.get_nn_data(structure, n)
  nn_data = self.get_nn_data(structure, n)
  r1 = _get_radius(structure[n])
  r2 = _get_radius(entry["site"])
  nn_data = self.get_nn_data(structure, n)
  r1 = _ge

In [71]:
df_mp.to_csv("materials_data_10k_structured.csv", index=False)

In [85]:
df_mp = pd.read_csv("materials_data_10k_structured.csv")

In [86]:
df_mp.head(20)

Unnamed: 0,nsites,composition,formula_pretty,volume,symmetry,material_id,structure,formation_energy_per_atom,energy_above_hull,band_gap,...,W - Zr bond frac.,Xe - Xe bond frac.,Y - Y bond frac.,Y - Yb bond frac.,Y - Zn bond frac.,Y - Zr bond frac.,Yb - Yb bond frac.,Zn - Zn bond frac.,Zn - Zr bond frac.,Zr - Zr bond frac.
0,3,Te2 Rh1,Te2Rh,74.021226,"{'crystal_system': 'Trigonal', 'symbol': 'P-3m...",mp-228,Full Formula (Te2 Rh1)\nReduced Formula: Te2Rh...,-0.43984,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,24,Zn1 As2,ZnAs2,558.344218,"{'crystal_system': 'Monoclinic', 'symbol': 'P2...",mp-7262,Full Formula (Zn8 As16)\nReduced Formula: ZnAs...,-0.148991,0.0,0.2241,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,16,Li1 As1,LiAs,287.389201,"{'crystal_system': 'Monoclinic', 'symbol': 'P2...",mp-7943,Full Formula (Li8 As8)\nReduced Formula: LiAs\...,-0.588209,0.0,0.3285,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10,Cd1 P4,CdP4,206.572949,"{'crystal_system': 'Monoclinic', 'symbol': 'P2...",mp-7904,Full Formula (Cd2 P8)\nReduced Formula: CdP4\n...,-0.120743,0.0,0.6551,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16,Na1 Sb1,NaSb,477.144588,"{'crystal_system': 'Monoclinic', 'symbol': 'P2...",mp-7944,Full Formula (Na8 Sb8)\nReduced Formula: NaSb\...,-0.448723,0.0,0.6041,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,10,Ru1 F4,RuF4,126.86488,"{'crystal_system': 'Monoclinic', 'symbol': 'P2...",mp-974434,Full Formula (Ru2 F8)\nReduced Formula: RuF4\n...,-1.884284,0.0,0.1813,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,16,Li1 P1,LiP,248.527196,"{'crystal_system': 'Monoclinic', 'symbol': 'P2...",mp-9588,Full Formula (Li8 P8)\nReduced Formula: LiP\na...,-0.616667,0.0,0.8095,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,20,As2 Se3,As2Se3,561.681863,"{'crystal_system': 'Monoclinic', 'symbol': 'P2...",mp-909,Full Formula (As8 Se12)\nReduced Formula: As2S...,-0.139483,0.0,1.4466,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,12,Cu1 P2,CuP2,190.936932,"{'crystal_system': 'Monoclinic', 'symbol': 'P2...",mp-927,Full Formula (Cu4 P8)\nReduced Formula: CuP2\n...,-0.15631,0.0,0.8679,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,22,Sn5 O6,Sn5O6,392.793008,"{'crystal_system': 'Monoclinic', 'symbol': 'P2...",mp-978114,Full Formula (Sn10 O12)\nReduced Formula: Sn5O...,-1.762199,0.0,1.7702,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
len(df_mp)

5804

In [89]:
for column in df_mp.columns: 
    print(column)

nsites
composition
formula_pretty
volume
symmetry
material_id
structure
formation_energy_per_atom
energy_above_hull
band_gap
MagpieData minimum Number
MagpieData maximum Number
MagpieData range Number
MagpieData mean Number
MagpieData avg_dev Number
MagpieData mode Number
MagpieData minimum MendeleevNumber
MagpieData maximum MendeleevNumber
MagpieData range MendeleevNumber
MagpieData mean MendeleevNumber
MagpieData avg_dev MendeleevNumber
MagpieData mode MendeleevNumber
MagpieData minimum AtomicWeight
MagpieData maximum AtomicWeight
MagpieData range AtomicWeight
MagpieData mean AtomicWeight
MagpieData avg_dev AtomicWeight
MagpieData mode AtomicWeight
MagpieData minimum MeltingT
MagpieData maximum MeltingT
MagpieData range MeltingT
MagpieData mean MeltingT
MagpieData avg_dev MeltingT
MagpieData mode MeltingT
MagpieData minimum Column
MagpieData maximum Column
MagpieData range Column
MagpieData mean Column
MagpieData avg_dev Column
MagpieData mode Column
MagpieData minimum Row
MagpieData

In [92]:
df_mp["B - Ti bond frac."]

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
5799    0.0
5800    0.0
5801    0.0
5802    0.0
5803    0.0
Name: B - Ti bond frac., Length: 5804, dtype: float64

In [94]:
zero_counts = (df_mp == 0).sum()
for i in zero_counts: 
    print(i)

0
0
0
0
0
0
0
0
1900
4712
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
373
0
373
0
0
0
1122
0
1122
0
0
0
57
0
57
0
0
0
30
0
27
0
213
0
4002
0
4002
143
5267
2192
2244
2192
2244
3535
3914
970
1432
970
1432
2859
5388
3050
3150
3050
3150
4663
0
0
262
0
262
0
5690
4070
4184
4070
4184
5078
5273
2192
2244
2192
2244
3535
5347
2679
2728
2679
2728
4537
5682
4277
4280
4277
4280
5225
1405
136
524
136
524
897
0
0
0
0
0
0
5602
3531
3531
3531
3531
4371
5766
4727
4727
4727
4727
5321
0
0
990
0
990
0
0
0
0
0
0
0
1336
0
158
0
0
2
0
22
0
22
135
5783
5803
5803
5803
5803
5802
5803
5803
5802
5803
5803
5803
5801
5803
5803
5801
5803
5803
5803
5803
5803
5803
5803
5803
5803
5803
5803
5803
5803
5803
5803
5803
5803
5803
5803
5802
5803
5803
5803
5721
5801
5803
5801
5803
5801
5803
5803
5802
5803
5802
5801
5802
5802
5803
5803
5801
5802
5801
5799
5803
5800
5803
5802
5803
5802
5800
5802
5802
5800
5803
5803
5803
5800
5803
5797
5801
5802
5802
5802
5803
5799
5802
5802
5802
5803
5801
5802
5803
5803
5803
5803
5803
58

In [96]:
for coulmn, i in df_mp.columns, zero_counts: 
    print(coulmn, i)

ValueError: too many values to unpack (expected 2)