Find out which materials to calculate from MP-20 dataset.

A few criterias:

All of these must be true:
- Binary compound or unary compound
- Unitcell less or equal than 10 atoms (diamond: 8 per cubic cell)
- Contain one element lighter or equal than Chroride (mass: 17)
- If there is a second element, then the second element is lighter or equal than Bromine (mass: 35)


In [17]:
import pandas as pd
import re

# Define element groups
light_elements = {
    "H", "Li", "Be", "B", "C", "N", "O", "F",
    "Na", "Mg", "Al", "Si", "P", "S", "Cl"
}
heavy_elements = {
    "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe",
    "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br"
}
metallic_elements = {
    "Li", "Be", "Na", "Mg", "Al", "K", "Ca", "Sc", "Ti",
    "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga",
    "Ge"
}  # Common metallic elements

# List of illegal formulas to remove
illegal_formulas = {"SF6", "O2", "H2", "HCl", "SiH", "HF", "HC", "HBr", "BrCl", "Br2Cl3", "H4C",\
                    "F2", "H2O", "CO2", "SO2", "N2", }

# Merge the CSV files
df = pd.concat([pd.read_csv(f) for f in ["../MP20_dataset/train.csv", "../MP20_dataset/val.csv",
                                         "../MP20_dataset/test.csv"]], ignore_index=True)
print(f"Total entries before filtering: {len(df)}")

# Function to count unit cell atoms
def get_atom_count(cif):
    match = re.search(r"_cell_formula_units_Z\s+([\d\.]+)", cif)
    if not match:
        return None
    Z = float(match.group(1))

    formula_match = re.search(r"_chemical_formula_sum\s+['\"]?([^'\"]+)['\"]?", cif)
    if not formula_match:
        return None

    formula = formula_match.group(1)
    atom_counts = sum(int(n) if n else 1 for _, n in re.findall(r"([A-Z][a-z]*)(\d*)", formula))
    return Z * atom_counts

# Function to check element criteria (excluding pure metallic compounds and illegal formulas)
def check_elements(row):
    elements_str = row["elements"].strip("[]").replace("'", "").replace('"', "")
    elements = [e.strip() for e in elements_str.split(",")]
    
    pretty_formula = row["pretty_formula"].strip()

    if len(elements) > 2:
        return False  # Only unary or binary compounds allowed

    if pretty_formula in illegal_formulas:
        return False  # Remove known unwanted formulas

    if all(el in metallic_elements for el in elements):
        return False  # Remove pure metallic compounds

    if len(elements) == 1:
        return elements[0] in light_elements  # Unary must be a light element

    # Binary case: either both are light elements, or one light and one heavy
    light_count = sum(el in light_elements for el in elements)
    heavy_count = sum(el in heavy_elements for el in elements)

    return light_count == 2 or (light_count == 1 and heavy_count == 1)

# Apply filters
df["unitcell_atoms"] = df["cif"].apply(get_atom_count)
filtered_df = df[
    (df["unitcell_atoms"] <= 12) & 
    (df.apply(check_elements, axis=1))
]

# Save filtered results
filtered_df.to_csv("filtered.csv", index=False)
filtered_df["pretty_formula"].to_csv("column_data.csv", index=False, header=True)

print(f"Filtered data saved to 'filtered.csv'.")
print(f"Total entries after filtering: {len(filtered_df)}")
print(filtered_df["pretty_formula"])


Total entries before filtering: 45229
Filtered data saved to 'filtered.csv'.
Total entries after filtering: 368
173        ZnO
358        MnS
371        FeS
880      SiSe2
933       CoO2
         ...  
44578     LiN3
44687     Mn2N
44747        C
44816      GaN
44855     LiCl
Name: pretty_formula, Length: 368, dtype: object


Then export to POSCAR

In [None]:
import pandas as pd
import os
from tqdm import tqdm
from pymatgen.core import Structure
from pymatgen.io.vasp import Poscar

# Read the filtered CSV
df = pd.read_csv("filtered.csv")

# Create output directory
output_dir = "structures_for_kappa"
os.makedirs(output_dir, exist_ok=True)

# Loop through each row and convert to POSCAR
for _, row in tqdm(df.iterrows()):
    mp_id = row["material_id"]  # Get material ID
    cif_str = row["cif"]  # Get CIF string

    try:
        # Convert CIF to pymatgen Structure
        structure = Structure.from_str(cif_str, fmt="cif")

        # Convert Structure to POSCAR
        poscar = Poscar(structure)

        # Save POSCAR file
        poscar_filename = os.path.join(output_dir, f"POSCAR_{mp_id}")
        '''with open(poscar_filename, "w") as f:
            f.write(str(poscar))'''

        #print(f"Saved: {poscar_filename}")

    except Exception as e:
        print(f"Error processing {mp_id}: {e}")

print("All POSCAR files generated successfully!")


  struct = parser.parse_structures(primitive=primitive)[0]
  struct = parser.parse_structures(primitive=primitive)[0]
  struct = parser.parse_structures(primitive=primitive)[0]
  struct = parser.parse_structures(primitive=primitive)[0]
  struct = parser.parse_structures(primitive=primitive)[0]
368it [00:00, 455.62it/s]

All POSCAR files generated successfully!





In [20]:
!zip -r structures_for_kappa.zip structures_for_kappa/
!rm -rf structures_for_kappa/

  adding: structures_for_kappa/ (stored 0%)
  adding: structures_for_kappa/POSCAR_mp-7790 (deflated 74%)
  adding: structures_for_kappa/POSCAR_mp-27705 (deflated 74%)
  adding: structures_for_kappa/POSCAR_mp-7583 (deflated 73%)
  adding: structures_for_kappa/POSCAR_mvc-11241 (deflated 68%)
  adding: structures_for_kappa/POSCAR_mp-1229 (deflated 74%)
  adding: structures_for_kappa/POSCAR_mp-604884 (deflated 74%)
  adding: structures_for_kappa/POSCAR_mp-849086 (deflated 76%)
  adding: structures_for_kappa/POSCAR_mp-30034 (deflated 68%)
  adding: structures_for_kappa/POSCAR_mp-1226378 (deflated 69%)
  adding: structures_for_kappa/POSCAR_mp-1180046 (deflated 75%)
  adding: structures_for_kappa/POSCAR_mp-1080771 (deflated 83%)
  adding: structures_for_kappa/POSCAR_mp-917 (deflated 70%)
  adding: structures_for_kappa/POSCAR_mp-1205322 (deflated 75%)
  adding: structures_for_kappa/POSCAR_mp-24084 (deflated 67%)
  adding: structures_for_kappa/POSCAR_mp-22877 (deflated 69%)
  adding: structures