Batch_generate defect types, later pass on to high-throughput MACE simulation.

In [8]:
from pymatgen.analysis.local_env import CrystalNN
from pymatgen.analysis.structure_prediction.dopant_predictor import (
    get_dopants_from_shannon_radii,
    get_dopants_from_substitution_probabilities
)
from pymatgen.ext.matproj import MPRester
from pymatgen.io.vasp import Poscar

num_dopants = 5 # n, p doping each 5

In [9]:
import os
import json
from tqdm import tqdm
import pandas as pd

directory = 'SM_dataset'  # Directory containing JSON files

def give_doping_recommendation(structure):
    structure.add_oxidation_state_by_guess()

    try: # try substitution probability first
        threshold = 1e-3  # probability threshold for substitution/structure predictions
        dopants = get_dopants_from_substitution_probabilities(
            structure, num_dopants=num_dopants, threshold=threshold
        )
    except ValueError: # if doesn't work here, e.g. alloys, try shannon radii matching
        try:
            cnn = CrystalNN()
            bonded_structure = cnn.get_bonded_structure(structure)
            dopants = get_dopants_from_shannon_radii(bonded_structure, num_dopants=num_dopants)
        except Exception as e:
            # if still doesn't work, for example single element compound have no oxidation number
            # then return nothing
            dopants = None
    return dopants

data_list = []

cnt = 0
for filename in tqdm(os.listdir(directory)):
    cnt += 1
    if cnt > 25:
        break
    # Construct the full path to the file
    file_path = os.path.join(directory, filename)
    
    # Open and read the JSON file
    with open(file_path, 'r') as file:
        data = json.load(file)
        structure_string = data["Structure_rlx"]
        ICSD = data["ICSD_number"]

    structure_string = '\n'+structure_string.split('\n', 1)[1]

    # Use pymatgen to parse the structure
    poscar = Poscar.from_str(structure_string, read_velocities=False)
    # poscar.write_file('./POSCAR')
    structure = poscar.structure
    if sum(poscar.natoms) >= 25:
        dopants = None
    else: # return a dictionary from pymatgen format
        dopants = give_doping_recommendation(structure)
    
    # convert to unified dataframe
    if dopants is not None:
        # Extract the first 5 dopants for n_doping and p_doping, if available
        n_dopants = [dopants['n_type'][i]['dopant_species'] if i < len(dopants['n_type']) else None for i in range(5)]
        n_originals = [dopants['n_type'][i]['original_species'] if i < len(dopants['n_type']) else None for i in range(5)]
        p_dopants = [dopants['p_type'][i]['dopant_species'] if i < len(dopants['p_type']) else None for i in range(5)]
        p_originals = [dopants['p_type'][i]['original_species'] if i < len(dopants['p_type']) else None for i in range(5)]
    else:
        n_dopants, n_originals = [None] * 5, [None] * 5
        p_dopants, p_originals = [None] * 5, [None] * 5
    
    data_list.append({
        "ICSD": ICSD,
        "Crystal": filename,
        "n_dopant": n_dopants,
        "n_original": n_originals,
        "p_dopant": p_dopants,
        "p_original": p_originals,
    })

df = pd.DataFrame(data_list)

  0%|          | 0/10476 [00:00<?, ?it/s]

Skipping...
Skipping...
Skipping...
Skipping...
Skipping...
Skipping...
Skipping...
Skipping...
Skipping...
Skipping...
Skipping...
Skipping...
Skipping...
  0%|          | 25/10476 [00:01<11:15, 15.47it/s]


In [10]:
df.to_csv('Defect_combinations.csv')
df.to_pickle('Defect_combinations.pkl')
df.head(20)

Unnamed: 0,ICSD,Crystal,n_dopant,n_original,p_dopant,p_original
0,36626,Nb1O5P1_36626.json,"[None, None, None, None, None]","[None, None, None, None, None]","[None, None, None, None, None]","[None, None, None, None, None]"
1,195115,Bi1Cl1O1_195115.json,"[F-, Cl-, K+, Rb+, Na+]","[O2-, O2-, Cl-, Cl-, Cl-]","[O2-, S2-, Na+, K+, Rb+]","[Cl-, Cl-, Bi3+, Bi3+, Bi3+]"
2,65183,I3O1W1_65183.json,"[F-, Cl-, K+, Cs+, Rb+]","[O2-, O2-, I-, I-, I-]","[O2-, Cr3+, S2-, Pb2+, Mg2+]","[I-, W5+, I-, W5+, W5+]"
3,260890,Al2Ca2O9Sn2_260890.json,"[None, None, None, None, None]","[None, None, None, None, None]","[None, None, None, None, None]","[None, None, None, None, None]"
4,180558,Na2O2_180558.json,"[None, None, None, None, None]","[None, None, None, None, None]","[None, None, None, None, None]","[None, None, None, None, None]"
5,170946,Cl1Cu1H3N1_170946.json,"[O2-, S2-, Se2-, F-, Zn2+]","[N3-, N3-, N3-, N3-, Cu+]","[O2-, S2-, O2-, O2-, Se2-]","[Cl-, Cl-, Cu+, H+, Cl-]"
6,262708,Cd1O3Ti1_262708.json,"[F-, Ta5+, Cl-, Nb5+, Ge4+]","[O2-, Ti4+, O2-, Ti4+, Cd2+]","[Na+, Na+, Zn2+, Mn2+, Mg2+]","[Cd2+, Ti4+, Ti4+, Ti4+, Ti4+]"
7,8097,Ca1O4Te1_8097.json,"[F-, La3+, Cl-, Nd3+, Pr3+]","[O2-, Ca2+, O2-, Ca2+, Ca2+]","[Na+, Re5+, K+, Ir5+, Co2+]","[Ca2+, Te6+, Ca2+, Te6+, Te6+]"
8,14260,Cl2Cs1I1_14260.json,"[None, None, None, None, None]","[None, None, None, None, None]","[None, None, None, None, None]","[None, None, None, None, None]"
9,609323,Al1Se2Tl1_609323.json,"[None, None, None, None, None]","[None, None, None, None, None]","[None, None, None, None, None]","[None, None, None, None, None]"
