In [8]:
from csv import DictReader, DictWriter
import json
import numpy as np
from pathlib import Path
from pymatgen.io.cif import CifWriter
import json
import re
from glob import glob
import pandas as pd
from tqdm.notebook import tqdm
from jarvis.ai.descriptors.cfid import CFID
from jarvis.core.atoms import Atoms
from pymatgen.core import Structure
from pymatgen.io.jarvis import JarvisAtomsAdaptor

In [7]:
def get_pymatgen_and_jarvis(paths: str) -> tuple[dict, dict]:
    paths = glob(paths)
    print('Getting pymatgen')
    structures = {
        list(filter(lambda x: x != '', re.split(r'\/(\w+)\.json', path)))[-1]:
            Structure.from_dict(json.load(open(path, 'r')))
            for path in tqdm(paths)
    }
    print('Converting to jarvis')
    jarvis = {
        key: JarvisAtomsAdaptor.get_atoms(value)
        for key, value in tqdm(structures.items())
    }
    return structures, jarvis

In [36]:
root_public_path = Path('../data/dichalcogenides_public')
root_private_path = Path('../data/dichalcogenides_private')
target_path = root_public_path / 'targets.csv'

target = pd.read_csv(target_path)

In [3]:
train_pymatgen, train_jarvis = get_pymatgen_and_jarvis('../data/dichalcogenides_public/structures/*.json')

Getting pymatgen


  0%|          | 0/2966 [00:00<?, ?it/s]

Converting to jarvis


  0%|          | 0/2966 [00:00<?, ?it/s]

In [4]:
eval_pymatgen, eval_jarvis = get_pymatgen_and_jarvis('../data/dichalcogenides_private/structures/*.json')

Getting pymatgen


  0%|          | 0/2967 [00:00<?, ?it/s]

Converting to jarvis


  0%|          | 0/2967 [00:00<?, ?it/s]

In [5]:
train_defects_pymatgen, train_defects_jarvis = get_pymatgen_and_jarvis('../data/dichalcogenides_public/defects/*.json')

Getting pymatgen


  0%|          | 0/2966 [00:00<?, ?it/s]

Converting to jarvis


  0%|          | 0/2966 [00:00<?, ?it/s]

In [6]:
eval_defects_pymatgen, eval_defects_jarvis = get_pymatgen_and_jarvis('../data/dichalcogenides_private/defects/*.json')

Getting pymatgen


  0%|          | 0/2967 [00:00<?, ?it/s]

Converting to jarvis


  0%|          | 0/2967 [00:00<?, ?it/s]

In [58]:
from collections import OrderedDict
import os

def save_to_jarvis(path: str, structures: dict, targets: pd.DataFrame = None) -> None:
    os.makedirs(path, exist_ok=True)
    for jid, structure in tqdm(structures.items()):
        with open(path + str(jid) + '.vasp', 'w') as file:
            file.write(str(structure))
    if targets is not None:
        targets_new = targets.copy()
        targets_new['_id'] = targets_new['_id'] + '.vasp'
        targets_new.to_csv(path + 'id_prop.csv', index=False, header=False)

In [59]:
save_to_jarvis('../data/eval/defects/jarvis/', eval_defects_jarvis)

  0%|          | 0/2967 [00:00<?, ?it/s]

In [60]:
save_to_jarvis('../data/train/defects/jarvis/', train_defects_jarvis, target)

  0%|          | 0/2966 [00:00<?, ?it/s]

In [61]:
save_to_jarvis('../data/eval/no_defects/jarvis/', eval_jarvis)

  0%|          | 0/2967 [00:00<?, ?it/s]

In [62]:
save_to_jarvis('../data/train/no_defects/jarvis/', train_jarvis, target)

  0%|          | 0/2966 [00:00<?, ?it/s]

In [165]:
def write_cifs(crystals: dict[str, any], destination_root: Path):
    atom_init = {}
    # converting public data to cifs and vectorizing moleculas
    for id, structure in tqdm(crystals.items()):
        CifWriter(structure).write_file(destination_root / Path(id).with_suffix('.cif'))
        vec = np.zeros(92, dtype=int)
        vec[list(set(structure.atomic_numbers))] = 1
        atom_init[id] = vec.tolist()

    with open(destination_root / 'atom_init.json', 'w') as f:
        f.write(json.dumps(atom_init))

In [92]:
def jarvis_to_cfid(structures: dict) -> dict:
    cfid = {}
    for id, atom in tqdm(structures.items()):
        cfid[id] = CFID(atom).get_comp_descp()
    return cfid

In [94]:
eval_defects_cfid = jarvis_to_cfid(eval_defects_jarvis)

  0%|          | 0/2967 [00:00<?, ?it/s]

You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_

In [95]:
eval_cfid = jarvis_to_cfid(eval_jarvis)

  0%|          | 0/2967 [00:00<?, ?it/s]

In [96]:
train_defects_cfid = jarvis_to_cfid(train_defects_jarvis)

  0%|          | 0/2966 [00:00<?, ?it/s]

You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_

In [97]:
train_cfid = jarvis_to_cfid(train_jarvis)

  0%|          | 0/2966 [00:00<?, ?it/s]

In [168]:
write_cifs(eval_defects_pymatgen, Path('../data/eval/defects/cifs'))

  0%|          | 0/2967 [00:00<?, ?it/s]

In [169]:
write_cifs(train_defects_pymatgen, Path('../data/train/defects/cifs'))

  0%|          | 0/2966 [00:00<?, ?it/s]

In [170]:
write_cifs(eval_pymatgen, Path('../data/eval/no_defects/cifs'))

  0%|          | 0/2967 [00:00<?, ?it/s]

In [171]:
write_cifs(train_pymatgen, Path('../data/train/no_defects/cifs'))

  0%|          | 0/2966 [00:00<?, ?it/s]

In [186]:
train_boosting = pd.DataFrame(train_cfid.values(), index=train_cfid.keys(), columns=feat_names())
train_boosting = train_boosting.merge(target, left_on=train_boosting.index, right_on='_id')
train_boosting.to_csv('../data/train/no_defects/cfid/train.csv')

In [189]:
train_defects_boosting = pd.DataFrame(train_defects_cfid.values(), index=train_defects_cfid.keys(), columns=feat_names())
train_defects_boosting = train_defects_boosting.merge(target, left_on=train_defects_boosting.index, right_on='_id')
train_defects_boosting.to_csv('../data/train/defects/cfid/train.csv')

In [190]:
eval_defects_boosting = pd.DataFrame(eval_defects_cfid.values(), index=eval_defects_cfid.keys(), columns=feat_names())
eval_defects_boosting.to_csv('../data/eval/defects/cfid/eval.csv')

In [192]:
eval_boosting = pd.DataFrame(eval_cfid.values(), index=eval_cfid.keys(), columns=feat_names())
eval_boosting.to_csv('../data/eval/no_defects/cfid/eval.csv')


In [203]:
def save_pymatgen(structures: dict[str, Structure], path: str) -> None:
    for id, struct in tqdm(structures.items()):
        with open(path + id + '.json', 'w') as f:
            f.writelines(struct.to_json())

In [204]:
save_pymatgen(train_pymatgen, '../data/train/no_defects/pymatgen/')

  0%|          | 0/2966 [00:00<?, ?it/s]

In [205]:
save_pymatgen(eval_pymatgen, '../data/eval/no_defects/pymatgen/')

  0%|          | 0/2967 [00:00<?, ?it/s]

In [206]:
save_pymatgen(train_defects_pymatgen, '../data/train/defects/pymatgen/')

  0%|          | 0/2966 [00:00<?, ?it/s]

In [207]:
save_pymatgen(eval_defects_pymatgen, '../data/eval/defects/pymatgen/')

  0%|          | 0/2967 [00:00<?, ?it/s]