# Datasets converter IDAO-22 notebook
Team: NESCafe Gold 3in1

## Imports and defects structures calculation

In [1]:
import json
import os
import re
from glob import glob
from pathlib import Path

import nglview
import numpy as np
import pandas as pd
from jarvis.ai.descriptors.cfid import CFID
from jarvis.ai.descriptors.cfid import feat_names
from pymatgen.core import Structure
from pymatgen.io.cif import CifWriter
from pymatgen.io.jarvis import JarvisAtomsAdaptor
from tqdm import tqdm
from tqdm.notebook import tqdm


tqdm.pandas()
show = lambda x: nglview.show_pymatgen(x)





### Complement structures calculation
...using multiprocessing

In [3]:
!python -m scripts/atoms_to_defects.py

## Useful scripts

In [7]:
def get_pymatgen_and_jarvis(paths: str) -> tuple[dict, dict]:
    paths = glob(paths)
    print('Getting pymatgen')
    structures = {
        list(filter(lambda x: x != '', re.split(r'\/(\w+)\.json', path)))[-1]:
            Structure.from_dict(json.load(open(path, 'r')))
            for path in tqdm(paths)
    }
    print('Converting to jarvis')
    jarvis = {
        key: JarvisAtomsAdaptor.get_atoms(value)
        for key, value in tqdm(structures.items())
    }
    return structures, jarvis


def save_to_jarvis(path: str, structures: dict, targets: pd.DataFrame = None) -> None:
    os.makedirs(path, exist_ok=True)
    for jid, structure in tqdm(structures.items()):
        with open(path + str(jid) + '.vasp', 'w') as file:
            file.write(str(structure))
    if targets is not None:
        targets_new = targets.copy()
        targets_new['_id'] = targets_new['_id'] + '.vasp'
        targets_new.to_csv(path + 'id_prop.csv', index=False, header=False)


def write_cifs(crystals: dict[str, any], destination_root: Path):
    atom_init = {}
    # converting public data to cifs and vectorizing moleculas
    for id, structure in tqdm(crystals.items()):
        CifWriter(structure).write_file(destination_root / Path(id).with_suffix('.cif'))
        vec = np.zeros(92, dtype=int)
        vec[list(set(structure.atomic_numbers))] = 1
        atom_init[id] = vec.tolist()

    with open(destination_root / 'atom_init.json', 'w') as f:
        f.write(json.dumps(atom_init))


def jarvis_to_cfid(structures: dict) -> dict:
    cfid = {}
    for id, atom in tqdm(structures.items()):
        cfid[id] = CFID(atom).get_comp_descp()
    return cfid


def save_pymatgen(structures: dict[str, Structure], path: str) -> None:
    for id, struct in tqdm(structures.items()):
        with open(path + id + '.json', 'w') as f:
            f.writelines(struct.to_json())

## Datasets calculation

In [36]:
target_path = Path('../data/dichalcogenides_public/targets.csv')
target = pd.read_csv(target_path)

In [None]:
structures_paths = {
    'train':         '../data/train/no_defects/pymatgen/*.json',
    'eval':          '../data/eval/no_defects/pymatgen/*.json',
    'train_defects': '../data/train/defects/pymatgen/*.json',
    'eval_defects':  '../data/eval/defects/pymatgen/*.json',
}

In [None]:
train_pymatgen, train_jarvis = get_pymatgen_and_jarvis(structures_paths['train'])
target.to_csv('../data/train/no_defects/pymatgen/targets.csv')
eval_pymatgen, eval_jarvis = get_pymatgen_and_jarvis(structures_paths['eval'])
train_defects_pymatgen, train_defects_jarvis = get_pymatgen_and_jarvis(structures_paths['train_defects'])
target.to_csv('../data/train/defects/pymatgen/targets.csv')
eval_defects_pymatgen, eval_defects_jarvis = get_pymatgen_and_jarvis(structures_paths['eval_defects'])

In [59]:
jarvis_paths = {
    train_jarvis:         '../data/train/no_defects/jarvis/',
    eval_jarvis:          '../data/eval/no_defects/jarvis/',
    train_defects_jarvis: '../data/train/defects/jarvis/',
    eval_defects_jarvis:  '../data/eval/defects/jarvis/',
}

for atoms, path in jarvis_paths.items():
    save_to_jarvis(path, atoms)

target.to_csv('../data/train/no_defects/jarvis/id_prop.csv')
target.to_csv('../data/train/defects/jarvis/id_prop.csv')

  0%|          | 0/2967 [00:00<?, ?it/s]

In [94]:
train_cfid = jarvis_to_cfid(train_jarvis)
eval_cfid = jarvis_to_cfid(eval_jarvis)
train_defects_cfid = jarvis_to_cfid(train_defects_jarvis)
eval_defects_cfid = jarvis_to_cfid(eval_defects_jarvis)

  0%|          | 0/2967 [00:00<?, ?it/s]

You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_n parameter
You might consider increasing max_

In [168]:
write_cifs(eval_defects_pymatgen, Path('../data/eval/defects/cifs'))
write_cifs(train_defects_pymatgen, Path('../data/train/defects/cifs'))
write_cifs(eval_pymatgen, Path('../data/eval/no_defects/cifs'))
write_cifs(train_pymatgen, Path('../data/train/no_defects/cifs'))

target.to_csv('../data/train/no_defects/cifs/id_prop.csv')
target.to_csv('../data/train/defects/cifs/id_prop.csv')

  0%|          | 0/2967 [00:00<?, ?it/s]

In [186]:
train_boosting = pd.DataFrame(train_cfid.values(), index=train_cfid.keys(), columns=feat_names())
train_boosting = train_boosting.merge(target, left_on=train_boosting.index, right_on='_id')
train_boosting.to_csv('../data/train/no_defects/cfid/train.csv')

eval_boosting = pd.DataFrame(eval_cfid.values(), index=eval_cfid.keys(), columns=feat_names())
eval_boosting.to_csv('../data/eval/no_defects/cfid/eval.csv')

train_defects_boosting = pd.DataFrame(train_defects_cfid.values(), index=train_defects_cfid.keys(), columns=feat_names())
train_defects_boosting = train_defects_boosting.merge(target, left_on=train_defects_boosting.index, right_on='_id')
train_defects_boosting.to_csv('../data/train/defects/cfid/train.csv')

eval_defects_boosting = pd.DataFrame(eval_defects_cfid.values(), index=eval_defects_cfid.keys(), columns=feat_names())
eval_defects_boosting.to_csv('../data/eval/defects/cfid/eval.csv')

In [204]:
save_pymatgen(train_pymatgen, '../data/train/no_defects/pymatgen/')
save_pymatgen(eval_pymatgen, '../data/eval/no_defects/pymatgen/')
save_pymatgen(train_defects_pymatgen, '../data/train/defects/pymatgen/')
save_pymatgen(eval_defects_pymatgen, '../data/eval/defects/pymatgen/')

  0%|          | 0/2966 [00:00<?, ?it/s]

### Calculate graph features
...using multiprocessing

In [None]:
!python -m scripts/graph_features.py