A customized dataset stored in a folder root_dir will have the following files:

    1. id_prop.csv: a CSV file with multiple columns. The first column recodes a unique ID for each crystal. From second column onwards the value of respective target property is stored. For eg., if you wish to perform multi-task learning for Formation energy and Band gap, then the second column should have the target value for Formation energy of the crystal and thrid column should have the target value for Band gap.

    2. atom_init.json: a JSON file that stores the initialization vector for each element. An example of atom_init.json is data/sample/atom_init.json, which should be good for most applications.

    3. ID.cif: a CIF file that recodes the crystal structure, where ID is the unique ID for the crystal.



In [12]:
import csv
import json
import numpy as np
from tqdm import tqdm
from pathlib import Path
from pymatgen.core import Structure
from pymatgen.io.cif import CifWriter
from typing import List, Dict


def read_json_structures(root: Path):
    return [
        {'_id': path.name.strip('.json'), 'structure': Structure.from_dict(json.load(open(path)))}
        for path in tqdm(root.glob('*.json'))
    ]


def read_csv(fname: Path, fieldnames=['_id', 'band_gap']):
    with open(fname, 'r') as f:
        return sorted(list(csv.DictReader(f, fieldnames))[1:], key=lambda x: x['_id'])


def write_csv(data, fieldnames, destination: Path):
    with open(destination, 'w') as f:
        csv.DictWriter(f, fieldnames=fieldnames, delimiter=',').writerows(data)


def write_cifs(crystals: List[Dict[str, str]], destination_root: Path):
    atom_init = {}

    # converting public data to cifs and vectorizing moleculas
    for d in tqdm(crystals):
        CifWriter(d['structure']).write_file(destination_root / Path(d['_id']).with_suffix('.cif'))
        vec = np.zeros(92, dtype=int)
        vec[list(set(d['structure'].atomic_numbers))] = 1
        atom_init[d['_id']] = vec.tolist()

    with open(destination_root / 'atom_init.json', 'w') as f:
        f.write(json.dumps(atom_init))


In [13]:
root_public_path = Path('../data/dichalcogenides_public')
root_private_path = Path('../data/dichalcogenides_private')
target_path = root_public_path / 'targets.csv'

private = read_json_structures(root_private_path / 'structures')
public = read_json_structures(root_public_path / 'structures')
target = read_csv(target_path)


2967it [00:17, 174.51it/s]
2966it [00:18, 160.74it/s]


In [21]:
num_to_id = {num: data['_id'] for num, data in enumerate(public + private)}
id_to_num = {value: key for key, value in num_to_id.items()}

for data in private:
    data['_id'] = id_to_num[data['_id']]

for data in public:
    data['_id'] = id_to_num[data['_id']]


In [15]:
cif_destination_public = Path('../models/mt-cgcnn/cifs/public')
cif_destination_private = Path('../models/mt-cgcnn/cifs/private')

!mkdir -p $cif_destination_public
!mkdir -p $cif_destination_private

# writing targets without header row
write_csv(target, ['_id', 'band_gap'], cif_destination_public / 'id_prop.csv')
# converting pymatgen to cifs for both public and private materials
write_cifs(public, cif_destination_public)
write_cifs(private, cif_destination_private)

mkdir: -: File exists
mkdir: p: File exists
mkdir: -: File exists
mkdir: p: File exists


100%|██████████| 2966/2966 [00:26<00:00, 112.56it/s]
100%|██████████| 2967/2967 [00:25<00:00, 117.40it/s]
