In [4]:
from pathlib import Path
import json
import numpy as np
from pymatgen.core.structure import Structure

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
curr_path = Path().absolute()
dataset_parent = curr_path.parent.parent / 'datasets' / 'similarity-datasets'

element = 'Zr'
file_path = dataset_parent / f'{element}_O_structures_all.json'
# Open the JSON file
with open(file_path, 'r') as file:
    # Load JSON data into a Python dictionary
    collection = json.load(file)
    
print(len(collection))
# print number of structures in this collection

1675


In [34]:
# extract a set of elements
from pymatgen.core.structure import Structure
test_key = list(collection.keys())[0]
structure = Structure.from_dict(collection[test_key])
element_set = {str(site.specie) for site in structure}
element_set

{'O', 'Zr'}

In [35]:
compositions = []
element_sets = []

# Loop over collection 
for key, item in collection.items():
    structure = Structure.from_dict(item)
    element_set = {str(site.specie) for site in structure}
    compositions.append(str(structure.composition))
    element_sets.append({str(site.specie) for site in structure})
    
data_dict = {
    'mp_id': list(collection.keys()),
    'composition': compositions,
    'elements': element_sets
}

import pandas as pd
df = pd.DataFrame.from_dict(data_dict)
df

Unnamed: 0,mp_id,composition,elements
0,mp-1245318,Zr30 O60,"{O, Zr}"
1,mp-755769,Zr4 O8,"{O, Zr}"
2,mp-775935,Zr3 O6,"{O, Zr}"
3,mp-775909,Zr2 O4,"{O, Zr}"
4,mp-2574,Zr2 O4,"{O, Zr}"
...,...,...,...
1670,mp-530787,Y6 Zr22 O53,"{O, Zr, Y}"
1671,mp-531103,Y2 Zr24 O51,"{O, Zr, Y}"
1672,mp-686417,Zr50 Sc12 O118,"{O, Zr, Sc}"
1673,mp-684977,Zr27 O49,"{O, Zr}"


In [36]:
df['O_only'] = df['elements'].apply(lambda x: x=={'O', element})
df['O_only'].value_counts()

O_only
False    1644
True       31
Name: count, dtype: int64

In [37]:
df_filtered = df[df['O_only'] == True].copy()
filter_keys = list(df_filtered['mp_id'])
collection_filtered = {key: collection[key] for key in filter_keys}
print(len(collection_filtered))

# Save to a JSON file
with open(dataset_parent / f'{element}_O_only.json', 'w') as f:
    json.dump(collection_filtered, f, indent=4)  # `indent=4` makes it human-readable

31


In [50]:
# Filter for experimentally observed structures (lowest energy above Hull on MP)
element = 'Zr'
file_path = dataset_parent / f'{element}_O_only.json'
# Open the JSON file
with open(file_path, 'r') as file:
    # Load JSON data into a Python dictionary
    collection = json.load(file)
print(len(collection))

31


In [51]:
# Zr filter for Zr-O and experimentally observed
Zr_list = ['mp-14024', 'mp-2858', 'mp-561418', 'mp-1017', 'mp-1190186',
           'mp-556605', 'mp-2574', 'mp-1565', 'mp-963', 'mp-10735']

Ti_list = ['mp-1215', 'mp-8057', 'mp-390', 'mp-554098', 'mp-2591',
           'mp-458', 'mp-1071163', 'mp-556754', 'mp-1188323', 'mp-1147']

Si_list = ['mp-7000', 'mp-6930', 'mp-12787', 'mp-6945', 'mp-546794',
           'mp-733790', 'mp-640556', 'mp-555235', 'mp-669426', 'mp-542814']

common_keys = Zr_list
collection_common = {key: collection[key] for key in common_keys}
print(len(collection_common))
# Save to a JSON file
with open(dataset_parent / f'{element}_O_only_10_common.json', 'w') as f:
    json.dump(collection_common, f, indent=4)  # `indent=4` makes it human-readable

10


# make CIF files

In [5]:
from pymatgen.io.cif import CifWriter

elements = ['Ti', 'Si', 'Zr']
# extension = 'O_only_10_common'
extension = 'O_only'

for element in elements:
    file_path = dataset_parent / f'{element}_{extension}.json'
    # Open the JSON file
    with open(file_path, 'r') as file:
        # Load JSON data into a Python dictionary
        collection = json.load(file)
        
    folder_write_path = dataset_parent / 'CIFs' / f'{element}_{extension}'
    # create directory if it doesn't exist
    folder_write_path.mkdir(parents=True, exist_ok=True)
    
    for key, item in collection.items():
        # write structure to CIF with pymatgen
        structure = Structure.from_dict(item)
        cif_writer = CifWriter(structure)
        cif_writer.write_file(folder_write_path / f'{key}.cif')
        
    print(f"Done making CIFs for {element}: {len(collection)} files exported")

Done making CIFs for Ti: 124 files exported
Done making CIFs for Si: 343 files exported
Done making CIFs for Zr: 31 files exported
