## Download entries to build general and transfer datasets
**!!! First run the Jupyter backend setup** 

https://github.com/xiaohang007/SLICES?tab=readme-ov-file#jupyter-backend-setup

**Set num_sites=(1,5) to speed up the test run**

**Set num_sites=(1,10) to reproduce the results reported in the manuscript**

In [None]:
import json
from invcryrep.utils import temporaryWorkingDirectory,search_materials,exclude_elements_json,splitRun,show_progress,collect_json,collect_csv
# Download entries to build general and transfer datasets
with temporaryWorkingDirectory("./HTS/0_get_json_mp_api"):
    dict_json=search_materials(apikeyPath='/crystal/APIKEY.ini',formation_energy=(-10000,0),num_sites=(1,5),fields=["material_id"])
    exclude_elements=['Fr' , 'Ra','Ac','Th','Pa','U','Np',\
              'Pu','Am','Cm','Bk','Cf','Es','Fm','Md','No','Lr','Rf',\
              'Db','Sg','Bh','Hs','Mt','Ds','Rg','Cn','Nh','Fl','Mc',\
              'Lv','Ts','Og']
    flitered_json=exclude_elements_json(dict_json,exclude_elements)
    with open('prior_model_dataset.json', 'w') as f:
        json.dump(flitered_json, f)
    print("prior_model_dataset.json generated")
    dict_json2=search_materials(apikeyPath='/crystal/APIKEY.ini',band_gap=(0.10, 0.55),num_sites=(1,5),formation_energy=(-10000,0),is_gap_direct=True,fields=["material_id"])
    flitered_json2=exclude_elements_json(dict_json2,exclude_elements)
    with open('transfer_learning_dataset.json', 'w') as f:
        json.dump(flitered_json2, f)
    print("transfer_learning_dataset.json generated")
# Rule out crystals with low-dimensional units (e.g. molecular crystals or layered crystals)
with temporaryWorkingDirectory("./HTS/0_get_json_mp_api/2_filter_prior_3d"):
    splitRun(filename='../prior_model_dataset.json',threads=8,skip_header=False)
    show_progress()
    collect_json(output="../prior_model_dataset_filtered.json", glob_target="./**/output.json",cleanup=True)
with temporaryWorkingDirectory("./HTS/0_get_json_mp_api/3_filter_transfer_3d"):
    splitRun(filename='../transfer_learning_dataset.json',threads=8,skip_header=False)
    show_progress()
    collect_json(output="../transfer_model_dataset_filtered.json", glob_target="./**/output.json",cleanup=True)

# Convert crystal structures in datasets to SLICES strings and conduct data augmentation

In [None]:
from invcryrep.utils import temporaryWorkingDirectory,splitRun,show_progress,collect_json,collect_csv
with temporaryWorkingDirectory("./HTS/1_augmentation/prior"):
    splitRun(filename='../../0_get_json_mp_api/prior_model_dataset_filtered.json',threads=16,skip_header=False)
    show_progress()
    collect_csv(output="../prior_aug.sli", glob_target="./**/result.sli",header="",cleanup=True)
with temporaryWorkingDirectory("./HTS/1_augmentation/transfer"):
    splitRun(filename='../../0_get_json_mp_api/transfer_model_dataset_filtered.json',threads=16,skip_header=False)
    show_progress()
    collect_csv(output="../transfer_aug.sli", glob_target="./**/result.sli",header="",cleanup=True)

# Train_sample
## Train general RNN and specialized RNN
**Set --epochs 3 to speed up the test run.**

**To accelerate the training process, consider adjusting the batch_size appropriately.**

**e.g., --batch_size 200 for 4GB GPU memory, --batch_size 1200 for 24GB GPU memory**

In [None]:
from invcryrep.utils import temporaryWorkingDirectory
import os
with temporaryWorkingDirectory("./HTS/2_train_sample"):
    os.system("python data_structs.py ../1_augmentation/prior_aug.sli")
    os.system("python train_prior.py --batch_size 1200 --epochs 3")
    os.system("python transfer_userinpt.py  --task train_model --voc Voc_prior \
    --smi ../1_augmentation/transfer_aug.sli --save_process_smi process.csv \
    --prior_model Prior_local.ckpt --tf_model transfer_test.ckpt --batch_size 1200 --epochs 3")

## Generate SLICES
**Generating 16000 SLICES for the test run to speed up the process**

**Generating 10,000,000 SLICES to reproduce the results reported in the manuscript**

In [None]:
from invcryrep.utils import temporaryWorkingDirectory,splitRun_sample,show_progress,collect_csv
with temporaryWorkingDirectory("./HTS/2_train_sample"):    
    splitRun_sample(threads=8,sample_size=16000) # generate 16000 SLICES in total with 8 CPU threads 
    show_progress()
    collect_csv(output="sampled.sli", glob_target="job_*/100x.sli",header="",cleanup=True)

# Decode SLICES into Crystal Structures

In [None]:
from invcryrep.utils import temporaryWorkingDirectory,splitRun_csv,show_progress,collect_csv
with temporaryWorkingDirectory("./HTS/3_inverse"):
    splitRun_csv(filename='../2_train_sample/sampled.sli',threads=16,skip_header=False)
    show_progress()
    collect_csv(output="results_3_inverse.csv", glob_target="job_*/result.csv",\
                header='SLICES,formula,energy_per_atom,POSCAR\n',cleanup=True)

# Filter out crystals with compositions that exist in the Materials Project database

In [None]:
from invcryrep.utils import temporaryWorkingDirectory,splitRun_csv,show_progress,collect_csv
with temporaryWorkingDirectory("./HTS/4_composition_filter"):
    splitRun_csv(filename='../3_inverse/results_3_inverse.csv',threads=16,skip_header=True)
    show_progress()
    collect_csv(output="results_4_composition_filter.csv", glob_target="job_*/result.csv",\
                header='SLICES,formula,energy_per_atom,POSCAR\n',cleanup=True)

# Find high-symmetry structures in candidates with duplicate compositions

In [None]:
from invcryrep.utils import temporaryWorkingDirectory,splitRun_csv,show_progress,collect_csv
import pandas as pd
with temporaryWorkingDirectory("./HTS/5_symmetry_filter_refine"):
    splitRun_csv(filename='../4_composition_filter/results_4_composition_filter.csv',threads=16,skip_header=True)
    show_progress()
    collect_csv(output="results_5_symmetry_filter_refine.csv", glob_target="job_*/result.csv",\
                header='index,SLICES,POSCAR,formula,energy_per_atom,energy_per_atom_sym,space_group_number\n',index=True,cleanup=True)
    df = pd.read_csv("results_5_symmetry_filter_refine.csv")
    result = df.loc[df['space_group_number'] != 1].groupby(['formula','space_group_number'], group_keys=False).apply(lambda x: x[x.energy_per_atom_sym==x.energy_per_atom_sym.min()])
    result.to_csv("results_5_symmetry_filter_refine_filtered.csv", index=False)

# Rule out crystals displaying minimum structural dissimilarity value < 0.75 (a dissimilarity threshold used in the Materials Project) with respect to the structures in the training dataset

In [None]:
from invcryrep.utils import temporaryWorkingDirectory,splitRun,splitRun_csv,show_progress,collect_csv_filter
import os,sys,glob
from monty.serialization import loadfn, dumpfn
# save structural fingerprints to json
with temporaryWorkingDirectory("./HTS/6_structure_dissimilarity_filter/0_save_structure_fingerprint"):
    splitRun(filename='../../0_get_json_mp_api/prior_model_dataset_filtered.json',threads=16,skip_header=False)
    show_progress()
    training_fingerprint_list=[]
    for i in glob.glob("job_*/training_fingerprint_list.json.gz"):
        training_fingerprint_list+=loadfn(i)
    dumpfn(training_fingerprint_list, "../training_fingerprint_list.json.gz") # or .json.gz
    print("../training_fingerprint_list.json.gz has been generated")
    for i in glob.glob("job_*"):
        os.system("rm -r "+i)
# Rule out crystals displaying minimum structural dissimilarity value < 0.75
with temporaryWorkingDirectory("./HTS/6_structure_dissimilarity_filter"):
    splitRun_csv(filename='../5_symmetry_filter_refine/results_5_symmetry_filter_refine_filtered.csv',threads=16,skip_header=True)
    show_progress()
    # collect results
    dissimilarity_limit=0.75
    collect_csv_filter(output="results_6_structure_dissimilarity_filter.csv", glob_target="job_*/result.csv",\
                header='index,SLICES,POSCAR,formula,energy_per_atom,energy_per_atom_sym,space_group_number,dissimilarity\n', \
                condition=lambda i: float(i.split(',')[-1]) >= dissimilarity_limit,cleanup=True)
# the lambda expression is the screening criteria

# Rule out candidates with IAP-predicted energy above hull >= 50 meV/atom

**Set ehull_limit=0.5 to facilitate the test run.** To initiate a preliminary screening, set the ehull_limit to 0.5 eV/atom, which relaxes the selection criteria and yields some results. If we were to set it to a more stringent condition like 0.05 eV/atom, it's likely we wouldn't find any matches due to the limited number of SLICES (16,000) we're working with.

**Set ehull_limit=0.05 to reproduce the results reported in the manuscript**

In [None]:
from invcryrep.utils import temporaryWorkingDirectory,splitRun_csv,show_progress,collect_csv_filter
import os
# download relevant entries for high-throughput energy above hull calculation
with temporaryWorkingDirectory("./HTS/7_EAH_prescreen"):
    os.system("rm result.csv > /dev/null 2>&1")
    os.system("python ehull.py -i '../6_structure_dissimilarity_filter/results_6_structure_dissimilarity_filter_filtered.csv' -o result.csv")
    print("competitive_compositions.json.gz has been generated")
    # calculate structure_dissimilarity
    splitRun_csv(filename='../6_structure_dissimilarity_filter/results_6_structure_dissimilarity_filter_filtered.csv',threads=16,skip_header=True)
    show_progress()
    # collect results
    ehull_limit=0.5 # eV/atom
    collect_csv_filter(output="results_7_EAH_prescreen.csv", glob_target="job_*/result.csv",\
                header='index,SLICES,POSCAR,formula,energy_per_atom,energy_per_atom_sym,space_group_number,dissimilarity,energy_above_hull_prescreen\n', \
                condition=lambda i: float(i.split(',')[-1]) <= ehull_limit,cleanup=True)

# Rule out candidates with ALIGNN predicted band gap E_g < 0.1 eV (less likely to be a semiconductor) 

In [None]:
from invcryrep.utils import temporaryWorkingDirectory,splitRun_csv,show_progress,collect_csv_filter
with temporaryWorkingDirectory("./HTS/8_band_gap_prescreen"):
    splitRun_csv(filename='../7_EAH_prescreen/results_7_EAH_prescreen_filtered.csv',threads=4,skip_header=True)
    show_progress()
    band_gap_lower_limit=0.1 # eV
    collect_csv_filter(output="results_8_band_gap_prescreen.csv", glob_target="job_*/result.csv",\
                header='index,SLICES,POSCAR,formula,energy_per_atom,energy_per_atom_sym,space_group_number,dissimilarity,energy_above_hull_prescreen,band_gap_prescreen\n', \
                condition=lambda i: float(i.split(',')[-1]) > band_gap_lower_limit,cleanup=True)