In [None]:
import json
from tqdm import tqdm

In [None]:
import sys

sys.path.append("../")

### Structure filtering

In this step, we use the cleaned raw data from notebook 1.1 and perform data filtering based on the local geometry. We are mainly filtering structures with local <sup>27</sup>Al environments with the following two steps:
1. Al sites with 4, 5, 6 coordinations.
2. Al sites that are tetrahedra, pyramidical and octahedra.  

Reload the Alnmr_clean.json file from data/interim

In [None]:
data_path = "../data/"
with open(data_path + "interim/Alnmr_clean.json", "r") as file:
    data = json.load(file)
    print("length of file is {}".format(len(data)))

Parse the raw data into a custom defined obj called structure_tensor. The structure_tensor contains two sets of information: crystal structure data in pymatgen.structure format and NMR spectrum parameters computed based on raw tensor.

    structure_tensor = {
        'structure': pymatgen.structure
        'tensors : {
            'diso',
            'csa',
            'csa_reduced',
            'etacs',
            'etaQ',
            'CQ',
            'site_index',
            'structure_index',
            'site_coord',
        }
    }

In [None]:
# get the structure_tensors obj
from src.data.structure_tensors_gen import get_structure_tensor

# structure_tensors = get_structure_tensors(data)
# print("length of structure_tensors:", len(structure_tensors))
structure_tensors = []
for compound in tqdm(data):
    structure = compound["structure"]
    efg = compound["efg"]
    cs = compound["cs"]
    structure_tensor = get_structure_tensor(structure, efg, cs)
    structure_tensors.append(structure_tensor)

print("length of structure_tensors:", len(structure_tensors))

pymatgen is a powerful tool to modify the crystal structure, here we need to add oxidation states for each site.

In [None]:
from src.data.structure_tensors_modifier import *

# Add oxidation state for each structures in structure_tensors obj. Might take a long time based on the structure.
structure_tensors = add_oxi_state_by_guess(structure_tensors)

Now we can filter the data based on local Al coordinations

In [None]:
structure_tensors_filtered = get_n_coord_tensors(structure_tensors, coord=[4, 5, 6])
structure_tensors_filtered = append_coord_num(structure_tensors_filtered)
len(structure_tensors_filtered)

Add chemical environment info (such as T:4, tetrahedron) to the "Tensor" key in structure_tensor. Might take a long time based on the structure.

In [None]:
structure_tensors_filtered = append_ce(structure_tensors_filtered)

Filter structures based on local chemenv. Here we select T:4 T:5 O:6 sites

In [None]:
chemenv_filter = filter_ce(structure_tensors_filtered)
# number of outliers
print("number of outliers:", len(chemenv_filter["outliers"]))

Save the processed data in data/interim for feature generation in the next step. 

In [None]:
filtered_data = copy.deepcopy(chemenv_filter["filtered"])
for data in filtered_data:
    data["structure"] = data["structure"].as_dict()
dir_ = data_path + "interim/"
filename = "filtered_data.json"
with open(dir_ + filename, "w") as outfile:
    json.dump(filtered_data, outfile)