# Clean ARC Input
This note book aims to help clean the ARC input species section before submitting the ARC job. You can provide libraries to avoid species included in the ARC input to be further calculated. You can also provide a filter species dictionary to avoid further calculating these species as well.

You can also find a script version under `../scripts/`

In [None]:
import os
import copy

from easy_rmg_model.common import (read_yaml_file,
                                   regularize_path,
                                   save_yaml_file)
from easy_rmg_model.rmg2arc.arc_input import (combine_arc_species_inputs,
                                              combine_spc_info,
                                              find_species_from_spc_dict,)
from easy_rmg_model.rmg2arc.species_dict import (load_spc_dict,
                                                 species_from_spc_info)
from easy_rmg_model.rmg2arc.thermo_db import (load_thermo_database,
                                              load_thermo_lib_by_path)

## INPUT
- `input_file`: The ARC input file to be cleaned.
- `libraries_path`: Path to a yaml file contains the thermo library list to be checked. If any entry is included in the library, it will be removed from the input file.
- `filter_spc_dict`: Path to a file contains the species to be filtered.
- `output_path`: Path to save the resulted cleaned ARC input

In [None]:
input_file = "/Users/xiaorui/Apps/ARC_lite/script/input_sens.yml"
libraries_path = "./library.yml"
filter_spc_dict = None
output_path = './input_cleaned.yml'

## Step 1. Read species information from uncleaned ARC input

In [None]:
# Get species info in the input file
arc_input_species = read_yaml_file(input_file)['species']
spc_info = {spc['label']: spc for spc in arc_input_species}

## Step 2. Filter by the provided filtering species dictionary

In [None]:
if filter_spc_dict:
    # Load filtered species dictionary
    filter_spc_dict = load_spc_dict(filter_spc_dict)
    # Clean work
    clean = []
    for label, spc in spc_info.items():
        dict_label, _ = find_species_from_spc_dict(spc, filter_spc_dict)
        if not dict_label:  # cannot find species
            clean.append(label)
        else:
            print(f'Warning: species {label} is cleaned out due to belonging '
                  f'to filtered species dictionary')
    spc_info = {label: spc for label, spc in spc_info.items()
                if label in clean}

## Step 3. Filter by the provided thermo libraries

In [None]:
if libraries_path:
    # Load thermo libraries
    libraries = read_yaml_file(libraries_path)
    thermo_db = load_thermo_database(libraries=libraries['built-in_libs'])
    for t_lib in libraries['external_libs']:
        load_thermo_lib_by_path(t_lib, thermo_db)
    
    # Clean work
    clean = []
    for label, spc in spc_info.items():
        try:
            thermo_data = thermo_db.get_all_thermo_data(
                species_from_spc_info(spc))
        except:
            print(f'Warning: Cannot generate thermo for {label}.')
            continue
        if len(thermo_data) <= 1:  # Only GAV availabel
            clean.append(label)
        else:
            print(f'Warning: species {label} is cleaned out due to existing '
                  f'in thermo libraries')
    spc_info = {label: spc for label, spc in spc_info.items()
                if label in clean}

## Step 4. Remove all duplicated entries
determined by their geometry

In [None]:
cleaned_info = {}
cleaned_spc_dict = {}
for label, spc in spc_info.items():
    cleaned_info = combine_spc_info(spc_info1=cleaned_info,
                                    spc_info2={label: spc},
                                    spc_dict=cleaned_spc_dict)
    

## Step 5. Change the species label
If there is `()#` in the label, this can cause problems in the ARC job

In [None]:
replace_list = ["(", ")", "#"]

In [None]:
for spc in cleaned_info.values():
    for symbol in replace_list:
        if symbol in spc['label']:
            spc['label'] = spc['label'].replace(symbol, "_")

## Step 6. Save the cleaned results

In [None]:
arc_input = {'species': [spc for spc in cleaned_info.values()]}
if not output_path:
    output_path = os.path.join('.', 'input_cleaned.yml')
save_yaml_file(output_path, arc_input, overwrite=False)