# Clean jobs from ARC calculations

This script is aimed to clean up ARC calculations.
1. migrate all calculations to `species_path` 
2. create arkane `species.py` file for thermo jobs (with UseBAC) and kinetics jobs (without BAC)

In [None]:
import os
import shutil
import sys

sys.path.insert(0, os.path.dirname(os.path.abspath('')))

from easy_rmg_model.species.info import (classify_jobs,
                                   find_latest_terminated_job,
                                   check_converge_and_geom_consist,
                                   generate_geom_info,
                                   find_rotors_from_xyz,
                                   filter_scans,
                                   check_scan_quality,
                                   generate_summary,)


from easy_rmg_model.template_writer.input import ArkaneSpecies, ArkaneThermo

import subprocess

%load_ext autoreload
%autoreload 2

In [None]:
def transfer_data(spc, save_dir='', output_file_name='output.out'):
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir)

    for job_type in ['composite', 'sp', 'freq',]:
        try:
            path = spc[job_type]
            save_at = os.path.join(save_dir, job_type)
            os.makedirs(save_at, exist_ok=True)
            shutil.copy(path, os.path.join(save_at, output_file_name))
            spc[job_type] = os.path.join(save_at, output_file_name)
        except KeyError:
            pass
    try:
        rotors_dict = spc['rotors_dict']
    except KeyError:
        pass

    for _, rotor in rotors_dict.items():
        job_type = 'scan_' + '_'.join([str(i) for i in rotor['scan']])
        save_at = os.path.join(save_dir, job_type)
        os.makedirs(save_at, exist_ok=True)
        if rotor['scan_path']:
            shutil.copy(rotor['scan_path'],
                        os.path.join(save_at,
                                     output_file_name))

            rotor['scan_path'] = os.path.join(save_at, output_file_name)


def change_to_relative_path(spc, curdir='.'):
    os.chdir(curdir)
    for job_type in ['composite', 'sp', 'freq']:
        try:
            path = spc[job_type]
            spc[job_type] = os.path.relpath(path)
        except KeyError:
            pass
    try:
        rotors_dict = spc['rotors_dict']
    except KeyError:
        pass

    for _, rotor in rotors_dict.items():
        if rotor['scan_path']:
            path = rotor['scan_path']
            rotor['scan_path'] = os.path.relpath(path)

## 0. Settings

- all species jobs will be moved to `species_path`
- arkane without BAC will be saved to `arkane_paths[0]`
- arkane with BAC will be saved to `arkane_paths[1]`
- jobs do not satisfy the criteria will be saved to `bad_path`

In [None]:
OUTPUT_FILE_NAME = 'output.out'  # output file name of (gaussian) jobs

LEVEL = {'composite': 'cbs-qb3',
         'freq': 'b3lyp/cbsb7',
         'scan': 'b3lyp/cbsb7'}

SCAN_FILTER = 'non-frozen'
# This filter make sure use the scan with fewest constraint
# But no guarantee on the shape of the curve (less important)
FREQ_SCALE_FACTOR = None

ARKANE_SPEC = {
    'model_chemistry': 'cbs-qb3',
}

species_path = '/Volumes/Extreme SSD/relax-rotor/Species'
arkane_paths = ['/Volumes/Extreme SSD/relax-rotor/Arkane_Species_wo_bac',
                '/Volumes/Extreme SSD/relax-rotor/Arkane_Species',
               ]
bad_path = '/Volumes/Extreme SSD/relax-rotor/NeedImprove/'

## 1.Migrate jobs and create arkane input/species files

`work_path` is usually the path to `ARC_PROJECT/calcs/Species/`. It also works if your jobs are organized by 
```
- many species folders
  |
  |- job folders(named by optxxx, compositexxx, freqxxx, scanxxx)
      |
      |- output files(e.g., output.out)
```


In [None]:
work_path = '/Volumes/Extreme SSD/Calcs/todo/Species'

### 1.1 integrated workflow

An iterative and iteractive process to add calculation results to a database

In [None]:
smiles_list = []
calculated_spc = [smi for smi in os.listdir(species_path) if os.path.isdir(os.path.join(species_path, smi))]
for dir_name in os.listdir(work_path):

    # Step 1. Read the path
    job_path = os.path.join(work_path, dir_name)
    if not os.path.isdir(job_path):
        continue
    print('\n', job_path)

    # Step 2. Initiate a `spc` dict to store species information
    spc = {'label': 'species',
           'directory': os.path.join(work_path, dir_name),
           'ts': False}

    # Step 3. Go through each steps to check the calculation quality
    try:
        classify_jobs(spc)
        find_latest_terminated_job(spc)
        check_converge_and_geom_consist(spc)
        generate_geom_info(spc)
        find_rotors_from_xyz(spc)
        filter_scans(spc, scan_filter='')
        check_scan_quality(spc)
    except (KeyError, FileNotFoundError) as e:
        print(dir_name)
        print(e)
        continue

    print(generate_summary(spc))
    if spc['smiles'] in calculated_spc:
        print(f"{spc['smiles']} is in the folder, skip")
    else:
        print(f"{spc['smiles']} is new")


    keep = None
    while keep not in ['Y', 'N', 'y', 'n', 'yes', 'no']:
        keep = input('Whether to keep this job (Y/N)?: ')

    if keep in  ['Y', 'y', 'yes',]:
        migrate = None
        while migrate not in ['Y', 'N', 'y', 'n', 'yes', 'no']:
            migrate = input('Whether to migrate this job (Y/N)?: ')
        if migrate in ['Y', 'y', 'yes',]:
            # Step 5. Migrate good jobs to `species_path`
            try:
                transfer_data(spc, save_dir=os.path.join(species_path,
                                                     spc['smiles']))
                with open(os.path.join(species_path,spc['smiles'], 'summary.txt'), 'w') as f:
                    f.write(spc['summary'])
            except FileNotFoundError:
                continue

            # Step 6. Create files for arkane jobs
            for ind, arkane_path in enumerate(arkane_paths):
                save_dir = os.path.join(arkane_path, spc['smiles'])
                os.makedirs(save_dir, exist_ok=True)
                change_to_relative_path(spc, save_dir)

                arkane_species_path = os.path.join(save_dir, 'species.py')
                arkane_thermo_input_path = os.path.join(save_dir, 'input.py')
                settings = {'use_bond_corrections': bool(ind),
                           'save_path': arkane_species_path}
                ArkaneSpecies({**spc, **ARKANE_SPEC, **settings}).save()
                settings = {
                    'use_bond_corrections': bool(ind),
                    'use_hindered_rotors': True,
                    'species_file': os.path.relpath(arkane_species_path),
                    'species_smiles': spc['smiles'],
                    'save_path': arkane_thermo_input_path,
                }
                ArkaneThermo({**spc, **ARKANE_SPEC, **settings}).save()

            smiles_list.append(spc['smiles'])
            shutil.rmtree(job_path)
        else:
            continue
    else:
        shutil.rmtree(job_path)



Species in `smiles_list` will be further processed. You can manually remove some some species if needed.

In [None]:
smiles_list

Option 1: Assign a new `smiles_list`

In [None]:
# smiles_list = [
# '[CH2]C(=C)C(C)=O'
# ]

Option 2: Remove misjudged species from `smiles_list`

In [None]:
# for i in ["[CH2]C_=C_C_OO_OC_C_=O_54177_",]:
#     try:
#         smiles_list.remove(i)
#         for j in [species_path] + arkane_paths:
#             shutil.rmtree(os.path.join(j, i))
#     except ValueError:
#         pass

### 1.2 Stepwise workflow
This is used to get familiar with the workflow, in case you do not trust the integrated workflow. This can be also used to tackle calculations that needs special cares.

#### Step 1. Initiate a `spc` dict to store species information
- `label` can be anything. It won't influence the result
- `directory` where you jobs are located
- `ts` whether it is a TS

In [None]:
spc = {'label': 'A',
       'directory': '/Volumes/Extreme SSD/Calcs/todo/Species/S_2708_',
       'ts': False}

#### Step 2. Go through each steps to check the calculation quality

In [None]:
calculated_spc = [smi for smi in os.listdir(species_path) if os.path.isdir(os.path.join(species_path, smi))]

classify_jobs(spc)
find_latest_terminated_job(spc)
check_converge_and_geom_consist(spc)
generate_geom_info(spc)
find_rotors_from_xyz(spc)
filter_scans(spc, scan_filter='')
check_scan_quality(spc)

if spc['smiles'] in calculated_spc:
    print(f"{spc['smiles']} is in the folder, skip")
else:
    print(f"{spc['smiles']} is new")
    print(generate_summary(spc))

In some cases, the generated `rotor_dicts` is not desirable. You may have to manually modify it.

In [None]:
# # Check the entry of a `rotors_dict` (a list)
# i = 1
# spc['rotors_dict'][i]

In [None]:
# spc['rotors_dict'][i]['scan'] = []  # INPUT!! A index-1 list
# spc['rotors_dict'][i]['top'] = []  # INPUT!!
# spc['rotors_dict'][i]['success'] = True  # INPUT!!
# spc['rotors_dict'][i]['symmetry'] = 1  # INPUT!!


# spc['rotors_dict'][i]['pivots'] = spc['rotors_dict'][i]['scan'][1:3]
# spc['rotors_dict'][i]['torsion'] = [t-1 for t in spc['rotors_dict'][i]['scan']]
# generate_summary(spc)

####  Step 3. If any step does not meet the criteria, then move it to `bad_path`

In [None]:
# transfer_data(spc, save_dir=os.path.join(bad_path,
#                                          spc['smiles']))
# with open(os.path.join(bad_path, spc['smiles'], 'summary.txt'), 'w') as f:
#     f.write(spc['summary'])

#### Step 4. Migrate good jobs to `species_path`

In [None]:
transfer_data(spc, save_dir=os.path.join(species_path,
                                         spc['smiles']))
with open(os.path.join(species_path, spc['smiles'], 'summary.txt'), 'w') as f:
    f.write(spc['summary'])

#### Step 5. Create files for arkane jobs

In [None]:
for ind, arkane_path in enumerate(arkane_paths):
    save_dir = os.path.join(arkane_path, spc['smiles'])
    os.makedirs(save_dir, exist_ok=True)
    change_to_relative_path(spc, save_dir)

    arkane_species_path = os.path.join(save_dir, 'species.py')
    arkane_thermo_input_path = os.path.join(save_dir, 'input.py')
    settings = {'use_bond_corrections': bool(ind),
               'save_path': arkane_species_path}
    ArkaneSpecies({**spc, **ARKANE_SPEC, **settings}).save()
    settings = {
        'use_bond_corrections': bool(ind),
        'use_hindered_rotors': True,
        'species_file': os.path.relpath(arkane_species_path),
        'species_smiles': spc['smiles'],
        'save_path': arkane_thermo_input_path,
    }
    ArkaneThermo({**spc, **ARKANE_SPEC, **settings}).save()

## 2. Check the Arkane files

In [None]:
import os
import shutil
from rdkit import Chem
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers, StereoEnumerationOptions, GetStereoisomerCount

In [None]:
arkane_paths = ['/Volumes/Extreme SSD/relax-rotor/Arkane_Species_wo_bac',
                '/Volumes/Extreme SSD/relax-rotor/Arkane_Species',
               ]

### 2.1 Correct the stereoIsomers number
- ARC may underestimate the number if multiple stereocenters exist
- ARC may overestimate the number if the molecule has no chiral center but the conformer has distingushiable mirror image

In [None]:
for subpath in smiles_list:
    mol = Chem.MolFromSmiles(subpath)
    if mol:
        # Find chiral centers (chiral carbons, does not include Z/E for C=C)
        chiral = Chem.FindMolChiralCenters(mol, force=True, includeUnassigned=True)
    else:
        # Smiles cannot be read, a
        continue

    # Get the 
    if chiral:
        # Try to get number of stereoisomers (Currently include Z/E for C=C)
        try:
            opts = StereoEnumerationOptions(tryEmbedding=True, unique=True)
            isomers = EnumerateStereoisomers(mol, opts)
        except:
            ## Todo: embed may not work for TSs
            print(f'Warning: {sub_path} needs manual check')
        else:
            num_chiral = len([isomer for isomer in isomers])
            if num_chiral < 1:
                print(f'Warning: {sub_path} needs manual check')
    else:
        num_chiral = 1

    # Open the Arkane species file
    try:
        with open(os.path.join(arkane_paths[1], subpath, 'species.py')) as f:
            lines = f.readlines()
    except FileNotFoundError:
        print(f'Cannot find: {subpath}')
        continue

    # Check whether opticalIsomers in the file is not consistent with the number from RDKit
    flag = True
    with open(os.path.join(arkane_paths[1], subpath, 'speceis.py.backup'), 'w') as f:
        for line in lines:
            if 'opticalIsomers' in line:
                rmg_num = int(line.strip().split()[2])
                if rmg_num != num_chiral:
                    print(subpath)
                    print('chiral center:', chiral)
                    print('rdkit:',num_chiral,'  species_file:',rmg_num)
                    
                    line = ' '. join(line.strip().split()[0:2])
                    line += f' {num_chiral}\n'
                    flag = False
            f.write(line)
    
    # Correct the opticalIsomers number if necessary
    if not flag:
        shutil.copy(os.path.join(arkane_paths[1], subpath, 'speceis.py.backup'),
                    os.path.join(arkane_paths[1], subpath, 'species.py'))          

### 2.2 Correct the symmetry number of rotors
- ARC cannot predict the symmetry if the curves are bad looking (can be numbers 0 to 6). Simply assign 1 to those.

#### 2.2.1 Naive way
Make those `symmetry` < 0 or > 3 to 1.

In [None]:
for subpath in smiles_list:
    try:
        with open(os.path.join(arkane_paths[1], subpath, 'species.py')) as f:
            lines = f.readlines()
    except (FileNotFoundError, NotADirectoryError):
        print(subpath)
        continue
        
    with open(os.path.join(arkane_paths[1], subpath, 'speceis.py.backup'), 'w') as f:
        for line in lines:
            if 'symmetry=' in line:
                rmg_num = int(line.strip().split('=')[1].split(',')[0])
                if rmg_num > 3 or rmg_num <= 0:
                    line = line.replace(str(rmg_num), str(1))
                    print(subpath)
                    print(rmg_num)
            f.write(line)
    shutil.copy(os.path.join(arkane_paths[1], subpath, 'speceis.py.backup'),
                os.path.join(arkane_paths[1], subpath, 'species.py'))

#### 2.2.2 Check the curves

In [None]:
from arc.parser import parse_1d_scan_energies
from arc.species.species import cyclic_index_i_plus_1, cyclic_index_i_minus_1, determine_rotor_symmetry
from arc.plotter import plot_1d_rotor_scan

def determine_rotor_sym(rotor_path):
    energies = parse_1d_scan_energies(path=rotor_path)[0]
    max_e = max(energies)
    tol = 0.1 * max_e if max_e > 2000 else max_e
    min_e = min(energies)
    peaks, valleys = list(), list()  # the peaks and valleys of the scan
    worst_peak_resolution, worst_valley_resolution = 0, 0
    for i, e in enumerate(energies):
        # identify peaks and valleys, and determine worst resolutions in the scan
        ip1 = cyclic_index_i_plus_1(i, len(energies))  # i Plus 1
        im1 = cyclic_index_i_minus_1(i)                # i Minus 1
        if i == 0 and energies[im1] == e:
            # If the first and last scan points have same energy, change im1
            im1 -= 1
        if e > energies[im1] and e > energies[ip1]:
            # this is a local peak
            if any([diff > worst_peak_resolution for diff in [e - energies[im1], e - energies[ip1]]]):
                worst_peak_resolution = max(e - energies[im1], e - energies[ip1])
            peaks.append(e)
        elif e < energies[im1] and e < energies[ip1]:
            # this is a local valley
            if any([diff > worst_valley_resolution for diff in [energies[im1] - e, energies[ip1] - e]]):
                worst_valley_resolution = max(energies[im1] - e, energies[ip1] - e)
            valleys.append(e)

    if len(peaks) != len(valleys):
        symmetry = 1
    else:
        symmetry = determine_rotor_symmetry('species', [0, 0], energies=energies, log=False)[0]
    return symmetry

In [None]:
# for subpath in os.listdir(arkane_paths[1]):
for subpath in smiles_list:
    try:
        with open(os.path.join(arkane_paths[1], subpath, 'species.py')) as f:
            lines = f.readlines()
    except (FileNotFoundError, NotADirectoryError):
        print(subpath)
        continue
    
    os.chdir(os.path.join(arkane_paths[1], subpath,))    
    with open(os.path.join(arkane_paths[1], subpath, 'speceis.py.backup'), 'w') as f:
        for line in lines:
            if 'scanLog' in line:
                rotor_path = os.path.realpath(line.strip().split("'")[1])
                sym = determine_rotor_sym(rotor_path)
            if 'symmetry=' in line:
                rmg_num = int(line.strip().split('=')[1].split(',')[0])
                if rmg_num != sym:
                    print(subpath)
                    print(rotor_path)
                    print(f'Original: {rmg_num}, New: {sym}')
                    if sym != 1:
                        energies, angles = parse_1d_scan_energies(rotor_path)
                        plot_1d_rotor_scan(angles, energies)
                        sym = input('What is the symmetry?')
                    line = line.replace(str(rmg_num), str(sym))
                    
            f.write(line)
    shutil.copy(os.path.join(arkane_paths[1], subpath, 'speceis.py.backup'),
                os.path.join(arkane_paths[1], subpath, 'species.py'))

### 2.3 Update the arkane files without BAC
Previous changes have not been applied to Arkane files without BAC

In [None]:
# for subpath in os.listdir(arkane_paths[1]):
for subpath in smiles_list:
    species_file = os.path.join(arkane_paths[1], subpath, 'species.py')
    if os.path.isfile(species_file):
        if not os.path.isdir(os.path.join(arkane_paths[0], subpath)):
            os.makedirs(os.path.join(arkane_paths[0], subpath))
        shutil.copy(species_file,
                    os.path.join(arkane_paths[0], subpath, 'species.py'))
    else:
        print(f'Path {species_file} does not exist')
        continue

    input_file = os.path.join(arkane_paths[1], subpath, 'input.py')
    if os.path.isfile(input_file):
        shutil.copy(input_file,
                    os.path.join(arkane_paths[0], subpath, 'input.py'))
           
    try:
        with open(os.path.join(arkane_paths[0], subpath, 'species.py')) as f:
            lines = f.readlines()
    except (FileNotFoundError, NotADirectoryError):
        print(subpath)
        continue

    with open(os.path.join(arkane_paths[0], subpath, 'species.py.backup'), 'w') as f:
        for line in lines:
            if 'useBondCorrections' in line:
                line = line.replace('True', 'False')
            f.write(line)

    shutil.copy(os.path.join(arkane_paths[0], subpath, 'species.py.backup'),
                os.path.join(arkane_paths[0], subpath, 'species.py'))


## 3. Rerun Arkane jobs

In [None]:
# Target_species = list(os.listdir(arkane_paths[1]))
Target_species = smiles_list

In [None]:
for subpath in Target_species:
    if os.path.isfile(os.path.join(arkane_paths[1], subpath, 'input.py')):
        subprocess.run('python $ARKANE input.py', cwd=os.path.join(arkane_paths[1], subpath), shell=True)

## 4. Remove the finished species from NeedImprove

In [None]:
spcs = os.listdir(species_path)
need_improve = os.listdir(bad_path)
for item in need_improve:
    if item in spcs:
        try:
            print('deleting', item)
            shutil.rmtree(os.path.join(bad_path, item))
        except NotADirectoryError:
            print(item)