In [1]:
import os
from pathlib import Path
from typing import Any, Dict, List, Tuple, Union
#from tqdm.notebook import tqdm
from tqdm import tqdm
from ase import Atoms

In [2]:
os.chdir(os.path.split(os.getcwd())[0]) # set working directory to Molli top level

import ase_utils as au
import gaussian_utils as GU
import metrics as ms
import molecule_validator as mv
import utils as ut

### Merge CREST output conformers xyz files into single file

In [4]:
input_dir = Path("C:/tmp/gaussian/workflow/2_candidate_geometries/2_0_crest_output")
input_paths = ut.get_file_paths_in_dir(
                                        search_dir=input_dir,
                                        file_extension=".xyz"
                                    )
input_paths


[WindowsPath('C:/tmp/gaussian/workflow/2_candidate_geometries/2_0_crest_output/ex0a_gfn2_crestconfs.xyz'),
 WindowsPath('C:/tmp/gaussian/workflow/2_candidate_geometries/2_0_crest_output/ex0b_gfn2_crestconfs.xyz'),
 WindowsPath('C:/tmp/gaussian/workflow/2_candidate_geometries/2_0_crest_output/ex0ff_crestconfs.xyz'),
 WindowsPath('C:/tmp/gaussian/workflow/2_candidate_geometries/2_0_crest_output/ex15_gfnff_crestconfs.xyz'),
 WindowsPath('C:/tmp/gaussian/workflow/2_candidate_geometries/2_0_crest_output/ex16_gfn2_crestconfs.xyz'),
 WindowsPath('C:/tmp/gaussian/workflow/2_candidate_geometries/2_0_crest_output/ex19_gfn2_crestconfs_from_ex16_c5.xyz'),
 WindowsPath('C:/tmp/gaussian/workflow/2_candidate_geometries/2_0_crest_output/ex20_gfn2_crestconfs.xyz'),
 WindowsPath('C:/tmp/gaussian/workflow/2_candidate_geometries/2_0_crest_output/ex21_gfnff_crestconfs_from_ex16_c5.xyz'),
 WindowsPath('C:/tmp/gaussian/workflow/2_candidate_geometries/2_0_crest_output/ex23_gfn2_crestconfs_from_ex16_c3.xyz')]

In [5]:
mols = au.create_ase_atoms_list_from_xyz_files(input_paths=input_paths)
len(mols)


390

In [6]:
all_crest_confs_xyz = Path("C:/tmp/gaussian/workflow/2_candidate_geometries/all_crest_confs.xyz")

In [5]:
au.write_ase_atoms_to_xyz_file(
                                atoms_list=mols,
                                output_path=all_crest_confs_xyz
                                )

'C:\\tmp\\gaussian\\workflow\\2_candidate_geometries\\all_crest_confs.xyz'

### Find duplicates and near-similars in CREST conformers

In [7]:
input_path = all_crest_confs_xyz
metric_func = ms.rmsd_of_positions

near_similars = au.calculate_metric_xyz_file(
                            xyz_path=input_path,
                            max_value=0.5,
                            align=True,
                            metric_function=metric_func
                          )
near_similars


{'less_than_max_value': [('target_source_value',
   231,
   286,
   0.03827408003119554),
  ('target_source_value', 318, 341, 0.049973742261938965),
  ('target_source_value', 318, 322, 0.07143240115440426),
  ('target_source_value', 214, 218, 0.08633062651585686),
  ('target_source_value', 322, 341, 0.09044290920417757),
  ('target_source_value', 120, 137, 0.10367811527826763),
  ('target_source_value', 168, 174, 0.11324264049581367),
  ('target_source_value', 289, 290, 0.15167862981016278),
  ('target_source_value', 93, 126, 0.1727109672197329),
  ('target_source_value', 78, 80, 0.3360270393837941),
  ('target_source_value', 234, 243, 0.3394852672929166),
  ('target_source_value', 91, 109, 0.34214550653389136),
  ('target_source_value', 192, 194, 0.3438940990853645),
  ('target_source_value', 139, 167, 0.3749680902670928),
  ('target_source_value', 106, 150, 0.3887555825146699),
  ('target_source_value', 46, 47, 0.4044855237553122),
  ('target_source_value', 102, 116, 0.40469545208935

In [27]:
[(
    au.get_name_from_atoms(mols[target]),
    au.get_name_from_atoms(mols[source]),
    round(val, 2)
    ) 
     for _, target, source, val in near_similars["less_than_max_value"][:10]
]

[('ex16_gfn2_crestconfs_5', 'ex19_gfn2_crestconfs_from_ex16_c5_53', 0.04),
 ('ex21_gfnff_crestconfs_from_ex16_c5_21',
  'ex21_gfnff_crestconfs_from_ex16_c5_44',
  0.05),
 ('ex21_gfnff_crestconfs_from_ex16_c5_21',
  'ex21_gfnff_crestconfs_from_ex16_c5_25',
  0.07),
 ('ex15_gfnff_crestconfs_138', 'ex15_gfnff_crestconfs_142', 0.09),
 ('ex21_gfnff_crestconfs_from_ex16_c5_25',
  'ex21_gfnff_crestconfs_from_ex16_c5_44',
  0.09),
 ('ex15_gfnff_crestconfs_44', 'ex15_gfnff_crestconfs_61', 0.1),
 ('ex15_gfnff_crestconfs_92', 'ex15_gfnff_crestconfs_98', 0.11),
 ('ex19_gfn2_crestconfs_from_ex16_c5_56',
  'ex19_gfn2_crestconfs_from_ex16_c5_57',
  0.15),
 ('ex15_gfnff_crestconfs_17', 'ex15_gfnff_crestconfs_50', 0.17),
 ('ex15_gfnff_crestconfs_2', 'ex15_gfnff_crestconfs_4', 0.34)]

In [17]:
duplicates_crest_path = Path(
    "C:/tmp/gaussian/workflow/2_candidate_geometries/2_1_duplicates_analysis/duplicates_crest_confs.txt"
    )

In [18]:
ut.write_text_file_json(
    file_name=duplicates_crest_path,
    data=near_similars
)

'C:\\tmp\\gaussian\\workflow\\2_candidate_geometries\\2_1_duplicates_analysis\\duplicates_crest_confs.txt'

#### Check duplicates: align pairwise and save to xyz

In [24]:
aligned_duplicates_output_path = Path(
    "C:/tmp/gaussian/workflow/2_candidate_geometries/2_1_duplicates_analysis/aligned_pairwise_duplicates_crest_confs.xyz"
)

In [25]:
aligned_duplicates = []

for _, target, source, val in near_similars["less_than_max_value"][:40]:
    aligned_mol2 = au.align_2_molecules_min_rmsd(
                                                target=mols[target],
                                                atoms_to_align=mols[source]
                                                )
    
    aligned_duplicates.append(mols[target])
    aligned_duplicates.append(aligned_mol2)

au.write_ase_atoms_to_xyz_file(
    atoms_list=aligned_duplicates,
    output_path=aligned_duplicates_output_path
)


'C:\\tmp\\gaussian\\workflow\\2_candidate_geometries\\2_1_duplicates_analysis\\aligned_pairwise_duplicates_crest_confs.xyz'

In [26]:
len(near_similars["less_than_max_value"])

927

### Validate CREST conformers

In [6]:
crest_confs = au.create_ase_atoms_list_from_xyz_file(
                                                    input_path=all_crest_confs_xyz,
                                                    name="all_crest_confs"
                                                  )

len(crest_confs)

390

In [7]:
crest_confs_validation_results = mv.validate_bonds_of_many_mols_with_target_molecule(mols_to_validate=crest_confs)

all_crest_confs_390: 100%|███████████████████| 390/390 [00:53<00:00,  7.28it/s]


In [None]:
len(crest_confs_validation_results["invalid_mols"])

In [9]:
validated_crest_path = Path(
    "C:/tmp/gaussian/workflow/2_candidate_geometries/2_2_structural_validation/validation_results_crest_confs.txt"
    )

In [None]:
ut.write_text_file_json(
    file_name=validated_crest_path,
    data=crest_confs_validation_results
)