# Core Imports

In [1]:
# Generic Imports
import re
from functools import partial, cached_property
from collections import defaultdict
from itertools import combinations, chain
from ast import literal_eval

# Numeric imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# File I/O
from pathlib import Path
import csv, json, openpyxl

# Typing and Subclassing
from typing import Any, Callable, ClassVar, Generator, Iterable, Optional, Union
from dataclasses import dataclass, field
from abc import ABC, abstractmethod, abstractproperty
from openmm.unit import Unit, Quantity

# Cheminformatics
from rdkit import Chem
from rdkit.Chem import rdChemReactions
from rdkit.Chem.Draw import IPythonConsole

DIM    = 300
ASPECT = 3/2
IPythonConsole.molSize = (int(ASPECT*DIM), DIM)   # Change image size

# Static Paths
RAW_DATA_DIR  = Path('monomer_data_raw')
FMT_DATA_DIR  = Path('monomer_data_formatted')
PROC_DATA_DIR = Path('monomer_data_processed')
RXN_FILES_DIR = Path('poly_rxns')
# RXN_FILES_DIR = Path('rxn_smarts')

# Filtering expanded dataset

In [2]:
from polymerist.monomers import specification

input_data_path = FMT_DATA_DIR / '20231114_polyid_data_density_DP2-6 - 1,2 monomers.csv'
df = pd.read_csv(input_data_path)

  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)
  setattr(self, word, getattr(machar, word).flat[0])
  return self._float_to_str(self.smallest_subnormal)




## Filter by number of monomer fragments in monomer SMILES

### Verify that the num_monomers fields is accurate

In [3]:
num_mols_in_smiles = lambda smiles : len(
    Chem.GetMolFrags(
        Chem.MolFromSmiles(smiles)
    )
)

reported_num_monomers = df['num_monomers']
actual_num_monomers   = df.apply(lambda row : num_mols_in_smiles(row['smiles_monomer']), axis=1)

assert( (reported_num_monomers == actual_num_monomers).all() ) # check that given monomer counts are correct

In [4]:
df = df[df['num_monomers'] == 2]
display(df)

Unnamed: 0,smiles_monomer,smiles_polymer_DP2,smiles_polymer_DP3,smiles_polymer_DP6,smiles_polymer_DP18,num_monomers,mechanism,Glass_Transition,Melt_Temp,Cp_solid_slope,...,log10_ElongBreak,YoungMod,Tensile_Strength,Density,log10_Permeability_CH4,log10_Permeability_CO2,log10_Permeability_N2,log10_Permeability_O2,log10_Permeability_H2,log10_Permeability_H2O
0,O=C(Cl)Cl.Oc1ccc(C(c2ccc(O)cc2)(C(F)(F)F)C(F)(...,O=C(Cl)Oc1ccc(C(c2ccc(O)cc2)(C(F)(F)F)C(F)(F)F...,O=C(Oc1ccc(C(c2ccc(O)cc2)(C(F)(F)F)C(F)(F)F)cc...,O=C(Cl)Oc1ccc(C(c2ccc(OC(=O)Oc3ccc(C(c4ccc(OC(...,O=C(Cl)Oc1ccc(C(c2ccc(OC(=O)Oc3ccc(C(c4ccc(OC(...,2,carbonate,176.00,,,...,,,,1.479,0.017759,1.380211,0.225687,0.838849,,
5,O=C(O)c1cccc(C(=O)O)c1.Nc1ccc(-c2ccc(Oc3ccc(Oc...,Nc1ccc(-c2cc(Oc3ccc(Oc4ccc(-c5ccc(NC(=O)c6cccc...,Nc1ccc(-c2ccc(Oc3ccc(Oc4cc(-c5ccc(NC(=O)c6cccc...,Nc1ccc(-c2cc(Oc3ccc(Oc4ccc(-c5ccc(NC(=O)c6cccc...,Nc1ccc(-c2cc(Oc3ccc(Oc4ccc(-c5ccc(NC(=O)c6cccc...,2,amide,243.00,,,...,1.316180,,,1.360,-0.397940,1.114277,-0.275724,0.514548,,
6,O=C(Cl)Cl.Cc1cc(C(C)(C)c2cc(C)c(O)c(C)c2)cc(C)c1O,Cc1cc(C(C)(C)c2cc(C)c(OC(=O)Cl)c(C)c2)cc(C)c1O,Cc1cc(C(C)(C)c2cc(C)c(OC(=O)Cl)c(C)c2)cc(C)c1O...,Cc1cc(C(C)(C)c2cc(C)c(OC(=O)Oc3c(C)cc(C(C)(C)c...,Cc1cc(C(C)(C)c2cc(C)c(OC(=O)Oc3c(C)cc(C(C)(C)c...,2,carbonate,193.00,,,...,,,,1.083,-0.096910,1.245019,0.037426,0.747412,,
7,Nc1ccc(C2(c3ccc(N)cc3)c3ccccc3-c3ccccc32)cc1.O...,Nc1ccc(C2(c3ccc(N4C(=O)c5ccc(C(c6ccc7c(c6)C(=O...,Nc1ccc(C2(c3ccc(N4C(=O)c5ccc(C(c6ccc7c(c6)C(=O...,Nc1ccc(C2(c3ccc(N4C(=O)c5ccc(C(c6ccc7c(c6)C(=O...,Nc1ccc(C2(c3ccc(N4C(=O)c5ccc(C(c6ccc7c(c6)C(=O...,2,imide,408.00,,,...,,,,1.318,,1.991226,0.518514,,,
9,O=C1OC(=O)c2cc(Oc3ccc4c(c3)C(=O)OC4=O)ccc21.CC...,CC(C)(C)c1cc(C(C)(C)C)c(Oc2ccc(N(c3ccc(N)cc3)c...,CC(C)(C)c1cc(C(C)(C)C)c(Oc2ccc(N(c3ccc(N)cc3)c...,CC(C)(C)c1cc(C(C)(C)C)c(Oc2ccc(N(c3ccc(N)cc3)c...,CC(C)(C)c1cc(C(C)(C)C)c(Oc2ccc(N(c3ccc(N)cc3)c...,2,imide,260.00,,,...,0.698970,2020.0,67.0,1.140,0.133539,1.812245,0.447158,1.311754,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,O=C(c1ccc(C(=O)O)c(C(=O)O)c1)c1ccc(C(=O)O)c(C(...,CC1(C)CC(C)(c2ccc(Oc3ccc(-c4ccc(N)cc4)cc3C(F)(...,CC1(C)CC(C)(c2ccc(Oc3ccc(-c4ccc(NC(=O)c5cc(C(=...,CC1(C)CC(C)(c2ccc(Oc3ccc(-c4ccc(NC(=O)c5cc(C(=...,CC1(C)CC(C)(c2ccc(Oc3ccc(-c4ccc(NC(=O)c5ccc(C(...,2,amide,252.00,,,...,0.954243,1291.5,,1.150,-0.055517,1.534026,0.120574,0.902003,,
461,NCCCCCCCN.O=C(O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)O,NCCCCCCCNC(=O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)O,O=C(O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)NCCCCCCCN...,NCCCCCCCNC(=O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)N...,NCCCCCCCNC(=O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)N...,2,amide,31.00,,,...,,,,1.040,,,,,,
462,O=C(Cl)Cl.Oc1ccc(C2(c3ccc(O)cc3)CCCCCC2)cc1,O=C(Cl)Oc1ccc(C2(c3ccc(O)cc3)CCCCCC2)cc1,O=C(Cl)Oc1ccc(C2(c3ccc(OC(=O)Cl)cc3)CCCCCC2)cc1,O=C(Cl)Oc1ccc(C2(c3ccc(OC(=O)Oc4ccc(C5(c6ccc(O...,O=C(Cl)Oc1ccc(C2(c3ccc(OC(=O)Oc4ccc(C5(c6ccc(O...,2,carbonate,155.00,,,...,,,,1.203,,,,-0.119186,,
463,OCCCCO.O=C(O)CCCCC(=O)O,O=C(O)CCCCC(=O)OCCCCO,O=C(O)CCCCC(=O)OCCCCOC(=O)CCCCC(=O)O,O=C(O)CCCCC(=O)OCCCCOC(=O)CCCCC(=O)OCCCCOC(=O)...,O=C(O)CCCCC(=O)OCCCCOC(=O)CCCCC(=O)OCCCCOC(=O)...,2,ester,-43.05,57.4,,...,,,16.2,1.060,,,,,,


## Filter by chemistry

### Insert column of expanded monomer SMILES for combined mols

In [5]:
col_labels = df.columns.to_list()
# col_labels.insert(1, 'smiles_expanded')
# df = df.reindex(columns=col_labels) # this need to be done BEFORE actually inserting the new column to avoid duplication
df.insert(
    loc=1,
    column='smiles_expanded',
    value=df['smiles_monomer'].apply(lambda smiles : specification.expanded_SMILES(smiles, assign_map_nums=False))
)

### Remove entries which contain any undesired atoms

In [6]:
from rdkit.Chem import rdqueries
from polymerist.rdutils.smileslib import queries

atom_queries = {
    'silicon' : Chem.MolFromSmarts('[Si]'),
    'sulfur'  : Chem.MolFromSmarts('[S]'),
    'metal'   : queries.SPECIAL_QUERY_MOLS['metal'],
    # 'halogen' : queries.SPECIAL_QUERY_MOLS['halogen'],
}

In [7]:
undes_atoms_present = df.apply(lambda row : 
    queries.matching_dict_from_substruct_dict(Chem.MolFromSmiles(row['smiles_expanded'], sanitize=False), atom_queries).values(),
    axis=1,
    result_type='expand'
)

df[[f'{atom_chem}_present' for atom_chem in atom_queries.keys()]] = undes_atoms_present

In [8]:
df = df[~undes_atoms_present.any(axis=1)]
len(df)

291

## Filter by reactive functional groups, placing fragments in correct order

### Load pre-defined functional group SMARTS queries

In [9]:
with Path('poly_rxns/fn_group_smarts.json').open('r') as file:
    fn_group_smarts = json.load(file)

fn_groups = {
    group_name : Chem.MolFromSmarts(smarts)
        for group_name, smarts in fn_group_smarts.items()
}

### Backmap reactions and their functional groups

In [10]:
from polymerist.rdutils.reactions import reactions

rxn_backmap = { # map NREL dataset mechanism names to pre-made rxn template names
    'amide'     : 'polyamide',
    'carbonate' : 'polycarbonate_phosgene',
    'ester'     : 'polyester',
    'imide'     : 'polyimide',
    'urethane'  : 'polyurethane_isocyanate',
    'NIPU'      : 'polyurethane_nonisocyanate',
    'vinyl'     : 'polyvinyl_head_tail'
}

assert(set(df['mechanism'].unique()).issubset(set(rxn_backmap.keys()))) # verify that we've mapped all reactions

backmap_path = RXN_FILES_DIR / 'rxn_backmap.json'
with backmap_path.open('w') as backmap_file:
    json.dump(rxn_backmap, backmap_file, indent=4)

In [11]:
rxns = {
    rxnname : reactions.AnnotatedReaction.from_rxnfile(RXN_FILES_DIR / f'{rxnname}.rxn')
        for rxnname in rxn_backmap.values()
}

with (RXN_FILES_DIR / 'rxn_groups.json').open('r') as file: # load table of functional group for each reaction
    rxn_groups = json.load(file)

In [12]:
df.insert( # insert column applying backmap to listed mechanism name
    loc=df.columns.to_list().index('mechanism') + 1, # insert after existing mechanism column
    column='rxn_name',
    value=df.mechanism.map(rxn_backmap)
)

### Identify if fragments match a defined rxn, and if so, in what order they should appear

In [13]:
from polymerist.maths.combinatorics.sequences import bin_ids_forming_sequence

def get_ordered_monomer_fragments(row : pd.Series) -> list[Optional[str], Optional[str]]:
    '''Returns expanded SMILES each individual monomer in the correct order for their respective reaction
    or all NoneType if no such match can be found'''
    targ_groups = rxn_groups[rxn_backmap[row.mechanism]]
    frag_smiles = row['smiles_expanded'].split('.') # easier in practice than generating fragments of combined mol, turning fragments to SMILES, and expanding those SMILES
    choice_bins = [
        queries.matching_labels_from_substruct_dict(Chem.MolFromSmiles(smiles, sanitize=False), fn_groups)
            for smiles in frag_smiles
    ]

    nfrags = len(frag_smiles) # avoids relying on prior data sanitization
    # nfrags = row['num_monomers']
    all_indices = set(range(nfrags))

    for ids in bin_ids_forming_sequence(sequence=targ_groups, choice_bins=choice_bins):
        if set(ids) == all_indices:
            return [frag_smiles[i] for i in ids] # return fragments in the order defined by the sequence
    else:
        return [None for _ in all_indices] # return all None if no covering order is found

### Insert individual monomers into table, remove rows with no monomers

In [14]:
reacting_monomers = df.apply(get_ordered_monomer_fragments, axis=1, result_type='expand')
for (i, col) in reacting_monomers.items():
    df.insert(
        loc=i+3,
        column=f'smiles_expanded_monomer_{i+1}',
        value=col
    )

df = df[reacting_monomers.notnull().all(axis=1)]
len(df)

284

## Filtering by IUPAC name query to chemical databases

In [15]:
import cirpy
import pubchempy as pcp
import chemspipy as csp

def get_IUPAC_name(smiles : str) -> Optional[str]:
    '''Attempts to fetch the IUPAC name for the molecule described by a SMILES string from online chemical resolution services
    Returns the fetched IUPAC name, or NoneType if both queries are unsuccessful'''
    try:
        pcquery = pcp.get_compounds(smiles, namespace='smiles')
        pccomp = pcquery.pop()
        return getattr(pccomp, 'iupac_name')
    except Exception as e:
        print(type(e), e)
        return None

In [16]:
IUPAC_names = df[['smiles_expanded_monomer_1', 'smiles_expanded_monomer_2']].map(get_IUPAC_name) # this takes a while due to the HTTP query
IUPAC_names

Unnamed: 0,smiles_expanded_monomer_1,smiles_expanded_monomer_2
0,"4-[1,1,1,3,3,3-hexafluoro-2-(4-hydroxyphenyl)p...",carbonyl dichloride
5,,"benzene-1,3-dicarboxylic acid"
6,"4-[2-(4-hydroxy-3,5-dimethylphenyl)propan-2-yl...",carbonyl dichloride
7,4-[9-(4-aminophenyl)fluoren-9-yl]aniline,"5-[2-(1,3-dioxo-2-benzofuran-5-yl)-1,1,1,3,3,3..."
9,"4-N-(4-aminophenyl)-4-N-[4-(2,4,6-tritert-buty...","5-[(1,3-dioxo-2-benzofuran-5-yl)oxy]-2-benzofu..."
...,...,...
458,"2,4,6-trimethylbenzene-1,3-diamine","5-(1,3-dioxo-2-benzofuran-5-carbonyl)-2-benzof..."
460,4-[4-[4-[6-[4-(4-aminophenyl)-2-(trifluorometh...,"4-(3,4-dicarboxybenzoyl)phthalic acid"
462,4-[1-(4-hydroxyphenyl)cycloheptyl]phenol,carbonyl dichloride
463,"butane-1,4-diol",hexanedioic acid


In [17]:
IUPAC_names.columns = (0, 1)
for (i, col) in IUPAC_names.items():
    df.insert(
        loc=i+5,
        column=f'IUPAC_monomer_{i+1}',
        value=col
    )

df = df[IUPAC_names.notnull().all(axis=1)]
len(df)

236

## Save filtered DataFrame for next steps to avoid reprocessing

In [19]:
out_data_path = PROC_DATA_DIR / f'{input_data_path.stem}_FILTERED{input_data_path.suffix}'
df.to_csv(out_data_path)