# Core Imports

In [3]:
# Generic Imports
import re
from functools import partial, cached_property
from collections import defaultdict
from itertools import combinations, chain
from ast import literal_eval

# Numeric imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# File I/O
from pathlib import Path
import csv, json, openpyxl

# Typing and Subclassing
from typing import Any, Callable, ClassVar, Generator, Iterable, Optional, Union
from dataclasses import dataclass, field
from abc import ABC, abstractmethod, abstractproperty
from openmm.unit import Unit, Quantity

# Cheminformatics
from rdkit import Chem
from rdkit.Chem import rdChemReactions
from rdkit.Chem.Draw import IPythonConsole

DIM    = 300
ASPECT = 3/2
IPythonConsole.molSize = (int(ASPECT*DIM), DIM)   # Change image size

# Static Paths
RAW_DATA_DIR  = Path('monomer_data_raw')
FMT_DATA_DIR  = Path('monomer_data_formatted')
PROC_DATA_DIR = Path('monomer_data_processed')
RXN_FILES_DIR = Path('poly_rxns')
# RXN_FILES_DIR = Path('rxn_smarts')

# Filtering expanded dataset

In [4]:
from polymerist.monomers import specification

# input_data_path = FMT_DATA_DIR / 'nipu_urethanes_FMT.csv'
input_data_path = FMT_DATA_DIR / '221010_trainingdata_DP-18_expanded_FMT.csv'
# input_data_path = FMT_DATA_DIR / '20231114_polyid_data_density_DP2-6 - 1,2 monomers.csv'
df = pd.read_csv(input_data_path)
display(df)

Unnamed: 0,hash-monomers-distribution,monomers,distribution,mechanism,Glass_Transition,Melt_Temp,Cp_solid_slope,Cp_solid_intercept,Cp_liquid_slope,Cp_liquid_intercept,...,log10_Permeability_CH4,log10_Permeability_CO2,log10_Permeability_N2,log10_Permeability_O2,log10_Permeability_H2,log10_Permeability_H2O,smiles_polymer,hash-smiles_polymer,smiles_monomer,replicate_structure
0,28NNnPqUqeUNrtmudNTwYz,"('CC(C)(C)c1cc(C(=O)O)cc(C(=O)O)c1', 'Nc1ccc(-...",,amide,275.00,,,,,,...,0.359835,1.743588,0.376577,1.148911,,,CC(C)(C)c1cc(C(=O)O)cc(C(=O)Nc2ccc(-c3ccc(Oc4c...,Zzhk9gU8ApTH4xtAfQi5rW,CC(C)(C)c1cc(C(=O)O)cc(C(=O)O)c1.Nc1ccc(-c2ccc...,0
1,28oL8vgvwP9acaZDehEMGU,"('O=C(O)c1cccc(C(=O)O)c1', 'Nc1ccc(Oc2ccc(Oc3c...",,amide,210.00,340.0,,,,,...,,,,,,,Nc1ccc(Oc2ccc(Oc3ccc(NC(=O)c4cccc(C(=O)Nc5ccc(...,MmKrS7QZesi5QHFkPtKreE,O=C(O)c1cccc(C(=O)O)c1.Nc1ccc(Oc2ccc(Oc3ccc(N)...,0
2,2DuicAyoesWbNJNz3MgSWn,"('NCCCCCCCCCN', 'O=C(O)CCCCCCCC(=O)O')",,amide,,177.0,,,,,...,,,,,,,NCCCCCCCCCNC(=O)CCCCCCCC(=O)NCCCCCCCCCNC(=O)CC...,DRQdStFNSgFv3wedj6sZz7,NCCCCCCCCCN.O=C(O)CCCCCCCC(=O)O,0
3,2EvjUqRRk9goyUTA69A7P7,"('O=C(O)c1cc(C(=O)O)cc([N+](=O)[O-])c1', 'Nc1c...",,amide,174.00,,,,,,...,,,,,,,Nc1ccc(O[Si](Oc2ccc(NC(=O)c3cc(C(=O)Nc4ccc(O[S...,X5mKgKyx8EJHc7TzcxMexM,O=C(O)c1cc(C(=O)O)cc([N+](=O)[O-])c1.Nc1ccc(O[...,0
4,2HpYXrCFgaJVszCPDtpWsv,"('Nc1ccc(N)cc1', 'O=C(O)c1cccc(N2C(=O)c3ccc(C(...",,amide,315.00,,,,,,...,,,,,,,Nc1ccc(NC(=O)c2cccc(N3C(=O)c4ccc(C(c5ccc6c(c5)...,KUoF3Vr4pohJzC9MzUW59q,Nc1ccc(N)cc1.O=C(O)c1cccc(N2C(=O)c3ccc(C(c4ccc...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2613,o3GGTNZkgNEuQSKNWYeYFs,"('C=CC',)",,vinyl,-9.15,,0.25,-0.85,0.15,42.95,...,,1.231736,0.079965,0.755046,,2.053483,CC(CCC(C)CC(C)CC(C)C(C)CCC(C)C(C)CC(C)CCC(C)C(...,6232Wkx4QGKYgD6kxuodTj,C=CC,0
2614,o4LroNwi3uKoY2oXtjDvau,"('C=CC(=O)OCCCCCCCCCCCCCC',)",,vinyl,,28.0,,,,,...,1.813581,2.472756,1.344392,1.756636,1.983175,,CCCCCCCCCCCCCCOC(=O)CCCC(C(=O)OCCCCCCCCCCCCCC)...,jeytmhnbvwNCi9TFJzb3Sh,C=CC(=O)OCCCCCCCCCCCCCC,0
2615,o8CKTqDchhSvCTx6NLc6ih,"('C=C(C#N)C(=O)OCCCCCCCC',)",,vinyl,48.85,,,,,,...,,,,,,,CCCCCCCCOC(=O)C(C#N)CC(C#N)(CCC(C#N)(CC(C#N)(C...,HMUpFTATpJVVYZG2zsywgy,C=C(C#N)C(=O)OCCCCCCCC,0
2616,oNVVbZgL7r4BMQkoTzruM3,"('C=Cc1ccc(C)cc1C',)",,vinyl,112.00,,,,,,...,,,,,,,Cc1ccc(CCCC(c2ccc(C)cc2C)C(CC(CCC(c2ccc(C)cc2C...,T4Fmiy6gWwfj7bntRHUzGX,C=Cc1ccc(C)cc1C,0


## Filter by number of monomer fragments in monomer SMILES

### Verify that the num_monomers fields is accurate

In [5]:
num_mols_in_smiles = lambda smiles : len(
    Chem.GetMolFrags(
        Chem.MolFromSmiles(smiles)
    )
)

actual_num_monomers   = df['smiles_monomer'].map(num_mols_in_smiles)

if 'num_monomers' in df:
    reported_num_monomers = df['num_monomers']
    assert( (reported_num_monomers == actual_num_monomers).all() ) # check that given monomer counts are correct

In [6]:
actual_num_monomers.iloc[71]

2

In [7]:
df = df[actual_num_monomers == 2]
display(df)

Unnamed: 0,hash-monomers-distribution,monomers,distribution,mechanism,Glass_Transition,Melt_Temp,Cp_solid_slope,Cp_solid_intercept,Cp_liquid_slope,Cp_liquid_intercept,...,log10_Permeability_CH4,log10_Permeability_CO2,log10_Permeability_N2,log10_Permeability_O2,log10_Permeability_H2,log10_Permeability_H2O,smiles_polymer,hash-smiles_polymer,smiles_monomer,replicate_structure
0,28NNnPqUqeUNrtmudNTwYz,"('CC(C)(C)c1cc(C(=O)O)cc(C(=O)O)c1', 'Nc1ccc(-...",,amide,275.0,,,,,,...,0.359835,1.743588,0.376577,1.148911,,,CC(C)(C)c1cc(C(=O)O)cc(C(=O)Nc2ccc(-c3ccc(Oc4c...,Zzhk9gU8ApTH4xtAfQi5rW,CC(C)(C)c1cc(C(=O)O)cc(C(=O)O)c1.Nc1ccc(-c2ccc...,0
1,28oL8vgvwP9acaZDehEMGU,"('O=C(O)c1cccc(C(=O)O)c1', 'Nc1ccc(Oc2ccc(Oc3c...",,amide,210.0,340.0,,,,,...,,,,,,,Nc1ccc(Oc2ccc(Oc3ccc(NC(=O)c4cccc(C(=O)Nc5ccc(...,MmKrS7QZesi5QHFkPtKreE,O=C(O)c1cccc(C(=O)O)c1.Nc1ccc(Oc2ccc(Oc3ccc(N)...,0
2,2DuicAyoesWbNJNz3MgSWn,"('NCCCCCCCCCN', 'O=C(O)CCCCCCCC(=O)O')",,amide,,177.0,,,,,...,,,,,,,NCCCCCCCCCNC(=O)CCCCCCCC(=O)NCCCCCCCCCNC(=O)CC...,DRQdStFNSgFv3wedj6sZz7,NCCCCCCCCCN.O=C(O)CCCCCCCC(=O)O,0
3,2EvjUqRRk9goyUTA69A7P7,"('O=C(O)c1cc(C(=O)O)cc([N+](=O)[O-])c1', 'Nc1c...",,amide,174.0,,,,,,...,,,,,,,Nc1ccc(O[Si](Oc2ccc(NC(=O)c3cc(C(=O)Nc4ccc(O[S...,X5mKgKyx8EJHc7TzcxMexM,O=C(O)c1cc(C(=O)O)cc([N+](=O)[O-])c1.Nc1ccc(O[...,0
4,2HpYXrCFgaJVszCPDtpWsv,"('Nc1ccc(N)cc1', 'O=C(O)c1cccc(N2C(=O)c3ccc(C(...",,amide,315.0,,,,,,...,,,,,,,Nc1ccc(NC(=O)c2cccc(N3C(=O)c4ccc(C(c5ccc6c(c5)...,KUoF3Vr4pohJzC9MzUW59q,Nc1ccc(N)cc1.O=C(O)c1cccc(N2C(=O)c3ccc(C(c4ccc...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2508,fqWRjvSWxGZkjGQuFNRjig,"('C=C(C)C', 'C=CC(=C)C')",,vinyl,-63.0,,,,,,...,,,-0.078040,,,,C=CC(C)(CCC(C)(C)CC(C)(C=C)C(C)(C)CC(C)(C)CC(C...,QNyEfVNnXPhroCt7y9WqFp,C=C(C)C.C=CC(=C)C,0
2540,hvbirwkKrgW3wSMDc6g5v8,"('FC(F)=C(F)F', 'FC(F)=C(F)C(F)(F)F')",,vinyl,,,,,,,...,,1.508829,0.605284,1.095584,,1.636947,FC(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(C(F)(F)F...,8HhmevugQDamxx9DHeF8YY,FC(F)=C(F)F.FC(F)=C(F)C(F)(F)F,0
2562,jaEe5qENAgpNtEiSs5DC6T,"('C=COC(C)=O', 'CCCCCCCCOC(=O)C=CC(=O)OCCCCCCCC')",,vinyl,-11.8,,,,,,...,,,,,,,CCCCCCCCOC(=O)C(CCOC(C)=O)C(C(=O)OCCCCCCCC)C(C...,3UMxKMV2agQDBfTutBJr4w,C=COC(C)=O.CCCCCCCCOC(=O)C=CC(=O)OCCCCCCCC,0
2590,mKKtbmP4DCych6DYUt8nS7,"('C=C(C)C(=O)OC', 'C=C(C)C(=O)OC1CCC2COC1O2')","[57, 43]",vinyl,140.0,,,,,,...,,,,,,,COC(=O)C(C)CC(C)(CCC(C)(C(=O)OC1CCC2COC1O2)C(C...,jA5XA4ZsSDZWF2Q7wjcsx9,C=C(C)C(=O)OC.C=C(C)C(=O)OC1CCC2COC1O2,0


## Filter by chemistry

### Insert column of expanded monomer SMILES for combined mols

In [8]:
col_labels = df.columns.to_list()
# col_labels.insert(1, 'smiles_expanded')
# df = df.reindex(columns=col_labels) # this need to be done BEFORE actually inserting the new column to avoid duplication
df.insert(
    loc=1,
    column='smiles_expanded',
    value=df['smiles_monomer'].apply(lambda smiles : specification.expanded_SMILES(smiles, assign_map_nums=False))
)

### Remove entries which contain any undesired atoms

In [9]:
from rdkit.Chem import rdqueries
from polymerist.rdutils.smileslib import queries

atom_queries = {
    'silicon' : Chem.MolFromSmarts('[Si]'),
    'sulfur'  : Chem.MolFromSmarts('[S]'),
    'metal'   : queries.SPECIAL_QUERY_MOLS['metal'],
    # 'halogen' : queries.SPECIAL_QUERY_MOLS['halogen'],
}

In [10]:
undes_atoms_present = df.apply(lambda row : 
    queries.matching_dict_from_substruct_dict(Chem.MolFromSmiles(row['smiles_expanded'], sanitize=False), atom_queries).values(),
    axis=1,
    result_type='expand'
)

df[[f'{atom_chem}_present' for atom_chem in atom_queries.keys()]] = undes_atoms_present

In [11]:
df = df[~undes_atoms_present.any(axis=1)]
len(df)

1664

## Filter by reactive functional groups, placing fragments in correct order

### Load pre-defined functional group SMARTS queries

In [12]:
with Path('poly_rxns/fn_group_smarts.json').open('r') as file:
    fn_group_smarts = json.load(file)

fn_groups = {
    group_name : Chem.MolFromSmarts(smarts)
        for group_name, smarts in fn_group_smarts.items()
}

### Backmap reactions and their functional groups

In [13]:
from polymerist.rdutils.reactions import reactions

rxn_backmap = { # map NREL dataset mechanism names to pre-made rxn template names
    'amide'     : 'polyamide',
    'carbonate' : 'polycarbonate_phosgene',
    'ester'     : 'polyester',
    'imide'     : 'polyimide',
    'urethane'  : 'polyurethane_isocyanate',
    'NIPU'      : 'polyurethane_nonisocyanate',
    'vinyl'     : 'polyvinyl_head_tail'
}

assert(set(df['mechanism'].unique()).issubset(set(rxn_backmap.keys()))) # verify that we've mapped all reactions

backmap_path = RXN_FILES_DIR / 'rxn_backmap.json'
with backmap_path.open('w') as backmap_file:
    json.dump(rxn_backmap, backmap_file, indent=4)

In [14]:
rxns = {
    rxnname : reactions.AnnotatedReaction.from_rxnfile(RXN_FILES_DIR / f'{rxnname}.rxn')
        for rxnname in rxn_backmap.values()
}

with (RXN_FILES_DIR / 'rxn_groups.json').open('r') as file: # load table of functional group for each reaction
    rxn_group_names = json.load(file)

In [15]:
df.insert( # insert column applying backmap to listed mechanism name
    loc=df.columns.to_list().index('mechanism') + 1, # insert after existing mechanism column
    column='rxn_name',
    value=df.mechanism.map(rxn_backmap)
)

### Identify if fragments match a defined rxn, and if so, in what order they should appear

In [16]:
from polymerist.maths.combinatorics.sequences import bin_ids_forming_sequence

def get_ordered_monomer_fragments(row : pd.Series) -> list[Optional[str], Optional[str]]:
    '''Returns expanded SMILES each individual monomer in the correct order for their respective reaction
    or all NoneType if no such match can be found'''
    targ_group_names = rxn_group_names[rxn_backmap[row.mechanism]]
    frag_smiles = row['smiles_expanded'].split('.') # easier in practice than generating fragments of combined mol, turning fragments to SMILES, and expanding those SMILES
    choice_bins = [
        queries.matching_labels_from_substruct_dict(Chem.MolFromSmiles(smiles, sanitize=False), fn_groups)
            for smiles in frag_smiles
    ]

    nfrags = len(frag_smiles) # avoids relying on prior data sanitization
    # nfrags = row['num_monomers']
    all_indices = set(range(nfrags))

    for ids in bin_ids_forming_sequence(sequence=targ_group_names, choice_bins=choice_bins):
        if set(ids) == all_indices:
            return [frag_smiles[i] for i in ids] # return fragments in the order defined by the sequence
    else:
        return [None for _ in all_indices] # return all None if no covering order is found

### Insert individual monomers into table, remove rows with no monomers

In [17]:
reacting_monomers = df.apply(get_ordered_monomer_fragments, axis=1, result_type='expand')
for (i, col) in reacting_monomers.items():
    df.insert(
        loc=i+3,
        column=f'smiles_expanded_monomer_{i+1}',
        value=col
    )

df = df[reacting_monomers.notnull().all(axis=1)]
len(df)

1559

## Filtering by IUPAC name query to chemical databases

In [18]:
import cirpy
import pubchempy as pcp
import chemspipy as csp

def get_IUPAC_name(smiles : str) -> Optional[str]:
    '''Attempts to fetch the IUPAC name for the molecule described by a SMILES string from online chemical resolution services
    Returns the fetched IUPAC name, or NoneType if both queries are unsuccessful'''
    try:
        pcquery = pcp.get_compounds(smiles, namespace='smiles')
        pccomp = pcquery.pop()
        return getattr(pccomp, 'iupac_name')
    except Exception as e:
        print(type(e), e)
        return None

In [19]:
IUPAC_names = df[['smiles_expanded_monomer_1', 'smiles_expanded_monomer_2']].map(get_IUPAC_name) # this takes a while due to the HTTP query
IUPAC_names

Unnamed: 0,smiles_expanded_monomer_1,smiles_expanded_monomer_2
0,,"5-tert-butylbenzene-1,3-dicarboxylic acid"
1,4-[4-(4-aminophenoxy)phenoxy]aniline,"benzene-1,3-dicarboxylic acid"
2,"nonane-1,9-diamine",nonanedioic acid
4,"benzene-1,4-diamine","3-[5-[2-[2-[4-[4-[5-[2-[2-(3-carboxyphenyl)-1,..."
5,4-[4-[4-[4-(4-aminophenyl)-2-(trifluoromethyl)...,
...,...,...
2448,methyl 2-methylprop-2-enoate,styrene
2481,"buta-1,3-diene",styrene
2486,prop-2-enenitrile,styrene
2503,prop-1-ene,ethene


In [20]:
IUPAC_names.columns = (0, 1)
for (i, col) in IUPAC_names.items():
    df.insert(
        loc=i+5,
        column=f'IUPAC_monomer_{i+1}',
        value=col
    )

df = df[IUPAC_names.notnull().all(axis=1)]
len(df)

1277

## Save filtered DataFrame for next steps to avoid reprocessing

In [21]:
out_data_path = PROC_DATA_DIR / f'{input_data_path.stem}_FILTERED{input_data_path.suffix}'
df.to_csv(out_data_path)