# Core Imports

In [1]:
# Generic Imports
import re
from functools import partial, cached_property
from collections import defaultdict
from itertools import combinations, chain
from ast import literal_eval

# Numeric imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# File I/O
from pathlib import Path
import csv, json, openpyxl

# Typing and Subclassing
from typing import Any, Callable, ClassVar, Generator, Iterable, Optional, Union
from dataclasses import dataclass, field
from abc import ABC, abstractmethod, abstractproperty
from openmm.unit import Unit, Quantity

# Cheminformatics
from rdkit import Chem
from rdkit.Chem import rdChemReactions
from rdkit.Chem.Draw import IPythonConsole

import pubchempy as pcp

DIM    = 300
ASPECT = 3/2
IPythonConsole.molSize = (int(ASPECT*DIM), DIM)   # Change image size

# Static Paths
RAW_DATA_DIR  = Path('monomer_data_raw')
FMT_DATA_DIR  = Path('monomer_data_formatted')
PROC_DATA_DIR = Path('monomer_data_processed')
RXN_FILES_DIR = Path('poly_rxns')
# RXN_FILES_DIR = Path('rxn_smarts')

# Reformatting data to be compliant with monomer pipeline

In [4]:
decoder_dict = { # de-stringify stored tuples
    'monomers' : literal_eval,
    'Monomers' : literal_eval,
}

# input_data_path = RAW_DATA_DIR / 'nipu_urethanes.csv'
input_data_path = RAW_DATA_DIR / '221010_trainingdata_DP-18_expanded.csv'

raw_df = pd.read_csv(input_data_path, converters=decoder_dict) 
raw_df

Unnamed: 0,hash-monomers-distribution,monomers,distribution,mechanism,Glass_Transition,Melt_Temp,Cp_solid_slope,Cp_solid_intercept,Cp_liquid_slope,Cp_liquid_intercept,...,log10_Permeability_CH4,log10_Permeability_CO2,log10_Permeability_N2,log10_Permeability_O2,log10_Permeability_H2,log10_Permeability_H2O,smiles_polymer,hash-smiles_polymer,smiles_monomer,replicate_structure
0,28NNnPqUqeUNrtmudNTwYz,"(CC(C)(C)c1cc(C(=O)O)cc(C(=O)O)c1, Nc1ccc(-c2c...",,amide,275.00,,,,,,...,0.359835,1.743588,0.376577,1.148911,,,CC(C)(C)c1cc(C(=O)O)cc(C(=O)Nc2ccc(-c3ccc(Oc4c...,Zzhk9gU8ApTH4xtAfQi5rW,CC(C)(C)c1cc(C(=O)O)cc(C(=O)O)c1.Nc1ccc(-c2ccc...,0
1,28oL8vgvwP9acaZDehEMGU,"(O=C(O)c1cccc(C(=O)O)c1, Nc1ccc(Oc2ccc(Oc3ccc(...",,amide,210.00,340.0,,,,,...,,,,,,,Nc1ccc(Oc2ccc(Oc3ccc(NC(=O)c4cccc(C(=O)Nc5ccc(...,MmKrS7QZesi5QHFkPtKreE,O=C(O)c1cccc(C(=O)O)c1.Nc1ccc(Oc2ccc(Oc3ccc(N)...,0
2,2DuicAyoesWbNJNz3MgSWn,"(NCCCCCCCCCN, O=C(O)CCCCCCCC(=O)O)",,amide,,177.0,,,,,...,,,,,,,NCCCCCCCCCNC(=O)CCCCCCCC(=O)NCCCCCCCCCNC(=O)CC...,DRQdStFNSgFv3wedj6sZz7,NCCCCCCCCCN.O=C(O)CCCCCCCC(=O)O,0
3,2EvjUqRRk9goyUTA69A7P7,"(O=C(O)c1cc(C(=O)O)cc([N+](=O)[O-])c1, Nc1ccc(...",,amide,174.00,,,,,,...,,,,,,,Nc1ccc(O[Si](Oc2ccc(NC(=O)c3cc(C(=O)Nc4ccc(O[S...,X5mKgKyx8EJHc7TzcxMexM,O=C(O)c1cc(C(=O)O)cc([N+](=O)[O-])c1.Nc1ccc(O[...,0
4,2HpYXrCFgaJVszCPDtpWsv,"(Nc1ccc(N)cc1, O=C(O)c1cccc(N2C(=O)c3ccc(C(c4c...",,amide,315.00,,,,,,...,,,,,,,Nc1ccc(NC(=O)c2cccc(N3C(=O)c4ccc(C(c5ccc6c(c5)...,KUoF3Vr4pohJzC9MzUW59q,Nc1ccc(N)cc1.O=C(O)c1cccc(N2C(=O)c3ccc(C(c4ccc...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2613,o3GGTNZkgNEuQSKNWYeYFs,"(C=CC,)",,vinyl,-9.15,,0.25,-0.85,0.15,42.95,...,,1.231736,0.079965,0.755046,,2.053483,CC(CCC(C)CC(C)CC(C)C(C)CCC(C)C(C)CC(C)CCC(C)C(...,6232Wkx4QGKYgD6kxuodTj,C=CC,0
2614,o4LroNwi3uKoY2oXtjDvau,"(C=CC(=O)OCCCCCCCCCCCCCC,)",,vinyl,,28.0,,,,,...,1.813581,2.472756,1.344392,1.756636,1.983175,,CCCCCCCCCCCCCCOC(=O)CCCC(C(=O)OCCCCCCCCCCCCCC)...,jeytmhnbvwNCi9TFJzb3Sh,C=CC(=O)OCCCCCCCCCCCCCC,0
2615,o8CKTqDchhSvCTx6NLc6ih,"(C=C(C#N)C(=O)OCCCCCCCC,)",,vinyl,48.85,,,,,,...,,,,,,,CCCCCCCCOC(=O)C(C#N)CC(C#N)(CCC(C#N)(CC(C#N)(C...,HMUpFTATpJVVYZG2zsywgy,C=C(C#N)C(=O)OCCCCCCCC,0
2616,oNVVbZgL7r4BMQkoTzruM3,"(C=Cc1ccc(C)cc1C,)",,vinyl,112.00,,,,,,...,,,,,,,Cc1ccc(CCCC(c2ccc(C)cc2C)C(CC(CCC(c2ccc(C)cc2C...,T4Fmiy6gWwfj7bntRHUzGX,C=Cc1ccc(C)cc1C,0


## Save reformatted copy for further processing

In [5]:
new_col_names = {
    'Chemistry' : 'mechanism',
    'Monomers'  : 'smiles_monomer'
}

fmt_df = raw_df.rename(new_col_names, axis='columns')
fmt_df['smiles_monomer'] = fmt_df['smiles_monomer'].map(lambda smi : '.'.join(smi) if isinstance(smi, tuple) else smi)
fmt_df

Unnamed: 0,hash-monomers-distribution,monomers,distribution,mechanism,Glass_Transition,Melt_Temp,Cp_solid_slope,Cp_solid_intercept,Cp_liquid_slope,Cp_liquid_intercept,...,log10_Permeability_CH4,log10_Permeability_CO2,log10_Permeability_N2,log10_Permeability_O2,log10_Permeability_H2,log10_Permeability_H2O,smiles_polymer,hash-smiles_polymer,smiles_monomer,replicate_structure
0,28NNnPqUqeUNrtmudNTwYz,"(CC(C)(C)c1cc(C(=O)O)cc(C(=O)O)c1, Nc1ccc(-c2c...",,amide,275.00,,,,,,...,0.359835,1.743588,0.376577,1.148911,,,CC(C)(C)c1cc(C(=O)O)cc(C(=O)Nc2ccc(-c3ccc(Oc4c...,Zzhk9gU8ApTH4xtAfQi5rW,CC(C)(C)c1cc(C(=O)O)cc(C(=O)O)c1.Nc1ccc(-c2ccc...,0
1,28oL8vgvwP9acaZDehEMGU,"(O=C(O)c1cccc(C(=O)O)c1, Nc1ccc(Oc2ccc(Oc3ccc(...",,amide,210.00,340.0,,,,,...,,,,,,,Nc1ccc(Oc2ccc(Oc3ccc(NC(=O)c4cccc(C(=O)Nc5ccc(...,MmKrS7QZesi5QHFkPtKreE,O=C(O)c1cccc(C(=O)O)c1.Nc1ccc(Oc2ccc(Oc3ccc(N)...,0
2,2DuicAyoesWbNJNz3MgSWn,"(NCCCCCCCCCN, O=C(O)CCCCCCCC(=O)O)",,amide,,177.0,,,,,...,,,,,,,NCCCCCCCCCNC(=O)CCCCCCCC(=O)NCCCCCCCCCNC(=O)CC...,DRQdStFNSgFv3wedj6sZz7,NCCCCCCCCCN.O=C(O)CCCCCCCC(=O)O,0
3,2EvjUqRRk9goyUTA69A7P7,"(O=C(O)c1cc(C(=O)O)cc([N+](=O)[O-])c1, Nc1ccc(...",,amide,174.00,,,,,,...,,,,,,,Nc1ccc(O[Si](Oc2ccc(NC(=O)c3cc(C(=O)Nc4ccc(O[S...,X5mKgKyx8EJHc7TzcxMexM,O=C(O)c1cc(C(=O)O)cc([N+](=O)[O-])c1.Nc1ccc(O[...,0
4,2HpYXrCFgaJVszCPDtpWsv,"(Nc1ccc(N)cc1, O=C(O)c1cccc(N2C(=O)c3ccc(C(c4c...",,amide,315.00,,,,,,...,,,,,,,Nc1ccc(NC(=O)c2cccc(N3C(=O)c4ccc(C(c5ccc6c(c5)...,KUoF3Vr4pohJzC9MzUW59q,Nc1ccc(N)cc1.O=C(O)c1cccc(N2C(=O)c3ccc(C(c4ccc...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2613,o3GGTNZkgNEuQSKNWYeYFs,"(C=CC,)",,vinyl,-9.15,,0.25,-0.85,0.15,42.95,...,,1.231736,0.079965,0.755046,,2.053483,CC(CCC(C)CC(C)CC(C)C(C)CCC(C)C(C)CC(C)CCC(C)C(...,6232Wkx4QGKYgD6kxuodTj,C=CC,0
2614,o4LroNwi3uKoY2oXtjDvau,"(C=CC(=O)OCCCCCCCCCCCCCC,)",,vinyl,,28.0,,,,,...,1.813581,2.472756,1.344392,1.756636,1.983175,,CCCCCCCCCCCCCCOC(=O)CCCC(C(=O)OCCCCCCCCCCCCCC)...,jeytmhnbvwNCi9TFJzb3Sh,C=CC(=O)OCCCCCCCCCCCCCC,0
2615,o8CKTqDchhSvCTx6NLc6ih,"(C=C(C#N)C(=O)OCCCCCCCC,)",,vinyl,48.85,,,,,,...,,,,,,,CCCCCCCCOC(=O)C(C#N)CC(C#N)(CCC(C#N)(CC(C#N)(C...,HMUpFTATpJVVYZG2zsywgy,C=C(C#N)C(=O)OCCCCCCCC,0
2616,oNVVbZgL7r4BMQkoTzruM3,"(C=Cc1ccc(C)cc1C,)",,vinyl,112.00,,,,,,...,,,,,,,Cc1ccc(CCCC(c2ccc(C)cc2C)C(CC(CCC(c2ccc(C)cc2C...,T4Fmiy6gWwfj7bntRHUzGX,C=Cc1ccc(C)cc1C,0


In [6]:
fmt_data_path = FMT_DATA_DIR / f'{input_data_path.stem}_FMT.csv'
fmt_df.to_csv(fmt_data_path, index=False)

# [DEPRECATED] Preprocessing and cleaning up NREL Urethane data

## Loading and inspecting raw data

In [None]:
p = RAW_DATA_DIR / 'nipu_urethanes.xlsx'
raw_table = pd.read_excel(p) # load from .xlsx file
raw_table['Monomers'] = raw_table['Monomers'].apply(literal_eval) # convert string of tuples into proper tuples

nipus     = raw_table[raw_table['Chemistry'] == 'NIPU'    ]
urethanes = raw_table[raw_table['Chemistry'] == 'urethane']

nipus_mono     = nipus['Monomers'].reset_index(drop=True) # extract monomers and renumber subset in-order
urethanes_mono = urethanes['Monomers'].reset_index(drop=True) # extract monomers and renumber subset in-order

In [None]:
# targ_mono = nipus_mono
targ_mono = {}

for i, monos in targ_mono.items():
    try:
        mono1, mono2 = monos
        display(Chem.MolFromSmiles(mono1))
        
        print('='*50)
    except ValueError:
        print(f'Row {i} failed')

## Classify each monomer pair by respective reactive functional group

In [None]:
with (RXN_FILES_DIR / 'fn_group_smarts.json').open('r') as fn_group_file:
    fn_group_SMARTS = json.load(fn_group_file)
    fn_groups = {
        group_name : Chem.MolFromSmarts(SMARTS)
            for group_name, SMARTS in fn_group_SMARTS.items()
    }

reaction_pairs = {
    'NIPU' : ('cyclocarbonate', 'amine'),
    'urethane' : ('isocyanate', 'hydroxyl')
}

all_results = []
digroup_only_results = []
for i, (chemistry, monomer_pair) in raw_table.iterrows():
    monomer_entry = {
        'Chemistry' : chemistry
    }

    too_many_sites = False
    for SMILES in monomer_pair:
        rdmol = Chem.MolFromSmiles(SMILES)
        rdmol = Chem.AddHs(rdmol)
        Chem.Kekulize(rdmol, clearAromaticFlags=True) # ensure aromatic rings are treated as simply single-double (simplifies structure matching)

        for group_name, fn_group in fn_groups.items():
            matches = rdmol.GetSubstructMatches(fn_group)
            if matches and (group_name in reaction_pairs[chemistry]):
                monomer_entry[group_name] = Chem.MolToSmarts(rdmol)
            too_many_sites |= (len(matches) > 2) # OR with previous result; single true will make result true
        
    all_results.append(monomer_entry)
    if not too_many_sites:        
        digroup_only_results.append(monomer_entry)

## Save to csv files for future reference

In [None]:
all_table = pd.DataFrame.from_records(all_results)
digroup_only_table = pd.DataFrame.from_records(digroup_only_results)

all_table.to_csv(PROC_DATA_DIR / 'clean_smarts_all.csv')
digroup_only_table.to_csv(PROC_DATA_DIR / 'clean_smarts_digroup.csv')