# Core Imports

In [28]:
# Generic Imports
import re
from functools import partial, cached_property
from collections import defaultdict
from itertools import combinations, chain
from ast import literal_eval

# Numeric imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# File I/O
from pathlib import Path
import csv, json, openpyxl

# Typing and Subclassing
from typing import Any, Callable, ClassVar, Generator, Iterable, Optional, Union
from dataclasses import dataclass, field
from abc import ABC, abstractmethod, abstractproperty
from openmm.unit import Unit, Quantity

# Cheminformatics
from rdkit import Chem
from rdkit.Chem import rdChemReactions
from rdkit.Chem.Draw import IPythonConsole

import pubchempy as pcp

DIM    = 300
ASPECT = 3/2
IPythonConsole.molSize = (int(ASPECT*DIM), DIM)   # Change image size

# Static Paths
RAW_DATA_DIR  = Path('monomer_data_raw')
FMT_DATA_DIR  = Path('monomer_data_formatted')
PROC_DATA_DIR = Path('monomer_data_processed')
RXN_FILES_DIR = Path('poly_rxns')
# RXN_FILES_DIR = Path('rxn_smarts')

# Reformatting data to be compliant with monomer pipeline

In [31]:
decoder_dict = { # de-stringify stored tuples
    'monomers' : literal_eval,
    'Monomers' : literal_eval,
}

input_data_path = RAW_DATA_DIR / 'nipu_urethanes.csv'
# input_data_path = RAW_DATA_DIR / '221010_trainingdata_DP-18_expanded.csv'

raw_df = pd.read_csv(input_data_path, converters=decoder_dict) 
raw_df

Unnamed: 0,Chemistry,Monomers
0,NIPU,"(NCCOCCOCCN, CCC1OC(=O)OC1CC1OC(=O)OC1CC1OC(=O..."
1,NIPU,(NCCCCCCNC(=O)CC(O)(CC(=O)NCCCCCCN)C(=O)NCCCCC...
2,NIPU,"(NCCCCN, CCC1OC(=O)OC1CC1OC(=O)OC1CC1OC(=O)OC1..."
3,NIPU,"(NCc1cccc(CN)c1, CCC1OC(=O)OC1CC1OC(=O)OC1CC1O..."
4,NIPU,"(NCCCCN, CCCCCCCC1C(CCCCC)CCC(CCCCCCCCC(=O)OCC..."
...,...,...
110,urethane,"(OCCCCCCO, O=C=NCCCCCCN=C=O)"
111,urethane,"(OCCO, O=C=Nc1ccc(Cc2ccc(N=C=O)cc2)cc1)"
112,urethane,"(OCCCCO, O=C=NCCCCCCN=C=O)"
113,urethane,"(OCCO, O=C=NCCCCCCN=C=O)"


## Save reformatted copy for further processing

In [32]:
new_col_names = {
    'Chemistry' : 'mechanism',
    'Monomers'  : 'smiles_monomer'
}

fmt_df = raw_df.rename(new_col_names, axis='columns')
fmt_df['smiles_monomer'] = fmt_df['smiles_monomer'].map(lambda smi_tup : '.'.join(smi_tup))
fmt_df

Unnamed: 0,mechanism,smiles_monomer
0,NIPU,NCCOCCOCCN.CCC1OC(=O)OC1CC1OC(=O)OC1CC1OC(=O)O...
1,NIPU,NCCCCCCNC(=O)CC(O)(CC(=O)NCCCCCCN)C(=O)NCCCCCC...
2,NIPU,NCCCCN.CCC1OC(=O)OC1CC1OC(=O)OC1CC1OC(=O)OC1CC...
3,NIPU,NCc1cccc(CN)c1.CCC1OC(=O)OC1CC1OC(=O)OC1CC1OC(...
4,NIPU,NCCCCN.CCCCCCCC1C(CCCCC)CCC(CCCCCCCCC(=O)OCC2C...
...,...,...
110,urethane,OCCCCCCO.O=C=NCCCCCCN=C=O
111,urethane,OCCO.O=C=Nc1ccc(Cc2ccc(N=C=O)cc2)cc1
112,urethane,OCCCCO.O=C=NCCCCCCN=C=O
113,urethane,OCCO.O=C=NCCCCCCN=C=O


In [33]:
fmt_data_path = FMT_DATA_DIR / f'polyurethanes.csv'
# fmt_data_path = FMT_DATA_DIR / f'{input_data_path.stem}_FMT.csv'
fmt_df.to_csv(fmt_data_path)

# [DEPRECATED] Preprocessing and cleaning up NREL Urethane data

## Loading and inspecting raw data

In [3]:
p = RAW_DATA_DIR / 'nipu_urethanes.xlsx'
raw_table = pd.read_excel(p) # load from .xlsx file
raw_table['Monomers'] = raw_table['Monomers'].apply(literal_eval) # convert string of tuples into proper tuples

nipus     = raw_table[raw_table['Chemistry'] == 'NIPU'    ]
urethanes = raw_table[raw_table['Chemistry'] == 'urethane']

nipus_mono     = nipus['Monomers'].reset_index(drop=True) # extract monomers and renumber subset in-order
urethanes_mono = urethanes['Monomers'].reset_index(drop=True) # extract monomers and renumber subset in-order

In [4]:
# targ_mono = nipus_mono
targ_mono = {}

for i, monos in targ_mono.items():
    try:
        mono1, mono2 = monos
        display(Chem.MolFromSmiles(mono1))
        
        print('='*50)
    except ValueError:
        print(f'Row {i} failed')

## Classify each monomer pair by respective reactive functional group

In [5]:
with (RXN_FILES_DIR / 'fn_group_smarts.json').open('r') as fn_group_file:
    fn_group_SMARTS = json.load(fn_group_file)
    fn_groups = {
        group_name : Chem.MolFromSmarts(SMARTS)
            for group_name, SMARTS in fn_group_SMARTS.items()
    }

reaction_pairs = {
    'NIPU' : ('cyclocarbonate', 'amine'),
    'urethane' : ('isocyanate', 'hydroxyl')
}

all_results = []
digroup_only_results = []
for i, (chemistry, monomer_pair) in raw_table.iterrows():
    monomer_entry = {
        'Chemistry' : chemistry
    }

    too_many_sites = False
    for SMILES in monomer_pair:
        rdmol = Chem.MolFromSmiles(SMILES)
        rdmol = Chem.AddHs(rdmol)
        Chem.Kekulize(rdmol, clearAromaticFlags=True) # ensure aromatic rings are treated as simply single-double (simplifies structure matching)

        for group_name, fn_group in fn_groups.items():
            matches = rdmol.GetSubstructMatches(fn_group)
            if matches and (group_name in reaction_pairs[chemistry]):
                monomer_entry[group_name] = Chem.MolToSmarts(rdmol)
            too_many_sites |= (len(matches) > 2) # OR with previous result; single true will make result true
        
    all_results.append(monomer_entry)
    if not too_many_sites:        
        digroup_only_results.append(monomer_entry)

## Save to csv files for future reference

In [6]:
all_table = pd.DataFrame.from_records(all_results)
digroup_only_table = pd.DataFrame.from_records(digroup_only_results)

all_table.to_csv(PROC_DATA_DIR / 'clean_smarts_all.csv')
digroup_only_table.to_csv(PROC_DATA_DIR / 'clean_smarts_digroup.csv')