# Core Imports

In [None]:
# Generic Imports
import re
from functools import partial, cached_property
from collections import defaultdict
from itertools import combinations, chain
from ast import literal_eval

# Numeric imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# File I/O
from pathlib import Path
import csv, json, openpyxl

# Typing and Subclassing
from typing import Any, Callable, ClassVar, Generator, Iterable, Optional, Union
from dataclasses import dataclass, field
from abc import ABC, abstractmethod, abstractproperty
from openmm.unit import Unit, Quantity

# Cheminformatics
from rdkit import Chem
from rdkit.Chem import rdChemReactions
from rdkit.Chem.Draw import IPythonConsole

import pubchempy as pcp

DIM    = 300
ASPECT = 3/2
IPythonConsole.molSize = (int(ASPECT*DIM), DIM)   # Change image size

# Static Paths
RAW_DATA_PATH  = Path('raw_monomer_data')
PROC_DATA_PATH = Path('processed_monomer_data')
# RXN_FILES_PATH = Path('rxn_smarts')
RXN_FILES_PATH = Path('poly_rxns')

# Inspecting expanded dataset

In [None]:
data_path = RAW_DATA_PATH / '20231114_polyid_data_density_DP2-6 - 1,2 monomers.csv'
df = pd.read_csv(data_path)
df

## Sorting by polymerization mechanism

In [None]:
key = 'mechanism'

df_grouper = df.groupby(key)
counts = df_grouper.size().to_dict()
frames = {
    mech : df_grouper.get_group(mech)
        for mech in df_grouper.groups
}

In [None]:
plt.bar(counts.keys(), counts.values())
plt.title(f'Number of SMILES by {key} ({len(df)} total)')

In [None]:
for mech, frame in frames.items():
    print(mech, frame['num_monomers'].unique())

## Adding chemical database-queried IUPAC names (if possible)

In [None]:
from polymerist.chemdbqueries import cactus

n = 2
for chemistry, frame in frames.items():
    print(chemistry.upper())
    for idx, row in frame.head(n).iterrows():
        monomer_smiles = row['smiles_monomer']
        monomer = Chem.MolFromSmiles(monomer_smiles)
        id_name = f'{row["mechanism"]}-{idx}'
        
        print('\t', id_name)
        display(monomer)
        print(monomer_smiles.split('.'))

        iupac_names = []
        for frag in Chem.GetMolFrags(monomer, asMols=True):
            smiles = Chem.MolToSmiles(frag)

            try: # 
                iupac_name = cactus.query_NIH_CACTUS(smiles, prop='iupac_name')
            except cactus.NoCACTUSDataFound:
                pcquery = pcp.get_compounds(smiles, namespace='smiles')
                pccomp = pcquery.pop()
                iupac_name = getattr(pccomp, 'iupac_name')
            except:
                iupac_name = None

            iupac_names.append(iupac_name)
            print(iupac_name, smiles, sep=' : ')
        # frame.loc[
        # print(iupac_names)
    print('='*50)

## Filtering by substructure query

In [None]:
with Path('poly_rxns/fn_group_smarts.json').open('r') as file:
    fn_group_smarts = json.load(file)

fn_groups = {
    group_name : Chem.MolFromSmarts(smarts)
        for group_name, smarts in fn_group_smarts.items()
}

In [30]:
from rdkit.Chem import rdqueries
from polymerist.rdutils.smileslib import queries


chemical_blacklist = {
    'silicon' : Chem.MolFromSmarts('[Si]'),
    'sulfur'  : Chem.MolFromSmarts('[S]'),
    'metal'   : queries.SPECIAL_QUERY_MOLS['metal'],
    # 'halogen' : queries.SPECIAL_QUERY_MOLS['halogen'],
}

In [33]:
from typing import Generator
from polymerist.rdutils.rdtypes import RDMol
from polymerist.monomers import specification


def matches_substruct_dict(target_mol : RDMol, substruct_queries : dict[str, RDMol]) -> dict[str, bool]:
    '''Takes an RDMol and a dict of labelled SMARTS substruct queries and returns a dict of bools with the same labels indicating whether each match is present'''
    return {
        matchmol_name : target_mol.HasSubstructMatch(matchmol)
            for matchmol_name, matchmol in substruct_queries.items()
    }

def matching_substruct_labels(target_mol : RDMol, substruct_queries : dict[str, RDMol]) -> Generator[str, None, None]:
    for matchmol_name, matchmol in substruct_queries.items():
        if target_mol.HasSubstructMatch(matchmol):
            yield matchmol_name

In [48]:
to_exclude_by_chemistry = df.apply(lambda row : any(matching_substruct_labels(Chem.MolFromSmiles(row['smiles_monomer'], sanitize=False), chemical_blacklist)), axis=1).to_numpy()
exclude_by_chemistry = df.loc[ to_exclude_by_chemistry, :]
include_by_chemistry = df.loc[~to_exclude_by_chemistry, :]

In [None]:
all_elems = set()
# target_query = 
target_query = 
# target_query = silicon_query

for i, row in df.iterrows():
    smiles = row['smiles_monomer']
    mol = Chem.MolFromSmiles(smiles)
    elems = set(
        atom.GetSymbol()
            for atom in mol.GetAtoms()
    )

    if mol.HasSubstructMatch(target_query):
        print(row['mechanism'], i)
        display(mol)

    all_elems |= elems

## Visually inspecting monomer and oligomer units

In [None]:
from polymerist.monomers import specification

mech = 'ester'
# mech = 'carbonate'
# mech = 'urethane'
# mech = 'vinyl'
# mech = 'imide'

# ids = (23, 223)
ids = ()
n = 10

spacer = '=' * 25
frame = frames[mech]

test = defaultdict(list)
smaller = set()
for idx, row in frame.head(n).iterrows():
    if (not ids) or (idx in ids):
        # extract and visualize structures
        print(f'\n{spacer} {row["mechanism"]}-{idx} {spacer}\n')
        monomer_smiles = row['smiles_monomer']
        monomer_smiles = specification.expanded_SMILES(monomer_smiles, assign_map_nums=False)
        monomer = Chem.MolFromSmiles(monomer_smiles, sanitize=False)
        display(monomer)

        # dimer_smiles  = row['smiles_polymer_DP2']
        # dimer = Chem.MolFromSmiles(dimer_smiles)
        # display(dimer)

        # trimer_smiles = row['smiles_polymer_DP3']
        # trimer = Chem.MolFromSmiles(trimer_smiles)
        # display(trimer)

        # other info

        num_monos = row['num_monomers']
        frags = Chem.GetMolFrags(monomer, asMols=True)
        count_monos = len(frags)
        assert(num_monos == count_monos) # sanity check

        for frag in frags:
            pcquery = pcp.get_compounds(Chem.MolToSmiles(frag), namespace='smiles')
            pccomp = pcquery.pop()
            print(pccomp.iupac_name)

        test[count_monos].append(monomer)

In [None]:
monomer

# Testing Reactions

In [None]:
rxn_lookup = { # map NREL dataset mechanism names to pre-made rxn template names
    'amide'     : 'polyamide',
    'carbonate' : 'polycarbonate_phosgene',
    'ester'     : 'polyester',
    'imide'     : 'polyimide',
    'urethane'  : 'polyurethane_isocyanate',
    'vinyl'     : 'polyvinyl_head_tail'
}

assert(set(rxn_lookup.keys()) == set(df_grouper.groups.keys())) # verify that we've mapped all reactions

In [None]:
for p in Path('poly_rxns').iterdir():
    if p.suffix == '.rxn':
        print(p.stem)

In [None]:
RXN_FILE_PATH

In [None]:
from polymerist.rdutils.reactions import reactions, reactors

rxn_path = RX
rxn = reactions.AnnotatedReaction.from_rxnfile(r)
rxn

In [None]:
from polymerist.monomers import specification

exp_smiles = specification.expanded_SMILES(monomer_smiles, assign_map_nums=False)
reactant = Chem.MolFromSmiles(exp_smiles, sanitize=False)
display(reactant)

In [None]:
reactor = reactors.PolymerizationReactor(rxn)
for dimer, frags in reactor.propagate([reactant, reactant]):
    display(dimer)
    for frag in frags:
        display(frag)

# Preprocessing and cleaning up NREL Urethane data

## Loading and inspecting raw data

In [None]:
p = RAW_DATA_PATH / 'nipu_urethanes.xlsx'
raw_table = pd.read_excel(p) # load from .xlsx file
raw_table['Monomers'] = raw_table['Monomers'].apply(literal_eval) # convert string of tuples into proper tuples

nipus     = raw_table[raw_table['Chemistry'] == 'NIPU'    ]
urethanes = raw_table[raw_table['Chemistry'] == 'urethane']

nipus_mono     = nipus['Monomers'].reset_index(drop=True) # extract monomers and renumber subset in-order
urethanes_mono = urethanes['Monomers'].reset_index(drop=True) # extract monomers and renumber subset in-order

In [None]:
raw_table

In [None]:
# targ_mono = nipus_mono
targ_mono = {}

for i, monos in targ_mono.items():
    try:
        mono1, mono2 = monos
        display(Chem.MolFromSmiles(mono1))
        
        print('='*50)
    except ValueError:
        print(f'Row {i} failed')

## Classify each monomer pair by respective reactive functional group

In [None]:
with (RXN_FILES_PATH / 'fn_group_smarts.json').open('r') as fn_group_file:
    fn_group_SMARTS = json.load(fn_group_file)
    fn_groups = {
        group_name : Chem.MolFromSmarts(SMARTS)
            for group_name, SMARTS in fn_group_SMARTS.items()
    }

reaction_pairs = {
    'NIPU' : ('cyclocarbonate', 'amine'),
    'urethane' : ('isocyanate', 'hydroxyl')
}

all_results = []
digroup_only_results = []
for i, (chemistry, monomer_pair) in raw_table.iterrows():
    monomer_entry = {
        'Chemistry' : chemistry
    }

    too_many_sites = False
    for SMILES in monomer_pair:
        rdmol = Chem.MolFromSmiles(SMILES)
        rdmol = Chem.AddHs(rdmol)
        Chem.Kekulize(rdmol, clearAromaticFlags=True) # ensure aromatic rings are treated as simply single-double (simplifies structure matching)

        for group_name, fn_group in fn_groups.items():
            matches = rdmol.GetSubstructMatches(fn_group)
            if matches and (group_name in reaction_pairs[chemistry]):
                monomer_entry[group_name] = Chem.MolToSmarts(rdmol)
            too_many_sites |= (len(matches) > 2) # OR with previous result; single true will make result true
        
    all_results.append(monomer_entry)
    if not too_many_sites:        
        digroup_only_results.append(monomer_entry)

## Save to csv files for future reference

In [None]:
all_table = pd.DataFrame.from_records(all_results)
digroup_only_table = pd.DataFrame.from_records(digroup_only_results)

all_table.to_csv(PROC_DATA_PATH / 'clean_smarts_all.csv')
digroup_only_table.to_csv(PROC_DATA_PATH / 'clean_smarts_digroup.csv')