# Core Imports

In [1]:
# Generic Imports
import re
from functools import partial, cached_property
from collections import defaultdict
from itertools import combinations, chain
from ast import literal_eval

# Numeric imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# File I/O
from pathlib import Path
import csv, json, openpyxl

# Typing and Subclassing
from typing import Any, Callable, ClassVar, Generator, Iterable, Optional, Union
from dataclasses import dataclass, field
from abc import ABC, abstractmethod, abstractproperty
from openmm.unit import Unit, Quantity

# Cheminformatics
from rdkit import Chem
from rdkit.Chem import rdChemReactions
from rdkit.Chem.Draw import IPythonConsole

import pubchempy as pcp

DIM    = 300
ASPECT = 3/2
IPythonConsole.molSize = (int(ASPECT*DIM), DIM)   # Change image size

# Static Paths
RAW_DATA_DIR  = Path('monomer_data_raw')
FMT_DATA_DIR  = Path('monomer_data_formatted')
PROC_DATA_DIR = Path('monomer_data_processed')
RXN_FILES_DIR = Path('poly_rxns')

# Inspecting monomer dataset

In [None]:
# input_data_path = FMT_DATA_DIR / '20231114_polyid_data_density_DP2-6 - 1,2 monomers.csv'
# input_data_path = FMT_DATA_DIR / 'polyurethanes.csv'
# input_data_path = FMT_DATA_DIR / '221010_trainingdata_DP-18_expanded_FMT.csv'
input_data_path = PROC_DATA_DIR / '20231114_polyid_data_density_DP2-6 - 1,2 monomers_FILTERED.csv'
# input_data_path = PROC_DATA_DIR / 'monomer_data_MASTER.csv'
# input_data_path = FMT_DATA_DIR / 'nipu_urethanes_FMT.csv'
df = pd.read_csv(input_data_path, index_col=0)
len(df)

## Sort by polymerization mechanism and #monomers, obtain respective counts and colors

In [None]:
from polymerist.graphics import plotutils

if 'num_monomers' not in df.columns:
    df['num_monomers'] = df.smiles_monomer.map(lambda smiles : len(smiles.split('.')))

# keys = ['num_monomers', 'mechanism']
keys = ['mechanism', 'num_monomers']

df_grouper = df.groupby(keys)
frames = {
    mech : df_grouper.get_group(mech)
        for mech in df_grouper.groups
}

In [None]:
counts = df_grouper.size().to_dict()
cdict, carr = plotutils.label_discrete_cmap(
    cmap=plt.get_cmap('tab10'),
    color_names=df.mechanism.unique(),
    hues_per_color=1
)

In [None]:
counts

In [None]:
colors = []
plot_counts = {}
for (mech, num_mono), count_val in counts.items():
    plot_counts[f'{mech}-{num_mono}'] = count_val
    colors.append(cdict[mech])

### Plot Histogram by number of monomers and mechanism

In [None]:
plt.imshow(carr)

plotutils.presize_subplots(1, 1, scale=10, elongation=1/2)

plt.bar(plot_counts.keys(), plot_counts.values(), color=colors)
plt.xticks(np.arange(len(counts))+0, rotation=-45)

plt.title(f'Number of SMILES by {" & ".join(keys)} ({len(df)} total)')

# Visually inspecting monomer and oligomer units

In [None]:
from polymerist.monomers import specification

# mech = 'ester'
mech = 'carbonate'
# mech = 'urethane'
# mech = 'vinyl'
# mech = 'imide'
num_mono = 2 # 1

frame = frames[(mech, num_mono)]

# ids = (23, 223)
ids = ()
n = 10
spacer = '=' * 25

test = defaultdict(list)
smaller = set()
for idx, row in frame.head(n).iterrows():
    if (not ids) or (idx in ids):
        # extract and visualize structures
        print(f'\n{spacer} {row["mechanism"]}-{idx} {spacer}\n')
        combined_smiles = row['smiles_monomer']
        combined_monomer = Chem.MolFromSmiles(combined_smiles, sanitize=False)
        display(combined_monomer)

        # dimer_smiles  = row['smiles_polymer_DP2']
        # dimer = Chem.MolFromSmiles(dimer_smiles)
        # display(dimer)

        # trimer_smiles = row['smiles_polymer_DP3']
        # trimer = Chem.MolFromSmiles(trimer_smiles)
        # display(trimer)

        for id in (1, 2):
            exp_mono_smi = row[f'smiles_expanded_monomer_{id}']
            monomer = Chem.MolFromSmiles(exp_mono_smi, sanitize=False)
            iupac_name = row[f'IUPAC_monomer_{id}']

            display(monomer)
            print(iupac_name)

# Concatenating PU and Density data sets into "master" data set

In [9]:
p1 = PROC_DATA_DIR/'20231114_polyid_data_density_DP2-6 - 1,2 monomers_FILTERED.csv'
p2 = PROC_DATA_DIR/'nipu_urethanes_FILTERED.csv'

df1 = pd.read_csv(p1, index_col=[0,1])
df2 = pd.read_csv(p2, index_col=[0,1])

pd.concat([df1, df2], axis=0).to_csv(PROC_DATA_DIR/'monomer_data_MASTER.csv')