# Core Imports

In [1]:
# Generic Imports
from ast import literal_eval

# Numeric imports
import matplotlib.pyplot as plt
import pandas as pd

# File I/O
from pathlib import Path

# Cheminformatics
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole

DIM    = 300
ASPECT = 3/2
IPythonConsole.molSize = (int(ASPECT*DIM), DIM)   # Change image size

# Static Paths
RAW_DATA_DIR  = Path('monomer_data_raw')
FMT_DATA_DIR  = Path('monomer_data_formatted')
PROC_DATA_DIR = Path('monomer_data_processed')
RXN_FILES_DIR = Path('poly_rxns')
# RXN_FILES_DIR = Path('rxn_smarts')

# Reformatting data to be compliant with monomer pipeline

In [None]:
decoder_dict = { # de-stringify stored tuples
    'monomers' : literal_eval,
    'Monomers' : literal_eval,
}

# input_data_path = RAW_DATA_DIR / 'nipu_urethanes.csv'
input_data_path = RAW_DATA_DIR / '221010_trainingdata_DP-18_expanded.csv'

raw_df = pd.read_csv(input_data_path, converters=decoder_dict) 
raw_df

## Save reformatted copy for further processing

In [None]:
new_col_names = {
    'Chemistry' : 'mechanism',
    'Monomers'  : 'smiles_monomer'
}

fmt_df = raw_df.rename(new_col_names, axis='columns')
fmt_df['smiles_monomer'] = fmt_df['smiles_monomer'].map(lambda smi : '.'.join(smi) if isinstance(smi, tuple) else smi)
fmt_df

In [None]:
fmt_data_path = FMT_DATA_DIR / f'{input_data_path.stem}_FMT.csv'
fmt_df.to_csv(fmt_data_path, index=False)

# Producing combined master dataset from individual processed datasets 

In [2]:
dsets_to_combine_paths = [
    'nipu_urethanes_FILTERED.csv',
    '20231114_polyid_data_density_DP2-6 - 1,2 monomers_FILTERED.csv'
]

master = pd.concat(
    [
        pd.read_csv(PROC_DATA_DIR / path_name, index_col=0)
            for path_name in dsets_to_combine_paths
    ], axis=0
)

master.to_csv(PROC_DATA_DIR / 'monomer_data_MASTER.csv')