# Core Imports

In [1]:
# Generic Imports
from ast import literal_eval

# Numeric imports
import matplotlib.pyplot as plt
import pandas as pd

# File I/O
from pathlib import Path

# Cheminformatics
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole

DIM    = 300
ASPECT = 3/2
IPythonConsole.molSize = (int(ASPECT*DIM), DIM)   # Change image size

# Static Paths
RAW_DATA_DIR  = Path('monomer_data_raw')
FMT_DATA_DIR  = Path('monomer_data_formatted')
PROC_DATA_DIR = Path('monomer_data_processed')
RXN_FILES_DIR = Path('poly_rxns')
# RXN_FILES_DIR = Path('rxn_smarts')

# Reformatting data to be compliant with monomer pipeline

In [3]:
decoder_dict = { # de-stringify stored tuples
    'monomers' : literal_eval,
    'Monomers' : literal_eval,
}

# input_data_path = RAW_DATA_DIR / 'nipu_urethanes.csv'
# input_data_path = RAW_DATA_DIR / '221010_trainingdata_DP-18_expanded.csv'
input_data_path = FMT_DATA_DIR / '20231114_polyid_data_density_DP2-6 - 1,2 monomers.csv'

raw_df = pd.read_csv(input_data_path, converters=decoder_dict) 
raw_df

Unnamed: 0,smiles_monomer,smiles_polymer_DP2,smiles_polymer_DP3,smiles_polymer_DP6,smiles_polymer_DP18,num_monomers,mechanism,Glass_Transition,Melt_Temp,Cp_solid_slope,...,log10_ElongBreak,YoungMod,Tensile_Strength,Density,log10_Permeability_CH4,log10_Permeability_CO2,log10_Permeability_N2,log10_Permeability_O2,log10_Permeability_H2,log10_Permeability_H2O
0,O=C(Cl)Cl.Oc1ccc(C(c2ccc(O)cc2)(C(F)(F)F)C(F)(...,O=C(Cl)Oc1ccc(C(c2ccc(O)cc2)(C(F)(F)F)C(F)(F)F...,O=C(Oc1ccc(C(c2ccc(O)cc2)(C(F)(F)F)C(F)(F)F)cc...,O=C(Cl)Oc1ccc(C(c2ccc(OC(=O)Oc3ccc(C(c4ccc(OC(...,O=C(Cl)Oc1ccc(C(c2ccc(OC(=O)Oc3ccc(C(c4ccc(OC(...,2,carbonate,176.000,,,...,,,,1.479,0.017759,1.380211,0.225687,0.838849,,
1,C=CSCCC,CCCSCCC(C)SCCC,CCCSC(C)CC(SCCC)C(C)SCCC,CCCSCCC(CCC(SCCC)C(CCC(CCSCCC)SCCC)SCCC)SCCC,CCCSCCCC(CC(CC(SCCC)C(CC(CCC(SCCC)C(CC(CC(CCC(...,1,vinyl,,,,...,,,,1.021,,,,,,
2,C=C(C#N)C(=O)OCC,CCOC(=O)C(C#N)CC(C)(C#N)C(=O)OCC,CCOC(=O)C(C#N)CC(C#N)(CC(C)(C#N)C(=O)OCC)C(=O)OCC,CCOC(=O)C(C#N)CCC(C#N)(C(=O)OCC)C(C#N)(CC(C#N)...,CCOC(=O)C(C#N)CCC(C#N)(CC(C#N)(C(=O)OCC)C(C#N)...,1,vinyl,131.850,,,...,,2200.0,,1.224,,,,,,
3,C=C(C)c1ccccc1,CC(CC(C)(C)c1ccccc1)c1ccccc1,CC(CCC(C)(CC(C)c1ccccc1)c1ccccc1)c1ccccc1,CC(CCC(C)(CC(C)(CC(C)(c1ccccc1)C(C)(CCC(C)c1cc...,CC(CCC(C)(c1ccccc1)C(C)(CCC(C)(CC(C)(CC(C)(c1c...,1,vinyl,171.850,,0.51,...,,,,1.070,,,,,,
4,C=C(C)C(=O)OC1CCCCC1,CC(CCC(C)C(=O)OC1CCCCC1)C(=O)OC1CCCCC1,CC(CCC(C)(CC(C)C(=O)OC1CCCCC1)C(=O)OC1CCCCC1)C...,CC(CCC(C)(CC(C)(CC(C)(C(=O)OC1CCCCC1)C(C)(CCC(...,CC(CCC(C)(CC(C)(CC(C)(CC(C)(CC(C)(CC(C)(C(=O)O...,1,vinyl,104.350,,,...,,,,1.100,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461,NCCCCCCCN.O=C(O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)O,NCCCCCCCNC(=O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)O,O=C(O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)NCCCCCCCN...,NCCCCCCCNC(=O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)N...,NCCCCCCCNC(=O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)N...,2,amide,31.000,,,...,,,,1.040,,,,,,
462,O=C(Cl)Cl.Oc1ccc(C2(c3ccc(O)cc3)CCCCCC2)cc1,O=C(Cl)Oc1ccc(C2(c3ccc(O)cc3)CCCCCC2)cc1,O=C(Cl)Oc1ccc(C2(c3ccc(OC(=O)Cl)cc3)CCCCCC2)cc1,O=C(Cl)Oc1ccc(C2(c3ccc(OC(=O)Oc4ccc(C5(c6ccc(O...,O=C(Cl)Oc1ccc(C2(c3ccc(OC(=O)Oc4ccc(C5(c6ccc(O...,2,carbonate,155.000,,,...,,,,1.203,,,,-0.119186,,
463,OCCCCO.O=C(O)CCCCC(=O)O,O=C(O)CCCCC(=O)OCCCCO,O=C(O)CCCCC(=O)OCCCCOC(=O)CCCCC(=O)O,O=C(O)CCCCC(=O)OCCCCOC(=O)CCCCC(=O)OCCCCOC(=O)...,O=C(O)CCCCC(=O)OCCCCOC(=O)CCCCC(=O)OCCCCOC(=O)...,2,ester,-43.050,57.4,,...,,,16.2,1.060,,,,,,
464,O=C1OC(=O)c2cc(-c3ccc4c(c3)C(=O)OC4=O)ccc21.Nc...,Nc1ccc(Oc2ccc(C(c3ccc(Oc4cccc(Oc5ccc(C(c6ccc(O...,Nc1ccc(Oc2ccc(C(c3ccc(Oc4cccc(Oc5ccc(C(c6ccc(O...,Nc1ccc(Oc2ccc(C(c3ccc(Oc4cccc(Oc5ccc(C(c6ccc(O...,Nc1ccc(Oc2ccc(C(c3ccc(Oc4cccc(Oc5ccc(C(c6ccc(O...,2,imide,285.000,,,...,,,,1.316,,,,,,


## Save reformatted copy for further processing

In [4]:
new_col_names = {
    'Chemistry' : 'mechanism',
    'Monomers'  : 'smiles_monomer'
}

fmt_df = raw_df.rename(new_col_names, axis='columns')
fmt_df['smiles_monomer'] = fmt_df['smiles_monomer'].map(lambda smi : '.'.join(smi) if isinstance(smi, tuple) else smi)
fmt_df

Unnamed: 0,smiles_monomer,smiles_polymer_DP2,smiles_polymer_DP3,smiles_polymer_DP6,smiles_polymer_DP18,num_monomers,mechanism,Glass_Transition,Melt_Temp,Cp_solid_slope,...,log10_ElongBreak,YoungMod,Tensile_Strength,Density,log10_Permeability_CH4,log10_Permeability_CO2,log10_Permeability_N2,log10_Permeability_O2,log10_Permeability_H2,log10_Permeability_H2O
0,O=C(Cl)Cl.Oc1ccc(C(c2ccc(O)cc2)(C(F)(F)F)C(F)(...,O=C(Cl)Oc1ccc(C(c2ccc(O)cc2)(C(F)(F)F)C(F)(F)F...,O=C(Oc1ccc(C(c2ccc(O)cc2)(C(F)(F)F)C(F)(F)F)cc...,O=C(Cl)Oc1ccc(C(c2ccc(OC(=O)Oc3ccc(C(c4ccc(OC(...,O=C(Cl)Oc1ccc(C(c2ccc(OC(=O)Oc3ccc(C(c4ccc(OC(...,2,carbonate,176.000,,,...,,,,1.479,0.017759,1.380211,0.225687,0.838849,,
1,C=CSCCC,CCCSCCC(C)SCCC,CCCSC(C)CC(SCCC)C(C)SCCC,CCCSCCC(CCC(SCCC)C(CCC(CCSCCC)SCCC)SCCC)SCCC,CCCSCCCC(CC(CC(SCCC)C(CC(CCC(SCCC)C(CC(CC(CCC(...,1,vinyl,,,,...,,,,1.021,,,,,,
2,C=C(C#N)C(=O)OCC,CCOC(=O)C(C#N)CC(C)(C#N)C(=O)OCC,CCOC(=O)C(C#N)CC(C#N)(CC(C)(C#N)C(=O)OCC)C(=O)OCC,CCOC(=O)C(C#N)CCC(C#N)(C(=O)OCC)C(C#N)(CC(C#N)...,CCOC(=O)C(C#N)CCC(C#N)(CC(C#N)(C(=O)OCC)C(C#N)...,1,vinyl,131.850,,,...,,2200.0,,1.224,,,,,,
3,C=C(C)c1ccccc1,CC(CC(C)(C)c1ccccc1)c1ccccc1,CC(CCC(C)(CC(C)c1ccccc1)c1ccccc1)c1ccccc1,CC(CCC(C)(CC(C)(CC(C)(c1ccccc1)C(C)(CCC(C)c1cc...,CC(CCC(C)(c1ccccc1)C(C)(CCC(C)(CC(C)(CC(C)(c1c...,1,vinyl,171.850,,0.51,...,,,,1.070,,,,,,
4,C=C(C)C(=O)OC1CCCCC1,CC(CCC(C)C(=O)OC1CCCCC1)C(=O)OC1CCCCC1,CC(CCC(C)(CC(C)C(=O)OC1CCCCC1)C(=O)OC1CCCCC1)C...,CC(CCC(C)(CC(C)(CC(C)(C(=O)OC1CCCCC1)C(C)(CCC(...,CC(CCC(C)(CC(C)(CC(C)(CC(C)(CC(C)(CC(C)(C(=O)O...,1,vinyl,104.350,,,...,,,,1.100,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461,NCCCCCCCN.O=C(O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)O,NCCCCCCCNC(=O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)O,O=C(O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)NCCCCCCCN...,NCCCCCCCNC(=O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)N...,NCCCCCCCNC(=O)CCCCCCCCCCSCCCCSCCCCCCCCCCC(=O)N...,2,amide,31.000,,,...,,,,1.040,,,,,,
462,O=C(Cl)Cl.Oc1ccc(C2(c3ccc(O)cc3)CCCCCC2)cc1,O=C(Cl)Oc1ccc(C2(c3ccc(O)cc3)CCCCCC2)cc1,O=C(Cl)Oc1ccc(C2(c3ccc(OC(=O)Cl)cc3)CCCCCC2)cc1,O=C(Cl)Oc1ccc(C2(c3ccc(OC(=O)Oc4ccc(C5(c6ccc(O...,O=C(Cl)Oc1ccc(C2(c3ccc(OC(=O)Oc4ccc(C5(c6ccc(O...,2,carbonate,155.000,,,...,,,,1.203,,,,-0.119186,,
463,OCCCCO.O=C(O)CCCCC(=O)O,O=C(O)CCCCC(=O)OCCCCO,O=C(O)CCCCC(=O)OCCCCOC(=O)CCCCC(=O)O,O=C(O)CCCCC(=O)OCCCCOC(=O)CCCCC(=O)OCCCCOC(=O)...,O=C(O)CCCCC(=O)OCCCCOC(=O)CCCCC(=O)OCCCCOC(=O)...,2,ester,-43.050,57.4,,...,,,16.2,1.060,,,,,,
464,O=C1OC(=O)c2cc(-c3ccc4c(c3)C(=O)OC4=O)ccc21.Nc...,Nc1ccc(Oc2ccc(C(c3ccc(Oc4cccc(Oc5ccc(C(c6ccc(O...,Nc1ccc(Oc2ccc(C(c3ccc(Oc4cccc(Oc5ccc(C(c6ccc(O...,Nc1ccc(Oc2ccc(C(c3ccc(Oc4cccc(Oc5ccc(C(c6ccc(O...,Nc1ccc(Oc2ccc(C(c3ccc(Oc4cccc(Oc5ccc(C(c6ccc(O...,2,imide,285.000,,,...,,,,1.316,,,,,,


In [None]:
fmt_data_path = FMT_DATA_DIR / f'{input_data_path.stem}_FMT.csv'
fmt_df.to_csv(fmt_data_path, index=False)

# Producing combined master dataset from individual processed datasets 

In [2]:
dsets_to_combine_paths = [
    'nipu_urethanes_FILTERED.csv',
    '20231114_polyid_data_density_DP2-6 - 1,2 monomers_FILTERED.csv'
]

master = pd.concat(
    [
        pd.read_csv(PROC_DATA_DIR / path_name, index_col=0)
            for path_name in dsets_to_combine_paths
    ], axis=0
)

master.to_csv(PROC_DATA_DIR / 'monomer_data_MASTER.csv')