# **EDA**


Importing and inspecting data

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem

In [11]:
df = pd.read_csv("USPTO_50K.csv")

In [12]:
# Split the 'reactions' column by '>' into 3 new columns
df[['reactants', 'agents', 'products']] = df['reactions'].str.split('>', expand=True)
df = df.drop(columns = ['id', 'class','reactions','agents'])
df

Unnamed: 0,reactants,products
0,COC(=O)[C@H](CCCCNC(=O)OCc1ccccc1)NC(=O)Nc1cc(...,COC(=O)[C@H](CCCCN)NC(=O)Nc1cc(OC)cc(C(C)(C)C)c1O
1,Nc1cccc2cnccc12.O=C(O)c1cc([N+](=O)[O-])c(Sc2c...,O=C(Nc1cccc2cnccc12)c1cc([N+](=O)[O-])c(Sc2c(C...
2,CCNCC.Cc1nc(-c2ccc(C=O)cc2)sc1COc1ccc([C@H](CC...,CCN(CC)Cc1ccc(-c2nc(C)c(COc3ccc([C@H](CC(=O)N4...
3,CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...,CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...
4,CCOc1ccc(Oc2ncnc3c2cnn3C2CCNCC2)c(F)c1.O=C(Cl)...,CCOc1ccc(Oc2ncnc3c2cnn3C2CCN(C(=O)OC3CCCC3)CC2...
...,...,...
50011,CCOC(=O)N1CCc2ccc3c(c2CC1)C(O)(C1CC1)CC3,CCOC(=O)N1CCc2ccc3c(c2CC1)C(C1CC1)CC3
50012,Brc1cccc(C=C2c3ccccc3CCc3ccccc32)c1.N#C[Cu],N#Cc1cccc(C=C2c3ccccc3CCc3ccccc32)c1
50013,Cc1noc(C)c1-c1c(-c2ccc(O)cc2)c2ccccc2n1C=O.NO,Cc1noc(C)c1-c1c(-c2ccc(O)cc2)c2ccccc2n1C=NO
50014,O=C(NC1CC1)c1ccc(-c2cnc3c(NCCCO)nc(Br)cn23)cc1,O=C(NC1CC1)c1ccc(-c2cnc3c(NCCCO)nccn23)cc1


Each entry in reactant and product is a SMILES string representing multiple molecules separated by dots (.).

1.Split these into individual molecules (fragments)

2.Store each molecule in its own column (e.g., reactant_1, reactant_2, ...)


In [13]:
#Check dataset shape
print(f"Dataset shape: {df.shape} (rows, columns)")

#Preview first few rows
print("\nFirst 5 rows:")
print(df.head())

#Column names
print("\nColumn names:")
print(df.columns.tolist())

#Missing values summary
print("\nMissing values per column:")
print(df.isnull().sum())

#Data types of each column
print("\nData types:")
print(df.dtypes)

#Summary statistics for numerical columns (if any)
print("\nSummary statistics:")
print(df.describe(include='all'))

#Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

#Check for inconsistencies: whitespace, empty strings
print("\nNumber of rows with blank (empty string) entries:")
blank_counts = (df == '').sum()
print(blank_counts)


Dataset shape: (50016, 2) (rows, columns)

First 5 rows:
                                           reactants  \
0  COC(=O)[C@H](CCCCNC(=O)OCc1ccccc1)NC(=O)Nc1cc(...   
1  Nc1cccc2cnccc12.O=C(O)c1cc([N+](=O)[O-])c(Sc2c...   
2  CCNCC.Cc1nc(-c2ccc(C=O)cc2)sc1COc1ccc([C@H](CC...   
3  CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...   
4  CCOc1ccc(Oc2ncnc3c2cnn3C2CCNCC2)c(F)c1.O=C(Cl)...   

                                            products  
0  COC(=O)[C@H](CCCCN)NC(=O)Nc1cc(OC)cc(C(C)(C)C)c1O  
1  O=C(Nc1cccc2cnccc12)c1cc([N+](=O)[O-])c(Sc2c(C...  
2  CCN(CC)Cc1ccc(-c2nc(C)c(COc3ccc([C@H](CC(=O)N4...  
3  CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...  
4  CCOc1ccc(Oc2ncnc3c2cnn3C2CCN(C(=O)OC3CCCC3)CC2...  

Column names:
['reactants', 'products']

Missing values per column:
reactants    0
products     0
dtype: int64

Data types:
reactants    object
products     object
dtype: object

Summary statistics:
                             reactants                    products
count    

In [None]:
df = df.drop_duplicates()

In [6]:

# Function to split SMILES into fragments and return them as a list of SMILES
def split_smiles_into_fragments(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return []
        frags = Chem.GetMolFrags(mol, asMols=True)
        frag_smiles = [Chem.MolToSmiles(frag, canonical=True) for frag in frags]
        return frag_smiles
    except:
        return []


In [None]:
df.loc[:, 'reactant_fragments'] = df['reactants'].apply(split_smiles_into_fragments)


In [20]:
df

Unnamed: 0,reactants,products,reactant_fragments,reactant_1,reactant_2,reactant_3,product_fragments
0,COC(=O)[C@H](CCCCNC(=O)OCc1ccccc1)NC(=O)Nc1cc(...,COC(=O)[C@H](CCCCN)NC(=O)Nc1cc(OC)cc(C(C)(C)C)c1O,[COC(=O)[C@H](CCCCNC(=O)OCc1ccccc1)NC(=O)Nc1cc...,COC(=O)[C@H](CCCCNC(=O)OCc1ccccc1)NC(=O)Nc1cc(...,,,[COC(=O)[C@H](CCCCN)NC(=O)Nc1cc(OC)cc(C(C)(C)C...
1,Nc1cccc2cnccc12.O=C(O)c1cc([N+](=O)[O-])c(Sc2c...,O=C(Nc1cccc2cnccc12)c1cc([N+](=O)[O-])c(Sc2c(C...,"[Nc1cccc2cnccc12, O=C(O)c1cc([N+](=O)[O-])c(Sc...",Nc1cccc2cnccc12,O=C(O)c1cc([N+](=O)[O-])c(Sc2c(Cl)cncc2Cl)s1,,[O=C(Nc1cccc2cnccc12)c1cc([N+](=O)[O-])c(Sc2c(...
2,CCNCC.Cc1nc(-c2ccc(C=O)cc2)sc1COc1ccc([C@H](CC...,CCN(CC)Cc1ccc(-c2nc(C)c(COc3ccc([C@H](CC(=O)N4...,"[CCNCC, Cc1nc(-c2ccc(C=O)cc2)sc1COc1ccc([C@H](...",CCNCC,Cc1nc(-c2ccc(C=O)cc2)sc1COc1ccc([C@H](CC(=O)N2...,,[CCN(CC)Cc1ccc(-c2nc(C)c(COc3ccc([C@H](CC(=O)N...
3,CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...,CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...,[CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(N...,CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...,O=C(CF)CF,,[CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(N...
4,CCOc1ccc(Oc2ncnc3c2cnn3C2CCNCC2)c(F)c1.O=C(Cl)...,CCOc1ccc(Oc2ncnc3c2cnn3C2CCN(C(=O)OC3CCCC3)CC2...,"[CCOc1ccc(Oc2ncnc3c2cnn3C2CCNCC2)c(F)c1, O=C(C...",CCOc1ccc(Oc2ncnc3c2cnn3C2CCNCC2)c(F)c1,O=C(Cl)OC1CCCC1,,[CCOc1ccc(Oc2ncnc3c2cnn3C2CCN(C(=O)OC3CCCC3)CC...
...,...,...,...,...,...,...,...
50011,CCOC(=O)N1CCc2ccc3c(c2CC1)C(O)(C1CC1)CC3,CCOC(=O)N1CCc2ccc3c(c2CC1)C(C1CC1)CC3,[CCOC(=O)N1CCc2ccc3c(c2CC1)C(O)(C1CC1)CC3],CCOC(=O)N1CCc2ccc3c(c2CC1)C(O)(C1CC1)CC3,,,[CCOC(=O)N1CCc2ccc3c(c2CC1)C(C1CC1)CC3]
50012,Brc1cccc(C=C2c3ccccc3CCc3ccccc32)c1.N#C[Cu],N#Cc1cccc(C=C2c3ccccc3CCc3ccccc32)c1,"[Brc1cccc(C=C2c3ccccc3CCc3ccccc32)c1, N#[C][Cu]]",Brc1cccc(C=C2c3ccccc3CCc3ccccc32)c1,N#[C][Cu],,[N#Cc1cccc(C=C2c3ccccc3CCc3ccccc32)c1]
50013,Cc1noc(C)c1-c1c(-c2ccc(O)cc2)c2ccccc2n1C=O.NO,Cc1noc(C)c1-c1c(-c2ccc(O)cc2)c2ccccc2n1C=NO,"[Cc1noc(C)c1-c1c(-c2ccc(O)cc2)c2ccccc2n1C=O, NO]",Cc1noc(C)c1-c1c(-c2ccc(O)cc2)c2ccccc2n1C=O,NO,,[Cc1noc(C)c1-c1c(-c2ccc(O)cc2)c2ccccc2n1C=NO]
50014,O=C(NC1CC1)c1ccc(-c2cnc3c(NCCCO)nc(Br)cn23)cc1,O=C(NC1CC1)c1ccc(-c2cnc3c(NCCCO)nccn23)cc1,[O=C(NC1CC1)c1ccc(-c2cnc3c(NCCCO)nc(Br)cn23)cc1],O=C(NC1CC1)c1ccc(-c2cnc3c(NCCCO)nc(Br)cn23)cc1,,,[O=C(NC1CC1)c1ccc(-c2cnc3c(NCCCO)nccn23)cc1]


In [None]:
# Determine the max number of fragments
max_frags = df['reactant_fragments'].apply(len).max()
print(f"Max number of reactants: {max_frags}")
# Create new columns reactant_1, reactant_2, ..., reactant_N
for i in range(max_frags):
    df[f'reactant_{i+1}'] = df['reactant_fragments'].apply(
        lambda x: x[i] if i < len(x) else None
    )

Max number of reactants: 3


In [18]:
df

Unnamed: 0,reactants,products,reactant_fragments,reactant_1,reactant_2,reactant_3
0,COC(=O)[C@H](CCCCNC(=O)OCc1ccccc1)NC(=O)Nc1cc(...,COC(=O)[C@H](CCCCN)NC(=O)Nc1cc(OC)cc(C(C)(C)C)c1O,[COC(=O)[C@H](CCCCNC(=O)OCc1ccccc1)NC(=O)Nc1cc...,COC(=O)[C@H](CCCCNC(=O)OCc1ccccc1)NC(=O)Nc1cc(...,,
1,Nc1cccc2cnccc12.O=C(O)c1cc([N+](=O)[O-])c(Sc2c...,O=C(Nc1cccc2cnccc12)c1cc([N+](=O)[O-])c(Sc2c(C...,"[Nc1cccc2cnccc12, O=C(O)c1cc([N+](=O)[O-])c(Sc...",Nc1cccc2cnccc12,O=C(O)c1cc([N+](=O)[O-])c(Sc2c(Cl)cncc2Cl)s1,
2,CCNCC.Cc1nc(-c2ccc(C=O)cc2)sc1COc1ccc([C@H](CC...,CCN(CC)Cc1ccc(-c2nc(C)c(COc3ccc([C@H](CC(=O)N4...,"[CCNCC, Cc1nc(-c2ccc(C=O)cc2)sc1COc1ccc([C@H](...",CCNCC,Cc1nc(-c2ccc(C=O)cc2)sc1COc1ccc([C@H](CC(=O)N2...,
3,CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...,CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...,[CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(N...,CC1(C)CCC(CN2CCN(c3ccc(C(=O)NS(=O)(=O)c4ccc(NC...,O=C(CF)CF,
4,CCOc1ccc(Oc2ncnc3c2cnn3C2CCNCC2)c(F)c1.O=C(Cl)...,CCOc1ccc(Oc2ncnc3c2cnn3C2CCN(C(=O)OC3CCCC3)CC2...,"[CCOc1ccc(Oc2ncnc3c2cnn3C2CCNCC2)c(F)c1, O=C(C...",CCOc1ccc(Oc2ncnc3c2cnn3C2CCNCC2)c(F)c1,O=C(Cl)OC1CCCC1,
...,...,...,...,...,...,...
50011,CCOC(=O)N1CCc2ccc3c(c2CC1)C(O)(C1CC1)CC3,CCOC(=O)N1CCc2ccc3c(c2CC1)C(C1CC1)CC3,[CCOC(=O)N1CCc2ccc3c(c2CC1)C(O)(C1CC1)CC3],CCOC(=O)N1CCc2ccc3c(c2CC1)C(O)(C1CC1)CC3,,
50012,Brc1cccc(C=C2c3ccccc3CCc3ccccc32)c1.N#C[Cu],N#Cc1cccc(C=C2c3ccccc3CCc3ccccc32)c1,"[Brc1cccc(C=C2c3ccccc3CCc3ccccc32)c1, N#[C][Cu]]",Brc1cccc(C=C2c3ccccc3CCc3ccccc32)c1,N#[C][Cu],
50013,Cc1noc(C)c1-c1c(-c2ccc(O)cc2)c2ccccc2n1C=O.NO,Cc1noc(C)c1-c1c(-c2ccc(O)cc2)c2ccccc2n1C=NO,"[Cc1noc(C)c1-c1c(-c2ccc(O)cc2)c2ccccc2n1C=O, NO]",Cc1noc(C)c1-c1c(-c2ccc(O)cc2)c2ccccc2n1C=O,NO,
50014,O=C(NC1CC1)c1ccc(-c2cnc3c(NCCCO)nc(Br)cn23)cc1,O=C(NC1CC1)c1ccc(-c2cnc3c(NCCCO)nccn23)cc1,[O=C(NC1CC1)c1ccc(-c2cnc3c(NCCCO)nc(Br)cn23)cc1],O=C(NC1CC1)c1ccc(-c2cnc3c(NCCCO)nc(Br)cn23)cc1,,
