# FART Dataset Enrichment

In [3]:
import pandas as pd
import requests
import time
from tqdm import tqdm

In [None]:
df = pd.read_csv("FART_curated.csv")

# Define a function to query PubChem API using SMILES
def get_pubchem_data(smiles):
    '''
    Args:
      smiles (string) : SMILES representation of molecule

    Retruns:
      Dict : PubChem Information for molecule
    '''
    url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/smiles/{smiles}/property/IUPACName,MolecularFormula,MolecularWeight,InChI,InChIKey/JSON?api_key=6d38451f7d0f37eca09d6666298d0f242d08"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        properties = data['PropertyTable']['Properties'][0]
        return {
            'PubChemID': properties.get('CID', ''),
            'IUPAC Name': properties.get('IUPACName', ''),
            'Molecular Formula': properties.get('MolecularFormula', ''),
            'Molecular Weight': properties.get('MolecularWeight', ''),
            'InChI': properties.get('InChI', ''),
            'InChIKey': properties.get('InChIKey', '')
        }
    else:
        return {'PubChemID': '', 'IUPAC Name': '', 'Molecular Formula': '', 'Molecular Weight': '', 'InChI': '', 'InChIKey': ''}

In [None]:
# Apply the function to each row in the DataFrame
data_columns = ['PubChemID', 'IUPAC Name', 'Molecular Formula', 'Molecular Weight', 'InChI', 'InChIKey']
for col in data_columns:
    df[col] = ''

# Due to API rate limits, we may need to fetch the data in batches or with delays
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing SMILES"):
    result = get_pubchem_data(row['Canonicalized SMILES'])
    for key in data_columns:
        df.at[index, key] = result[key]
    time.sleep(0.2)  # Sleep to respect API rate limits

In [None]:
df.to_csv('FART_full_db.csv', index=False)