# Week 14 Data Preprocessing

Data was aggregated from two sources, one with SMILES ([Kaggle](https://www.kaggle.com/datasets/ravisinghiitbhu/fda-approved-drugs-list-and-smiles/)) and the other with function ([FDA](https://drugcentral.org/download)).

SMILES strings were canonicalized using RDKit, and then embedded using ChemBERTa.

The first step of preprocessing is to match the molecules in the database of FDA approved structures with the data about drug-target interactions, so we have information about the molecular targets of the drugs.

In [1]:
import pandas as pd
import numpy as np

from rapidfuzz import fuzz
from rapidfuzz import process

# --- CONFIGURATION ---
INPUT_FILE_SMILES = 'data/FDA_Approved_structures.csv'
INPUT_FILE_FUNCTION = 'data/drug.target.interaction.tsv'
OUTPUT_FILE = 'data/FDA_Approved_structures_with_function_fuzzy.csv'
SCORE_THRESHOLD = 70 # Minimum similarity score (out of 100) for a fuzzy match

# --- 1. Load DataFrames ---
try:
    smiles = pd.read_csv(INPUT_FILE_SMILES)
    # Using tab separation for the target file as specified earlier
    function = pd.read_csv(INPUT_FILE_FUNCTION, sep='\t')
except FileNotFoundError as e:
    print(f"Error: {e}. Please ensure both files are in the current directory.")
    # Exit gracefully if files aren't found
    raise

# --- 2. FUZZY MATCHING & MAPPING CREATION ---
print("Starting fuzzy matching to create name map...")

smiles_names = smiles['Name'].dropna().unique()
function_names = function['DRUG_NAME'].dropna().unique()

# List to hold the final name mapping: (Original SMILES Name, Matched Function Name)
name_map_data = []

# Iterate through every unique name in the SMILES list
for smiles_name in smiles_names:
    
    # Use process.extractOne to find the single best match in the function_names list
    # Use token_sort_ratio for robustness against word order/salts
    match_result = process.extractOne(
        smiles_name, 
        function_names, 
        scorer=fuzz.token_sort_ratio,
        score_cutoff=SCORE_THRESHOLD
    )
    
    # ✅ FIX: Check if the result is NOT None (i.e., a match >= 90 was found)
    if match_result is not None:
        best_match, score, _ = match_result
        
        # Store the mapping
        name_map_data.append({
            'Name_Original': smiles_name,
            'DRUG_NAME': best_match,
            'Match_Score': score
        })

# Convert the mapping list into a DataFrame
name_map_df = pd.DataFrame(name_map_data)
print(f"Fuzzy matching complete. Found {len(name_map_df)} high-confidence matches.")

# --- 3. TWO-STEP MERGE AND CLEANUP ---

# Step 3a: Join SMILES to the MAP on 'Name' (exact match on the original name)
smiles_mapped = smiles.merge(
    name_map_df[['Name_Original', 'DRUG_NAME']],
    left_on='Name',
    right_on='Name_Original',
    how='left'
).drop(columns=['Name_Original']) 

# Step 3b: Join the MAPPED result to the FUNCTION data on the matched DRUG_NAME (exact match)
smiles_merged = smiles_mapped.merge(
    function[['DRUG_NAME', 'TARGET_NAME']],
    on='DRUG_NAME',
    how='left'
).drop(columns=['DRUG_NAME']) 

# Rename the target column
smiles_merged = smiles_merged.rename(columns={'TARGET_NAME': 'Target'})

# --- 4. Aggregate Multiple Targets and Save ---
# Group by the unique drug identifiers (Name and SMILES)
smiles_final = smiles_merged.groupby(['Name', 'SMILES'], as_index=False).agg(
    {
        # Aggregate the Target column: drop NaNs, convert to string, find unique, join by ';'
        'Target': lambda x: ';'.join(x.dropna().astype(str).unique())
    }
)

# Replace any resulting empty strings (from drugs that had no target match) with NaN
smiles_final['Target'] = smiles_final['Target'].replace('', np.nan)


# Save the final, enriched file
smiles_final.to_csv(OUTPUT_FILE, index=False)
# print count of non empty targets found
non_empty_targets = smiles_final['Target'].notna().sum()
print(f"Number of drugs with assigned targets: {non_empty_targets}")
# ratio of non empty targets
ratio_non_empty = non_empty_targets / len(smiles_final)
print(f"Ratio of drugs with assigned targets: {ratio_non_empty:.2%}")

print(f"Results saved to {OUTPUT_FILE}")
# print head of first 5 drugs with targets
print(smiles_final[smiles_final['Target'].notna()].head(5))


Starting fuzzy matching to create name map...
Fuzzy matching complete. Found 1940 high-confidence matches.
Number of drugs with assigned targets: 1940
Ratio of drugs with assigned targets: 75.08%
Results saved to data/FDA_Approved_structures_with_function_fuzzy.csv
           Name                                             SMILES  \
8      Abacavir  NC1=NC2=C(N=CN2[C@@H]2C[C@H](CO)C=C2)C(NC2CC2)=N1   
9    Abametapir                       CC1=CC=C(N=C1)C1=CC=C(C)C=N1   
10     Abarelix  CC(C)C[C@H](NC(=O)[C@@H](CC(N)=O)NC(=O)[C@H](C...   
11  Abemaciclib  CCN1CCN(CC2=CC=C(NC3=NC=C(F)C(=N3)C3=CC(F)=C4N...   
12  Abiraterone  [H][C@@]12CC=C(C3=CC=CN=C3)[C@@]1(C)CC[C@@]1([...   

                                               Target  
8   Gag-Pol polyprotein;Reverse transcriptase/RNas...  
9   C-C chemokine receptor type 5;C-C chemokine re...  
10            Gonadotropin-releasing hormone receptor  
11  Potassium voltage-gated channel subfamily H me...  
12  Corticosteroid-binding globul

Next we calculate canonical SMILES for each molecule using RDKit.

In [2]:
# convert smiles to canonical using rdkit
from rdkit import Chem
from rdkit.Chem import MolToSmiles
from tqdm import tqdm
import pandas as pd

df = pd.read_csv("data/FDA_Approved_structures_with_function_fuzzy.csv")

def to_canonical(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return MolToSmiles(mol, canonical=True)
        else:
            return None
    except:
        return None

df['SMILES_Canonical'] = df['SMILES'].apply(to_canonical)

# save updated file with canonical smiles
df.to_csv('data/Canonical_SMILES.csv', index=False)


[16:31:48] Explicit valence for atom # 84 N, 4, is greater than permitted
[16:31:48] SMILES Parse Error: syntax error while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[16:31:48] SMILES Parse Error: check for mistakes around position 76:
[16:31:48] C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C
[16:31:48] ~~~~~~~~~~~~~~~~~~~~^
[16:31:48] SMILES Parse Error: extra open parentheses while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[16:31:48] SMILES Parse Error: check for mistakes around position 32:
[16:31:48] C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2
[16:31:48] ~~~~~~~~~~~~~~~~~~~~^
[16:31:48] SMILES Parse Error: extra open parentheses while parsing: OC1=CC=CC(=C1)C-1=C2\CCC(=N2)\C(=C2/N\C(\C=C2)=C(/C2=N/C(/C=C2)=C(\C2=CC=C\-1N2)C1=CC(O)=CC=C1)C1=CC(O)=CC=C1)\C1=CC(O)=CC=C1
[16:31:48] SMILES Parse Error: check fo

Finally, we calculate embeddings for the canonical smiles and save.

In [3]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd
import torch
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")
model = AutoModel.from_pretrained("seyonec/ChemBERTa-zinc-base-v1")

device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
model.to(device)

df = pd.read_csv("tmp/Canonical_SMILES.csv")
# remove rows that are NaN in SMILES_Canonical
df = df.dropna(subset=['SMILES_Canonical'])

# batch run to avoid memory issues
batch_size = 64
embeddings = []
for i in tqdm(range(0, len(df), batch_size)):
    batch_smiles = df['SMILES_Canonical'].iloc[i:i+batch_size].tolist()
    inputs = tokenizer(batch_smiles, return_tensors="pt", padding=True, truncation=True)
    # inputs to device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    batch_embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    embeddings.append(batch_embeddings)

embeddings = np.vstack(embeddings)
# save embeddings to pyarrow parquet file

table = pa.Table.from_pandas(pd.DataFrame(embeddings))
pq.write_table(table, 'data/Canonical_SMILES_embeddings.parquet')

Skipping import of cpp extensions due to incompatible torch version 2.9.1 for torchao version 0.14.1             Please see https://github.com/pytorch/ao/issues/2919 for more info
W1202 16:31:51.746000 48116 torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
100%|██████████| 41/41 [00:30<00:00,  1.35it/s]
