In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('../../../../docs/data/cycpeptdb_clean_onlyPC.csv')

# Function to compute ECFP fingerprints
def compute_Morgan_fingerprint(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    else:
        return np.nan
    
def compute_DRFP_fingerprint(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    else:
        return np.nan


# Apply the function to the DataFrame
df['fingerprint'] = df['SMILES'].apply(compute_Morgan_fingerprint)

# Convert fingerprints to a list of lists
fingerprint_list = [list(f) for f in df['fingerprint']]

# Optionally convert to a NumPy array for regression models
fingerprint_array = np.array(fingerprint_list)

print(fingerprint_array)

[[0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]]
