In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
import pandas as pd
import numpy as np

# Load the data
df = pd.read_csv('../../../../docs/data/cycpeptdb_clean_onlyPC.csv')

# Function to compute ECFP fingerprints
def compute_Morgan_fingerprint(smiles, radius=2, nBits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits)
    else:
        return np.nan


# Apply the function to the DataFrame
df['fingerprint'] = df['SMILES'].apply(compute_Morgan_fingerprint)

# Convert fingerprints to a list of lists
fingerprint_list = [list(f) for f in df['fingerprint']]

# Optionally convert to a NumPy array for regression models
fingerprint_array = np.array(fingerprint_list)

print(fingerprint_array)

[[0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 0 0 0]]


In [1]:
from drfp import DrfpEncoder
import pandas as pd

# Load the data
df = pd.read_csv('../../../../docs/data/cycpeptdb_clean_onlyPC.csv')

df['fingerprint'] = DrfpEncoder.encode(df['SMILES'])

# Convert fingerprints to a list of lists
fingerprint_list = [list(f) for f in df['fingerprint']]

# Optionally convert to a NumPy array for regression models
fingerprint_array = np.array(fingerprint_list)

print(fingerprint_array)

NoReactionError: The following is not a valid reaction SMILES: 'C/C=C/C[C@@H](C)[C@@H](O)[C@H]1C(=O)N[C@@H](CC)C(=O)N(C)CC(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](C(C)C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N[C@@H](C)C(=O)N[C@H](C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N(C)[C@@H](CC(C)C)C(=O)N(C)[C@@H](C(C)C)C(=O)N1C'

In [10]:
from rdkit import Chem
import tmap as tm
from map4 import MAP4Calculator

dim = 1024

MAP4 = MAP4Calculator(dimensions=dim)
ENC = tm.Minhash(dim)

df = pd.read_csv('../../../../docs/data/cycpeptdb_clean.csv')
# df['fingerprint'] = Chem.MolFromSmiles(df['SMILES']).apply(MAP4.calculate_many)
df['MolFromSmiles'] = df['SMILES'].apply(Chem.MolFromSmiles)
df['fingerprint'] = MAP4.calculate_many(df['MolFromSmiles'])


fps = df['fingerprint'].values

print(ENC.get_distance(fps[0], fps[1]))

0.91796875


In [11]:
print(df['fingerprint'])

0       [4783318, 851071, 99402, 3285824, 2115449, 113...
1       [9109819, 8511241, 2475943, 5328534, 2924166, ...
2       [9109819, 8511241, 2475943, 5328534, 2924166, ...
3       [9109819, 8511241, 2475943, 5328534, 2924166, ...
4       [9109819, 8511241, 2475943, 5328534, 2924166, ...
                              ...                        
7329    [1523474, 3827191, 1052992, 44620, 1038221, 23...
7330    [170694, 3827191, 1052992, 44620, 1038221, 231...
7331    [170694, 1246412, 1052992, 44620, 1038221, 231...
7332    [170694, 3827191, 1052992, 44620, 1038221, 231...
7333    [170694, 3827191, 1052992, 44620, 1038221, 243...
Name: fingerprint, Length: 7334, dtype: object
