In [2]:
import duckdb
import pandas as pd

parquet_file = 'train.parquet'

# Start a DuckDB session
con = duckdb.connect(database='my_duckdb.db', read_only=False)

# Create a virtual table that points to the parquet file
con.execute(f"CREATE VIEW IF NOT EXISTS train AS SELECT * FROM parquet_scan('{parquet_file}')")

# Define your query
query = 'SELECT * FROM train WHERE binds = 1 LIMIT 10'

# Execute the query and load the result into a pandas DataFrame
df = pd.read_sql_query(query, con)

  df = pd.read_sql_query(query, con)


In [3]:
print(df.head())


     id                            buildingblock1_smiles  \
0   466  C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21   
1   467  C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21   
2   683  C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21   
3  1321  C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21   
4  2141  C#CC[C@@H](CC(=O)O)NC(=O)OCC1c2ccccc2-c2ccccc21   

  buildingblock2_smiles      buildingblock3_smiles  \
0  C#CCOc1ccc(CN)cc1.Cl      Cc1cc2cc(CN)ccc2[nH]1   
1  C#CCOc1ccc(CN)cc1.Cl      Cc1cc2cc(CN)ccc2[nH]1   
2  C#CCOc1ccc(CN)cc1.Cl  Cl.NCC12CC3CC(CC(C3)C1)C2   
3  C#CCOc1ccc(CN)cc1.Cl          Nc1n[nH]c2ncccc12   
4  C#CCOc1cccc(CN)c1.Cl      Cc1cc2cc(CN)ccc2[nH]1   

                                     molecule_smiles protein_name  binds  
0  C#CCOc1ccc(CNc2nc(NCc3ccc4[nH]c(C)cc4c3)nc(N[C...          HSA      1  
1  C#CCOc1ccc(CNc2nc(NCc3ccc4[nH]c(C)cc4c3)nc(N[C...          sEH      1  
2  C#CCOc1ccc(CNc2nc(NCC34CC5CC(CC(C5)C3)C4)nc(N[...          sEH      1  
3  C#CCOc1ccc(

In [4]:
# execute a query against the duckdb that pulls count of rows that have a value of 1 in the target column
query = "SELECT COUNT(*) FROM train WHERE binds = 1"
df = pd.read_sql_query(query, con)
print(df)
# do the same query but group by protein
query = "SELECT protein_name, COUNT(*) FROM train WHERE binds = 1 GROUP BY protein_name"
df = pd.read_sql_query(query, con)
print(df)



  df = pd.read_sql_query(query, con)
  df = pd.read_sql_query(query, con)


   count_star()
0       1589906
  protein_name  count_star()
0          HSA        408410
1          sEH        724532
2         BRD4        456964


In [5]:
df = pd.read_sql_query("SELECT COUNT(*) FROM train", con)
print(df.head())

   count_star()
0     295246830


  df = pd.read_sql_query("SELECT COUNT(*) FROM train", con)


In [11]:
from rdkit import Chem
from rdkit.Chem import AllChem

query = """
SELECT molecule_smiles, buildingblock1_smiles, buildingblock2_smiles, buildingblock3_smiles, binds, protein_name
FROM train
"""
df = con.execute(query).fetchdf()

def smiles_to_fp(smiles, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=n_bits)
    return list(fp)

for col in ['molecule_smiles', 'buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles']:
    df[col + '_fp'] = df[col].apply(lambda x: smiles_to_fp(x))

In [5]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from scipy.sparse import csr_matrix

query = """
SELECT molecule_smiles, buildingblock1_smiles, buildingblock2_smiles, buildingblock3_smiles, binds, protein_name
FROM train
"""

def smiles_to_fp(smiles, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=n_bits)
    return csr_matrix(list(fp))  # convert to sparse representation

batch_size = 100000  # adjust this based on your available memory
offset = 0

while True:
    batch_query = query + f" LIMIT {batch_size} OFFSET {offset}"
    df = con.execute(batch_query).fetchdf()

    if df.empty:
        break

    for col in ['molecule_smiles', 'buildingblock1_smiles', 'buildingblock2_smiles', 'buildingblock3_smiles']:
        df[col] = df[col].apply(lambda x: smiles_to_fp(x))

    # process the batch (e.g., save to disk, send to a model for training, etc.)

    offset += batch_size

KeyboardInterrupt: 

In [4]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from scipy.sparse import csr_matrix

df['molecule'] = df['molecule_smiles'].apply(Chem.MolFromSmiles)

# Generate ECFPs
def generate_ecfp(molecule, radius=2, bits=1024):
    if molecule is None:
        return None
    return list(AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=bits))

df['ecfp'] = df['molecule'].apply(generate_ecfp)

In [5]:
print(df['ecfp'].head())

0    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: ecfp, dtype: object
