In [2]:
%pip install pyarrow duckdb polars
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd





[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
# Initialize duckdb
import duckdb
import pandas as pd
import polars as pl
import pyarrow as pa

parquet_file = 'train.parquet'

# Start a DuckDB session
con = duckdb.connect(database=':memory:', read_only=False)

# Create a virtual table that points to the parquet file
con.execute(f"CREATE VIEW IF NOT EXISTS train AS SELECT * FROM parquet_scan('{parquet_file}')")

df = con.execute("""SELECT molecule_smiles as smiles, protein_name, binds FROM train""").fetch_arrow_table()

df = pl.from_arrow(df)


In [6]:
print(df.head())

print(df.shape) 

binded_records = df.filter(df['binds'] == 1)

print(binded_records.count())

shape: (5, 3)
┌─────────────────────────────────┬──────────────┬───────┐
│ smiles                          ┆ protein_name ┆ binds │
│ ---                             ┆ ---          ┆ ---   │
│ str                             ┆ str          ┆ i64   │
╞═════════════════════════════════╪══════════════╪═══════╡
│ C#CCOc1ccc(CNc2nc(NCC3CCCN3c3c… ┆ BRD4         ┆ 0     │
│ C#CCOc1ccc(CNc2nc(NCC3CCCN3c3c… ┆ HSA          ┆ 0     │
│ C#CCOc1ccc(CNc2nc(NCC3CCCN3c3c… ┆ sEH          ┆ 0     │
│ C#CCOc1ccc(CNc2nc(NCc3cccc(Br)… ┆ BRD4         ┆ 0     │
│ C#CCOc1ccc(CNc2nc(NCc3cccc(Br)… ┆ HSA          ┆ 0     │
└─────────────────────────────────┴──────────────┴───────┘
(295246830, 3)
shape: (1, 3)
┌─────────┬──────────────┬─────────┐
│ smiles  ┆ protein_name ┆ binds   │
│ ---     ┆ ---          ┆ ---     │
│ u32     ┆ u32          ┆ u32     │
╞═════════╪══════════════╪═════════╡
│ 1589906 ┆ 1589906      ┆ 1589906 │
└─────────┴──────────────┴─────────┘


In [8]:
BRD4_records = df.filter(df['protein_name'] == 'BRD4')
HSA_records = df.filter(df['protein_name'] == 'HSA')
sEH_records = df.filter(df['protein_name'] == 'sEH')

print(BRD4_records.shape)
print(HSA_records.shape)
print(sEH_records.shape)


(98415610, 3)
(98415610, 3)
(98415610, 3)


In [9]:
BRD4_binded_records_count = BRD4_records.filter(BRD4_records['binds'] == 1).shape[0]
HSA_binded_records_count = HSA_records.filter(HSA_records['binds'] == 1).shape[0]
sEH_binded_records_count = sEH_records.filter(sEH_records['binds'] == 1).shape[0]

print(f"Number of BRD4 records that bind: {BRD4_binded_records_count}")
print(f"Number of HSA records that bind: {HSA_binded_records_count}")
print(f"Number of sEH records that bind: {sEH_binded_records_count}")

Number of BRD4 records that bind: 456964
Number of HSA records that bind: 408410
Number of sEH records that bind: 724532


In [13]:
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_bitvec(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:  # Check if molecule is successfully created
        bit_vec = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
        return bit_vec.ToBitString()  # Convert bit vector to a string to store in Polars
    return None  # Return None or appropriate value if SMILES is invalid

# Apply the function to the 'smiles' column
df = BRD4_records.with_columns(
    pl.col("smiles").apply(smiles_to_bitvec).alias("bit_vector")
)

  pl.col("smiles").apply(smiles_to_bitvec).alias("bit_vector")
  df = BRD4_records.with_columns(


AttributeError: 'ExplicitBitVect' object has no attribute 'GetNumAtoms'

In [5]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np

def smiles_to_ecfp(smiles_string, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles_string)
    if mol:
        ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        # Convert the bit vector to a numpy array directly
        
        arr = np.zeros((n_bits,), dtype=np.int8)
        AllChem.DataStructs.ConvertToNumpyArray(ecfp, arr)
        return arr
    return np.zeros((n_bits,), dtype=np.int8)

def process_batch(df_polars):
    ecfps = [smiles_to_ecfp(sm) for sm in df_polars['smiles'].to_list()]
    df_polars = df_polars.with_column(pl.Series("ecfp", ecfps))
    return df_polars

In [6]:
import xgboost as xgb
import numpy as np

params = {
    'device':'cuda',
    'tree_method': 'gpu_hist',
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'learning_rate': 0.1,
    'max_depth': 5,
    'alpha': 10,
    'n_estimators': 100
}

# Placeholder for the model
model = None

# Process each batch
df_polars = process_batch(df)
X = np.stack(df_polars['ecfp'].to_numpy())
y = df_polars['binds'].to_numpy()

dtrain = xgb.DMatrix(X, label=y)
model = xgb.train(params, dtrain, num_boost_round=10, xgb_model=model)

model.save_model("model.bin")
