In [1]:

from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
import numpy as np
import hdbscan
import pyarrow as pa
from rdkit.SimDivFilters import rdSimDivPickers
import multiprocessing


In [2]:
#in_memory_stream = pa.input_stream('/home/lyg/data/pubchem/arrow/pubchem_sorted.arrow')
#opened_stream = pa.ipc.open_stream(in_memory_stream)
#table = opened_stream.read_all()

table = pa.ipc.RecordBatchFileReader(pa.memory_map('/home/lyg/data/pubchem/arrow/pubchem_best.arrow')).read_all()

# write out the smiles to a temporary file
smiles = table.column('smiles').to_pylist()

smiles[0]

'[C-]#[O+]'

In [9]:
radius=2
nBits=2048

mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=nBits)

def smiles_to_fp(smiles):
  """
  Worker function to create an RDKit fingerprint from a SMILES string.
  """
  mol = Chem.MolFromSmiles(smiles)
  if mol is None:
    # return an empty fingerprint if the SMILES is invalid
    return DataStructs.cDataStructs.ExplicitBitVect(nBits)
  else:
    return mfpgen.GetFingerprint(mol)

with multiprocessing.Pool() as pool:
    arr = pool.map(smiles_to_fp, smiles)

[11:37:19] Explicit valence for atom # 2 O, 2, is greater than permitted
[11:37:19] Explicit valence for atom # 0 B, 4, is greater than permitted
[11:37:19] Explicit valence for atom # 0 B, 4, is greater than permitted
[11:37:21] Conflicting single bond directions around double bond at index 6.
[11:37:21]   BondStereo set to STEREONONE and single bond directions set to NONE.
[11:37:21] Explicit valence for atom # 10 O, 2, is greater than permitted
[11:37:21] Conflicting single bond directions around double bond at index 3.
[11:37:21]   BondStereo set to STEREONONE and single bond directions set to NONE.
[11:37:22] Conflicting single bond directions around double bond at index 8.
[11:37:22]   BondStereo set to STEREONONE and single bond directions set to NONE.
[11:37:22] Conflicting single bond directions around double bond at index 8.
[11:37:22]   BondStereo set to STEREONONE and single bond directions set to NONE.
[11:37:22] Conflicting single bond directions around double bond at ind

In [12]:
assert len(arr) == len(smiles)

In [None]:

thresh = 0.65 # <- minimum distance between cluster centroids. random threshold for morgan radius 2 is 0.65

print(f"Finished computing fingerprints, clustering {len(arr)} molecules...")

lp = rdSimDivPickers.LeaderPicker()

picks = lp.LazyBitVectorPick(arr,len(arr),thresh)


0 C1=CC(=C(C=C1Cl)Cl)Cl
Finished computing fingerprints, clustering 10 molecules...


In [6]:
# with open('cluster_labels.txt', 'w') as f:
#     for i in picks:
#         smiles = table['smiles'][i].as_py()
#         iupac = table['iupac'][i].as_py()
#         cid = table['cid'][i].as_py()
#         formula = table['formula'][i].as_py()
#         num_atoms = table['num_atoms'][i].as_py()
#         # f.write(f"{smiles}\t{iupac}\t{labels[i]}\n")
#         f.write(f"{cid}\t{smiles}\t{iupac}\t{formula}\t{num_atoms}\n")

def convert_string_view_to_string(table):
    for column in table.schema:
        if pa.types.is_string_view(column.type):
            # Recreate the column as pa.string
            table = table.set_column(
                table.schema.get_field_index(column.name),
                column.name,
                pa.array(table[column.name].to_pylist(), type=pa.string())
            )
    return table

table = table.take(pa.array(picks))

# huggingface only supports string, not string_view
table = convert_string_view_to_string(table)

with pa.OSFile('/home/lyg/data/pubchem/arrow/pubchem_best_cluster.arrow', 'wb') as sink:
    with pa.RecordBatchFileWriter(sink, table.schema) as writer:
        writer.write_table(table)
print(f"Finished writing {len(picks)} picks to pubchem_best_cluster.arrow")

Finished writing 8 picks to pubchem_best_cluster.arrow
