In [3]:

from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
import numpy as np
import hdbscan
import pyarrow as pa
from rdkit.SimDivFilters import rdSimDivPickers


In [4]:
#in_memory_stream = pa.input_stream('/home/lyg/data/pubchem/arrow/pubchem_sorted.arrow')
#opened_stream = pa.ipc.open_stream(in_memory_stream)
#table = opened_stream.read_all()

table = pa.ipc.RecordBatchFileReader(pa.memory_map('/home/lyg/data/pubchem/arrow/pubchem_best.arrow')).read_all()

# sort the table by the number of atoms as that will give easier examples first
# also sort by cid so better annotated cids are picked first
table = table.sort_by([('num_atoms','ascending'),('cid','ascending')])


In [5]:
radius=2
nBits=2048
max_records=6000000
thresh = 0.65 # <- minimum distance between cluster centroids. random threshold for morgan radius 2 is 0.65

mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=nBits)

def itertuples(table, chunk_size=1, max_records=None):
    for i in range(0, table.num_rows, chunk_size):
        if i >= max_records:
            break
        rows = table[i:i + chunk_size].to_pydict()
        yield rows


# Cluster with HDBSCAN using Jaccard (1 - Tanimoto)
def cluster_fingerprints(fps_numpy, min_cluster_size=2):
    clusterer = hdbscan.HDBSCAN(metric='jaccard', min_cluster_size=min_cluster_size, core_dist_n_jobs=16)
    labels = clusterer.fit_predict(fps_numpy)
    return labels, clusterer

# arr = np.zeros((len(table), nBits), dtype=bool)
arr = []
arr_index = []

for i, row in enumerate(itertuples(table, max_records=max_records)):
    smiles = row['smiles'][0]
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        # arr[i] = mfpgen.GetFingerprintAsNumPy(mol)
        arr.append(mfpgen.GetFingerprint(mol))
        arr_index.append(i)
    if i % 10000 == 0:
        print(i, smiles)

print(f"Finished computing fingerprints, clustering {len(arr)} molecules...")

lp = rdSimDivPickers.LeaderPicker()

picks = lp.LazyBitVectorPick(arr,len(arr),thresh)


# labels, clusterer = cluster_fingerprints(arr)


0 C1=CC(=C(C=C1Cl)Cl)Cl
Finished computing fingerprints, clustering 10 molecules...


In [6]:
# with open('cluster_labels.txt', 'w') as f:
#     for i in picks:
#         smiles = table['smiles'][i].as_py()
#         iupac = table['iupac'][i].as_py()
#         cid = table['cid'][i].as_py()
#         formula = table['formula'][i].as_py()
#         num_atoms = table['num_atoms'][i].as_py()
#         # f.write(f"{smiles}\t{iupac}\t{labels[i]}\n")
#         f.write(f"{cid}\t{smiles}\t{iupac}\t{formula}\t{num_atoms}\n")

def convert_string_view_to_string(table):
    for column in table.schema:
        if pa.types.is_string_view(column.type):
            # Recreate the column as pa.string
            table = table.set_column(
                table.schema.get_field_index(column.name),
                column.name,
                pa.array(table[column.name].to_pylist(), type=pa.string())
            )
    return table

table = table.take(pa.array(picks))

# huggingface only supports string, not string_view
table = convert_string_view_to_string(table)

with pa.OSFile('/home/lyg/data/pubchem/arrow/pubchem_best_cluster.arrow', 'wb') as sink:
    with pa.RecordBatchFileWriter(sink, table.schema) as writer:
        writer.write_table(table)
print(f"Finished writing {len(picks)} picks to pubchem_best_cluster.arrow")

Finished writing 8 picks to pubchem_best_cluster.arrow
