In [1]:

from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
import pyarrow as pa
from rdkit.SimDivFilters import rdSimDivPickers
import multiprocessing


In [2]:
#in_memory_stream = pa.input_stream('/home/lyg/data/pubchem/arrow/pubchem_sorted.arrow')
#opened_stream = pa.ipc.open_stream(in_memory_stream)
#table = opened_stream.read_all()

table = pa.ipc.RecordBatchFileReader(pa.memory_map('/home/lyg/data/pubchem/arrow/pubchem_best.arrow')).read_all()

# write out the smiles to a temporary file
smiles = table.column('smiles').to_pylist()

smiles[0]

'N'

In [3]:
radius=2
nBits=2048

mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=nBits)

def smiles_to_fp(smiles):
  """
  Worker function to create an RDKit fingerprint from a SMILES string.
  """
  mol = Chem.MolFromSmiles(smiles)
  if mol is None:
    # return an empty fingerprint if the SMILES is invalid
    return DataStructs.cDataStructs.ExplicitBitVect(nBits)
  else:
    return mfpgen.GetFingerprint(mol)

with multiprocessing.Pool() as pool:
    arr = pool.map(smiles_to_fp, smiles)

[16:29:56] Explicit valence for atom # 0 B, 4, is greater than permitted
[16:29:57] Explicit valence for atom # 2 O, 2, is greater than permitted
[16:29:58] Explicit valence for atom # 0 B, 4, is greater than permitted
[16:29:58] Explicit valence for atom # 0 B, 4, is greater than permitted
[16:29:59] Conflicting single bond directions around double bond at index 6.
[16:29:59]   BondStereo set to STEREONONE and single bond directions set to NONE.
[16:30:00] Explicit valence for atom # 10 O, 2, is greater than permitted
[16:30:00] Conflicting single bond directions around double bond at index 3.
[16:30:00]   BondStereo set to STEREONONE and single bond directions set to NONE.
[16:30:00] Conflicting single bond directions around double bond at index 8.
[16:30:00]   BondStereo set to STEREONONE and single bond directions set to NONE.
[16:30:00] Conflicting single bond directions around double bond at index 8.
[16:30:00]   BondStereo set to STEREONONE and single bond directions set to NONE

In [4]:
assert len(arr) == len(smiles) == table.num_rows

In [5]:

thresh = 0.6
# minimum distance between cluster centroids. random threshold for morgan radius 2 is 0.65
# see https://greglandrum.github.io/rdkit-blog/posts/2020-11-18-sphere-exclusion-clustering.html#decreasing-the-sphere-radius

print(f"Finished computing fingerprints, clustering {len(arr)} molecules...")

lp = rdSimDivPickers.LeaderPicker()

picks = lp.LazyBitVectorPick(arr,len(arr),thresh)


Finished computing fingerprints, clustering 16588727 molecules...


In [6]:
# with open('cluster_labels.txt', 'w') as f:
#     for i in picks:
#         smiles = table['smiles'][i].as_py()
#         iupac = table['iupac'][i].as_py()
#         cid = table['cid'][i].as_py()
#         formula = table['formula'][i].as_py()
#         num_atoms = table['num_atoms'][i].as_py()
#         # f.write(f"{smiles}\t{iupac}\t{labels[i]}\n")
#         f.write(f"{cid}\t{smiles}\t{iupac}\t{formula}\t{num_atoms}\n")

def convert_string_view_to_string(table):
    for column in table.schema:
        if pa.types.is_string_view(column.type):
            # Recreate the column as pa.string
            table = table.set_column(
                table.schema.get_field_index(column.name),
                column.name,
                pa.array(table[column.name].to_pylist(), type=pa.string())
            )
    return table

table2 = table.take(pa.array(picks))

# huggingface only supports string, not string_view
table2 = convert_string_view_to_string(table2)

with pa.OSFile('/home/lyg/data/pubchem/arrow/pubchem_best_cluster.arrow', 'wb') as sink:
    with pa.RecordBatchFileWriter(sink, table2.schema) as writer:
        writer.write_table(table2)
print(f"Finished writing {len(picks)} picks to pubchem_best_cluster.arrow")

Finished writing 839175 picks to pubchem_best_cluster.arrow


In [7]:
ipicks = list(picks)

In [8]:
with open('/home/lyg/data/pubchem/arrow/pubchem_best_cluster_picks.txt', 'w') as f:   
    for i in ipicks:
        f.write(f"{i}\n")

In [10]:
len(table2)

839175