In [1]:

from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator
import numpy as np
import hdbscan
import pyarrow as pa
from rdkit.SimDivFilters import rdSimDivPickers


In [None]:
#in_memory_stream = pa.input_stream('/home/lyg/data/pubchem/arrow/pubchem_sorted.arrow')
#opened_stream = pa.ipc.open_stream(in_memory_stream)
#table = opened_stream.read_all()

table = pa.ipc.RecordBatchFileReader(pa.memory_map('/home/lyg/data/pubchem/arrow/pubchem_best.arrow')).read_all()

# sort the table by the number of atoms as that will give easier examples first
# also sort by cid so better annotated cids are picked first
table = table.sort_by(['num_atoms','cid'])


In [3]:
radius=2
nBits=2048
max_records=6000000
thresh = 0.65 # <- minimum distance between cluster centroids. random threshold for morgan radius 2 is 0.65

mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius,fpSize=nBits)

def itertuples(table, chunk_size=1, max_records=None):
    for i in range(0, table.num_rows, chunk_size):
        if i >= max_records:
            break
        rows = table[i:i + chunk_size].to_pydict()
        yield rows


# Cluster with HDBSCAN using Jaccard (1 - Tanimoto)
def cluster_fingerprints(fps_numpy, min_cluster_size=2):
    clusterer = hdbscan.HDBSCAN(metric='jaccard', min_cluster_size=min_cluster_size, core_dist_n_jobs=16)
    labels = clusterer.fit_predict(fps_numpy)
    return labels, clusterer

# arr = np.zeros((len(table), nBits), dtype=bool)
arr = []
arr_index = []

for i, row in enumerate(itertuples(table, max_records=max_records)):
    smiles = row['smiles'][0]
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        # arr[i] = mfpgen.GetFingerprintAsNumPy(mol)
        arr.append(mfpgen.GetFingerprint(mol))
        arr_index.append(i)
    if i % 10000 == 0:
        print(i, smiles)

print(f"Finished computing fingerprints, clustering {len(arr)} molecules...")

lp = rdSimDivPickers.LeaderPicker()

picks = lp.LazyBitVectorPick(arr,len(arr),thresh)


# labels, clusterer = cluster_fingerprints(arr)


0 CC(=O)OC(CC(=O)[O-])C[N+](C)(C)C




10000 CC(CCC(=O)NCCS(=O)(=O)O)[C@H]1CC[C@@H]2[C@@]1([C@H](C[C@H]3[C@H]2CC[C@H]4[C@@]3(CC[C@H](C4)O)C)O)C
20000 CC1=C(C(=NO1)C2=C(C=CC=C2Cl)F)C(=O)N[C@H]3[C@@H]4N(C3=O)[C@H](C(S4)(C)C)C(=O)O




30000 CN(C)N=CC(=O)C1=CC=C(C=C1)OC2=CC=CC=C2
40000 C1=CC=C(C=C1)C2=NC3=C(C=CC=C3O2)CC(=O)O
50000 CCSC1=NC2=C(S1)C=C(C=C2)NCN3C(=NN=N3)C4=CC=C(C=C4)[N+](=O)[O-]




60000 C1=CC(=CN=C1)C(=CCCCC(=O)O)C2=CC=C(C=C2)CCNS(=O)(=O)C3=CC=C(C=C3)Cl




70000 CC1=CC(=CC=C1)C(=O)C2CCCCC2




80000 CCCCCC[Pb]CCCCCC




90000 CC1CNCCN1C2=CC=C(C=C2)C




100000 CCCCCCOC(=O)C1C(C2(C(=C(C1(C2(Cl)Cl)Cl)Cl)Cl)Cl)C(=O)O




110000 CC1=CC(=C(C=C1)C(C)C)OCCOCCOCCOCCN2CCCCC2




120000 C1CC1N2C=C(C(=O)C3=CC(=C(C=C32)C4=CC=C(C=C4)O)F)C(=O)O




130000 C1=CC=C2C(=C1)C=[N+](C=C2O)C3=CC=C(C=C3)[N+](=O)[O-]




140000 CC(C)(C)N(CC(COC1=CC=CC2=C1CC(C(C2)O)O)O)N=O




150000 [O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[O-2].[Fe+3].[Fe+3].[Mo].[Mo]




160000 C1=C(C(=CO1)Cl)Cl




170000 C([C@@H](CO)O)[C@@H]([C@H](C=O)O)O




180000 CCC(C)N.CC(=O)O




190000 COC1=CC(=C(C=C1)OC)N=C(C2=CC=CC=C2)N




200000 C(C(=C(F)F)Cl)Cl
210000 COC1=CC=C(C=C1)C(C#N)(C2CCCCC2)C3CCCCC3
220000 C1=CC=C(C(=C1)CSC2=NC(=NC3=C2N=CN3C4C(C(C(O4)CO)O)O)N)F
230000 CCC(C)N=CC1=C(C=CC(=C1)[N+](=O)[O-])O
240000 CC1(COP(=O)(OC1)NC2=CC=C(C=C2)OC)[N+](=O)[O-]
250000 C1COC2(O1)C3C4C5C3C6(C5C4C26Br)C(=O)O
260000 C1=CC=C(C=C1)P(=CC(=O)CCl)(C2=CC=CC=C2)C3=CC=CC=C3
270000 C1CCC2C(C1)C(C(=C2C3=CC=CC=C3)C4=CC=CC=C4)Cl
280000 C1=CC(=CC(=C1)S(=O)(=O)F)C=CC2=CC=C(C=C2)C(=O)O
290000 CC(=O)N(C1=CC=CC=C1)C2=CC=CC=C2[N+](=O)[O-]




300000 C[Sn](CC1=CC=CC=C1)(CC2=CC=CC=C2)C3=CC=CC=C3
310000 CC1(C2=CC=CC=C2N=C(O1)C3=CC=CC=C3)C
320000 C1=CC(=CC=C1NC(=O)C2=CC(=C(C(=C2)Br)O)Br)Br
330000 CC(C)NC(=O)O[C@H]1CC[C@@]2(C3CC[C@]4(C([C@@H]3CC=C2C1)CCC4=NOC(=O)NC(C)C)C)C
340000 CC1=CC=C(C=C1)N2C(=O)CSC2=C(C#N)C3=NC4=CC=CC=C4N3
350000 CCOC(=O)C=C[C@@H](C(C)(C)C)O[Si](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3
360000 C1C(=O)NC2=CC=CC=C2C(=O)O1
370000 C1=CC=C(C=C1)CCN(CCC2=CC=CC=C2)CC#N
380000 COC(=O)C(CC1=CC=C(C=C1)OC(C2=CC=CC=C2)(C3=CC=CC=C3)C4=CC=CC=C4)N
390000 CC1CC(C(C(C=C(C(C(C=CC=C(C(=O)NC2=CC(=O)C(=C(C1)C2=O)NCCCl)C)OC)OC(=O)N)C)C)O)OC




400000 C[C@H]1[C@H]([C@H](C[C@@H](O1)O[C@H]2C[C@@](CC3=C(C4=C(C5=CC=CC=C5C(=C4C(=C23)O)O)O)O)(C(=O)C)O)N)O




410000 CCC1=CC(=CC(=C1)NC(=O)C2=CC=CC=C2SSC3=CC=CC=C3C(=O)NC4=CC(=CC(=C4)CC)CC)CC


[14:25:49] Explicit valence for atom # 66 O, 2, is greater than permitted


420000 C1=CC=C(C=C1)C[C@@H]2[C@@H]([C@H]([C@H](N(C(=O)N2CC3=CC=CC=C3)CC4=CC5=C(C=C4)NN=C5N)CC6=CC=CC=C6)O)O
430000 CCCC1(OC(=C(C(=O)O1)C(CC)C2=CC(=CC=C2)NS(=O)(=O)C3=NC=C(C=C3)[N+](=O)[O-])O)CCC4=CC=CC=C4
440000 C1=CC=C(C(=C1)CCO)C[N+]2=C3C(=C(N=C2)N)NC=N3
450000 C[C@@H]1CN(CCN1C(=O)C(=O)C2=CNC3=C(C=CC(=C23)F)C(=O)NCCN4CCOCC4)C(=O)C5=CC=CC=C5




460000 CC1(CCCCCCCCC(CC1)(C)C)C
470000 CC(=CCOC(=O)CSC1=CC=CC=C1)C
480000 CSC(=C(C(F)(F)F)C(F)(F)F)F
490000 C[SiH](C)O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)O[Si](C)(C)O[SiH](C)C
500000 CCOC(=O)SSC(=O)NC1=CC=CC=C1
510000 C1CC1CC2=CC=C(C=C2)C#N
520000 CC1CCC2(C(C3C(O2)CC4C3(CCC5C4CCC6C5(CCC(C6)OS(=O)(=O)C7=CC=C(C=C7)C)C)C)C)OC1
530000 CC1=C(C(C(=C)C1)C2=CC=CC=C2)C
540000 C1C(=NNC1=O)C2=CC=C(C=C2)[N+](=O)[O-]
550000 CC1=CC(=CC=C1)C=NC2=C(C=CC(=C2)[N+](=O)[O-])F
560000 CCCCOC(=O)C1=C2C=C(C=CC2=NC(=C1)C)I
570000 CCC=CC[C@@H]1[C@H](CCC1=O)CC(=O)N[C@@H]([C@@H](C)CC)C(=O)O




580000 CC1=C2C(=CC=C1)C=C(C(=O)N2)CN(CC3=CC=CO3)S(=O)(=O)C4=C(C=CC5=NSN=C54)C




590000 C1=CC=C(C=C1)N2C(C3C(O2)C(=O)N(C3=O)C4=CC=C(C=C4)F)C5=CC=CC=C5OCC(=O)O




600000 CC1=CC=CC=C1C(=O)NC2=C(C3=C(S2)CCCC3)C(=O)OC
610000 COC1=C(C=CC(=C1)[C@@H]2N[C@@H](CS2)C(=O)O)OCC(=O)N
620000 CC[C@H](C)OC1=C(C=C(C=C1)/C=C/2\C(=O)NC(=O)N2)Cl
630000 COC1=CC=C(C=C1)/C=C/C(=O)NC(C2=CC=CC=C2)C3=CC=CC=C3
640000 C1=CC=C(C(=C1)C(=O)N)NC(=O)C=CC2=CC=C(C=C2)Cl
650000 C1COCCN1C(=O)CSC2=NNC(=O)NC2=O
660000 C1=CC(=CC(=C1)[N+](=O)[O-])C(=O)NC2=NC=CS2
670000 C1=CC=C(C=C1)CCN[C@@H](CC(=O)NC2=CC(=CC=C2)Cl)C(=O)O
680000 COC(=O)C1=CC=C(C=C1)NC(=O)CC2CCC2
690000 CC(C)C(=O)NC1=C(C=C(C=C1)Cl)C(=O)C2=CC=CC=C2
700000 C1CC[C@@H]2C(=NN([C@@]2(C(F)(F)F)O)C(=O)C3=CC=C(C=C3)O)CC1
710000 CC1=CC=C(C=C1)S(=O)(=O)NC2=CC=CC(=C2)C(=O)NN
720000 CC(C)OC1=CC=CC(=C1)C(=O)NC(=S)NC2=CN=CC=C2
730000 CC(C)(C)C(=O)N1CCC(CC1)C2=NC(=NO2)C3=CC=C(C=C3)F
740000 COC1=CC=C(C=C1)C=NNC2=NN=C(C3=CC=CC=C32)Cl
750000 CC1=C(C=C(C=C1)Cl)NC(=O)CC2=CC=CC3=CC=CC=C32
760000 CCC1=CC=C(C=C1)[C@@H]2C3=C(C=C(C=C3)N)OC(=C2C#N)N
770000 CC1=CC(=CC=C1)C2=NC(=NN2)SCC(=O)N
780000 CCC1=CC=C(C=C1)[C@H]2NC3=CC=CC=C3C(=O)N2C4=CC=CC=C



1180000 C1=CC=C(C=C1)/C=C/C2C3C(C(=O)N(C3=O)C4=CC=C(C=C4)F)C5(O2)C(=O)C6=CC=CC=C6C5=O
1190000 CCOC(=O)C1=CC(=C(C=C1)C)CNC(=O)CSC2=NC(=CS2)C3=CC=C(C=C3)F




1200000 CC(=O)C1=C(C(=CO1)OC)OC




1210000 C1=CC(=CC=C1CN2C3=CC(=C(C=C3C4=C(C2=O)C(=C(C=C4)O)O)O)O)[N+](=O)[O-]
1220000 COC1=C(C(=C(C=C1)/C=N/NC(=O)C2=CC(=C(C(=C2)OC)OC)OC)OC)OC
1230000 C(CNC1=C(C(=C(C(=C1Cl)Cl)Cl)Cl)Cl)N.C(CNC(=S)S)NC1=C(C(=C(C(=C1Cl)Cl)Cl)Cl)Cl




1240000 C1=CC=C(C=C1)/C=C/COC(=O)C(F)(F)F


[14:32:33] Conflicting single bond directions around double bond at index 10.
[14:32:33]   BondStereo set to STEREONONE and single bond directions set to NONE.


1250000 CCCCCC/C=C\CCCCCCCCC(=O)C1=C(C(=CC=C1)OC)O




1260000 CCN\1C2=CC=CC=C2S/C1=C\C(=C(/C#N)\C(=O)OCC)\C
1270000 CCOC1=CC=CC(=C1)C2=NNC(=S)N2/N=C\C3=C(C=CS3)C
1280000 CCN(CC)CCN1[C@@H](/C(=C(/C2=CC(=C(C=C2)OC)C)\O)/C(=O)C1=O)C3=CC(=C(C=C3)OC)OC


[14:32:59] Explicit valence for atom # 0 B, 4, is greater than permitted
[14:32:59] Explicit valence for atom # 0 B, 3, is greater than permitted


1290000 CC\1=NN(C(=O)/C1=C\C2=CC=C(C=C2)[N+](=O)[O-])C(=O)CC3=CC=CC=C3




1300000 C1=CC=C(C=C1)NC(=O)O/N=C/2\[C@@H]([C@H]([C@@H]([C@H](O2)CO)O)O)O




1310000 CC1=CC(=C(C=C1)C)NC(=O)COC2=CC=CC=C2C3=NC(=NO3)C
1320000 CCOC1=CC=C(C=C1)CN2C(=O)C3=CC4=C(N3CC2(C)C(=O)NC5CCC(CC5)C)C=CO4
1330000 CC1CCN(CC1)S(=O)(=O)C2=CC3=C(C=C2)N(C(=O)C3(C)C)CC(=O)NC4=CC=C(C=C4)C(=O)C
1340000 C1=CC=C(C=C1)C2=NN3C(=O)C=C(N=C3S2)CNC(=O)C4=CC=CS4
1350000 CCC1=NC(=NO1)C2=CC(=C(S2)C)S(=O)(=O)NCC(=O)NC3=C(C=C(C=C3)OC)OC
1360000 CC1=CC(=NC2=C1C(=NN2C3=CC=CC=C3)C)OCC(=O)N4CCN(CC4)C5=CC=CC=C5
1370000 CCC1=CC=C(C=C1)C2=C(C=NN2)CN3CCCC3C4=C(ON=C4C)C
1380000 C1CC1(C2=CC=C(C=C2)NC(=O)C3=CC=CS3)C(=O)NCCC4=CC=CC=C4
1390000 CC1=CC(=CC=C1)C2=NOC(=N2)C3=CN(C4=C(C3=O)C=CC(=N4)C)CC(=O)NC5=C(C=CC(=C5)C)C
1400000 CC1=C(C=CC=C1NC(=O)C)C(=O)NC2=CC=C(C=C2)OCC3=NN=C(O3)C4=CC=C(C=C4)OC
1410000 C1=CC=C(C=C1)N2C=CC(=C2)/C=N\NC(=O)C3=CC=CC=C3Br
1420000 CCC1=CC=CC=C1N2C(=O)/C(=C\C3=CC4=C(C=C3F)N(C(C=C4C)(C)C)C)/C(=O)NC2=S
1430000 CN(C)C1=CC=C(C=C1)C(=O)OC2CCC3=CC=CC=C3C2=O




1440000 CC(C)COC(=O)C1=CC=CC(=C1)C2=CC=C(O2)C=NC3=CC=C(C=C3)NC(=O)C
1450000 C#[C-].[Zr]
1460000 CC1(N([C@H](CO1)/C=C\C2=C(C=C(C=C2)OC)[N+]#[C-])C(=O)OC(C)(C)C)C




1470000 CN1C2CCC1[C@H]([C@H](C2)C3=CC=C(C=C3)Cl)C(=O)NC4=NC(=CC=C4)F




1480000 CCC1=CC=C(C=C1)C(=O)NC2CCN(CC2)C(=O)C3CC(=O)N(C3)C4=CC=C(C=C4)F




1490000 C1COCCN1C2=C(C(=O)C3=CC=CC=C3C2=O)C(=O)O
1500000 CC1=C(C(=CC=C1)NC(=O)CNC(C)CCC2=CC=CO2)C
1510000 COC1=CC=CC=C1NC(=O)N2CCCN3C(=CC(=N3)C(=O)NC4CC4)C2
1520000 CC(C)OCCCNC(=O)C1=CC(=CC=C1)NCC2=CC=C(C=C2)F
1530000 CCOC1=CC=CC=C1C2=NC=CN2CC3=CC=CC=C3C




1540000 CCN1C2=C(C=C(C=C2)CN(CCC3=CN=CC=C3)C(=O)CC4=CN(C5=CC=CC=C54)C)N(C(=O)C(C1=O)(C)C)C
1550000 CC1=CC=C(C=C1)N2C(=S)N(C(=O)C2(C)C)C3=C(C(=C(C=C3)C#N)Cl)F




1560000 C1CC2CC3=NC4=C(N3CCN2C1)C=CC(=C4)C5=CC6=NC=CN6C=C5
1570000 CCOC1=CC=C(C=C1)NC(=O)NC2=C(C=CC(=C2)Cl)C




1580000 CC1=C(C(C2=C(N1)CCCC2=O)C3=C(C(=CC=C3)OC)OC)C(=O)OC.O




1590000 CC1=NN(C(=C1CN(CC2CCCO2)C(=O)C3=CC=C(C=C3)C(F)(F)F)OC4=CC=C(C=C4)Cl)C5=CC=CC=C5
1600000 CCOCCCNC(=O)NC1=CC(=NN1C2=CC=CC=C2)C
1610000 CC1CC1C(=O)NC2=CC(=C(C=C2)F)NC(=O)OC(C)(C)C
1620000 CCOC(C)C(=O)N1CCC(CC1)NC(=O)C2=C(N=CS2)C
1630000 C1CC2CC1CC2C(=O)NC3=NC=C(S3)CC4=CC=CC=C4
1640000 CC1=C(C=CC(=C1)NC(=O)C(C)SC2=NN=CS2)N3CCN(CC3)C
1650000 CC1=NN(C(=C1CNC(C)C2=CC3=C(C=C2)NC(=O)CC3)OC)C
1660000 COC1=C(C=C(C=C1)CNC(=O)CCC(=O)C2=CC3=C(C=C2)OCCO3)OC
1670000 CCN(CC(=O)NC1=CC2=C(C=C1)OCCO2)C(=O)C3=CC(=C(C=C3)C)C
1680000 CC(C)N(C1=CC=CC=C1)C(=O)CSC2=NNC(=O)N2C
1690000 CCC1=C(C=C(S1)C(=O)NC2=CC=NC=C2)C
1700000 CCCNC(=O)CSC1=NN=C(S1)SCC2=NC=C(N2C)Cl
1710000 C1CC1N(CC2=CC=CC=N2)C(=O)C3=CC=C(O3)C4=CC=C(C=C4)F
1720000 CN1C=CN=C1C(C2=CC=CC=C2)NC(=O)CCCOC3=CC=CC=C3
1730000 C1CC2=CC=CC=C2C1NC(=O)CNC(=O)CC3=CC=CC4=CC=CC=C43
1740000 C1CCC2=NN=C(N2CC1)CCNC(=O)C3=CC(=CC=C3)OC4=CC=CC=C4
1750000 CC1=C(C(=CC=C1)C)NC(=O)CN(C)C(=O)C(C)OC2=CC=C(C=C2)OC
1760000 CC1=CC=C(C=C1)S(=O)(=O)N(CC(=O)NC2=CC=C(C=C2)



1770000 CC(C)(C)C[C@@H]1CN([C@@H]([C@@]1(C#N)C2=C(C=C(C=C2)Cl)F)C3=C(C(=CC=C3)Cl)F)C(=O)NCC4=CC=C(C=C4)F




1780000 CCCN1C(=O)C2=CC=CC=C2N=C1SCC3=CC(=CC(=C3)F)F
1790000 CCN1C=C(C=N1)C2=NC3=CC=CC=C3C(=C2)C(=O)N4CCC(=O)[C@H]5[C@@H]4CCCC5
1800000 CC1(CC2=C([C@@H](C3=C(N2)C4=CC=CC=C4C3=O)C5=CC=C(C=C5)OC(=O)C6=CC=C(C=C6)OC)C(=O)C1)C
1810000 CC1=CC=C(C=C1)CN2C(=O)[C@@]3(CCOC4=CC=CC=C43)NC2=O
1820000 CCOC(=O)C1=NN([C@@H]2[C@H]1C(=O)N(C2=O)C3=CC=C(C=C3)OC)C4=CC=C(C=C4)Cl
1830000 CCOC(=O)C1=C(OC2=C([C@H]1C3=CN=CC=C3)C(=O)N(C(=C2)C)C)N
1840000 COC1=CC=CC(=C1)N2C[C@@H](CC2=O)NC(=O)COC3=CC=CC=N3
1850000 C1[C@H](C(=O)NC2=CC=CC=C21)CCC(=O)NCCC3=CC(=CC=C3)F
1860000 CCOC(=O)[C@H]1[C@@H](NC(=O)N[C@@]1(C(F)(F)F)O)C2=CC=C(O2)C3=CC=C(C=C3)Br
1870000 COC1=CC=C(C=C1)[C@@H](CNC2=C(N=C(O2)C3=CN=CC=C3)C#N)N4CCOCC4
1880000 C[C@@H]1CCC[C@H](C12C(=O)N(C(=O)N2)CN(CC(C)C)CC(F)(F)F)C
1890000 CCC1=CC=C(C=C1)NC(=O)N2CCC[C@@H]2C3=CC=C(C=C3)C
1900000 C1CCC2=C(C1)C(=NC(=O)N2C[C@@H]3CCCO3)SCC4=CN5C=CC=CC5=N4
1910000 CC[C@@H](C)C1=CC=C(C=C1)S(=O)(=O)N(C)C2=CC=CC=N2
1920000 C[C@@H]1CN(C[C@H](O1)C)C(=O)C2CCN(CC2)C(=O)CC3=CC=CC=C3C



2200000 CC1CC2=CC=CC=C2N1C(=O)C3=COC(=N3)NC(=O)C4=CC=CO4
2210000 CC1=C(C=C(C=C1)C2=NN3C(=NN=C3C=C2)C)S(=O)(=O)N4CCOCC4
2220000 C1CCC(C1)NC2=CC(=NC=N2)N3C=CC=N3
2230000 CC1=CC2=C(C=C1)N3C=CC=C3C4(N2)CCN(CC4)S(=O)(=O)C5=CC=CC(=C5)F
2240000 CC1CCN(CC1)C(=O)C2=NC3=CC=CC=C3C(=C2)NC4=CC(=CC=C4)SC
2250000 CC1=C(C=C(C=C1)NC(=O)CCC2=NC(=NO2)C3=CN=CC=C3)Cl
2260000 C1CN(CCC12C3=CC=CN3C4=C(N2)C=CC(=C4)Cl)S(=O)(=O)C5=CC(=C(C=C5)F)F
2270000 CC1C(=O)NC2=C(O1)C=C(C(=C2)C)S(=O)(=O)N3CCCC3C(=O)N(C)C4CCCCC4
2280000 CC1=CC=C(C=C1)CNS(=O)(=O)C2=C(SC(=C2)C3=CC(=NO3)C)C
2290000 CC(C)CC1=CC(=NO1)C(=O)N2CC(C2)OCC3=CC(=CC=C3)F
2300000 COC1=CC=CC=C1C(=O)NCC2=NN=C(S2)C(=O)NC3=CC(=CC=C3)F
2310000 COC1=CC=C(C=C1)C(=O)NC2=CC=C(C=C2)OCC3=CC(=NO3)C(=O)NCC4=CC=CC=C4
2320000 CC1(CC2=CC=CC=C2C(=O)N1CC3=CC(=CC=C3)Cl)C(=O)NCC4=CC=CC=C4OC
2330000 CC(C)OC1=CC=C(C=C1)C2=NOC(=N2)CN3C4=C(C(=O)N(C3=O)C5=CC(=CC=C5)F)SC=C4
2340000 C1=CC=C(C=C1)NC(=O)CN2C(=O)C=CC=C2C3=NC(=NO3)C4=CC=CC=C4F
2350000 C1CN(CCC1C2=NC(=NO2)C3=CC(=CC=C3)F)



2430000 C1COC(C2=CSC=C21)CN




2440000 CC(C)(C(=O)[O-])SC1=CC(=NC(=N1)N)Cl
2450000 C1=CC=C(C=C1)NC2=NC=NC(=C2)NCCO




2460000 C1C(NC(S1)C2=C(OC3=C(C2=O)C=C(C=C3)O)N)C(=O)O
2470000 CCOC1=CC=CC=C1N(CC(=O)NCC2=CC(=CC=C2)OC)S(=O)(=O)C3=CC=C(C=C3)SC
2480000 CN1CCN(CC1)C(=O)C2=C(ON=[N+]2C3=CC=C(C=C3)OC)[O-]


[14:51:25] Explicit valence for atom # 15 O, 2, is greater than permitted


2490000 CCCCN1C(=NN=N1)C2(CCC(=O)N2CC3CCCCC3)C




2500000 CCC1=CC(=CN=C1)C2=CC[C@@H]3[C@@]2(CC[C@H]4[C@H]3CC[C@@H]5[C@@]4(CCOC(=O)C5)C)C




2510000 C[N+](C)(C)NP(=N[N+](C)(C)C)(O)OC1=CC=CC=C1.[I-].[I-]
2520000 CCOC1=CC=CC(=C1)CNC(=O)C2=C(N(N=N2)C3=CC(=CC=C3)OC)C4=CC=NC=C4
2530000 CN1C[C@@H]2C[C@@H](C1)[C@@H]3CCCCN3C2




2540000 C[C@](CO)(C1=CC=C(C=C1)C(=O)NC2=NC=C3C(=C2)N(C=C3Br)C4CC4)O




2550000 C1CN(C(=O)N1)C2=CC=C(C=C2)S(=O)(=O)OC3=CC=CC=C3I
2560000 CCOC(=O)C1CCN(CC1)C2=NC=NC3=C2CN(CC3)C(=O)C4=CC=C(C=C4)N5C=CC=N5
2570000 CC1=CC=C(C=C1)S(=O)(=O)N2C[C@H](CC[C@H]2C3=CC=CC=C3)C(=O)C




2580000 CC(C(=O)NC1=NC(=CS1)C(=NOC)C(=O)NC2C3N(C2=O)C(=CCS3)C(=O)OCOC(=O)C(C)(C)C)N




2590000 C1=CC=C(C=C1)C2=NC=CC(=C2)N.Cl
2600000 COC(=O)C1=CC(=C(C=C1)N2CCCC2=O)F
2610000 CC1=C(C=C(C=C1)Cl)N(C2=C(C=CC(=C2)Cl)C)C(=O)C3=CC(=C(C=C3)O)C(=O)N




2620000 C1=CC=C(C=C1)S(=O)(=O)N2C=CC3=C2C=CC(=C3)/C=C/C(=O)O
2630000 CC(C)C/C(=C/N=C(C1=CC=CC=C1)C2=CC=CC=C2NC(=O)C3=CC=CC=N3)/C(=O)O




2640000 CC1=CC=CC=C1[PH+](C2=CC=CC=C2C)C3=CC=CC=C3[CH2-].CC1=CC=CC=C1[PH+](C2=CC=CC=C2C)C3=CC=CC=C3[CH2-].CC(=O)O.CC(=O)O.[Pd].[Pd]




2650000 CC1=CC(=NN=C1NCC(C)(C)C2=CC=C(C=C2)F)C3=CC(=CC=C3)C(=O)N
2660000 CC(C)C1=NN(C(=C1CNCC2=C(N=CC=C2)N(C)C)Cl)C.Cl
2670000 CC1=NN(CC1)C2=CC=C(C=C2)C(=O)OCC3=CC4=C(C=C3)OCO4
2680000 CC(CNC(=O)NC1=CC=CC(=C1)CN2C=CC=NC2=O)N3CCC4=CC=CC=C43
2690000 CC1=C(OC2=CC=CC=C12)C(=O)N/N=C/C=C/C3=CC=CC=C3[N+](=O)[O-]
2700000 COC1=CC=CC(=C1)N2CCC(C2)CNC3=NC=NC4=C3NC=N4
2710000 CCN(CCNC(=O)C1=NN(C=C1)C2=CC=CC=C2)C3CC3
2720000 CCC(=O)NC(C1=CC=CC=C1)C(=O)NC2=CC=C(C=C2)C3=CC=NN3
2730000 CCN(CC)C(=O)C1CCCN1C(=O)C2=CC=C(C=C2)F
2740000 CCN1CCCC1C2CCCN2C(=O)C3CCC(CC3)N4C=CC=C4
2750000 COCCN1CCCC(C1)NC(=O)C(=O)NC2=C(C(=CC=C2)F)F
2760000 C1CC(CN(C1)C(=O)CNC2=CC=CC=C2[N+](=O)[O-])C(=O)N
2770000 CC(C(=O)NCC1=CC2=C(C=C1)OCO2)N3CCC4=C(C3)N=CN4C




2780000 [CH2]CC#CC=C




2790000 CC(=CC(=O)OC)OP(=O)(O)O
2800000 CC(C)(C)[Si](C)(C)C1=CC=CC=C1C(=O)O
2810000 C(CCOCCCCO)CN
2820000 C1CN(CCN1CCCCC(=O)C2=CC=CC=C2)N=C(C#N)NO
2830000 CCCCCCC(C1=CN=CC=C1)N=O
2840000 C1=C(C2=NON=C2C(=C1)[N+](=O)[O-])CCCCCC(CCN)(N)N
2850000 CCCCCCC1(CCC(CC1)C2CCC(CC2)C3=CC=C(C=C3)C#N)C
2860000 CN1C=C(C2=CC=CC=C21)C3C(=O)C(=O)C(C(=O)C3=O)C4=CC=CC=C4
2870000 C[C@H]1[C@H]2[C@H](C=C3CCCC[C@@H]3[C@H]2C#C)C(=O)O1
2880000 C1=CC=C(C=C1)C=CC2=NC=CC(=N2)C#N
2890000 CC(C1C(N(C1=O)C(C2=CC=C(C=C2)OC)C3=CC=C(C=C3)OC)CC(=O)C(=[N+]=[N-])C(=O)OCC4=CC=C(C=C4)[N+](=O)[O-])OC(=O)OCC5=CC=C(C=C5)[N+](=O)[O-]
2900000 C1=CC=C(C=C1)C(=O)NC2=C3C(=NC=N2)N(C=N3)[C@H]4[C@@]([C@@H]([C@H](O4)CO)O)(O)F
2910000 CN(C)CCCSC(=S)N
2920000 CS(=O)(=O)C1=NC=C(C(=N1)CN)C2=CC=CC=C2Cl
2930000 CC1=CC(=C2C(=N1)C=CC=N2)C(=O)O
2940000 CCCCCCC[C@H]1CCC([C@@H](C1)C)C2CCC(CC2)C#N
2950000 CN1C(=O)N(C(=O)N(C1=O)C2=C(C=CC(=C2)OC3=C(C=C(C=C3)C(F)(F)F)Cl)Cl)CC(=O)OC
2960000 C1C[C@H]([C@H](C1)O)NC2=C3C(=NC=N2)N(C=N3)[C@H]4[C@@H]([C@@H]([



3510000 C1C(N(N=C1C2=CN=CC=C2)C(=O)C3=CC=C(S3)C4=CC=C(C=C4)CO)C5=CC=CC=C5O
3520000 C1=CC(=CC=C1CN=CC2=CC=NC=C2)C(=O)O




3530000 CCOC(=O)C1=CC=CC=C1C(=O)S.[Na+]




3540000 C[C@@H]1CN(S(=O)(=O)C2=C(C=C(C=C2)C#CCC3CCCC3)O[C@H]1CN(C)S(=O)(=O)C4=CC=CC=C4OC)[C@@H](C)CO
3550000 C[C@@H]1CN(S(=O)(=O)C2=C(C=C(C=C2)C#C[C@H](C)O)O[C@H]1CN(C)C(=O)C3CCCCC3)[C@@H](C)CO
3560000 CN1CCN(CC1)C(=O)C[C@@H]2CC[C@@H]([C@H](O2)CO)NC(=O)C3=CC4=C(C=C3)OCO4
3570000 C[C@H](CN([C@@H](C)CO)C(=O)NC1=CC=C(C=C1)F)[C@@H](CN(C)CC2=CC3=C(C=C2)OCO3)OC
3580000 C1CCC(CC1)CNC(=O)C[C@H]2CC[C@H]3[C@H](O2)COC[C@@H](CN3C(=O)NC4=CC=C(C=C4)C(F)(F)F)O
3590000 CN1C=C(N=C1)S(=O)(=O)N2CCCCN3[C@@H](C2)[C@@H]([C@H]3CO)C4=CC=C(C=C4)C5=CC=C(C=C5)F
3600000 CC(C)CCN1C2=CC=CC=C2C(=C(C1=O)C(=O)N/N=C/C3=CC(=CC=C3)F)O




3610000 CN1C2=CC=CC=C2C(=C(C1=O)CC3=CC=C(C=C3)C4=CC5=CC=CC=C5C=C4)O
3620000 CCN1C2=CC=CC=C2C(=C(C1=O)C(=O)C=CC3=C(C=C(C=C3)O)O)O




3630000 CN1C2=CC=CC=C2C(=C(C1=O)C(=O)N(C)C3=CC=C(C=C3)OCC(=O)OC)O




3640000 CN(C)[C@H]1[C@@H]2C[C@@H]3CC4=C(C(=CN=C4N(C)C)O)C(=C3C(=O)[C@@]2(C(=C(C1=O)C(=O)N)O)O)O




3650000 CCOC(=O)C1=C([C@@H]([C@@H](N[C@@H]1C2=CC=C(C=C2)Cl)C3=CC=C(C=C3)Cl)SC4=CC=C(C=C4)Cl)O




3660000 CC(=C)[C@@H]1CC[C@]2([C@H]1[C@H]3CC[C@H]4[C@]([C@@]3(CC2)C)(CC[C@@H]5[C@@]4(CC=C(C5(C)C)C6=CC=C(C=C6)C7=NNN=N7)C)C)CO
3670000 CC1=CC(=C(C=C1)/C=C(\C2=CC=CC=C2[N+](=O)[O-])/C(=O)OC)Br
3680000 CCCCN1C(=O)C2=CC=CC=C2NC1(C)C3=CC=CC=C3OCCOC4=CC=CC=C4
3690000 C1=CC(=CC(=C1)Br)C(=O)NC2=CC=C(C=C2)C(=O)NCC3=CC=CO3
3700000 CCC(CNC1=CC(=CC=C1)OCC)OC2=CC=CC=C2C
3710000 CCCCCCCOC1=CC=C(C=C1)NC(=O)CNC2=CC=CC(=C2)C
3720000 CCCCCCOC1=CC=C(C=C1)NCC(=O)NC2=CC=CC(=C2)C(=O)NCCCOC
3730000 CC1CCN(CC1)C(=O)C2=CC=C(C=C2)NCC(=O)N3CCCCCC3
3740000 CC(=O)NC1=CC=C(C=C1)NC(=O)CNC2=CC=CC(=C2)C(=O)N3CCCC3
3750000 CCC1=NC(=NC(=C1C(=O)N)C)N
3760000 CCC(C)CN(C)C1CC2=CC=CC=C2NC1
3770000 CCN(CC)CCN(CC(C)C)C(=O)CC(C)N
3780000 CC1=NC2=C(O1)C=CC(=C2)NC(C)C3=CC=NN3
3790000 C1CC1CNC2(CCC3=CC=CC=C3C2)C#N
3800000 CCN(CC(=O)OCC)C(=O)C1=C(C=CC=C1Cl)Cl
3810000 CC1CCN(CC1)C(=O)CSC2=CC=C(C=C2)O
3820000 CC(C1=CC2=C(C=C1)OCCO2)NC3=NN(C=C3)C
3830000 C1=CC(=CC=C1CCN)OCC2=CC=C(C=C2)C#N
3840000 CN(CC1=CC=C(C=C1)Cl)C(=O)C2CC2C(=O)O




3960000 C1=CC=C(C=C1)C(=NC2=CC=C(C=C2)[N+](=O)[O-])NNC3=CC=C(C=C3)[N+](=O)[O-]
3970000 C1=CC=C2C(=C1)N=C(S2)N/N=C\C3=CC=C(C=C3)F
3980000 CC1CCC2=C(C1)SC(=C2C(=O)N)NC(=O)/C=C/C3=CC=C(O3)C




3990000 CC(C)C(C(=O)O)NC(=O)/C(=C\C1=CC=C(C=C1)Br)/NC(=O)C2=CC=CC=C2
4000000 C1=CC=C(C=C1)C(=O)/C=C(/C(=O)O)\NC2=CC=C(C=C2)[N+](=O)[O-]




4010000 CC1=CC=C(C=C1)S(=O)(=O)OC2=CC=CC(=C2)/C=N\NC(=O)C3=CC=CC4=CC=CC=C43
4020000 CCCOC1=C(C=C(C=C1)/C=N\NC(=O)C(C(C)C)NC(=O)C2=C(C=C(C=C2)Cl)Cl)OCC
4030000 CC1CC2=C(O1)C=CC(=C2)/C(=C\3/C(N(C(=O)C3=O)CCC4=CC=CC=C4)C5=CC(=C(C=C5)O)OC)/O
4040000 CN1C(=O)/C(=C\C2=C(C(=CC(=C2)OC)Br)O)/SC1=S
4050000 COC1=CC=C(C=C1)NS(=O)(=O)C2=CC(=C(C=C2)N/N=C\C3=CC=CC=C3F)[N+](=O)[O-]
4060000 CCC1=CC=CC=C1N=C2N(C(=O)/C(=C\C3=CC=C(C=C3)O)/S2)CC
4070000 CCC(C(=O)O)N1C(=O)/C(=C\C2=CC(=C(C=C2)O)OCC)/SC1=S
4080000 CC1=CC=C(C=C1)NC(=O)C2=C(N(C3=NC4=CC=CC=C4N=C23)/N=C\C5=CC=CC(=C5)C)N
4090000 CC1=CC=CC=C1CN2C=C(C3=CC=CC=C32)/C=C\4/C(=N)N5C(=NC4=O)SC(=N5)C
4100000 COC(=O)C1=CC=C(O1)CN2C(=O)/C(=C\C3=CC=CN3C4=CC=CC=C4F)/SC2=O
4110000 C1=CC=C2C(=C1)C=CC=C2NCC(=O)N/N=C\C=C\C3=CC=CO3
4120000 CC(C)CN(C1CC1)C2=CC=C(C=C2)C(=S)N
4130000 CCN(CC(C1=C(C=CC(=C1)C)C)N)CC(F)(F)F
4140000 CCCCCCCCCCNCC1CCCO1
4150000 C1COCCC1S(=O)(=O)NC2=C(C=CC(=C2)N)Cl
4160000 CC(C)C(C)N(C)C(=O)C1=C(C=CC(=C1)N)[N+](=O)[O-]
4170000 CCCNCC(C)(C)CN



4330000 CN(CC1=CC=C(S1)C(=O)NN)CC2=CSC=N2
4340000 C1CC2CC1CC2C(CC3=CC=C(C=C3)Br)N
4350000 CC(C)(C)NCC1=NN=NN1CC2CC3CCC2C3
4360000 CNS(=O)(=O)C1=CC=C(C=C1)NCCOCCN
4370000 CC1=C(C(=C(C=C1)F)[C@@H]2CCCN2)F
4380000 C1CCN(CC1)C2=CC=C(O2)[C@H]3CN3
4390000 CCC1=C(C(=CC=C1)Cl)C(CC)N
4400000 CCC(C1=CC(=C(C=C1)C)O)N
4410000 CC1=C(OC(=C1)C[C@H](C(=O)O)N)C
4420000 C=CCNCCCCl
4430000 COC1=CC=CC(=C1)C2=NOC(=N2)COC(=O)C3=C(C4=C(C=C3)C(=O)C5=CC=CC=C5C4=O)Cl
4440000 CN1C(=C(C(=O)N(C1=O)C)C(=O)CSC2=NC3=CC=CC=C3C(=O)N2C4CCCCC4)N
4450000 CC1=CC(=C(C(=C1C)S(=O)(=O)N2CCC(CC2)C(=O)NC3=C(C4=C(S3)CCCC4)C#N)C)C
4460000 CCN(CC(=O)NC(C)(C)C)C(=O)C1CCN(CC1)S(=O)(=O)C2=CC=C(C=C2)OCC
4470000 C1=CC=C(C=C1)CN(CC2=CC=CO2)CC(=O)NC3=C(C=C(C=C3Cl)Cl)Cl
4480000 CC1=C(SC=C1)CN(C)C(=O)C2=CC(=C(C=C2)Cl)S(=O)(=O)NCC3CCCO3
4490000 CCC1=NC2=CC=CC=C2N1CC(=O)OCC(=O)NC3=C(C=C(C=C3)Cl)C(F)(F)F
4500000 COC1=C(C=C(C=C1)Cl)C2=NC(=NN2)SCC(=O)NC(=O)NCCC3=CC=CS3
4510000 CC(C(=O)C1=CC=C(C=C1)NC(=O)C(C)(C)C)OC(=O)C2=CSC(=N2)C3=CC=CC=C3
4520



5710000 C1=CC(=C(C=C1N)C#N)N2C(=O)C3=CC4=C(C=C3C2=O)C(=O)N(C4=O)C5=C(C=C(C=C5)N)C#N
5720000 CC(=O)NC1=CC2=C(C=C1)N3C(=NN=C3N(C2=O)CC=CC4=CC=CC=C4)N5CCCC5
5730000 CC(C)C[C@@H](C(=O)C1=C(C=C(C2=CC=CC=C21)OC)C(=O)N)N
5740000 CN1C(=O)C2=NC3=CC=CC=C3N=C2N(C1=O)CC(=O)NC[C@@H]4[C@@H]5[C@@H]([C@H]([C@H](O4)O[C@@H]6[C@H](O[C@@H]([C@@H]([C@H]6O)O)O[C@@H]7[C@H](O[C@@H]([C@@H]([C@H]7O)O)O[C@@H]8[C@H](O[C@@H]([C@@H]([C@H]8O)O)O[C@@H]9[C@H](O[C@@H]([C@@H]([C@H]9O)O)O[C@@H]1[C@H](O[C@H](O5)[C@@H]([C@H]1O)O)CO)CO)CO)CO)CO)O)O
5750000 COC1=C(C=C(C=C1)C2=C(COC2=O)OCCN3CCCCC3)OC
5760000 C[C@H]1CN([C@@H](CN1C(=O)C2=CC(=CC=C2)C(F)(F)F)C)C[C@]3(CC[C@]4([C@H](C3)CC[C@@H]5[C@@H]4CC[C@]6([C@H]5CCC6=O)C)C)O
5770000 CC[N+]1=CC=C(C=C1)/C=C\2/C=CC3=CC=CC=C3N2C
5780000 CC1=C(SC=C1)CN(C)CC2=C(OC(=N2)C3=C(C=C(C=C3)OC)OC)C
5790000 CCN1C2=CC=CC=C2N=C1CCNC3CCC4(CC3)CCNCC4
5800000 C1CN(C[C@H]([C@@H]1C2=CC3=CC=CC=C3C=C2)O)C(=O)C4=CC=C(C=C4)CN
5810000 CC1=NC(=C(C=C1)C(=O)O)N2CCC(CC2)(CCOC3=CC=CC=C3)CO
5820000 CC1=C(C2=C(N1



5840000 CC1=CC=C(C=C1)C2=C(N=NN2CC3=CC=C(C=C3)Br)C(=O)O




5850000 CC1=CC(=NN1CC2CCCN2C(=O)C3=CC=CC(=C3)C(=O)N)C
5860000 C1CC(CN(C1)C2=NC=C(C=C2)C(F)(F)F)C3OCCO3
5870000 COC1=CC(=C(C=C1)NS(=O)(=O)C2=CC(=C(C=C2)OC)C(=O)O)Cl
5880000 CC(C1=CC=CC=C1OC)NC(=O)C2=CC3=CC=CC=C3C(=O)N2
5890000 CC1=CC(=C(S1)C)CS(=O)(=O)C(C)C2=NC(=NO2)C(C)C
5900000 C1CNCCC1CCOC2=CC(=CC=C2)Br
5910000 CCOCC(=O)N[C@H]1CCCN(C1)CC2=CC(=C(C=C2)F)N(C=O)NC3=CN=C(C=C3)C




5920000 C1=CC=C(C=C1)C2=C(N=C(N=C2Cl)C3=CC=CC=C3)NCCCCN4C=CN=C4[N+](=O)[O-]
5930000 C1CN(CCC1(CC2=CC=CC=C2C(F)(F)F)C(=O)O)CC3=CC=CO3
5940000 C1CN(CC(N1)C(=O)O)C(=O)C2=C(C=CC(=C2)N3C=CC=N3)Cl
5950000 C1CCN(C1)C(=O)CCN2CCCC(C2)CCC3=CC=C(C=C3)F
5960000 C1CN2C=C(N=C2CN1)C(=O)NCCC3=CSC(=N3)C4=NC=CN=C4
5970000 CN1C=CC2=C(C=CN=C21)C3=CC(=CC=C3)O
5980000 COCC1CCN(CC1)S(=O)(=O)N2CCCCC2CCC3=CC=CC=C3
5990000 CN1CCC(CC1)NC(=O)COC2=CC=C(C=C2)N(C)S(=O)(=O)C




Finished computing fingerprints, clustering 5999996 molecules...


In [None]:
# with open('cluster_labels.txt', 'w') as f:
#     for i in picks:
#         smiles = table['smiles'][i].as_py()
#         iupac = table['iupac'][i].as_py()
#         cid = table['cid'][i].as_py()
#         formula = table['formula'][i].as_py()
#         num_atoms = table['num_atoms'][i].as_py()
#         # f.write(f"{smiles}\t{iupac}\t{labels[i]}\n")
#         f.write(f"{cid}\t{smiles}\t{iupac}\t{formula}\t{num_atoms}\n")

def convert_string_view_to_string(table):
    for column in table.schema:
        if pa.types.is_string_view(column.type):
            # Recreate the column as pa.string
            table = table.set_column(
                table.schema.get_field_index(column.name),
                column.name,
                pa.array(table[column.name].to_pylist(), type=pa.string())
            )
    return table

table = table.take(pa.array(picks))

# huggingface only supports string, not string_view
table = convert_string_view_to_string(table)

with pa.OSFile('pubchem_best_cluster.arrow', 'wb') as sink:
    with pa.RecordBatchFileWriter(sink, table.schema) as writer:
        writer.write_table(table)
print(f"Finished writing {len(picks)} picks to pubchem_best_cluster.arrow")