In [8]:
from pathlib import Path
from collections import defaultdict
from itertools import combinations
import dask.dataframe as dd
import dask
import pandas as pd

In [9]:
out_path = Path("../../Molecular_database/HAC_9")
pa = list(out_path.glob("*.parquet"))
clas = defaultdict(list)
for p in pa:
    db = p.stem.split("_")[1].split("_")[0].strip("db")
    clas[db].append(p)
comb = list(combinations(clas, 2))

In [20]:
clas.keys()

dict_keys(['003', '012', '004', '008', '007', '011', '005', '009'])

In [10]:
from dask.distributed import Client
client = Client()
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 33011 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:33011/status,

0,1
Dashboard: http://127.0.0.1:33011/status,Workers: 4
Total threads: 8,Total memory: 31.08 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:39623,Workers: 0
Dashboard: http://127.0.0.1:33011/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:36721,Total threads: 2
Dashboard: http://127.0.0.1:45507/status,Memory: 7.77 GiB
Nanny: tcp://127.0.0.1:36129,
Local directory: /tmp/dask-scratch-space/worker-j4z0x0sl,Local directory: /tmp/dask-scratch-space/worker-j4z0x0sl

0,1
Comm: tcp://127.0.0.1:32929,Total threads: 2
Dashboard: http://127.0.0.1:42077/status,Memory: 7.77 GiB
Nanny: tcp://127.0.0.1:34077,
Local directory: /tmp/dask-scratch-space/worker-7toodvd9,Local directory: /tmp/dask-scratch-space/worker-7toodvd9

0,1
Comm: tcp://127.0.0.1:34279,Total threads: 2
Dashboard: http://127.0.0.1:42485/status,Memory: 7.77 GiB
Nanny: tcp://127.0.0.1:45301,
Local directory: /tmp/dask-scratch-space/worker-xv7ev2w3,Local directory: /tmp/dask-scratch-space/worker-xv7ev2w3

0,1
Comm: tcp://127.0.0.1:36325,Total threads: 2
Dashboard: http://127.0.0.1:35011/status,Memory: 7.77 GiB
Nanny: tcp://127.0.0.1:44955,
Local directory: /tmp/dask-scratch-space/worker-fgoehb8y,Local directory: /tmp/dask-scratch-space/worker-fgoehb8y




In [11]:

dedup_dfs = {}

lazy_results = []
previous = []
smiles_col="SMILES"
# Build all lazy computations first
for db_id, path in clas.items():
    print(db_id)
    df = dd.read_parquet(path, columns=[smiles_col])
    
    df_dedup = df.drop_duplicates(subset=[smiles_col])
    unique = df_dedup.map_partitions(len).sum()
    dedup_dfs[db_id] = df_dedup
    # Collect both lazy results for single batch compute
    lazy_results.append(unique)
    
# Compute all totals and uniques at once
computed_values = dask.compute(*lazy_results)

# Assign results back in the same order
counts = dict(zip(clas.keys(), computed_values))

003
012
004
008
007
011
005
009


In [None]:
def modify_counts(counts):
    names = {}
    n = []
    for x in counts:
        if len(x.split("_")) > 1:
            n.append(x)
            if x.split("_")[0] not in names:
                names[x.split("_")[0]] = 0
            names[x.split("_")[0]] += counts[x]

    counts.update(names)
    for x in n:
        del counts[x]
    return counts

In [17]:
pd.Series(counts).to_dict()

{'003': 9654,
 '012': 152207,
 '004': 270643,
 '008': 85,
 '007': 2164,
 '011': 42,
 '005': 2435,
 '009': 1}

In [13]:
pairs = comb
pairs

[('003', '012'),
 ('003', '004'),
 ('003', '008'),
 ('003', '007'),
 ('003', '011'),
 ('003', '005'),
 ('003', '009'),
 ('012', '004'),
 ('012', '008'),
 ('012', '007'),
 ('012', '011'),
 ('012', '005'),
 ('012', '009'),
 ('004', '008'),
 ('004', '007'),
 ('004', '011'),
 ('004', '005'),
 ('004', '009'),
 ('008', '007'),
 ('008', '011'),
 ('008', '005'),
 ('008', '009'),
 ('007', '011'),
 ('007', '005'),
 ('007', '009'),
 ('011', '005'),
 ('011', '009'),
 ('005', '009')]

In [5]:
def get_overlap(db1, db2, dedup_dfs, counts, smiles_col="SMILES", small_threshold=10_000):
    """Efficient overlap detection using merge or isin depending on size."""
    df1 = dedup_dfs[db1]
    df2 = dedup_dfs[db2]

    len1 = counts[db1]
    len2 = counts[db2]

    if len1 < small_threshold or len2 < small_threshold:
        # Use isin for small dataset
        if len1 < len2:
            small, big = df1, df2
        else:
            small, big = df2, df1
        smiles_small = small[smiles_col].compute()
        overlap = big[big[smiles_col].isin(smiles_small)]
    else:
        # Use merge for large–large comparisons
        overlap = dd.merge(df1, df2, on=smiles_col, how="inner")

    return overlap

In [6]:
from itertools import islice

def batched(iterable, n):
    it = iter(iterable)
    while batch := list(islice(it, n)):
        yield batch
        
overlaps={}
for batch in batched(pairs, 3):  # run 3 at a time
    futures = []
    for db1, db2 in batch:
        overlap = dd.merge(
            dedup_dfs[db1],
            dedup_dfs[db2],
            on=smiles_col,
            how="inner"
        )
        #overlap = get_overlap(db1, db2, dedup_dfs, counts, smiles_col=smiles_col)
        futures.append(overlap)
    
    results = dask.compute(*futures)
    for (db1, db2), res in zip(batch, results):
        overlaps[f"{db1}_{db2}"] = res

NameError: name 'pairs' is not defined

In [4]:
smiles_to_dbs = defaultdict(set)
redun_counts = {}
for pair, df in overlaps.items():
    redun_counts[pair] = df.shape[0]
    db1, db2 = pair.split("_")
    for smi in df["SMILES"]:
        smiles_to_dbs[smi].update([db1, db2])

NameError: name 'overlaps' is not defined

In [22]:
smiles_overlap_df = pd.DataFrame({
    "SMILES": list(smiles_to_dbs.keys()),
    "Databases": [",".join(sorted(list(v))) for v in smiles_to_dbs.values()]
})
smiles_overlap_df

Unnamed: 0,SMILES,Databases
0,C=CCNc1ccco1,003004012
1,CC(C)OC(=O)CON,003004012
2,CCOCCOCCO,003004005007012
3,N#CCCOCC(N)=O,003004012
4,CCCN(C)S(C)(=O)=O,003004012
...,...,...
110508,Oc1ncc(O)c(O)n1,005007
110509,Nc1cc(=O)nc(N)[nH]1,005007
110510,C=C1C(=O)O[C@@H](C)[C@@H]1O,005007
110511,N[C@H]1CCN[C@H]1C(=O)O,005007


In [33]:
redun_counts

{'003_012': 2993,
 '003_004': 4593,
 '003_008': 3,
 '003_007': 177,
 '003_011': 4,
 '003_005': 198,
 '003_009': 1,
 '012_004': 107289,
 '012_008': 55,
 '012_007': 1534,
 '012_011': 9,
 '012_005': 1895,
 '012_009': 1,
 '004_008': 52,
 '004_007': 1352,
 '004_011': 13,
 '004_005': 1841,
 '004_009': 0,
 '008_007': 70,
 '008_011': 0,
 '008_005': 30,
 '008_009': 0,
 '007_011': 0,
 '007_005': 481,
 '007_009': 1,
 '011_005': 0,
 '011_009': 0,
 '005_009': 1}