In [21]:
from pathlib import Path
from collections import defaultdict
from itertools import combinations
import dask.dataframe as dd
import dask
import pandas as pd

In [4]:
out_path = Path("../../Molecular_database/HAC_9")
pa = list(out_path.glob("*.parquet"))
clas = defaultdict(list)
for p in pa:
    db = p.stem.split("_")[1].split("_")[0].strip("db")
    clas[db].append(p)
comb = list(combinations(clas, 2))

In [None]:
from dask.distributed import Client
client = Client()



In [None]:

dedup_dfs = {}

lazy_results = []
previous = []
smiles_col="SMILES"
# Build all lazy computations first
for db_id, path in clas.items():
    print(db_id)
    df = dd.read_parquet(path, columns=[smiles_col])
    
    df_dedup = df.drop_duplicates(subset=[smiles_col])
    unique = df_dedup.map_partitions(len).sum()
    dedup_dfs[db_id] = df_dedup
    # Collect both lazy results for single batch compute
    lazy_results.append(unique)
    
# Compute all totals and uniques at once
computed_values = dask.compute(*lazy_results)

# Assign results back in the same order
counts = dict(zip(clas.keys(), computed_values))

003
012
004
008
007
011
005
009


In [23]:
counts

{'003': 9654,
 '012': 152207,
 '004': 270643,
 '008': 85,
 '007': 2164,
 '011': 42,
 '005': 2435,
 '009': 1}

In [7]:
pairs = comb

In [None]:
from itertools import islice

def batched(iterable, n):
    it = iter(iterable)
    while batch := list(islice(it, n)):
        yield batch
        
overlaps={}
for batch in batched(pairs, 3):  # run 3 at a time
    futures = []
    for db1, db2 in batch:
        overlap = dd.merge(dedup_dfs[db1], dedup_dfs[db2], on=smiles_col, how="inner")
        futures.append(overlap)
    
    results = dask.compute(*futures)
    for (db1, db2), res in zip(batch, results):
        overlaps[f"{db1}_{db2}"] = res

In [18]:
smiles_to_dbs = defaultdict(set)

for pair, df in overlaps.items():
    db1, db2 = pair.split("_")
    for smi in df["SMILES"]:
        smiles_to_dbs[smi].update([db1, db2])

In [22]:
smiles_overlap_df = pd.DataFrame({
    "SMILES": list(smiles_to_dbs.keys()),
    "Databases": [",".join(sorted(list(v))) for v in smiles_to_dbs.values()]
})
smiles_overlap_df

Unnamed: 0,SMILES,Databases
0,C=CCNc1ccco1,003004012
1,CC(C)OC(=O)CON,003004012
2,CCOCCOCCO,003004005007012
3,N#CCCOCC(N)=O,003004012
4,CCCN(C)S(C)(=O)=O,003004012
...,...,...
110508,Oc1ncc(O)c(O)n1,005007
110509,Nc1cc(=O)nc(N)[nH]1,005007
110510,C=C1C(=O)O[C@@H](C)[C@@H]1O,005007
110511,N[C@H]1CCN[C@H]1C(=O)O,005007
