In [1]:
# The purpose of this file is to take colin's data and build a ranking set. 07/05/22
import numpy as np, pickle, os, torch, tqdm, glob
from collections import defaultdict
import fingerprint_utils

In [2]:
def fp_to_nonzero(fp):
    return np.nonzero(fp)[0]
def nonzero_to_fp(nonzero):
    new_fp = np.zeros(6144, dtype=np.byte)
    new_fp[np.array(nonzero)] = 1
    return new_fp

In [3]:
# using colin's pickle files
pairs_path = "../../tempdata/data/hsqc_ms_pairs"
sets = {}
for f in os.listdir(pairs_path):
    key = f[:f.index(".")] # test, train, or val
    print(f"Key: {key}")
    with open(os.path.join(pairs_path, f), "rb") as f:
        val = pickle.load(f)
    # fps = [tuple(v["FP"]) for v in val.values()] # for colin fingerprint
    smiles, nonzero_fp = zip(*[(v["SMILES"], tuple(fp_to_nonzero(fingerprint_utils.FP_generator(v["SMILES"], 2)))) for v in val.values()]) # for hyunwoo fp
    print(f"Original size: {len(nonzero_fp)}")
    fp_to_smiles = defaultdict(set)
    for my_sm, my_fp in tqdm.tqdm(zip(smiles, nonzero_fp)):
        my_sm = fingerprint_utils.canonical(my_sm)
        fp_to_smiles[my_fp].add(my_sm)
    print(f"Size after converting to set: {len(fp_to_smiles.keys())}")
    sets[key] = fp_to_smiles

Key: train




Original size: 9794


9794it [00:02, 3473.19it/s]


Size after converting to set: 8704
Key: val
Original size: 1224


1224it [00:00, 3475.48it/s]


Size after converting to set: 1203
Key: test
Original size: 1224


1224it [00:00, 3696.08it/s]

Size after converting to set: 1205





In [4]:
super_set = set(sets["train"].keys()).union(set(sets["val"].keys())).union(set(sets["test"].keys()))
print(f"Number of unique fp's in ranking library: {len(super_set)}, \
    compared to if you added uniques(tr) + uniques(va) + uniques(test): {sum(len(v) for v in sets.values())}")


Number of unique fp's in ranking library: 10663,     compared to if you added uniques(tr) + uniques(va) + uniques(test): 11112


In [5]:
identity_map = defaultdict(set)
for fp_to_smiles in sets.values():
    for fp, smiles in fp_to_smiles.items():
        identity_map[fp] = identity_map[fp].union(smiles)
print(f"Num keys: {len(identity_map.keys())}")
print(f"Max number of collisions: {max([len(v) for k,v in identity_map.items()])}")

Num keys: 10663
Max number of collisions: 14


In [6]:
out_dir = "../../tempdata/hyun_pair_ranking_set_07_22"
os.makedirs(out_dir, exist_ok=True)

In [7]:
just_test = torch.stack([torch.tensor(nonzero_to_fp(v)) for v in sets["test"]])
just_val = torch.stack([torch.tensor(nonzero_to_fp(v)) for v in sets["val"]])
all = torch.stack([torch.tensor(nonzero_to_fp(v)) for v in super_set])
print(just_test.size())
print(all.size())

torch.Size([1205, 6144])
torch.Size([10663, 6144])


In [None]:
torch.save(just_test, os.path.join(out_dir, "test_pair.pt"))
torch.save(just_val, os.path.join(out_dir, "val_pair.pt"))
torch.save(all, os.path.join(out_dir, "all_pair.pt"))

In [11]:
with open(os.path.join(out_dir, "fp_lookup.pkl"), "wb") as f:
    pickle.dump(dict(identity_map), f)