# Find K-Nearest Neighbors of the Known Molecules

In [None]:
import pandas as pd
import numpy as np
import pickle
import os

In [None]:
data_dir = './'
known_file = os.path.join(data_dir, "known.pkl")
relevant_file = os.path.join(data_dir, "relevant.pkl")
random_file = os.path.join(data_dir, "random.pkl")

target_file = known_file
candidate_file = random_file

with open(target_file, 'rb') as fin:
    target_emb_dict = pickle.load(fin)
with open(candidate_file, 'rb') as fin:
    candidate_emb_dict = pickle.load(fin)

In [None]:
# dist_measurement = "Euclidean Distance"
dist_measurement = "Cosine Similarity"

def distance(u, v, f):
    if f == "Euclidean Distance":
        return euclidean_distance(u, v)
    elif f == "Cosine Similarity":
        return cosine_similarity(u, v)
    else:
        raise ValueError("Unknown distance metric: " + f)

def euclidean_distance(u, v):
    return np.linalg.norm(u - v)

def cosine_similarity(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [None]:
res = sorted(([target_molecule, distance(target_emb, candidate_emb, dist_measurement), candidate_molecule] 
                for target_molecule, target_emb in target_emb_dict.items() 
                for candidate_molecule, candidate_emb in candidate_emb_dict.items()),
             reverse= dist_measurement == "Cosine Similarity")


In [None]:
columns = list()
if target_file == known_file:
    columns.append("Known Molecule")
else:
    raise ValueError("Unknown target file" + target_file)
columns.append(dist_measurement)
if candidate_file == random_file:
    columns.append("Random Valid Molecule")
elif candidate_file == relevant_file:
    columns.append("Relevant ENA Molecule")
else:
    raise ValueError("Unknown candidate file" + candidate_file)

df = pd.DataFrame(res, columns=columns)
df

In [None]:
outfile = ""
if target_file == known_file:
    outfile += "Known-"
else:
    raise ValueError("Unknown target file" + target_file)
if candidate_file == random_file:
    outfile += "Random_Valid-"
elif candidate_file == relevant_file:
    outfile += "Relevant_ENA-"
else:
    raise ValueError("Unknown candidate file" + candidate_file)
outfile += "molecules-abs-"
if dist_measurement == 'Cosine Similarity':
    outfile += 'cosine'
elif dist_measurement == 'Euclidean Distance':
    outfile += 'euclidean'
else:
    raise ValueError("Unknown distance measurement:" + dist_measurement)

df.to_csv(os.path.join(data_dir, f"{outfile}-all.csv"), index=False)

In [None]:
k = 20
df_topk = df.groupby(columns[0]).head(k).reset_index(drop=True)
df_topk

In [None]:
df_topk.to_csv(os.path.join(data_dir, f"{outfile}-top{k}.csv"), index=False)