In [None]:
import os
import rdkit
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem

from measures import *
from utils import fingerprint, similarities

In [None]:
MAX_STEP = 2000

def parse_mols_txt(file_name):
    data = []
    with open(file_name, 'r') as f:
        lines = f.readlines()
    
    step = None
    colums = []
    for line in lines:
        if line.startswith('molecules'):
            step = int(line.strip('\n').split(' ')[-1])
            if step > MAX_STEP: break
            continue
        if line.startswith('#'):
            columns = ['step'] + line.strip('\n').split('\t')
            continue
        data.append([step] + line.strip('\n').split('\t'))
    df = pd.DataFrame(data, columns=columns)
    return df

In [None]:
import pandas as pd

model_names = ['Baseline', 'AD', 'NN']

file_names = [
    'data/mols/mols_baseline.txt',
    'data/mols/mols_ad.txt',
    'data/mols/mols_nn.txt',
]

# dfs = []
df = pd.DataFrame()

for model, file_name in zip(model_names, file_names):
#     t = pd.read_csv(file_name, header=1, sep='\t')
    t = parse_mols_txt(file_name)
    for col in t.columns:
        if col == 'smiles': continue
        t[col] = pd.to_numeric(t[col], errors='coerce')
    t = t.dropna()
    t = t.drop_duplicates(subset='smiles')
    t['model'] = model
    df = df.append(t)

In [None]:
df['succ'] = \
    ((df['jnk3'] >= 0.5) & \
    (df['qed' ] >= 0.6) & \
    (df['sa'  ] >= .67)).tolist()
df['score'] = df['jnk3'] + df['qed'] + df['sa']
# df

In [None]:
df_succ = df[df['succ'] == True]
# df_succ

In [None]:
df_succ[df_succ['model'] == 'AD']

In [None]:
def define_measures():
    measures = {
#         'GS' : GoldenStandard(),
#         'Diversity' : AvgAvgDis(),
#         'SumDiversity' : SumAvgDis(),
#         'Bottleneck' : MinMinDis(),
#         'SumBottleneck' : SumMinDis(),
#         'Diameter' : MaxMaxDis(),
#         'SumDiameter' : SumMaxDis(),
#         'DPP' : DPP(),
        '#FG' : NFragment(frag='FG'),
#         '#RS' : NFragment(frag='RS'),
#         '#Circles': NCirc(threshold=0.35),
    }
#     THRESHOLDS = [0.35] #np.linspace(0, 1, num=50)
#     for th in THRESHOLDS:
#         measures['N_Circ (c=%.2f)' % th] = NCirc(threshold=th)
    return measures

In [None]:
import random

measures = define_measures()
print(measures.keys())
columns = ['model', 'step', 'Richness'] + list(measures.keys())
measures_evaluated = []
data = []

for model in model_names:
    measures = define_measures()
    rich = 0
    for step in tqdm(range(MAX_STEP+1)):
        smiles = df_succ[(df_succ['model'] == model) & (df_succ['step'] == step)]['smiles']
        mols = [Chem.MolFromSmiles(smi) for smi in smiles]
        rich += len(smiles)
        entry = [model, step, rich]
        for name, measure in measures.items():
            if isinstance(measure, DissimilarityBasedMeasure):
                mols_ = [mol for mol in mols if random.random() < 0.05]
            else: mols_ = mols
            measure.update(mols_)
            entry.append(measure.report())
        data.append(entry)
    measures_evaluated.append(measures)
        
df_curve = pd.DataFrame(data=data, columns=columns)
# df_curve

In [None]:
df_curve[df_curve['model'] == 'Baseline']

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr

measure_name = 'Diversity'

plt.figure(figsize=(10, 7))
sns.set(font_scale=3)
ax = sns.lineplot(data=df_curve, x='step', y=measure_name, hue='model', style='model', linewidth=5, legend=True,
             hue_order=['Baseline', 'NN', 'AD'], 
             palette=['steelblue', 'forestgreen', 'darkorange'])
ax.lines[0].set_linestyle("dashdot")
ax.lines[1].set_linestyle("solid")
ax.lines[2].set_linestyle("dotted")
# ax.set_ylabel(ax.get_ylabel(), rotation=90)
# ax.yaxis.set_major_formatter(tkr.FuncFormatter(lambda y, p: f'{int(y/1000)}K'))

In [None]:
def fingerprint(mol):
    try: 
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
    except:
        fp = np.zeros(1024)
    return fp

labels = []
fps = []
for name, measures in zip(model_names, measures_evaluated):
    measure = measures['#FG']
    smiles = measure.df['frags'].tolist()
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    fps += [fingerprint(mol) for mol in mols]
    labels += [name] * len(smiles)
nfps = np.array(fps)
nfps

In [None]:
import umap
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD, PCA, KernelPCA
from sklearn.manifold import TSNE, MDS

COLORS = ['steelblue', 'darkorange', 'forestgreen']

# mapper = umap.UMAP(metric='jaccard', n_neighbors=30, min_dist=0.99)
mapper = PCA(n_components=2)
# mapper = TruncatedSVD(n_components=2)
# mapper = TSNE(n_components=2, perplexity=50)
X = mapper.fit_transform(nfps)

In [None]:
plt.figure(figsize=(10, 7))
sns.set(font_scale=2)
ax = sns.scatterplot(x=X[::-1,0], y=X[::-1,1], linewidth=0, #alpha=0.8,
                hue=labels[::-1], style=labels[::-1], size=labels[::-1],
                hue_order=['Baseline', 'NN', 'AD'], 
#                 palette=['steelblue', 'forestgreen', 'darkorange'],
                palette=['royalblue', 'lightgreen', 'darkorange'],
                markers={'Baseline' : 'X', 'AD' : '*', 'NN' : 'o'},
                sizes={'Baseline' : 80, 'AD' : 120, 'NN' : 100})
ax.set(xticklabels=[])
ax.set(yticklabels=[])
# plt.scatter()
# plt.show()

In [None]:
df_succ

In [None]:
smiles = df_succ[df_succ['model'] == 'AD']['smiles'].tolist()
smiles = random.choices(smiles, k=1000)
mols = [Chem.MolFromSmiles(smi) for smi in smiles]
fps = [fingerprint(mol) for mol in mols]

In [None]:
import numpy as np

sim = [similarities(fp, fps) for fp in fps]
sim = np.array(sim)
sim

In [None]:
g = sns.clustermap(sim)
ax = g.ax_heatmap
ax.set(xticklabels=[])
ax.set(yticklabels=[])