In [1]:
import sys
sys.path.append(r"..\src")

from os.path import join
import numpy as np

import pandas as pd
import networkx as nx
import datetime
import matplotlib.pylab as plt
from tqdm import tqdm

import utils
import loader

%load_ext autoreload
%autoreload 2

from keras.layers import Input, Dense, LSTM, RepeatVector
from keras.models import Model, Sequential

from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
molecules_df = loader.get_molecules_df()
name2vec = {row['name'].lower(): utils.smile2vec(row['smile']) for _,row in molecules_df.iterrows()}

In [3]:
papers_df = pd.read_csv('..\data\papers.csv', usecols=['abstract','mul_list'], nrows=10000)
num_papers = len(papers_df)
train_indices, test_indices = train_test_split(np.arange(num_papers), test_size=0.2)

In [4]:
text_model = utils.coupus2model(papers_df['abstract'].values, size=100, window=5, min_count=3)

In [5]:
def generate_dataset(indices, batch_size=32):
    cur_batch, X, y, names = 0, [], [], []
    
    for _, row in papers_df.iloc[indices].iterrows():
        if cur_batch >= batch_size:
            max_len = max([len(cur_x) for cur_x in X])
            X = np.array([np.concatenate([np.zeros((max_len-len(cur_x),len(cur_x[0]))),cur_x]) for cur_x in X])
            y = np.array(y)
            names = np.array(names)
            yield X, y, names
            cur_batch, X, y, names = 0, [], [], []
            
        mul_list = eval(row['mul_list'])
        if len(mul_list) > 0:
            cur_X = [text_model.wv[word] for word in row['abstract'] if word in text_model.wv]
            for name,_ in mul_list:
                if name in name2vec:
                    y.append(name2vec[name])
                    X.append(cur_X)
                    names.append(name)
                    cur_batch += 1
    
    if len(X) > 0:
        max_len = max([len(cur_x) for cur_x in X])
        X = np.array([np.concatenate([np.zeros((max_len-len(cur_x),len(cur_x[0]))),cur_x]) for cur_x in X])
        y = np.array(y)
        names = np.array(names)
        yield X, y, names

In [6]:
paper2smile_model = Sequential()
paper2smile_model.add(LSTM(100,input_shape=(None,100)))
paper2smile_model.add(Dense(100))

paper2smile_model.compile(optimizer='adam', loss='cosine_proximity')

In [7]:
for epoch in np.arange(0, 5):
    print("[PARENT EPOCH] epoch {}...".format(epoch + 1))
    for X, y, _ in generate_dataset(train_indices, batch_size=32):
        paper2smile_model.train_on_batch(X,y)

[PARENT EPOCH] epoch 1...
[PARENT EPOCH] epoch 2...
[PARENT EPOCH] epoch 3...
[PARENT EPOCH] epoch 4...
[PARENT EPOCH] epoch 5...


In [13]:
num_samples, right = 0, 0
pred_indices, true_indices = [], []
all_names = list(name2vec.keys())
all_vecs = [name2vec[name] for name in all_names]

nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto', metric='cosine').fit(all_vecs)
for X_test, y_test, names in generate_dataset(test_indices):
    cur_pred_indices = list(nbrs.kneighbors(paper2smile_model.predict_on_batch(X_test), return_distance=False).flatten())
    cur_true_indices = [all_names.index(name) for name in names]
    num_samples += len(cur_pred_indices)
    right += sum(np.array(cur_pred_indices) == cur_true_indices)
    print(f'{right}/{num_samples} = {100*right/num_samples}%')
    pred_indices += cur_pred_indices
    true_indices += cur_true_indices

0/32 = 0.0%
0/64 = 0.0%
0/96 = 0.0%
0/128 = 0.0%
0/160 = 0.0%
0/192 = 0.0%
0/227 = 0.0%
0/259 = 0.0%
0/291 = 0.0%
0/323 = 0.0%
0/355 = 0.0%
0/387 = 0.0%
0/421 = 0.0%
0/455 = 0.0%
0/487 = 0.0%
0/519 = 0.0%
0/551 = 0.0%
0/584 = 0.0%
0/598 = 0.0%
