# Tools

In [1]:
%matplotlib inline
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

# basic uses
import pprint
import re
import numpy as np
import pandas as pd
import pickle as pk
from itertools import zip_longest
from pathlib import Path
from math import ceil
from random import uniform

# plotting figures
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import seaborn as sb

# RDKit molecule conversion and drawing
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import rdMolDraw2D

# modeling
import torch
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib

# user-friendly print
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [1]:
 # prediction vs. observation plots (single test trial)
def draw(y_true, y_pred, y_true_fit=None, y_pred_fit=None, *, prop_name, log_scale=False, file_dir=None, file_name=None):
    
    mask = ~np.isnan(y_pred_fit)
    y_true_fit = y_true_fit[mask]
    y_pred_fit = y_pred_fit[mask]

    mask = ~np.isnan(y_pred)
    y_true = y_true[mask]
    y_pred = y_pred[mask]
        
    test_ = pd.DataFrame(dict(Observation=y_true, Prediction=y_pred, dataset=['test'] * len(y_true)))
    train_ = pd.DataFrame(dict(Observation=y_true_fit, Prediction=y_pred_fit, dataset=['train'] * len(y_true_fit)))
    data = pd.concat([train_, test_])
    if log_scale:
        data = data.apply(lambda c: np.log(c.values) if c.dtype.type is not np.object_ else c, axis=0)
        test_ = test_.apply(lambda c: np.log(c.values) if c.dtype.type is not np.object_ else c, axis=0)

    scores = metrics(test_['Observation'], test_['Prediction'])

    sb.set(font_scale=2.5)
    g = sb.lmplot(x="Prediction", y="Observation", hue="dataset", ci=None,
                  data=data, palette="Set1", height=10, legend=False,  markers=[".", "o"],
                  scatter_kws={'s': 25, 'alpha': 0.7}, hue_order=['train', 'test'])
    
    ax = plt.gca()
    tmp = [data["Prediction"].max(), data["Prediction"].min(), data["Observation"].max(), data["Observation"].max()]
    min_, max_ = np.min(tmp), np.max(tmp)
    margin = (max_- min_) / 15
    min_ = min_ - margin
    max_ = max_ + margin
    ax.set_xlim(min_, max_)
    ax.set_ylim(min_, max_)
    ax.plot((min_, max_), (min_, max_), ':', color='gray')
    ax.set_title(prop_name)
    if log_scale:
        ax.set_title(prop_name + ' (log scale)')
    ax.text(0.98, 0.03,
            'MAE: %.5f\nRMSE: %.5f\nPearsonR: %.5f\nSpearmanR: %.5f' % (scores['mae'], scores['rmse'], scores['pearsonr'], scores['spearmanr']),
            transform=ax.transAxes, horizontalalignment='right', fontsize='small')

    ax.legend(loc='upper left', markerscale=2, fancybox=True, shadow=True, frameon=True, facecolor='w')

    plt.tight_layout()
    if file_dir and file_name:
        if log_scale:
            plt.savefig(file_dir + '/' + file_name + '_log_scale.png', dpi=300, bbox_inches='tight')
        else:
            plt.savefig(file_dir + '/' + file_name + '.png', dip=300, bbox_inches='tight')
    else:
        print('Missing directory and/or file name information!')

In [1]:
# Prediction vs. observation plots (cross-validation: list of test trials)
def draw_cv(y_trues, y_preds, y_trues_fit, y_preds_fit, *, prop_name, log_scale=False, file_dir=None, file_name=None):
    cv=len(y_trues)
    
    y_true = np.concatenate(y_trues)
    y_pred = np.concatenate(y_preds)
    y_true_fit = np.concatenate(y_trues_fit)
    y_pred_fit = np.concatenate(y_preds_fit)
    
    mask = ~np.isnan(y_pred_fit)
    y_true_fit = y_true_fit[mask]
    y_pred_fit = y_pred_fit[mask]

    mask = ~np.isnan(y_pred)
    y_true = y_true[mask]
    y_pred = y_pred[mask]
        
    test_ = pd.DataFrame(dict(Observation=y_true, Prediction=y_pred, dataset=['test'] * len(y_true)))
    train_ = pd.DataFrame(dict(Observation=y_true_fit, Prediction=y_pred_fit, dataset=['train'] * len(y_true_fit)))
    data = pd.concat([train_, test_])
    if log_scale:
        data = data.apply(lambda c: np.log(c.values) if c.dtype.type is not np.object_ else c, axis=0)
        test_ = test_.apply(lambda c: np.log(c.values) if c.dtype.type is not np.object_ else c, axis=0)

    scores = metrics(test_['Observation'], test_['Prediction'])

    sb.set(font_scale=2.5)
    g = sb.lmplot(x="Prediction", y="Observation", hue="dataset", ci=None,
                  data=data, palette="Set1", height=10, legend=False,  markers=[".", "o"],
                  scatter_kws={'s': 25, 'alpha': 0.7}, hue_order=['train', 'test'])
    
    ax = plt.gca()
    tmp = [data["Prediction"].max(), data["Prediction"].min(), data["Observation"].max(), data["Observation"].max()]
    min_, max_ = np.min(tmp), np.max(tmp)
    margin = (max_- min_) / 15
    min_ = min_ - margin
    max_ = max_ + margin
    ax.set_xlim(min_, max_)
    ax.set_ylim(min_, max_)
    ax.plot((min_, max_), (min_, max_), ':', color='gray')
    ax.set_title(prop_name)
    if log_scale:
        ax.set_title(prop_name + ' (log scale)')
    ax.text(0.98, 0.03,
            '$%d-fold$ CV\nmae: %.5f\nrmse: %.5f\npearsonr: %.5f\nspearmanr: %.5f' % (cv, scores['mae'], scores['rmse'], scores['pearsonr'], scores['spearmanr']),
            transform=ax.transAxes, horizontalalignment='right', fontsize='small')
    ax.legend(loc='upper left', markerscale=2, fancybox=True, shadow=True, frameon=True, facecolor='w')
    plt.tight_layout()
    if file_dir and file_name:
        if log_scale:
            plt.savefig(file_dir + '/' + file_name + '_log_scale.png', dpi=300, bbox_inches='tight')
        else:
            plt.savefig(file_dir + '/' + file_name + '.png', dip=300, bbox_inches='tight')
    else:
        print('Missing directory and/or file name information!')


In [2]:
# calculating basic statistics for predictions
def metrics(y_true, y_pred, ignore_nan=True):
    from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
    from scipy.stats import pearsonr, spearmanr
    
    if ignore_nan:
        mask = ~np.isnan(y_pred)
        y_true = y_true[mask]
        y_pred = y_pred[mask]
    
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    pr, p_val = pearsonr(y_true, y_pred)
    sr, _ = spearmanr(y_true, y_pred)
    return dict(
        mae=mae,
        rmse=rmse,
        r2=r2,
        pearsonr=pr,
        spearmanr=sr,
        p_value=p_val
    )

In [3]:
import os

def quicksort(x):
    if x==[]: return []

    smallerSorted = quicksort([a for a in x[1:] if os.path.getmtime(str(a)) <= os.path.getmtime(str(x[0]))])
    biggerSorted = quicksort([a for a in x[1:] if os.path.getmtime(str(a)) > os.path.getmtime(str(x[0]))])

    return(smallerSorted+[x[0]]+biggerSorted)

def retrieve_nn_models_from_with_cv(*props, score='pearsonr', cv=10):
    for prop in props:
        p = Path(prop)
        models = [x for x in p.iterdir() if x.is_dir() and x.name != '.ipynb_checkpoints']
        models = quicksort(models)
        return [Checker.load(x.name, x.parent) for x in models[:10]], p

In [None]:
def simple_predict(model, X):
    if not isinstance(model, torch.nn.Module):
        raise TypeError('<model> must be a instance of <torch.nn.Module>')
    if isinstance(X, pd.DataFrame):
        X = X.values 
    x_ = torch.from_numpy(X).type(torch.FloatTensor)
    x_.cpu()
    model.cpu()

    return model(x_).detach().numpy()