# Probablistic model building genetic algorithm

In [1]:
%cd /mnt/ceph/users/zzhang/CRISPR_pred/crispr_kinn

/mnt/ceph/users/zzhang/CRISPR_pred/crispr_kinn


In [2]:
from silence_tensorflow import silence_tensorflow
silence_tensorflow()
from src.kinetic_model import KineticModel, modelSpace_to_modelParams, modelParams_to_modelSpace
from src.neural_network_builder import KineticEigenModelBuilder
from src.model_spaces import get_informed_ms as get_model_space
#from src.model_spaces import get_cas9_finkelstein_ms as get_model_space
from src.neural_search import search_env, get_reward_pipeline
from src.data import get_sim_ness_data as get_data
#from src.data import load_finkelstein_data as get_data
# reload and re-train to full convergence
from src.reload import reload_from_dir

Using TensorFlow backend.
Matplotlib created a temporary config/cache directory at /tmp/matplotlib-g52v_n92 because the default path (/home/zzhang/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


detected tf2 - using compatibility mode


In [3]:
import warnings
warnings.filterwarnings('ignore')
import time
from datetime import datetime

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import yaml

import scipy.stats as ss
import pandas as pd
import numpy as np
from tqdm import tqdm
import tensorflow as tf
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import shutil
import os
import pickle
import gc
from sklearn.model_selection import train_test_split

## Load data

In [4]:
(x_train, y_train), (x_test, y_test) = get_data()

## Setup AMBER

In [5]:
import amber
print(amber.__version__)
from amber.architect import pmbga
from amber.architect import ModelSpace, Operation

0.1.2


In [6]:
kinn_model_space = get_model_space()
print(kinn_model_space)

StateSpace with 7 layers and 1 total combinations


In [7]:
controller = pmbga.ProbaModelBuildGeneticAlgo(
            model_space=kinn_model_space,
            buffer_type='population',
            buffer_size=50, # buffer size controlls the max history going back
            batch_size=1,   # batch size does not matter in this case; all arcs will be retrieved
            ewa_beta=0.8,  # ewa_beta approximates the moving average over 1/(1-ewa_beta) prev points
        )

## A For-Loop that does the work for `amber.architect.trainEnv`

In [8]:
wd = "outputs/notebook"
# trainEnv parameters
evo_params = dict(
    model_fn = KineticEigenModelBuilder,
    samps_per_gen = 10,   # how many arcs to sample in each generation; important
    max_gen = 100,
    patience = 100,
    n_warmup_gen = 1,
    train_data = (x_train, y_train),
    test_data = (x_test, y_test)
)

# manager configs
# this learning rate is trickier than usual, for eigendecomp to work
initial_learning_rate = 0.01
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate,
    decay_steps=10*int(7000/128), # decrease every 10 epochs
    decay_rate=0.9,
    staircase=True)
manager_kwargs={
    'optimizer': lambda: tf.keras.optimizers.Adam(learning_rate=lr_schedule, clipnorm=1.0),
    'output_op': lambda: tf.keras.layers.Lambda(
        lambda x: tf.math.log(tf.math.maximum(tf.reshape(- x[:,1], (-1,1)), 10**-10)),  # change the clip as well
        name="output_slice"),
    'n_feats': 50,  # remember to change this!!
    'n_channels': 9,
    'batch_size': 128,
    'epochs': 100,
    'earlystop': 10,
    'verbose': 0
}

In [None]:
controller, hist, stat_df = search_env(
    controller=controller, 
    wd = wd,
    evo_params=evo_params, 
    manager_kwargs=manager_kwargs
)

Gen 0 < 1 warmup.. skipped - Time 266.48
datapoints:  12 / total:  20
[20:38:36] Gen 1 - Mean fitness 0.775 - Best 0.9478 - PostVar 5.499 - Time 283.65
0.774542857916648 * 0.8 + (1. - 0.8) * 0.8141389618507194
datapoints:  19 / total:  30
[20:43:29] Gen 2 - Mean fitness 0.782 - Best 0.9478 - PostVar 5.454 - Time 292.24
0.7824620787034623 * 0.8 + (1. - 0.8) * 0.8567375879937215
datapoints:  24 / total:  40
[20:48:28] Gen 3 - Mean fitness 0.797 - Best 0.9478 - PostVar 5.162 - Time 299.25
0.7973171805615141 * 0.8 + (1. - 0.8) * 0.811450481688823
datapoints:  31 / total:  50
[20:51:59] Gen 4 - Mean fitness 0.800 - Best 0.9478 - PostVar 5.172 - Time 210.71
0.800143840786976 * 0.8 + (1. - 0.8) * 0.7775228181574266
datapoints:  39 / total:  60
[20:55:47] Gen 5 - Mean fitness 0.796 - Best 0.9479 - PostVar 5.293 - Time 228.62
0.7956196362610661 * 0.8 + (1. - 0.8) * 0.8464422521044073
datapoints:  46 / total:  70
[20:59:22] Gen 6 - Mean fitness 0.806 - Best 0.9479 - PostVar 5.221 - Time 215.16
0

In [None]:
pd.DataFrame(hist).sort_values('test_reward', ascending=False).head()

In [None]:
print("\n".join([str(x) 
                 for x in pd.DataFrame(hist).
                 sort_values('test_reward', ascending=False).
                 head(1)['arc'].values[0]]))

In [None]:
a = pd.DataFrame(hist)
a['arc'] = ['|'.join([f"{x.Layer_attributes['RANGE_ST']}-{x.Layer_attributes['RANGE_ST']+x.Layer_attributes['RANGE_D']}" for x in entry]) for entry in a['arc']]
a.drop(columns=['rate_df'], inplace=True)
a.to_csv(os.path.join(wd,"train_history.tsv"), sep="\t", index=False)

In [None]:
%matplotlib inline

ax = stat_df.plot.line(x='Generation', y=['GenAvg', 'Best'])
ax.set_ylabel("Reward (Pearson correlation)")
ax.set_xlabel("Generation")
#plt.savefig("reward_vs_time.png")

In [None]:
# ground truth
plot_gt = True
with open("/mnt/home/alamson/ceph/DATA/CRISPR/KineticSims/22-05-12_cas9_kinn_deplete/cas9_kinn_deplete_params.yaml", "r") as f:
    gt_model_params = yaml.load(f, Loader=yaml.Loader)
gt_model_params = modelParams_to_modelSpace(gt_model_params)
gt_rates = [k for k in gt_model_params['Rates']]

In [None]:
# START SITE
fig, axs_ = plt.subplots(3,3, figsize=(15,15))
axs = [axs_[i][j] for i in range(len(axs_)) for j in range(len(axs_[i]))]
for k in controller.model_space_probs:
    if k[-1] == 'RANGE_ST':
        try:
            d = controller.model_space_probs[k].sample(size=1000)
        except:
            continue
        ax = axs[k[0]]
        sns.distplot(d, label="Post", ax=ax)
        sns.distplot(controller.model_space_probs[k].prior_dist, label="Prior", ax=ax)
        if plot_gt:
            ax.axvline(gt_rates[k[0]]['RANGE_ST'], ls='--', color='black')
            ax.set_title(
                f'Rate ID{str(k[0])} = {gt_rates[k[0]]["name"]}\n'
                f'Ground truth={gt_rates[k[0]]["RANGE_ST"]}\nPosterior mean {str(np.mean(d))}')
        else:
            ax.set_title(
                f'Rate ID{str(k[0])} = {gt_rates[k[0]]["name"]}\n'
                f'Posterior mean {str(np.mean(d))}')
            

        #_ = ax.set_xlim(0,50)

fig.suptitle('range start')
fig.tight_layout()
#fig.savefig("range_st.png")

In [None]:
# CONV RANGE
fig, axs_ = plt.subplots(3,3, figsize=(15,15))
axs = [axs_[i][j] for i in range(len(axs_)) for j in range(len(axs_[i]))]
for k in controller.model_space_probs:
    if k[-1] == 'RANGE_D':
        d = controller.model_space_probs[k].sample(size=1000)
        ax = axs[k[0]]
        sns.distplot(d, ax=ax)
        sns.distplot(controller.model_space_probs[k].prior_dist, label="Prior", ax=ax)
        if plot_gt:
            ax.axvline(gt_rates[k[0]]['RANGE_D'], ls='--', color='black')
            ax.set_title(
                f'Rate ID{str(k[0])} = {gt_rates[k[0]]["name"]}\n'
                f'Ground truth={gt_rates[k[0]]["RANGE_D"]}\nPosterior mean {str(np.mean(d))}')
        else:
            ax.set_title(
                f'Rate ID{str(k[0])} = {gt_rates[k[0]]["name"]}\n'
                f'Posterior mean {str(np.mean(d))}')

fig.suptitle('range length')
fig.tight_layout()
#fig.savefig("range_d.png")

# KERNEL SIZE 
fig, axs_ = plt.subplots(3,3, figsize=(15,15))
axs = [axs_[i][j] for i in range(len(axs_)) for j in range(len(axs_[i]))]
for k in controller.model_space_probs:
    if k[-1] == 'kernel_size':
        d = controller.model_space_probs[k].sample(size=1000)
        ax = axs[k[0]]
        sns.distplot(d, ax=ax)
        sns.distplot(controller.model_space_probs[k].prior_dist, ax=ax)
        ax.set_title(
            ' '.join(['Rate ID', str(k[0]), '\nPosterior mean', str(np.mean(d))]))
        #_ = ax.set_xlim(0,20) 
fig.suptitle('kernel size')
fig.tight_layout()

pickle.load(open("outputs/notebook/AmberSearchBestModel_config.pkl", "rb"))

In [None]:
from src.neural_network_builder import KineticEigenModelBuilder
tf.compat.v1.reset_default_graph()
tf.compat.v1.experimental.output_all_intermediates(True)
sess = tf.compat.v1.Session()
mb = reload_from_dir(
    wd="outputs/notebook", 
    manager_kwargs=manager_kwargs,
    sess=sess,
    model_fn=KineticEigenModelBuilder)
model = mb.model

In [None]:
x_train_b = mb.blockify_seq_ohe(x_train)
x_test_b = mb.blockify_seq_ohe(x_test)

checkpointer = ModelCheckpoint(
    filepath=os.path.join(wd,"bestmodel.h5"), mode='min', verbose=0, save_best_only=True,
    save_weights_only=True)
earlystopper = EarlyStopping(
    monitor="val_loss",
    mode='min',
    patience=15,
    verbose=0)

#model.fit(x_train_b, y_train,
#          batch_size=32,
#          validation_split=0.2,
#          callbacks=[checkpointer, earlystopper],
#          epochs=225, 
#          verbose=2)
#model.load_weights(os.path.join(wd,"bestmodel.h5"))
y_hat = model.predict(x_test_b).flatten()
test_pcc = ss.pearsonr(y_hat, y_test)[0]

In [None]:
[str(x.__dict__) for x in mb.kinn.rates]

In [None]:
layer_dict = {l.name:l for l in model.layers}

In [None]:
np.around(layer_dict['conv_k0'].get_weights()[0],3)

In [None]:
np.around(layer_dict['conv_k1'].get_weights()[0],3)

In [None]:
x_test_b = mb.blockify_seq_ohe(x_test)
y_hat = model.predict(x_test_b).flatten()
h = sns.jointplot(np.log10(np.exp(y_test)), np.log10(np.exp(y_hat)))
h.set_axis_labels("obs", "pred", fontsize=16)
print("spearman", ss.spearmanr(y_hat, y_test))
p = ss.pearsonr(y_hat, y_test)
print("pearson", p)
h.fig.suptitle("Testing prediction, pcc=%.3f"%p[0], fontsize=16)