# Probablistic model building genetic algorithm

In [1]:
%cd /mnt/ceph/users/zzhang/CRISPR_pred/crispr_kinn

/mnt/ceph/users/zzhang/CRISPR_pred/crispr_kinn


In [2]:
from src.kinetic_model import KineticModel, modelSpace_to_modelParams
from src.neural_network_builder import KineticNeuralNetworkBuilder

Using TensorFlow backend.


In [15]:
import warnings
warnings.filterwarnings('ignore')
import time
from datetime import datetime

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import scipy.stats as ss
import pandas as pd
import numpy as np
from tqdm import tqdm
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

import gc
from sklearn.model_selection import train_test_split

## Load data

In [4]:
x = np.load('./data/compiled_X.npy')
y = np.load('./data/compiled_Y.npy')
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=777)

## Setup AMBER

In [5]:
import amber
print(amber.__version__)
from amber.architect import pmbga
from amber.architect import ModelSpace, Operation

0.1.1-ga


In [6]:
kinn_model_space = ModelSpace.from_dict([
    # k_01, sol -> open R-loop
    [dict(Layer_type='conv1d', filters=1, SOURCE='0', TARGET='1', 
          #kernel_size=pmbga.Categorical(choices=[1,2,3], prior_cnt=[1]*3),
          kernel_size=1,
          EDGE=1,
          RANGE_ST=pmbga.Categorical(choices=[0,1,2,3,4], prior_cnt=[1]*5),
          RANGE_D=pmbga.ZeroTruncatedNegativeBinomial(alpha=5, beta=1), 
     )],
    # k_10, open R-loop -> sol
    [dict(Layer_type='conv1d', filters=1, SOURCE='1', TARGET='0', 
          #kernel_size=pmbga.Categorical(choices=[1,2,3], prior_cnt=[1]*3),
          kernel_size=1,
          EDGE=pmbga.Binomial(alpha=1, beta=1, n=1),
          RANGE_ST=pmbga.Categorical(choices=[0,1,2,3,4], prior_cnt=[1]*5),
          RANGE_D=pmbga.ZeroTruncatedNegativeBinomial(alpha=5, beta=1),          
     )],
    # k_12, open R-loop -> intermediate R-loop
    [dict(Layer_type='conv1d', filters=1, SOURCE='1', TARGET='2', 
          kernel_size=1, 
          EDGE=1,
          RANGE_ST=pmbga.Categorical(choices=[5,6,7,8,9,10], prior_cnt=[1]*6),
          RANGE_D=pmbga.ZeroTruncatedNegativeBinomial(alpha=5, beta=1), 
     )],
    # k_21, intermediate R-loop -> open R-loop
    [dict(Layer_type='conv1d', filters=1, SOURCE='2', TARGET='1', 
          kernel_size=1, 
          EDGE=pmbga.Binomial(alpha=1, beta=1, n=1),
          RANGE_ST=pmbga.Categorical(choices=[5,6,7,8,9,10], prior_cnt=[1]*6),
          RANGE_D=pmbga.ZeroTruncatedNegativeBinomial(alpha=5, beta=1),             
     )],
    # k_23, intermediate R-loop -> closed R-loop
    [dict(Layer_type='conv1d', filters=1, SOURCE='2', TARGET='3', 
          kernel_size=1, 
          EDGE=1,
          RANGE_ST=pmbga.Categorical(choices=[13,14,15,16,17,18], prior_cnt=[1]*6),
          RANGE_D=pmbga.ZeroTruncatedNegativeBinomial(alpha=5, beta=1),     
     )],
    # k_32
    [dict(Layer_type='conv1d', kernel_size=1, filters=1, SOURCE='3', TARGET='2', 
          EDGE=pmbga.Binomial(alpha=1, beta=1, n=1),        
          RANGE_ST=pmbga.Categorical(choices=[13,14,15,16,17,18], prior_cnt=[1]*6),
          RANGE_D=pmbga.ZeroTruncatedNegativeBinomial(alpha=5, beta=1),          
     )],
    # k_30
    [dict(Layer_type='conv1d', kernel_size=1, filters=1, SOURCE='3', TARGET='0', 
          EDGE=1,
          RANGE_ST=pmbga.Categorical(choices=np.arange(0,19), prior_cnt=[1]*19),
          RANGE_D=pmbga.ZeroTruncatedNegativeBinomial(alpha=5, beta=1), 
          CONTRIB=1
     )],
])
print(kinn_model_space)

StateSpace with 7 layers and 1 total combinations


In [7]:
controller = pmbga.ProbaModelBuildGeneticAlgo(
            model_space=kinn_model_space,
            buffer_type='population',
            buffer_size=50,  # buffer size controlls the max history going back
            batch_size=1,   # batch size does not matter in this case; all arcs will be retrieved
        )

## Components before they are implemented in AMBER

In [13]:
## NEEDS RE-WORK
# poorman's manager get reward
def get_reward_pipeline(model_arcs):
    from warnings import simplefilter
    simplefilter(action='ignore', category=DeprecationWarning)
    train_graph = tf.Graph()
    train_sess = tf.Session(graph=train_graph)
    model_params = modelSpace_to_modelParams(model_arcs)
    with train_graph.as_default(), train_sess.as_default():
        kinn_test = KineticModel(model_params)
        mb = KineticNeuralNetworkBuilder(kinn=kinn_test, session=train_sess, n_channels=13)
        # train and test
        mb.build(optimizer='adam', plot=False, output_act=True)
        model = mb.model
        x_train_b = mb.blockify_seq_ohe(x_train)
        x_test_b = mb.blockify_seq_ohe(x_test)
        checkpointer = ModelCheckpoint(filepath="bestmodel.h5", mode='min', verbose=0, save_best_only=True,
                               save_weights_only=True)
        earlystopper = EarlyStopping(
            monitor="val_loss",
            mode='min',
            patience=5,
            verbose=0)

        model.fit(x_train_b, y_train[:,1],
                  batch_size=64,
                  validation_split=0.2,
                  epochs=25, verbose=0)
        y_hat = model.predict(x_test_b).flatten()
        test_pcc = ss.pearsonr(y_hat, y_test[:,1])[0]
    del train_graph, train_sess
    gc.collect()
    return test_pcc

## A fancy For-Loop that does the work for `amber.architect.trainEnv`

In [9]:
# trainEnv parameters
samps_per_gen = 10   # how many arcs to sample in each generation; important
max_gen = 500
epsilon = 0.05
patience = 100
n_warmup_gen = -1

In [10]:
def compute_eps(model_space_probs, old_probs=None):
    delta = []
    samp_probs = {}
    for p in model_space_probs:
        #print(p)
        samp_probs[p] = model_space_probs[p].sample(size=10000)
        n = np.percentile(samp_probs[p], [10, 20, 30, 40, 50, 60, 70, 80, 90])
        if old_probs is None:
            delta.append( np.mean(np.abs(n)) )
        else:
            o = np.percentile(old_probs[p], [10, 20, 30, 40, 50, 60, 70, 80, 90])
            delta.append( np.mean(np.abs(o - n)) )
    return np.mean(delta), samp_probs 

# get prior probas
_, old_probs = compute_eps(controller.model_space_probs)

In [None]:
hist = []
pc_cnt = 0
best_indv = 0
stat_df = pd.DataFrame(columns=['Generation', 'GenAvg', 'Best', 'PostVar'])
for generation in range(max_gen):
    try:
        start = time.time()
        has_impr = False
        #for _ in tqdm(range(samps_per_gen), total=samps_per_gen, position=0, leave=True):
        for _ in range(samps_per_gen):
            # get arc
            arc, _ = controller.get_action()
            # get reward
            try:
                test_pcc = get_reward_pipeline(arc)
            except ValueError:
                test_pcc = 0
            except Exception as e:
                raise e
            rate_df = None
            # update best, or increase patience counter
            if test_pcc > best_indv:
                best_indv = test_pcc
                has_impr = True
            # store
            _ = controller.store(action=arc, reward=test_pcc)
            hist.append({'gen': generation, 'arc':arc, 'test_pcc': test_pcc, 'rate_df': rate_df})
        end = time.time()
        if generation < n_warmup_gen:
            print(f"Gen {generation} < {n_warmup_gen} warmup.. skipped - Time %.2f" % (end-start), flush=True)
            continue
        _ = controller.train(episode=generation, working_dir=".")
        delta, old_probs = compute_eps(controller.model_space_probs, old_probs)
        post_vars = [np.var(x.sample(size=100)) for _, x in controller.model_space_probs.items()]
        stat_df = stat_df.append({
            'Generation': generation,
            'GenAvg': controller.buffer.r_bias,
            'Best': best_indv,
            'PostVar': np.mean(post_vars)
        }, ignore_index=True)
        print("[%s] Gen %i - Mean fitness %.3f - Best %.4f - PostVar %.3f - Eps %.3f - Time %.2f" % (
            datetime.now().strftime("%H:%M:%S"),
            generation, 
            controller.buffer.r_bias, 
            best_indv, 
            np.mean(post_vars),
            delta,
            end-start), flush=True)
        #if delta < epsilon:
        #    print("stop due to convergence criteria")
        #    break
        pc_cnt = 0 if has_impr else pc_cnt+1
        if pc_cnt >= patience:
            print("early-stop due to max patience w/o improvement")
            break
    except KeyboardInterrupt:
        print("user interrupted")
        break

datapoints:  4 / total:  11
[19:11:23] Gen 0 - Mean fitness 0.076 - Best 0.2721 - PostVar 4.657 - Eps 0.552 - Time 55.22
datapoints:  11 / total:  21
[19:12:31] Gen 1 - Mean fitness 0.200 - Best 0.2732 - PostVar 4.391 - Eps 0.376 - Time 62.86
datapoints:  18 / total:  31
[19:13:30] Gen 2 - Mean fitness 0.122 - Best 0.2732 - PostVar 4.237 - Eps 0.150 - Time 52.45
datapoints:  25 / total:  41
[19:14:36] Gen 3 - Mean fitness 0.109 - Best 0.2732 - PostVar 3.907 - Eps 0.176 - Time 60.24
datapoints:  27 / total:  51
[19:15:42] Gen 4 - Mean fitness 0.172 - Best 0.2767 - PostVar 4.419 - Eps 0.144 - Time 61.29
datapoints:  31 / total:  61
[19:16:56] Gen 5 - Mean fitness 0.206 - Best 0.2797 - PostVar 4.066 - Eps 0.144 - Time 67.59
datapoints:  41 / total:  71
[19:17:55] Gen 6 - Mean fitness 0.148 - Best 0.2797 - PostVar 4.227 - Eps 0.131 - Time 53.73
datapoints:  46 / total:  81
[19:19:03] Gen 7 - Mean fitness 0.188 - Best 0.2871 - PostVar 3.924 - Eps 0.150 - Time 61.60
datapoints:  52 / total: 

datapoints:  257 / total:  500
[20:34:04] Gen 66 - Mean fitness 0.233 - Best 0.2911 - PostVar 3.969 - Eps 0.118 - Time 85.86
datapoints:  360 / total:  500
[20:35:08] Gen 67 - Mean fitness 0.187 - Best 0.2911 - PostVar 4.081 - Eps 0.111 - Time 58.82
datapoints:  398 / total:  500
[20:36:26] Gen 68 - Mean fitness 0.143 - Best 0.2911 - PostVar 3.918 - Eps 0.052 - Time 71.31
datapoints:  269 / total:  500
[20:37:53] Gen 69 - Mean fitness 0.230 - Best 0.2911 - PostVar 4.055 - Eps 0.118 - Time 80.90
datapoints:  375 / total:  500
[20:39:08] Gen 70 - Mean fitness 0.172 - Best 0.2911 - PostVar 3.833 - Eps 0.118 - Time 69.33
datapoints:  315 / total:  500
[20:40:31] Gen 71 - Mean fitness 0.217 - Best 0.2911 - PostVar 4.181 - Eps 0.072 - Time 76.86
datapoints:  348 / total:  500
[20:41:55] Gen 72 - Mean fitness 0.197 - Best 0.2911 - PostVar 3.912 - Eps 0.065 - Time 78.33
datapoints:  388 / total:  500
[20:43:18] Gen 73 - Mean fitness 0.151 - Best 0.2911 - PostVar 3.558 - Eps 0.065 - Time 77.91


datapoints:  410 / total:  500
[22:14:44] Gen 132 - Mean fitness 0.156 - Best 0.2951 - PostVar 3.776 - Eps 0.020 - Time 96.38
datapoints:  350 / total:  500
[22:16:27] Gen 133 - Mean fitness 0.210 - Best 0.2951 - PostVar 4.130 - Eps 0.039 - Time 96.15
datapoints:  319 / total:  500
[22:18:19] Gen 134 - Mean fitness 0.221 - Best 0.2951 - PostVar 4.529 - Eps 0.020 - Time 105.02
datapoints:  262 / total:  500
[22:20:09] Gen 135 - Mean fitness 0.236 - Best 0.2951 - PostVar 4.299 - Eps 0.046 - Time 104.42
datapoints:  337 / total:  500
[22:21:50] Gen 136 - Mean fitness 0.215 - Best 0.2951 - PostVar 4.318 - Eps 0.026 - Time 94.61
datapoints:  288 / total:  500
[22:23:42] Gen 137 - Mean fitness 0.229 - Best 0.2951 - PostVar 4.258 - Eps 0.013 - Time 106.73
datapoints:  392 / total:  500
[22:25:12] Gen 138 - Mean fitness 0.180 - Best 0.2951 - PostVar 4.457 - Eps 0.065 - Time 83.37
datapoints:  418 / total:  500
[22:26:31] Gen 139 - Mean fitness 0.144 - Best 0.2951 - PostVar 3.858 - Eps 0.013 - 

datapoints:  411 / total:  500
[00:08:27] Gen 197 - Mean fitness 0.129 - Best 0.2961 - PostVar 4.223 - Eps 0.190 - Time 84.37
datapoints:  368 / total:  500
[00:10:33] Gen 198 - Mean fitness 0.193 - Best 0.2961 - PostVar 4.146 - Eps 0.065 - Time 119.73
datapoints:  326 / total:  500
[00:12:27] Gen 199 - Mean fitness 0.216 - Best 0.2961 - PostVar 4.126 - Eps 0.059 - Time 108.16
datapoints:  313 / total:  500
[00:14:20] Gen 200 - Mean fitness 0.223 - Best 0.2961 - PostVar 3.948 - Eps 0.033 - Time 108.10
datapoints:  345 / total:  500
[00:16:16] Gen 201 - Mean fitness 0.209 - Best 0.2961 - PostVar 4.140 - Eps 0.039 - Time 110.29
datapoints:  381 / total:  500
[00:18:16] Gen 202 - Mean fitness 0.189 - Best 0.2961 - PostVar 4.336 - Eps 0.072 - Time 114.11
datapoints:  253 / total:  500
[00:20:14] Gen 203 - Mean fitness 0.237 - Best 0.2961 - PostVar 3.903 - Eps 0.124 - Time 111.86
datapoints:  278 / total:  500
[00:22:10] Gen 204 - Mean fitness 0.232 - Best 0.2961 - PostVar 4.016 - Eps 0.046

In [None]:
pd.DataFrame(hist).sort_values('test_pcc', ascending=False)

In [None]:
a = pd.DataFrame(hist)
a['arc'] = ['|'.join([f"{x.Layer_attributes['RANGE_ST']}-{x.Layer_attributes['RANGE_ST']+x.Layer_attributes['RANGE_D']}" for x in entry]) for entry in a['arc']]
a.drop(columns=['rate_df'], inplace=True)
a.to_csv("train_history.tsv", sep="\t", index=False)

In [None]:
%matplotlib inline

ax = stat_df.plot.line(x='Generation', y=['GenAvg', 'Best'])
ax.set_ylabel("Reward (Pearson correlation)")
ax.set_xlabel("Generation")

In [None]:
kinn_gr = KineticModel('../test/test_1/test_1_model_params.yaml')

# START SITE
fig, axs_ = plt.subplots(3,3, figsize=(15,15))
axs = [axs_[i][j] for i in range(len(axs_)) for j in range(len(axs_[i]))]
for k in controller.model_space_probs:
    if k[-1] == 'RANGE_ST':
        try:
            d = controller.model_space_probs[k].sample(size=1000)
        except:
            continue
        ax = axs[k[0]]
        _ = sns.distplot(d, label="Post", ax=ax)
        _ = sns.distplot(controller.model_space_probs[k].prior_dist, label="Prior", ax=ax)
        if k[0] < 7:
            _ = ax.axvline(x=kinn_gr.model_params['Rates'][k[0]]['input_range'][0],linestyle='--', color='grey')
            _ = ax.set_title(
                ' '.join(['Rate ID', str(k[0]), '\nPosterior mean', str(np.mean(d)), 
                          '\nGround truth', str(kinn_gr.model_params['Rates'][k[0]]['input_range'][0])])
            )
        else:
            _ = ax.set_title(
                ' '.join(['Rate ID', str(k[0]), '\nPosterior mean', str(np.mean(d))]))

        #_ = ax.set_xlim(0,50)
fig.tight_layout()

In [None]:
# CONV RANGE
fig, axs_ = plt.subplots(3,3, figsize=(15,15))
axs = [axs_[i][j] for i in range(len(axs_)) for j in range(len(axs_[i]))]
for k in controller.model_space_probs:
    if k[-1] == 'RANGE_D':
        d = controller.model_space_probs[k].sample(size=1000)
        ax = axs[k[0]]
        _ = sns.distplot(d, ax=ax)
        _ = sns.distplot(controller.model_space_probs[k].prior_dist, label="Prior", ax=ax)
        if k[0] < 7:
            D = kinn_gr.model_params['Rates'][k[0]]['input_range'][1] - kinn_gr.model_params['Rates'][k[0]]['input_range'][0]
            _ = ax.axvline(x=D,linestyle='--', color='grey')
            _ = ax.set_title(
                ' '.join(['Rate ID', str(k[0]), '\nPosterior mean', str(np.mean(d)), '\nGround truth', str(D)])
            )
        else:
            _ = ax.set_title(
                ' '.join(['Rate ID', str(k[0]), '\nPosterior mean', str(np.mean(d))]))
        #_ = ax.set_xlim(0,20)    
fig.tight_layout()

In [None]:
# EDGE PRESENCE
fig, axs_ = plt.subplots(3,3, figsize=(15,15))
axs = [axs_[i][j] for i in range(len(axs_)) for j in range(len(axs_[i]))]
for k in controller.model_space_probs:
    if k[-1] == 'EDGE':
        d = controller.model_space_probs[k].sample(size=1000)
        ax = axs[k[0]]
        sns.distplot(d, ax=ax)
        sns.distplot(controller.model_space_probs[k].prior_dist, ax=ax)
        ax.set_title(
            ' '.join(['Rate ID', str(k[0]), '\nPosterior mean', str(np.mean(d))]))
        #_ = ax.set_xlim(0,20)    
fig.tight_layout()

In [None]:
# bogus connections
bogus_conns = [7]
for b in bogus_conns:
    for k in controller.model_space_probs:
        if k[0] != b: continue
        #print(k)
        fig, ax = plt.subplots()
        ax = sns.distplot(controller.model_space_probs[k].sample(size=1000), label="post")
        sns.distplot(controller.model_space_probs[k].prior_dist, ax=ax, label="prior")
        ax.set_title(k)