In [1]:
%pylab notebook

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
import tensorflow.keras.backend as K

import primo.models
import primo.datasets
import primo.tools.sequences as seqtools

from copy import deepcopy

from primo.models.cas9_keras import log10_crispr_spec, log10_norm_crispr_spec, linear_crispr_spec

from primo.models.encoder import entropy_regularizer

gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

Matplotlib created a temporary config/cache directory at /tmp/matplotlib-45heqc5j because the default path (/tf/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


Populating the interactive namespace from numpy and matplotlib
1 Physical GPUs, 1 Logical GPUs


In [2]:
train_dataset = primo.datasets.OpenImagesTrain(
    '/tf/open_images/train/', switch_every=10**5
)
validation_dataset = primo.datasets.OpenImagesVal('/tf/open_images/validation/')

def keras_batch_generator(dataset_batch_generator, similarity_threshold):
    # Yield datasets
    # TODO: Verify with Callie this understanding is correct https://github.com/uwmisl/cas9-similarity-search/issues/2
    while True:
        # This tuple contains:
        # indices: a positive integer uniquely identifying an image. This index is obtained by enumerating all the images in the dataset (before splitting them into test/train/validate datasets)
        # pairs:
        indices, pairs = next(dataset_batch_generator)
        # The Euclidean distances between the two vectors in each pair
        distances = np.sqrt(np.square(pairs[:,0,:] - pairs[:,1,:]).sum(1))
        # Whether or not the images in this pair should be considered 'similar'. This is a boolean value, represented by an int (0 or 1), and is determined by whether the aforementioned Euclidean distances between image feature vectors are under some pre-deterined "similarity threshold".
        similar = (distances < similarity_threshold).astype(int)
        # Yield a pair of sequences, and 0-or-1 indicating whether they're similar.
        yield pairs, similar

# To see how this value was derived, please consult the Materials and Methods subsection under Feature Extraction section.
similarity_threshold = 75
# Intuitively determined:
batch_size = 150
val_batch_size = 150

train_batch_generator = keras_batch_generator(
    train_dataset.balanced_pairs(batch_size, similarity_threshold),
    similarity_threshold
)

val_batch_generator = keras_batch_generator(
    validation_dataset.balanced_pairs(val_batch_size, similarity_threshold),
    similarity_threshold
)
        
train_inputs, train_targets = next(train_batch_generator)

switching to train_1.h5 and train_6.h5


In [15]:
OUTPUT_LEN = 20

subpen = np.array([-1.7449405080809126, -1.275485084790358, -1.8001827224086722, -1.9323849500279549, -1.6677722398632207, -1.6537370694565101, -1.8981469677400609, -1.0814292717607923, -1.3231152511430453, -0.99840146446464273, -1.2766126030502924, -1.073338813454068, -1.5614374592181826, -1.4737507320504855, -1.298392565410591, -1.0105000195452765, -0.43349702574711524, -0.11665543376814178, -0.17370266801790191, 0.2676084623705467, 0.051835157750172757, 0.08920809165894289, 0.075459598643889569, 0.046975071077932237]).astype(np.float32)
subpen = subpen[:20][::-1]
subtrans = np.array([[ 0.        ,  1.16616601,  0.96671383,  0.94917742],       [ 0.94076049,  0.        ,  1.18426595,  0.87129983],       [ 0.58224486,  1.11064886,  0.        ,  1.04707949],       [ 0.9633753 ,  0.98895548,  1.2293125 ,  0.        ]]).astype(np.float32)
# changes matrices to account for the nucleotide string being 'ACGT', not 'ATCG' as in seqtools.bases and the rest of the PRIMO package
finkel_bases = 'ACGT'
shift = np.array([finkel_bases.index(b) for b in seqtools.bases])
subtrans = subtrans[shift,:][:, shift]
    
def dotproduct_crispr_spec(seq_pairs):
    """Alternative cleave rate model
    
    Normalize the softmax inputs, and take the penalty at each position to be
    one minus the dot product of the two "base vectors", weighted according to the 
    position dependent substitution penalty from the finkelstein paper.
    
    This ignores the base dependent weighting -- i.e. difference between substituting
    A, T, C, or G -- but allows base outputs which are less confident but still
    similar to be evaluated as such during training. 
    
    With the previous function, an input like [0.8, 0.066, 0.066, 0.0.66] / [0.8, 0.066, 0.066, 0.0.66]
    at a single base results in a substantial penalty to cleave rate.
    """
    ref = seq_pairs[:, 0, :, :]
    obs = seq_pairs[:, 1, :, :]
    ref_norm, _ = tf.linalg.normalize(ref, axis=-1)
    obs_norm, _ = tf.linalg.normalize(obs, axis=-1)
    
    m1 = tf.einsum('bij,jk->bijk', ref, subtrans)
    m2 = tf.einsum('bij,bikj->bijk', obs, m1)
    base_factors = tf.reduce_sum(tf.reduce_sum(m2, axis=-1), axis=-1)
    # 1 - Dot product
    x = 1 -  tf.reduce_sum(ref_norm*obs_norm, axis=-1)
    
    scores = tf.reduce_sum(x * base_factors * tf.constant(subpen.flatten(), dtype=tf.float32), -1)
    scores = tf.minimum(0.0, scores)
    return scores

def dotproduct_linearized(mid_point=None):
    """Returns a predictor function which will scale the log10 scores such that the
    given `mid_point` value is 0.5.
    
    mid_point is the output of log10_crispr_spec, and shoudl be on range [log10_lb, log10_ub]
    
    If mid_point is none, cleave rate is linearized; i.e. return 10**log_10_crispr_spec(x)
    """
    power = 10.0
    if mid_point is not None:
        power = 0.5 ** (1 / mid_point)

    def f(seq_pairs):
        """
        seq_pairs batch_size x 2 x SEQLEN x 4
        """
        scores = power ** dotproduct_crispr_spec(seq_pairs)
        confidence = tf.reduce_mean(tf.reduce_max(seq_pairs, -1), axis=2)
        confidence = tf.reduce_prod(confidence, axis=1)
        return 0.5 + (scores - 0.5) * confidence

    return f

TRAIN_PREDICTOR = tf.function(dotproduct_linearized(-1.5))
REAL_PREDICTOR = tf.function(linear_crispr_spec(-1.5))

encoder = tf.keras.Sequential([
    layers.Dense(2048, activation = 'relu', activity_regularizer=tf.keras.regularizers.l2(0.0001)),
    layers.Dropout(0.2),
    layers.Dense(1024, activation = 'relu', activity_regularizer=tf.keras.regularizers.l2(0.0001)),
    layers.Dropout(0.2),
    layers.Dense(OUTPUT_LEN * 4, activation='relu'),
    layers.Reshape([OUTPUT_LEN, 4]),
    layers.Activation('softmax'),
#     layers.Lambda(
#         # Just using the identity because we don't want to transform the softmaxxed output,
#         # we just want to make sure we learn an output encoding that's regularized (i.e. not crazy complex/over-fitting)
#         lambda x: x,

#         # In inference mode, this does nothing (just passes identity), but when training, this regularizes
#         # the activations.
#         # Using an "entropy" regulator because we passed the output through a softmax.
#         activity_regularizer=entropy_regularizer(
#             0.000
#         )
#     )

], name='encoder')

#encoder = tf.keras.models.load_model('/tf/primo/data/models/encoder_1site_20220118_1.h5')

# from primo.models.encoder import ClusteringLayer
# import pickle
# with open('/tf/primo/data/clusters_kmeans_64.pkl', 'rb') as f:
#     kmodel = pickle.loads(f.read())
# kmeans = kmodel.cluster_centers_
# encoder = tf.keras.Sequential([
#     ClusteringLayer(kmeans.shape[0], kmeans, input_dim=4096, alpha=25.0),
#     layers.Dense(1024, activation = 'relu'),
#     layers.Dense(OUTPUT_LEN * 4, activation='relu'),
#     layers.Reshape([OUTPUT_LEN, 4]),
#     layers.Activation('softmax'),
# ], name='encoder')

X_pairs = layers.Input([2, 4096])

# Essentially, we started with ]a batch of feature-vector pairs...
# ...And turned them into a pair of feature-vector batches.
X1, X2 = layers.Lambda(lambda X: (X[:,0,:], X[:,1,:]))(X_pairs)

distances = layers.Lambda(lambda Xs: tf.sqrt(tf.reduce_sum(tf.square(Xs[0]-Xs[1]), axis=1)))([X1,X2])

# Independently transforms the batches of feature vectors into soft-max encoded DNA sequences.
S1 = encoder(X1)
S2 = encoder(X2)

# Glue them back together! Back into a batch of feature vector pairs.
S_pairs = layers.Lambda(
    lambda Ss: tf.stack(Ss, axis=-1)
)([S1,S2])

# Dimensions: (batch_size x 80 x 4 x 2 ) (i.e. batch size x DNA length x # of nucleotides x 2)

# Swaps dimensions for the predictor, which wants (batch-size x 2 x DNA length x 4)
S_pairs_T = layers.Lambda(lambda S: tf.transpose(S, [0, 3, 1, 2]))(S_pairs)

# y_h: Estimated output
y_h = layers.Lambda(tf.function(TRAIN_PREDICTOR))(S_pairs_T)
y_h_T = layers.Reshape([1])(y_h)

# Make layers for one hot scoring
S_pairs_T_OH = tf.one_hot(K.argmax(S_pairs_T, -1), 4)
y_h_OH = layers.Lambda(REAL_PREDICTOR)(S_pairs_T_OH)
y_h_t_OH = layers.Reshape([1])(y_h_OH)
y_h_OH_log = layers.Lambda(tf.function(log10_crispr_spec))(S_pairs_T_OH)

calcseq = tf.keras.Model(inputs=X_pairs, outputs=S_pairs_T)
calcdists = tf.keras.Model(inputs=X_pairs, outputs=distances)
encoder_trainer = tf.keras.Model(inputs=X_pairs, outputs=y_h_T)
onehotseqs = tf.keras.Model(inputs=X_pairs, outputs=S_pairs_T_OH)
onehotscore = tf.keras.Model(inputs=X_pairs, outputs=y_h_t_OH)
onehotscore_log = tf.keras.Model(inputs=X_pairs, outputs=y_h_OH_log)

print(encoder.summary())



Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 2048)              8390656   
_________________________________________________________________
dropout_2 (Dropout)          (None, 2048)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 1024)              2098176   
_________________________________________________________________
dropout_3 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 80)                82000     
_________________________________________________________________
reshape_5 (Reshape)          (None, 20, 4)             0         
_________________________________________________________________
activation_1 (Activation)    (None, 20, 4)             0   

In [13]:
import secrets
import os
def UniquenessMetric(y_true, y_pred):
    """Use a single batch of inputs to measure sequence uniqueness while training"""
    seqs = calcseq(train_inputs)
    # Reduce seqs to a single base (e.g. [0.6, 0.2, 0.1, 0.1] -> [0])
    rounded_seqs = K.argmax(seqs[:, 0, :, :], axis=-1)
    # Now each sample is array of 20 integers, 0-3. Multiply each position by 4**n to make
    # a unique integer for each possible 20bp sequence
    hashed_seqs = tf.reduce_sum(rounded_seqs * (4 ** tf.range(0, 20, dtype=tf.int64)), axis=-1)
    # Get unique ones
    unique_seqs, _ = tf.unique(hashed_seqs)
    # Return ratio of unique to input
    return float(len(unique_seqs)) / len(train_inputs)

def ConfidenceMetric(batch):
    """Uses a single batch of inputs to measure sequence uniqueness while training"""
    def confidence(_y_true, _y_pred):
        seqs = calcseq(batch)
        # Get the highest probabilty value for each base
        max_seqs = tf.reduce_max(seqs, axis=-1)
        # Take mean along both batch and seqence dimensions
        return tf.reduce_mean(max_seqs)
    return confidence

class EarlyStopCallback(keras.callbacks.Callback):
    def __init__(self, **kwargs):
        uid = secrets.token_hex(2)
        self.__file = f'/tf/primo/signals/{uid}'
        print(f"Touch {self.__file} to terminate training early")
        super().__init__(**kwargs)


    def on_epoch_end(self, epoch, logs={}):
        if os.path.exists(self.__file):
          print (f"\nStopping after Epoch {epoch}")
          self.model.stop_training = True


In [16]:
import tensorflow_addons as tfa
encoder_trainer.compile(
    tf.keras.optimizers.Adagrad(1e-1),
    #'binary_crossentropy',
    tfa.losses.TripletSemiHardLoss(),
    run_eagerly=False,
    metrics=[
        UniquenessMetric,
        ConfidenceMetric(train_inputs),
        tf.keras.metrics.Precision(), 
        tf.keras.metrics.Recall(),
        'accuracy']
)
history = encoder_trainer.fit(
    train_batch_generator,
    validation_data=val_batch_generator,
    validation_steps=50,
    steps_per_epoch=500,
    epochs=100,
    callbacks=[EarlyStopCallback()],
)

Touch /tf/primo/signals/79df to terminate training early
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

KeyboardInterrupt: 

In [7]:
encoder_trainer.compile(tf.keras.optimizers.Adagrad(1e-2, clipnorm=1.0), 'binary_crossentropy')
history = encoder_trainer.fit(train_inputs, train_targets, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [102]:

encoder.save('/tf/primo/data/models/encoder_1site_20220118_1.h5')



In [11]:
fig = plt.figure(figsize=(8, 12))
lines = [k for k in history.history.keys() if not k.startswith('val_')]

axes = fig.subplots(int(len(lines)/2), 2).flatten()
for i in range(len(lines)):
    label = lines[i]
    try:
        ax = axes[i]
    except TypeError:
        ax = axes
    
    val_label = 'val_' + label

    ax.plot(history.history[label], label=label)
    if val_label in history.history:
        ax.plot(history.history[val_label], linestyle=':', label=val_label)
    ax.legend()

<IPython.core.display.Javascript object>

In [100]:
def confidence_by_base(sequences):
    """Takes a batch of softmax sequences, and returns vector of average confidence in each position
    
    sequences: Number of exmaple x Number of bases x 4
    
    Returns: 1 x number of bases
    """
    # reduce last dimension to max base number
    return np.mean(np.max(sequences, axis=-1), axis=0)

def entropy_by_base(sequences):
    x = np.sum(sequences, axis=0)
    x /= np.sum(x, axis=1)[:, None]
    entropy = -np.sum(x * np.log(x + 1e-10), axis=1)
    return entropy

def one_hot(nparray, depth = 0, on_value = 1, off_value = 0):
    if depth == 0:
        depth = np.max(nparray) + 1
    assert np.max(nparray) < depth, "the max index of nparray: {} is larger than depth: {}".format(np.max(nparray), depth)
    shape = nparray.shape
    out = np.ones((*shape, depth)) * off_value
    indices = []
    for i in range(nparray.ndim):
        tiles = [1] * nparray.ndim
        s = [1] * nparray.ndim
        s[i] = -1
        r = np.arange(shape[i]).reshape(s)
        if i > 0:
            tiles[i-1] = shape[i-1]
            r = np.tile(r, tiles)
        indices.append(r)
    indices.append(nparray)
    out[tuple(indices)] = on_value
    return out

seqs_softmax = calcseq.predict(train_inputs)[:, 0, :, :]
seqs_onehot = one_hot(seqs_softmax.argmax(-1))

confidence = confidence_by_base(seqs_softmax)
entropy = entropy_by_base(seqs_softmax)

seqlen = seqs_softmax.shape[1]
fig = plt.figure()
axes = fig.subplots(2,1)
ax = axes[0]
ax.bar(np.arange(0, seqlen), confidence)
ax.set_title('confidence')
ax = axes[1]
ax.bar(np.arange(0, seqlen), entropy)
ax.set_title('entropy')
ax.set_xlabel('sequence position')
plt.tight_layout()

oh_confidence = confidence_by_base(seqs_onehot)
oh_entropy = entropy_by_base(seqs_onehot)

subpen = np.array([-1.7449405080809126, -1.275485084790358, -1.8001827224086722, -1.9323849500279549, -1.6677722398632207, -1.6537370694565101, -1.8981469677400609, -1.0814292717607923, -1.3231152511430453, -0.99840146446464273, -1.2766126030502924, -1.073338813454068, -1.5614374592181826, -1.4737507320504855, -1.298392565410591, -1.0105000195452765, -0.43349702574711524, -0.11665543376814178, -0.17370266801790191, 0.2676084623705467, 0.051835157750172757, 0.08920809165894289, 0.075459598643889569, 0.046975071077932237]).astype(np.float32)
subpen = subpen[:20][::-1]

fig = plt.figure()
ax = fig.add_subplot(111)

ax.bar(np.arange(0, seqlen), oh_entropy)
ax.plot(-subpen)
ax.set_title('onehot entropy')
ax.set_xlabel('sequence position')
plt.tight_layout()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [92]:
idx = train_targets.argsort()

Y2 = onehotscore.predict(train_inputs)
Y = encoder_trainer.predict(train_inputs)
seqs = calcseq.predict(train_inputs)
plt.figure()

print(train_targets.shape)
print((Y[:, 0] - train_targets).shape)
plt.plot(train_targets[idx], '.')
plt.plot(Y[idx][:, 0], 'x')
plt.plot(Y2[idx][:, 0], '.')
plt.hlines(0.5, 0, len(seqs), linestyle=':')
plt.ylabel('score')
plt.xlabel('sample')
plt.title('random balanced training batch')
plt.legend(['truth', 'softmax', 'onehot'])

<IPython.core.display.Javascript object>

(750,)
(750,)


<matplotlib.legend.Legend at 0x7f10787e7978>

In [93]:
def get_batchs(n, generator):
    inputs = None
    target = None
    for _ in range(n):
        i, t = next(generator)
        if inputs is None:
            inputs = i
            targets = t
        else:
            inputs = np.concatenate([inputs, i])
            targets = np.concatenate([targets, t])
    return inputs, targets

#val_inputs, val_targets = next(val_batch_generator)
val_inputs, val_targets = get_batchs(1, val_batch_generator)

idx = val_targets.argsort()
Y = encoder_trainer.predict(val_inputs)
Y2 = onehotscore.predict(val_inputs)
plt.figure()
plt.plot(val_targets[idx], '.')
plt.plot(Y[idx][:, 0], 'x')
plt.plot(Y2[idx][:, 0], '.')
plt.hlines(0.5, 0, len(Y), linestyle=':')
plt.ylabel('score')
plt.xlabel('sample')
plt.title('random balanced validation batch')



<IPython.core.display.Javascript object>

Text(0.5, 1.0, 'random balanced validation batch')

In [101]:
val_inputs, val_targets = get_batchs(5, val_batch_generator)
train_inputs, train_targets = get_batchs(5, train_batch_generator)

def log_batch_results(inputs, targets, title):
    idx = targets.argsort()
    Y = np.maximum(-4.0, onehotscore_log.predict(inputs))

    RECALL_THRESH = -2.0

    recall = ((Y > RECALL_THRESH) * (targets > 0.5)).sum() / (targets > 0.5).sum()
    precision = ((Y > RECALL_THRESH) * (targets > 0.5)).sum() / (Y > RECALL_THRESH).sum()
    plt.figure()
    #plt.plot(val_targets[idx], '.')
    plt.plot(Y[idx], '.')
    plt.hlines(-2.0, 0, len(Y), color='black', linestyle=':')
    plt.vlines(len(inputs) / 2, -4, 0, color='black', linestyle=':')
    plt.ylabel('log10 rate')
    plt.xlabel('sample')
    plt.title(title + f"\nRecall: {recall:.2f}, precision: {precision:.2f}")

    print(f"Recall: {recall:.2f}, precision: {precision:.2f}")
    
log_batch_results(train_inputs, train_targets, 'training batch')
log_batch_results(val_inputs, val_targets, 'validation batch')

<IPython.core.display.Javascript object>

Recall: 0.89, precision: 0.77


<IPython.core.display.Javascript object>

Recall: 0.83, precision: 0.91


In [11]:
seqs = calcseq.predict(train_inputs)
all_seqs = seqs[:, 0, :, :] #np.concatenate([seqs[:, 0, :, :], seqs[:, 1, :, :]])

seq_strings = seqtools.onehots_to_seqs(all_seqs)
print(len(np.unique(all_seqs.argmax(-1), axis=0)))
unique = np.unique(seq_strings)
print(unique)
print(f"{len(unique)} / {len(all_seqs)}")



22
['ACAATGGACAATTCTATCCT' 'ACAATGGGCAATTCTATCCT' 'CCAATGGACAATGCTATCCT'
 'CCAATGGACAATTCTATCCT' 'CCAATGGGCAATTCTATCCT' 'GCAAGGTGATTTGCTATCCT'
 'GCAATGGACAATTCTATCCT' 'GCCATGGACAATTCTATCCT' 'TCAAAGTCATTGGCTATCCT'
 'TCAATGCACTATTCTATCCT' 'TCAATGGAAAATTCTATCCT' 'TCAATGGACAAGCCTATCCT'
 'TCAATGGACAATGCTATCCT' 'TCAATGGACAATTCTATCCT' 'TCAATGGACTATTCTATCCT'
 'TCAATGGCCAATGCTATCCT' 'TCAATGGCCATGTCTATCCT' 'TCAATGGGAAATGCTATCCT'
 'TCAATGTGCTTGTCTATCCT' 'TCCATGCACAATTCTATCCT' 'TCCATGGACAATTCTATCCT'
 'TTAATGGACAATTCTATCCT']
22 / 150


In [18]:
all_seqs[7]

array([[0.00333693, 0.9899893 , 0.00333693, 0.00333693],
       [0.22694145, 0.24425125, 0.27815652, 0.25065082],
       [0.23219538, 0.2819585 , 0.25365072, 0.23219538],
       [0.23225082, 0.6456381 , 0.06428759, 0.05782351],
       [0.5236062 , 0.10427291, 0.09229853, 0.27982232],
       [0.07805529, 0.03192632, 0.8580921 , 0.03192632],
       [0.00319229, 0.00319229, 0.00319229, 0.9904231 ],
       [0.05635713, 0.35138175, 0.535904  , 0.05635713],
       [0.21648732, 0.07278113, 0.22385861, 0.48687303],
       [0.00915403, 0.00915403, 0.9725379 , 0.00915403],
       [0.1738564 , 0.13498202, 0.12751456, 0.56364703],
       [0.00953625, 0.01210224, 0.969872  , 0.00848958],
       [0.98716694, 0.00427771, 0.00427771, 0.00427771],
       [0.00378281, 0.00378281, 0.98804945, 0.00438489],
       [0.0809309 , 0.16193402, 0.09562994, 0.66150516],
       [0.00678979, 0.00668144, 0.00668144, 0.9798473 ],
       [0.9866355 , 0.00445482, 0.00445482, 0.00445482],
       [0.98578656, 0.00437462,

In [9]:
# weights = [tensor for tensor in encoder_trainer.trainable_weights]# if encoder_trainer.get_layer(tensor.name[:-2]).trainable]
# optimizer = encoder_trainer.optimizer
# loss = tf.keras.metrics.BinaryCrossentropy(from_logits=False)(y_h_T, train_targets)
# g = optimizer.get_gradients(loss, weights)

with tf.GradientTape() as tape:
    # Doing the computation in the context of the gradient tape
    # For example computing loss
    pred = encoder_trainer(train_inputs)
    loss = tf.metrics.MSE(train_targets,pred)
    
# Getting the gradient of weight w.r.t loss 
grad = tape.gradient(loss, encoder_trainer.trainable_weights) 
print(grad)
#print([tf.norm(g).numpy() for g in grad])

[<tf.Tensor: shape=(4096, 2048), dtype=float32, numpy=
array([[-0.00668996, -0.00442752, -0.01292114, ..., -0.00297812,
         0.00866672,  0.00046856],
       [-0.014083  , -0.0085131 , -0.01540488, ..., -0.00662993,
         0.01435623,  0.00218737],
       [-0.00739208, -0.00459306, -0.00793888, ..., -0.00424623,
         0.00936238,  0.00140106],
       ...,
       [-0.00707141, -0.0031601 , -0.00536229, ..., -0.00655378,
         0.01212534,  0.0022773 ],
       [-0.01130053, -0.00968682, -0.01601881, ..., -0.00374336,
         0.00950455,  0.00092499],
       [-0.00697468, -0.0061232 , -0.00847617, ..., -0.0014009 ,
         0.00568788,  0.00132537]], dtype=float32)>, <tf.Tensor: shape=(2048,), dtype=float32, numpy=
array([-0.01711289, -0.01263236, -0.01965856, ..., -0.00607363,
        0.01729374,  0.0022029 ], dtype=float32)>, <tf.Tensor: shape=(2048, 2048), dtype=float32, numpy=
array([[ 5.5676320e-04,  7.4185044e-02,  7.9013333e-02, ...,
        -1.9020287e-02, -2.8113823e-

In [10]:
# summarize history for loss
plt.figure()
plt.plot(history.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7f7e06ef4710>

In [11]:
sample_input = train_inputs[0]
inspect_layers = [encoder.model.layers[0].output, encoder.model.layers[3].output]
layers_model = tf.keras.Model(inputs=encoder.model.input, outputs=inspect_layers)
Y = layers_model.predict(sample_input)
print(Y)
plt.figure()
plt.plot(Y[0].T, '.')

AttributeError: 'Sequential' object has no attribute 'model'

In [None]:
sim = primo.models.Simulator()
from primo.models.cas9 import crispr_specificity
crispr_specificity('TATGTAGAAAACTGCAAAAC', 'TATGTAGAAAACTGCAAAAC')

In [None]:
x = np.array([[all_seqs[0], all_seqs[1]]])
print(x.shape)
print(f"linear: {linear_crispr_spec(tf.convert_to_tensor(x, dtype = tf.float32))}")
print(f"log10norm: {log10_norm_crispr_spec(tf.convert_to_tensor(x, dtype = tf.float32))}")
all_seqs[0].shape

In [None]:
print(np.around(all_seqs[0], 4))