Encoder Training
==============

In [1]:
%pylab notebook

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers

import primo.models
import primo.datasets

from copy import deepcopy

from primo.models.encoder_trainer import cas9_loss


Matplotlib created a temporary config/cache directory at /tmp/matplotlib-ygg0zyjw because the default path (/tf/.cache/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


Populating the interactive namespace from numpy and matplotlib


Reserve space on the GPU for running simulations. It's important to do this before running any tensorflow code (which will take all available GPU memory):

In [2]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)


1 Physical GPUs, 1 Logical GPUs


Load up the training and validation datasets:

In [3]:
train_dataset = primo.datasets.OpenImagesTrain(
    '/tf/open_images/train/', switch_every=10**5
)

validation_dataset = primo.datasets.OpenImagesVal('/tf/open_images/validation/')

In [4]:
def keras_batch_generator(dataset_batch_generator, similarity_threshold):
    # Yield datasets
    # TODO: Verify with Callie this understanding is correct https://github.com/uwmisl/cas9-similarity-search/issues/2
    while True:
        # This tuple contains:
        # indices: a positive integer uniquely identifying an image. This index is obtained by enumerating all the images in the dataset (before splitting them into test/train/validate datasets)
        # pairs:
        indices, pairs = next(dataset_batch_generator)
        # The Euclidean distances between the two vectors in each pair
        distances = np.sqrt(np.square(pairs[:,0,:] - pairs[:,1,:]).sum(1))
        # Whether or not the images in this pair should be considered 'similar'. This is a boolean value, represented by an int (0 or 1), and is determined by whether the aforementioned Euclidean distances between image feature vectors are under some pre-deterined "similarity threshold".
        similar = (distances < similarity_threshold).astype(int)
        # Yield a pair of sequences, and 0-or-1 indicating whether they're similar.
        yield pairs, similar

In [5]:
# To see how this value was derived, please consult the Materials and Methods subsection under Feature Extraction section.
similarity_threshold = 75
# Intuitively determined:
encoder_training_dataset_batch_size = 100
# Intuitively determined:
encoder_validation_dataset_batch_size = 2500

encoder_train_batches = keras_batch_generator(
    train_dataset.balanced_pairs(encoder_training_dataset_batch_size, similarity_threshold),
    similarity_threshold
)

# encoder_val_batches = keras_batch_generator(
#     validation_dataset.random_pairs(encoder_validation_dataset_batch_size),
#     similarity_threshold
# )

encoder_val_batches = keras_batch_generator(
    validation_dataset.random_pairs(encoder_validation_dataset_batch_size),
    similarity_threshold
)

# TODO: The new predictor is the nucleaseq Cas9 predictor. https://github.com/uwmisl/cas9-similarity-search/issues/3
predictor_train_batch_size = 1000
predictor_train_batches = train_dataset.random_pairs(predictor_train_batch_size)

Create the models and stack them together with the trainer:

In [6]:
# Yield predictor here is a differentiable DNA hybridization yield predictor (originally learned from the Nupack simulator). Represented in brown to the right of the one-hot box.
![big](../../documentation/similarity_search_schematic.jpg)

/bin/sh: 1: Syntax error: word unexpected (expecting ")")


In [7]:
encoder = primo.models.Encoder('/tf/primo/data/models/encoder-function-P.h5')
#encoder = primo.models.Encoder()

# TODO: Replace the yield_predictor with the nucleaseq Cas9 predictor, use that here instead. https://github.com/uwmisl/cas9-similarity-search/issues/3 
#yield_predictor = primo.models.PredictorModel('/tf/primo/data/models/yield-model.h5')
yield_predictor = primo.models.PredictorFunction()
encoder.model.compile()
yield_predictor.model.compile()
encoder_trainer = primo.models.EncoderTrainer(encoder, yield_predictor)
encoder_trainer.model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 2, 4096)]    0                                            
__________________________________________________________________________________________________
lambda_1 (Lambda)               ((None, 4096), (None 0           input_1[0][0]                    
__________________________________________________________________________________________________
encoder (Sequential)            (None, 20, 4)        12750928    lambda_1[0][0]                   
                                                                 lambda_1[0][1]                   
__________________________________________________________________________________________________
lambda_3 (Lambda)               (None, 20, 4, 2)     0           encoder[0][0]              

Run the training!

In [8]:
#$encoder_trainer.model.compile(tf.keras.optimizers.Adagrad(1e-2), tf.keras.metrics.BinaryCrossentropy(from_logits=False))
encoder_trainer.model.compile(tf.keras.optimizers.Adagrad(1e-5), 'binary_crossentropy')
#encoder_trainer.calcseq.compile(tf.keras.optimizers.Adagrad(1e-3), cas9_loss)
#encoder_trainer.model.compile(tf.keras.optimizers.Adagrad(0), 'binary_crossentropy')

In [None]:
def compare_weights(w1, w2):
    for i in range(len(w1)):
        l1 = w1[i]
        l2 = w2[i]
        name = w1[i].name
        dist = np.linalg.norm(l1 - l2)
        max_change = np.max(l1 - l2)
        print(f"{name}: dist {dist}, max: {max_change}")

old_weights = None# deepcopy(encoder.model.trainable_weights)
    
import pandas as pd
query_features = pd.read_hdf("/tf/primo/data/queries/features.h5")
test_feature = query_features.loc['luis_lego']
class CustomCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        global old_weights
        # Save the model
        filepath = f'/tf/primo/checkpoints/encoder-model.{epoch:02d}.h5'
        encoder.save(filepath)

        # Print outputs for checking
        onehots = encoder.model.predict(np.array([test_feature]))
        print(np.around(onehots, 4))
        
        # Print delta in weights to see if model is evolving
        new_weights = deepcopy(encoder.model.trainable_weights)
        if old_weights is not None:
            compare_weights(old_weights, new_weights)
        old_weights = new_weights

callbacks = [
    CustomCallback(),
]
# initial_results = encoder_trainer.model.evaluate(*next(encoder_train_batches))
# initial_val_results = encoder_trainer.model.evaluate(*next(encoder_val_batches))
# print(f"Initial loss: {initial_results:.2f}")
# print(f"Initial val loss: {initial_val_results:.2f}")
history = encoder_trainer.model.fit_generator(
    encoder_train_batches,
    steps_per_epoch = 1000,
#    epochs = 100,
#     steps_per_epoch = 500,
    epochs = 200,
    validation_data = encoder_val_batches,
    validation_steps = 1,
    verbose = 2,
    callbacks = callbacks,
)

switching to train_2.h5 and train_b.h5




In [None]:
encoder.save('/tf/primo/data/models/encoder-function-P.h5')

In [None]:
encoder_trainer.model.summary()


In [None]:
# summarize history for loss
plt.figure()
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')



In [None]:
Unique: 41 / 100
Unique: 50 / 100import primo.tools.filepath as filepaths
import primo.tools.sequences as seqtools
import pandas as pd
query_features_filepath = filepaths.get_query_features_path(isDocker=True)
query_features = pd.read_hdf(query_features_filepath)
query_seqs = encoder.encode_feature_seqs(query_features)
print(f"Query Seqs: {query_seqs}")

def seq_str_to_input(seq):
    return np.transpose(seqtools.seqs_to_onehots(seq), [1, 0, 2])

In [None]:
a = encoder_trainer.model.predict(np.array([[query_features.loc['callie_janelle'], query_features.loc['callie_janelle']]]))
print(f"Full model: {a}")

b = encoder_trainer.predictor.model.predict(np.array([
    np.concatenate([
        seq_str_to_input('TAAAAAAAAAAAAGAAAAAA'),
        seq_str_to_input('TAAAAAAAAAAAAGAAAAAA'),
    ]),
]))
print(f"Predictor with sequences: {b}")

In [None]:
encoder_trainer.calcdists.predict(np.array([[query_features.loc['callie_janelle'], query_features.loc['luis_lego']]]))

In [None]:
batch = next(encoder_train_batches)

In [None]:
pair = batch[0][12]
seqs = np.array([encoder.model.predict(pair)])
print(seqs.shape)
print(encoder_trainer.predictor.model.predict(seqs))
print(encoder_trainer.calcdists.predict(np.array([pair])))

In [None]:

encoder_trainer.predictor.model(np.array([
    np.concatenate([
        seq_str_to_input('TAAAAAAAAAAAAGAAAAAA'),
        seq_str_to_input('TAAAAAAAAAAAAGAAAAAA'),
    ]),
    np.concatenate([
        seq_str_to_input('GACATCAACGAACAAAGTAA'),
        seq_str_to_input('GAAAACAAAAAAAAAAAAAA'),
    ]),
]))
#print(np.transpose(seqtools.seqs_to_onehots('GAAAACAAAAAAAAAAAAAA'), [1, 0, 2]).shape)


In [None]:
encoder_trainer.model.predict(batch[0])