In [1]:
import tensorflow as tf
import numpy as np

# Set seeds
tf.set_random_seed(42)
np.random.seed(42)
   
# Import the mLSTM babbler model
from unirep import babbler1900 as babbler
    
# Where model weights are stored.
MODEL_WEIGHT_PATH = "./data/1900_weights"

In [2]:
tf.reset_default_graph()
batch_size = 12
model = babbler(batch_size=batch_size, model_path=MODEL_WEIGHT_PATH)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
dim is deprecated, use axis instead

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
Use tf.random.categorical instead.

In [None]:
# Check that representations are reproducible
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm

# Load the saved representations
path = "./data/stability_data"
output_path = os.path.join(path, "stability_with_unirep_fusion.hdf")
existing_seqs = pd.read_hdf(output_path, key="ids").reset_index(drop=True)
existing_reps = pd.read_hdf(output_path, key="reps").reset_index(drop=True)
assert existing_seqs.shape[0] == existing_reps.shape[0]
assert np.array_equal(existing_seqs.index, existing_reps.index)

# Create reprensetations for some seqs
for index, row in tqdm(existing_seqs.iterrows(), total=existing_seqs.shape[0]):
    check_rep_1 = model.get_rep(row["sequence"])
    check_rep_1 = np.concatenate((check_rep_1[0], check_rep_1[1], check_rep_1[2]))
    check_rep_2 = model.get_rep(row["sequence"])
    check_rep_2 = np.concatenate((check_rep_2[0], check_rep_2[1], check_rep_2[2]))
    true_rep = existing_reps.iloc[index].values

    if not np.allclose(true_rep, check_rep_1, atol=0.0002) or not np.allclose(check_rep_1, check_rep_2, atol=0.0002):
        true_check_diff = abs(np.sum(true_rep - check_rep_1))
        self_run_diff = abs(np.sum(check_rep_1 - check_rep_2))
        print("{}: {} difference with saved truth".format(index, true_check_diff))
        print("{}: {} difference with self comparison".format(index, self_run_diff))

In [9]:
# Save meta_graph
# graph_def = tf.Session().graph_def
# graph_def = tf.get_default_graph().as_graph_def() # Get the loaded babbler graph
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    saver.save(sess, "./")

In [142]:
from unirep import mLSTMCell1900, tf_get_shape

# FIXME: Group input sequences by length

class babbler1900():

    def __init__(self,
                 model_path="./data/1900_weights",
                 batch_size=500
                 ):
        self._model_path = model_path
        self._batch_size = batch_size
        
        # Batch size dimensional placeholder which gives the
        # Lengths of the input sequence batch. Used to index into
        # The final_hidden output and select the stop codon -1
        # final hidden for the graph operation.
        self._rnn = mLSTMCell1900(1900,
                    model_path=self._model_path,
                        wn=True)
        zero_state = self._rnn.zero_state(self._batch_size, tf.float32)

        self._embed_matrix = tf.get_variable(
            "embed_matrix", dtype=tf.float32, initializer=np.load(os.path.join(self._model_path, "embed_matrix:0.npy"))
        )
        
        with tf.Session() as sess:
            self._zero_state = sess.run(zero_state)
        
    def get_reps(self, seqs):
        # Get the input sequences
        seq_ints = [aa_seq_to_int(seq.strip())[:-1] for seq in seqs]
        lengths = [len(x) for x in seq_ints]
        print(max(lengths))
        tf_tensor = tf.convert_to_tensor(seq_ints)
        dataset = tf.data.Dataset.from_tensor_slices(tf_tensor).batch(self._batch_size)
        iterator = dataset.make_one_shot_iterator()
        input_tensor = iterator.get_next()

        embed_cell = tf.nn.embedding_lookup(self._embed_matrix, input_tensor)
        _output, _final_state = tf.nn.dynamic_rnn(
            self._rnn,
            embed_cell,
            initial_state=self._zero_state,
            swap_memory=True,
            parallel_iterations=1
        )
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            final_state_, hs = sess.run([_final_state, _output])
            assert final_state_[0].shape[0] == self._batch_size

            final_cell, final_hidden = final_state_
            avg_hidden = np.array([np.mean(x, axis=0) for x in hs])
            together = np.concatenate((avg_hidden, final_hidden, final_cell), axis=1)
            return together

tf.reset_default_graph()
model = babbler1900(batch_size=5000)
path = "./data/stability_data"
df = pd.read_table(os.path.join(path, "ssm2_stability_scores.txt"))
seqs = df["sequence"].iloc[:5000].values
results = model.get_reps(seqs)
print(results.shape)
print(results)

# Check that representations are reproducible
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm

# Load the saved representations
path = "./data/stability_data"
output_path = os.path.join(path, "stability_with_unirep_fusion.hdf")
existing_seqs = pd.read_hdf(output_path, key="ids").reset_index(drop=True)
existing_reps = pd.read_hdf(output_path, key="reps").reset_index(drop=True)
assert existing_seqs.shape[0] == existing_reps.shape[0]
assert np.array_equal(existing_seqs.index, existing_reps.index)

# Create reprensetations for some seqs
for index, row in tqdm(existing_seqs.iterrows(), total=existing_seqs.shape[0]):
    check_rep = results[index]
    true_rep = existing_reps.iloc[index].values

    if not np.allclose(true_rep, check_rep, atol=0.00001):
        true_check_diff = abs(np.sum(true_rep - check_rep))
        print("{}: {} difference with saved truth".format(index, true_check_diff))

44
(5000, 5700)
[[ 0.0132201  -0.02335193  0.01794337 ...  0.5680594   0.92126364
   1.4379143 ]
 [ 0.01448829 -0.03285311  0.01742038 ...  0.6266354   1.5777107
   0.93737316]
 [ 0.01432922 -0.0293345   0.01375658 ...  0.5569489   1.8414272
   1.0451972 ]
 ...
 [ 0.01324046 -0.08780225  0.03289572 ...  1.2917025   1.671077
  -0.26092538]
 [ 0.01469793 -0.08446169  0.03067384 ...  1.0497465   2.312757
  -0.0273695 ]
 [ 0.01571836 -0.06809746  0.03865753 ...  1.0964143   0.9065295
  -0.22463574]]


HBox(children=(IntProgress(value=0, max=11380), HTML(value='')))

264: 1.8437343896948732e-05 difference with saved truth
319: 1.981104833248537e-05 difference with saved truth
356: 7.296836702153087e-05 difference with saved truth
400: 0.00018367973098065704 difference with saved truth
636: 0.0003032814711332321 difference with saved truth
668: 2.101020436384715e-05 difference with saved truth
702: 1.115216946345754e-06 difference with saved truth
788: 4.8540576244704425e-05 difference with saved truth
846: 4.7687361075077206e-05 difference with saved truth
869: 0.0002882756234612316 difference with saved truth
881: 0.0003204078529961407 difference with saved truth
882: 0.00016211306501645595 difference with saved truth
883: 0.00028316956013441086 difference with saved truth
904: 4.720847209682688e-05 difference with saved truth
912: 0.00014356020255945623 difference with saved truth
924: 3.420394205022603e-05 difference with saved truth
926: 5.047861486673355e-05 difference with saved truth
975: 0.00019094362505711615 difference with saved truth
10

2007: 0.0005706866504624486 difference with saved truth
2021: 6.293083424679935e-05 difference with saved truth
2022: 3.215122706023976e-05 difference with saved truth
2024: 0.00029548018937930465 difference with saved truth
2032: 4.583796180668287e-05 difference with saved truth
2037: 0.00025150779401883483 difference with saved truth
2040: 0.0005566130275838077 difference with saved truth
2041: 0.0001079081921488978 difference with saved truth
2043: 0.0006550320540554821 difference with saved truth
2045: 0.0004294779209885746 difference with saved truth
2046: 5.041695658292156e-06 difference with saved truth
2047: 0.0002736358728725463 difference with saved truth
2049: 0.00036644533975049853 difference with saved truth
2050: 0.0005854980554431677 difference with saved truth
2051: 0.000252632366027683 difference with saved truth
2052: 0.00037359638372436166 difference with saved truth
2053: 0.00030481701833195984 difference with saved truth
2054: 0.00034985330421477556 difference with

2658: 0.0001134863487095572 difference with saved truth
2675: 3.6972396628698334e-05 difference with saved truth
2705: 0.00018195361190009862 difference with saved truth
2732: 0.0003320676914881915 difference with saved truth
2741: 0.0003059689770452678 difference with saved truth
2742: 0.00029828280094079673 difference with saved truth
2743: 0.00011131382780149579 difference with saved truth
2748: 0.00010609444143483415 difference with saved truth
2750: 0.00025949961855076253 difference with saved truth
2754: 1.3978065908304416e-05 difference with saved truth
2757: 0.00014106612070463598 difference with saved truth
2761: 0.0001485230604885146 difference with saved truth
2780: 0.0001022981305141002 difference with saved truth
2791: 8.941145642893389e-05 difference with saved truth
2795: 0.0002622094179969281 difference with saved truth
2800: 0.0004355507844593376 difference with saved truth
2832: 7.446276867995039e-05 difference with saved truth
2842: 0.0002840459637809545 difference w

4469: 0.00021746309357695282 difference with saved truth
4471: 0.0003162219072692096 difference with saved truth
4472: 0.00014761167403776199 difference with saved truth
4483: 0.0001935485051944852 difference with saved truth
4507: 0.0002790364669635892 difference with saved truth
4522: 0.00012434150266926736 difference with saved truth
4534: 0.00031075667357072234 difference with saved truth
4570: 0.00023544261057395488 difference with saved truth
4613: 9.487812349107116e-05 difference with saved truth
4620: 5.890015745535493e-05 difference with saved truth
4634: 0.00011044720304198563 difference with saved truth
4700: 7.558657671324909e-05 difference with saved truth
4885: 1.4136901882011443e-05 difference with saved truth
4886: 0.00024612824199721217 difference with saved truth
4888: 0.00017460298840887845 difference with saved truth
4893: 0.00013193771883379668 difference with saved truth
4925: 8.322707435581833e-05 difference with saved truth
4938: 1.7963197024073452e-05 differenc

IndexError: index 5000 is out of bounds for axis 0 with size 5000

In [20]:
# Use tf.data and tf.Variable to feed data into the inference step
import os
import tensorflow as tf
from unirep import aa_seq_to_int, initialize_uninitialized
import pandas as pd

#using a placeholder
tf.set_random_seed(42)
np.random.seed(42)

# data = np.random.sample((100,2))
# tensor = tf.convert_to_tensor(data)
# dataset = tf.data.Dataset.from_tensor_slices(tensor)
# iter = dataset.make_one_shot_iterator()
# el = iter.get_next()
# with tf.Session() as sess:
#     result = sess.run(el) # output [0.37454012 0.95071431]
#     print(result)

# tf.reset_default_graph() # Reset the graph

def get_reps(model, seqs):

    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
#         final_cell, final_hidden, hs = sess.run([final_cell_ts, final_hidden_ts, hs_ts])
        final_state_, hs = sess.run([model._inference_final_state, model._inference_output],
                                    feed_dict={
                                        model._minibatch_x_placeholder: [seq_ints[0]]
                                    }
                                   )

        final_cell, final_hidden = final_state_
        # Drop the batch dimension so it is just seq len by representation size
        final_cell = final_cell[0]
        final_hidden = final_hidden[0]
        hs = hs[0]
        avg_hidden = np.mean(hs, axis=0)
        return avg_hidden, final_hidden, final_cell
    
path = "./data/stability_data"
df = pd.read_table(os.path.join(path, "ssm2_stability_scores.txt"))
seqs = df["sequence"].iloc[:100].values
get_reps(model, seqs) # TODO: This is only the first 100 rows

(100,)
Tensor("Const_19:0", shape=(100, 44), dtype=int32)
<DatasetV1Adapter shapes: (44,), types: tf.int32>


(array([ 0.0132201 , -0.023352  ,  0.01794336, ...,  0.04402848,
         0.03012069,  0.047946  ], dtype=float32),
 array([ 0.00847428, -0.00885865,  0.03277024, ...,  0.07334112,
         0.07954072,  0.08330579], dtype=float32),
 array([ 1.7381716, -1.5599622,  6.092423 , ...,  0.5680599,  0.9212607,
         1.4379159], dtype=float32))

In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm

path = "./data/stability_data"
output_path = os.path.join(path, "stability_with_unirep_fusion.hdf")

existing_output = pd.DataFrame(columns=["name", "sequence", "stability"])
if os.path.isfile(output_path):
    print("Reading existing output file...")
    existing_output = pd.read_hdf(output_path, key="ids")
    print("Got {} existing data points".format(existing_output.shape[0]))
    duplicates = existing_output.duplicated(subset=["sequence"])
    assert True not in duplicates.values

new_ids_output = pd.DataFrame(columns=["name", "sequence", "stability"])
new_reps_output = pd.DataFrame(columns=list(range(0, 5700)))

for filename in os.listdir(path):
    if filename.endswith(".txt"):
        print("Processing data from {}".format(filename))
        df = pd.read_table(os.path.join(path, filename))
        model.get_rep(df["sequence"].values)
        for index, row in tqdm(df.iterrows(), total=df.shape[0]): # TODO: Parallelize this
            if index != 0 and index % 20 == 0:
                assert new_ids_output.shape[0] == new_reps_output.shape[0]
                if new_ids_output.shape[0] > 0:
                    print("Appending {} points...".format(new_ids_output.shape[0]))
                    new_ids_output.to_hdf(output_path, index=False, mode="a", key="ids", format="table", append=True)
                    new_ids_output = pd.DataFrame(columns=["name", "sequence", "stability"])
                    new_reps_output.to_hdf(output_path, index=False, mode="a", key="reps", format="table", append=True)
                    new_reps_output = pd.DataFrame(columns=list(range(0, 5700)))
            # If there is no existing data or the existing data already contains this sequence, ignore it
            if existing_output.empty or not row["sequence"] in existing_output["sequence"].values:
                if model.is_valid_seq(row["sequence"], max_len=500):
                    unirep_fusion = model.get_rep(row["sequence"])
                    unirep_fusion = np.concatenate((unirep_fusion[0], unirep_fusion[1], unirep_fusion[2]))
                    print(unirep_fusion.shape)
                    if "consensus_stability_score" in df.columns:
                        stability_score = row["consensus_stability_score"]
                    else:
                        stability_score = row["stabilityscore"]
                    new_ids_output.loc[len(new_ids_output)]=[row["name"], row["sequence"], stability_score]
                    new_reps_output.loc[len(new_reps_output)]=unirep_fusion

In [None]:
ids = pd.read_hdf(output_path, key="ids")
print("{} points in ids".format(ids.shape[0]))
reps = pd.read_hdf(output_path, key="reps")
print("{} points in reps".format(reps.shape[0]))
print(reps.iloc[0])