In [1]:
import tensorflow as tf
import numpy as np

# Set seeds
tf.set_random_seed(42)
np.random.seed(42)
   
# Import the mLSTM babbler model
from unirep import babbler1900 as babbler
    
# Where model weights are stored.
MODEL_WEIGHT_PATH = "./data/1900_weights"

In [2]:
batch_size = 12
model = babbler(batch_size=batch_size, model_path=MODEL_WEIGHT_PATH)

  from ._conv import register_converters as _register_converters


In [72]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm

path = "./data/stability_data"
output_path = os.path.join(path, "stability_with_unirep_fusion.hdf")

existing_output = pd.DataFrame(columns=["name", "sequence", "stability"])
if os.path.isfile(output_path):
    print("Reading existing output file...")
    existing_output = pd.read_hdf(output_path, key="ids")
    print("Got {} existing data points".format(existing_output.shape[0]))
    duplicates = existing_output.duplicated(subset=["sequence"])
    assert True not in duplicates.values

new_ids_output = pd.DataFrame(columns=["name", "sequence", "stability"])
new_reps_output = pd.DataFrame(columns=list(range(0, 5700)))

for filename in os.listdir(path):
    if filename.endswith(".txt"):
        print("Processing data from {}".format(filename))
        df = pd.read_table(os.path.join(path, filename))
        for index, row in tqdm(df.iterrows(), total=df.shape[0]): # TODO: Parallelize this
            if index != 0 and index % 20 == 0:
                assert new_ids_output.shape[0] == new_reps_output.shape[0]
                if new_ids_output.shape[0] > 0:
                    print("Appending {} points...".format(new_ids_output.shape[0]))
                    new_ids_output.to_hdf(output_path, index=False, mode="a", key="ids", format="table", append=True)
                    new_ids_output = pd.DataFrame(columns=["name", "sequence", "stability"])
                    new_reps_output.to_hdf(output_path, index=False, mode="a", key="reps", format="table", append=True)
                    new_reps_output = pd.DataFrame(columns=list(range(0, 5700)))
            # If there is no existing data or the existing data already contains this sequence, ignore it
            if existing_output.empty or not row["sequence"] in existing_output["sequence"].values:
                if model.is_valid_seq(row["sequence"], max_len=500):
                    unirep_fusion = model.get_rep(row["sequence"])
                    unirep_fusion = np.concatenate((unirep_fusion[0], unirep_fusion[1], unirep_fusion[2]))
                    if "consensus_stability_score" in df.columns:
                        stability_score = row["consensus_stability_score"]
                    else:
                        stability_score = row["stabilityscore"]
                    new_ids_output.loc[len(new_ids_output)]=[row["name"], row["sequence"], stability_score]
                    new_reps_output.loc[len(new_reps_output)]=unirep_fusion

Reading existing output file...
Got 2050 existing data points
Processing data from ssm2_stability_scores.txt


KeyboardInterrupt: 

In [71]:
ids = pd.read_hdf(output_path, key="ids")
print("{} points in ids".format(ids.shape[0]))
reps = pd.read_hdf(output_path, key="reps")
print("{} points in reps".format(reps.shape[0]))
print(reps.iloc[0])

2050 points in ids
2050 points in reps
0       0.013220
1      -0.023352
2       0.017943
3       0.001995
4      -0.179529
5       0.010197
6      -0.149218
7       0.002266
8      -0.018254
9       0.072948
10      0.207161
11      0.010123
12      0.064938
13      0.053157
14      0.016196
15      0.009681
16      0.045279
17     -0.035806
18      0.137569
19     -0.026070
20     -0.000789
21      0.050852
22      0.058471
23     -0.086977
24      0.056679
25     -0.037811
26     -0.009353
27      0.025263
28      0.010998
29     -0.052333
          ...   
5670    0.624806
5671   -0.312815
5672    1.221013
5673   -0.288682
5674   -0.381718
5675   -2.431110
5676   -1.886887
5677    2.433378
5678   -0.459735
5679   -1.152914
5680   -0.564261
5681    0.883261
5682    1.340542
5683   -0.711250
5684   -0.617897
5685    2.087688
5686   -0.870234
5687    1.205188
5688   -2.492855
5689   -0.216529
5690    1.151649
5691    0.801495
5692   -1.201436
5693    2.847294
5694   -2.590188
5695   -1

In [1]:
from sklearn import linear_model
# LassoLars usage: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoLars.html#sklearn.linear_model.LassoLars
reg = linear_model.LassoLars(alpha=0.01)
reg.fit(X, Y)