In [12]:
%load_ext autoreload
%autoreload 2
import multibind as mb
import numpy as np
import pandas as pd
import torch
import torch.nn as tnn
import torch.optim as topti
import torch.utils.data as tdata
import bindome as bd
bd.constants.ANNOTATIONS_DIRECTORY = '../annotations'
import matplotlib.pyplot as plt
import seaborn as sns
import logomaker
import os
import scipy
import pickle

# Use a GPU if available, as it should be faster.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device: " + str(device))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using device: cpu


In [21]:
matlab_path = os.path.join(bd.constants.ANNOTATIONS_DIRECTORY, 'pbm', 'affreg', 'PbmDataHom6_norm.mat')
mat = scipy.io.loadmat(matlab_path)
data = mat['PbmData'][0]
seqs_dna =  data[0][5]
seqs_dna = [s[0][0] for s in seqs_dna]
# load the MSA sequences, one hot encoded
df, signal = bd.datasets.PBM.pbm_homeo_affreg()
# x, y = pickle.load(open('../../data/example_homeo_PbmData.pkl', 'rb'))
x, y = pickle.load(open('../annotations/pbm/example_homeo_PbmData.pkl', 'rb'))

In [22]:
x_input = np.stack(x).transpose((0, 2, 1)).astype(np.float32)
x_input = torch.Tensor(x_input)

x_target = torch.Tensor(np.stack(x).astype(np.float32))

In [97]:
class Autoencoder(tnn.Module):
    def __init__(self, input_size=21, seq_length=88):
        super().__init__()
        self.encoder = mb.models.BMPrediction(num_classes=1, input_size=input_size, hidden_size=2, num_layers=1, seq_length=seq_length)
        self.decoder = mb.models.Decoder(enc_size=input_size, seq_length=seq_length)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

In [25]:
autoencoder = Autoencoder()
criterion = tnn.CrossEntropyLoss()
optimizer = torch.optim.Adam(autoencoder.parameters(), lr=0.001)

num_epochs = 1001

In [98]:
def train(autoencoder, num_epochs, x_input, x_target, criterion, optimizer):
    for epoch in range(num_epochs):
        outputs = autoencoder(x_input) #forward pass
        # print(outputs.shape)
        optimizer.zero_grad() #caluclate the gradient, manually setting to 0

        # obtain the loss function
        loss = criterion(outputs, x_target)

        loss.backward() #calculates the loss of the loss function

        optimizer.step() #improve from loss, i.e backprop
        if epoch % 200 == 0:
            print("Epoch: %d, loss: %1.5f" % (epoch, loss.item())) 

In [85]:
def cross_validation(x_input, x_target, k=5):
    val_losses = []
    for i in range(k):
        indices = np.array([j % k == i for j in range(x_input.shape[0])])
        x_input_train = x_input[~indices]
        x_input_test = x_input[indices]
        x_target_train = x_target[~indices]
        x_target_test = x_target[indices]
        
        autoencoder = Autoencoder(input_size=20, seq_length=50) #numbers should also be parameters
        criterion = tnn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(autoencoder.parameters(), lr=0.001)
        
        train(autoencoder, num_epochs, x_input_train, x_target_train, criterion, optimizer)
        
        pred = autoencoder(x_input_test)
        print(criterion(pred, x_target_test).item())
        val_losses += [criterion(pred, x_target_test).item()]
    return val_losses

In [61]:
cross_validation()

Epoch: 0, loss: 3.04454
Epoch: 200, loss: 0.60014
Epoch: 400, loss: 0.18317
Epoch: 600, loss: 0.09386
Epoch: 800, loss: 0.05768
Epoch: 1000, loss: 0.04241
11.242741584777832
Epoch: 0, loss: 3.04690
Epoch: 200, loss: 0.86632
Epoch: 400, loss: 0.76319
Epoch: 600, loss: 0.74991
Epoch: 800, loss: 0.74856
Epoch: 1000, loss: 0.74821
1.8671122789382935
Epoch: 0, loss: 3.04582
Epoch: 200, loss: 1.00713
Epoch: 400, loss: 1.00712
Epoch: 600, loss: 1.00712
Epoch: 800, loss: 1.00712
Epoch: 1000, loss: 1.00712
1.057201862335205
Epoch: 0, loss: 3.04357
Epoch: 200, loss: 0.75241
Epoch: 400, loss: 0.41967
Epoch: 600, loss: 0.25304
Epoch: 800, loss: 0.17557
Epoch: 1000, loss: 0.12737
5.542008876800537
Epoch: 0, loss: 3.04773
Epoch: 200, loss: 0.79510
Epoch: 400, loss: 0.52657
Epoch: 600, loss: 0.44165
Epoch: 800, loss: 0.37785
Epoch: 1000, loss: 0.34121
3.3183066844940186


[]

# Cross-validation with simulated data

In [62]:
n_seqs = 999
n_aa = 50


import random
import numpy as np
from matplotlib import rcParams
figsize = [5, 1]
random.seed(42)
aa_options = 'ACDEFGHIKLMNPQRSTVWY'
seq_arr = np.array(list(aa_options))
ref_seq = random.choices(sorted(seq_arr), k=n_aa)
ref_seq = ''.join(ref_seq)

import pandas as pd
import logomaker
np.random.seed(42)
ref_w = pd.DataFrame(np.random.random((4, 10)), index=['A', 'C', 'G', 'T'])
# logomaker.Logo(ref_w.T, shade_below=0.5, fade_below=0.5, figsize=figsize)
# plt.show()
# generate a random seq logo

final_seqs = []
final_w = []
curr_seq = ref_seq
curr_w = ref_w
final_seqs.append(curr_seq)
final_w.append(ref_w)

n_mutate = 2
for i in range(n_seqs):
    new_seq = curr_seq
    for mut_i in range(n_mutate):
        # mutate the protein sequence
        pi = np.random.randint(0, len(curr_seq))
        new_aa = aa_options[np.random.randint(0, len(aa_options))]
        # print(pi, new_aa)    

        new_seq = curr_seq[:pi] + new_aa + curr_seq[pi + 1:]
        # print(new_seq)

    final_seqs.append(new_seq)
    curr_seq = new_seq
    
    # mutate PWMs at one position
    pw = np.random.randint(0, ref_w.shape[1])
    new_w = pd.concat([curr_w.iloc[:,:pw],
                       pd.DataFrame(np.random.random((4, 1)), index=['A', 'C', 'G', 'T']),
                       curr_w.iloc[:,pw+1:]],
                     axis=1)
    new_w.columns = range(new_w.shape[1])
    # print(new_w)
    # logomaker.Logo(new_w.T, shade_below=0.5, fade_below=0.5, figsize=figsize)
    # plt.show()
    final_w.append(new_w)
    curr_w = new_w

In [66]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

def onehot_aa(seq, label_encoder=LabelEncoder(), onehot_encoder=OneHotEncoder(sparse=False)):
    seq_arr = np.array(list(seq + "ACDEFGHIKLMNPQRSTVWY"))
    seq_int = label_encoder.fit_transform(seq_arr)
    pre_onehot = onehot_encoder.fit_transform(seq_int.reshape(-1, 1))
    return (
        pre_onehot.T[:, :-20]
    ).astype(np.float32)

In [75]:
n_seq = len(final_seqs)
single_seq = "".join(final_seqs)
onehot = np.array(onehot_aa(single_seq))
onehot = np.array(np.split(onehot, n_seq, axis=1))

In [76]:
onehot.shape

(1000, 20, 50)

In [79]:
x_input = torch.Tensor(onehot.transpose((0, 2, 1)).astype(np.float32))

x_target = torch.Tensor(onehot.astype(np.float32))

In [100]:
val_losses = cross_validation(x_input, x_target)

Epoch: 0, loss: 2.99554
Epoch: 200, loss: 0.97880
Epoch: 400, loss: 0.45276
Epoch: 600, loss: 0.31102
Epoch: 800, loss: 0.26616
Epoch: 1000, loss: 0.22280
0.3763960599899292
Epoch: 0, loss: 2.99603
Epoch: 200, loss: 0.54630
Epoch: 400, loss: 0.34404
Epoch: 600, loss: 0.28986
Epoch: 800, loss: 0.24101
Epoch: 1000, loss: 0.21185
0.36328426003456116
Epoch: 0, loss: 2.99520
Epoch: 200, loss: 1.34428
Epoch: 400, loss: 0.89640
Epoch: 600, loss: 0.69996
Epoch: 800, loss: 0.56334
Epoch: 1000, loss: 0.50843
0.5557398200035095
Epoch: 0, loss: 2.99512
Epoch: 200, loss: 0.93819
Epoch: 400, loss: 0.67549
Epoch: 600, loss: 0.57986
Epoch: 800, loss: 0.54387
Epoch: 1000, loss: 0.52006
0.8059558868408203
Epoch: 0, loss: 2.99606
Epoch: 200, loss: 1.22230
Epoch: 400, loss: 0.77524
Epoch: 600, loss: 0.64947
Epoch: 800, loss: 0.53295
Epoch: 1000, loss: 0.44251
0.6242820024490356


[]

In [101]:
val_losses

[0.3763960599899292,
 0.36328426003456116,
 0.5557398200035095,
 0.8059558868408203,
 0.6242820024490356]