In [29]:
from Bio import SeqIO
import gzip
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from sklearn.metrics import (
    average_precision_score, precision_recall_curve,
    roc_auc_score, roc_curve,
    matthews_corrcoef
)
from sklearn.model_selection import train_test_split
from time import time
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, TensorDataset

from utils.pytorchtools import EarlyStopping

In [2]:
# Adapted from:
# https://github.com/FunctionLab/selene/blob/master/models/danQ.py
class DanQ(nn.Module):
    """DanQ architecture (Quang & Xie, 2016)."""

    def __init__(self, sequence_length, n_features):
        """
        Parameters
        ----------
        sequence_length : int
            Input sequence length
        n_features : int
            Total number of features to predict
        """
        super(DanQ, self).__init__()

        self.nnet = nn.Sequential(
            nn.Conv1d(4, 320, kernel_size=26),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(kernel_size=13, stride=13),
            nn.Dropout(0.2)
        )

        self.bdlstm = nn.Sequential(
            nn.LSTM(320, 320, num_layers=1, batch_first=True, bidirectional=True)
        )

        self._n_channels = math.floor((sequence_length - 25) / 13)

        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(self._n_channels * 640, 925),
            nn.ReLU(inplace=True),
            nn.Linear(925, n_features),
            nn.Sigmoid()
        )

    def forward(self, x):
        """Forward propagation of a batch."""
        out = self.nnet(x)
        reshape_out = out.transpose(0, 1).transpose(0, 2)
        out, _ = self.bdlstm(reshape_out)
        out = out.transpose(0, 1)
        reshape_out = out.contiguous().view(
            out.size(0), 640 * self._n_channels)
        predict = self.classifier(reshape_out)

        return(predict)

def get_criterion():
    """
    Specify the appropriate loss function (criterion) for this model.

    Returns
    -------
    torch.nn._Loss
    """
    return(nn.BCELoss())

def get_optimizer(params, lr=0.001):
    return(torch.optim.Adam(params, lr=lr))

In [3]:
def one_hot_encode(seq):
    """One hot encodes a sequence."""

    seq = seq.replace("A", "0")
    seq = seq.replace("C", "1")
    seq = seq.replace("G", "2")
    seq = seq.replace("T", "3")

    encoded_seq = np.zeros((4, len(seq)), dtype="float16")

    for i in range(len(seq)):
        if seq[i].isdigit():
            encoded_seq[int(seq[i]), i] = 1
        else:
            # i.e. Ns
            encoded_seq[:, i] = 0.25

    return(encoded_seq)

def one_hot_decode(encoded_seq):
    """Reverts a sequence's one hot encoding."""

    seq = []
    code = list("ACGT")
 
    for i in encoded_seq.transpose(1, 0):
        try:
            seq.append(code[int(np.where(i == 1)[0])])
        except:
            # i.e. N?
            seq.append("N")

    return("".join(seq))

def reverse_complement(encoded_seqs):
    """Reverse complements one hot encoding for a list of sequences."""
    return(encoded_seqs[..., ::-1, ::-1])

In [4]:
# Source of the example sequences:
# https://github.com/kundajelab/dragonn/tree/master/examples
# Parse FASTA sequences
pos_seqs = {}
neg_seqs = {}
#with gzip.open("../Data/pos_seqs.fa.gz", "rt") as handle:
with gzip.open("./examples/example_pos_sequences.fa.gz", "rt") as handle:
    for seq_record in SeqIO.parse(handle, "fasta"):
        pos_seqs[seq_record.id] = str(seq_record.seq).upper()
pos_seqs = pd.Series(pos_seqs)
#with gzip.open("../Data/neg_seqs.fa.gz", "rt") as handle:
with gzip.open("./examples/example_neg_sequences.fa.gz", "rt") as handle:
    for seq_record in SeqIO.parse(handle, "fasta"):
        neg_seqs[seq_record.id] = str(seq_record.seq).upper()
neg_seqs = pd.Series(neg_seqs)
pos_seqs

0       GGAGATTTGTGTATGTACTCTTCTTTCACGCATATGTGTGAGCAAA...
1       AGCCAGGCCCCCAAAATAACTTGCCAGATATGTCACCTGCTTCCCA...
2       ATCAGTGGAAATTTAAGAAAATACACATGGCCAGGCCCCAGCCCAA...
3       TTGTCCTGAATCGCCAGATTCAGGAGGCATAAAAACCAAAATAGAG...
4       AGAGAAGCAGCAGGACAGAGAGTGAGAGAAGGGGAGGGAGCAAAAG...
                              ...                        
2495    ATTAACTCAAGAATATACTGCTTCCTTGTTCTCTCCTTCTTGCCCC...
2496    AAAGTAGGCGGTTAGCCAGGCATGGTGTTGCACGCCTGTAGTCCCA...
2497    GGCTCTTATTTGGCCGGAGTGGAGTGACCAGGTCAGCGCCGCAGCT...
2498    CACCGGGGGTCCTCGAAGCGCACGAAGGCGAAGGGCACGAGGCCGT...
2499    TGGGGTGGGGGGCGTCGTCCCGTGGTGGCGCCGGCCGGGGTGGGGG...
Length: 2500, dtype: object

In [5]:
# One-hot encode sequences
pos_seqs_1_hot = pos_seqs.map(lambda x: one_hot_encode(x))
pos_seqs_1_hot = np.stack(pos_seqs_1_hot, axis=0)
neg_seqs_1_hot = neg_seqs.map(lambda x: one_hot_encode(x))
neg_seqs_1_hot = np.stack(neg_seqs_1_hot, axis=0)
pos_seqs_1_hot

array([[[0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [1., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 1.]],

       [[1., 0., 0., ..., 0., 1., 0.],
        [0., 0., 1., ..., 0., 0., 1.],
        [0., 1., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 1., 0., 1.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 1., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 1., 0., 1.],
        [1., 1., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 1., 0., ..., 0., 1., 1.],
        [1., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 1., 1., 0.],
        [0., 1., 1., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]]], dtype=float16)

In [6]:
# Split sequences into train, validation and test
seed = 123
pos_train_seqs, pos_test_seqs = train_test_split(
    pos_seqs_1_hot, test_size=0.2, random_state=seed
)
pos_validation_seqs, pos_test_seqs = train_test_split(
    pos_test_seqs, test_size=0.5, random_state=seed
)
neg_train_seqs, neg_test_seqs = train_test_split(
    neg_seqs_1_hot, test_size=0.2, random_state=seed
)
neg_validation_seqs, neg_test_seqs = train_test_split(
    neg_test_seqs, test_size=0.5, random_state=seed
)
pos_train_seqs

array([[[0., 0., 1., ..., 1., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 1., 0., ..., 0., 1., 1.]],

       [[1., 1., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 1., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 1., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 1.]],

       ...,

       [[1., 1., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 1.],
        [0., 0., 0., ..., 1., 0., 0.]],

       [[0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 1.],
        [1., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.]],

       [[0., 0., 1., ..., 1., 1., 1.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]], dtype=float16)

In [7]:
# Reverse complement train sequences
pos_train_seqs_rc = np.append(pos_train_seqs, reverse_complement(pos_train_seqs), axis=0)
neg_train_seqs_rc = np.append(neg_train_seqs, reverse_complement(pos_train_seqs), axis=0)
pos_train_seqs_rc

array([[[0., 0., 1., ..., 1., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 1., 0., ..., 0., 1., 1.]],

       [[1., 1., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 0., 0., ..., 1., 0., 0.]],

       [[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        [0., 1., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 1.]],

       ...,

       [[0., 0., 1., ..., 0., 0., 0.],
        [1., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 1., 1.]],

       [[0., 1., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 1.],
        [1., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 1.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 1., 1., ..., 1., 0., 0.]]], dtype=float16)

In [8]:
# Create TensorDatasets
X = np.concatenate((pos_train_seqs_rc, neg_train_seqs_rc))
y = np.concatenate(
    (np.ones((len(pos_train_seqs_rc), 1)), np.zeros((len(neg_train_seqs_rc), 1)))
)
train_dataset = TensorDataset(torch.Tensor(X), torch.Tensor(y))
X = np.concatenate((pos_validation_seqs, neg_validation_seqs))
y = np.concatenate(
    (np.ones((len(pos_validation_seqs), 1)), np.zeros((len(neg_validation_seqs), 1)))
)
validation_dataset = TensorDataset(torch.Tensor(X), torch.Tensor(y))
X = np.concatenate((pos_test_seqs, neg_test_seqs))
y = np.concatenate(
    (np.ones((len(pos_test_seqs), 1)), np.zeros((len(neg_test_seqs), 1)))
)
test_dataset = TensorDataset(torch.Tensor(X), torch.Tensor(y))
train_dataset

<torch.utils.data.dataset.TensorDataset at 0x7f93585e6370>

In [9]:
# Create DataLoaders
parameters = dict(batch_size=64, shuffle=True, num_workers=8)
train_dataloader = DataLoader(train_dataset, **parameters)
validation_dataloader = DataLoader(validation_dataset, **parameters)
test_dataloader = DataLoader(test_dataset, **parameters)
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f92ce8e9d00>

In [10]:
# Train and validate
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
max_epochs = 100
output_dir = "./examples/model/"
os.makedirs(output_dir, exist_ok=True)

# Initialize model, criterion, optimizer
model = DanQ(len(pos_seqs[0]), 1).to(device)
criterion = get_criterion()
optimizer = get_optimizer(model.parameters())
# scheduler = ReduceLROnPlateau(optimizer, "min", patience=10, verbose=True, factor=0.5)
state_dict = os.path.join(output_dir, "model.pth.tar")
early_stopping = EarlyStopping(20, True, path=state_dict)
train_losses = []
validation_losses = []

for epoch in range(1, max_epochs + 1):

    # Train
    t_time = time()
    model.train() # set the model in train mode
    train_losses.append([])
    for seqs, labels in train_dataloader:
        x = seqs.to(device) # shape = (batch_size, 4, 200)
        labels = labels.to(device)
        # Zero existing gradients so they don't add up
        optimizer.zero_grad()
        # Forward pass
        outputs = model(x)
        loss = criterion(outputs, labels) 
        # Backward and optimize
        loss.backward()
        optimizer.step()
        # Keep the loss
        train_losses[-1].append(loss.item())
    t_loss = np.average(train_losses[-1])
    t_time = time() - t_time

    # Validate
    v_time = time()
    model.eval() # set the model in evaluation mode
    validation_losses.append([])
    for seqs, labels in validation_dataloader:
        x = seqs.to(device) # shape = (batch_size, 4, 200)
        labels = labels.to(device)
        with torch.no_grad():
            # Forward pass
            outputs = model(x)
            loss = criterion(outputs, labels) 
            # Keep the loss
            validation_losses[-1].append(loss.item())
    v_loss = np.average(validation_losses[-1])
    v_time = time() - v_time

    print(f'[{epoch:>{3}}/{max_epochs:>{3}}] '
         +f'train_loss: {t_loss:.5f} ({t_time:.3f} sec) '
         +f'valid_loss: {v_loss:.5f} ({v_time:.3f} sec)')

    # # Adjust learning rate
    # scheduler.step(math.ceil(v_loss * 1000.0) / 1000.0)

    # EarlyStopping needs to check if the validation loss has decresed, 
    # and if it has, it will save the current model.
    early_stopping(v_loss, model)
    if early_stopping.early_stop:
        # Empty cache
        with torch.cuda.device(device):
            torch.cuda.empty_cache()
        print("Stop!!!")
        break

[  1/100] train_loss: 1.12768 (25.466 sec) valid_loss: 0.69347 (0.524 sec)
Validation loss decreased (inf --> 0.693471), saving model ...
[  2/100] train_loss: 0.69349 (25.487 sec) valid_loss: 0.69286 (0.530 sec)
Validation loss decreased (0.693471 --> 0.692862), saving model ...
[  3/100] train_loss: 0.69316 (25.475 sec) valid_loss: 0.69287 (0.522 sec)
EarlyStopping counter: 1 out of 20
[  4/100] train_loss: 0.69340 (25.475 sec) valid_loss: 0.69305 (0.521 sec)
EarlyStopping counter: 2 out of 20
[  5/100] train_loss: 0.69319 (25.467 sec) valid_loss: 0.69313 (0.522 sec)
EarlyStopping counter: 3 out of 20
[  6/100] train_loss: 0.69311 (25.459 sec) valid_loss: 0.69278 (0.520 sec)
Validation loss decreased (0.692862 --> 0.692784), saving model ...
[  7/100] train_loss: 0.69274 (25.521 sec) valid_loss: 0.69101 (0.523 sec)
Validation loss decreased (0.692784 --> 0.691014), saving model ...
[  8/100] train_loss: 0.69009 (25.553 sec) valid_loss: 0.67112 (0.529 sec)
Validation loss decreased (0

KeyboardInterrupt: 

In [21]:
# Test
y = None
predictions = None
state_dict = os.path.join(output_dir, "model.pth.tar")
model.load_state_dict(torch.load(state_dict))

model.eval() # set the model in evaluation mode
for seqs, labels in test_dataloader:
    x = seqs.to(device) # shape = (batch_size, 4, 200)
    labels = labels.to(device)
    with torch.no_grad():
        # Forward pass
        outputs = model(x)
        # Save predictions
        if predictions is None and y is None:
            predictions = outputs.data.cpu().numpy()
            y = labels.data.cpu().numpy()
        else:
            predictions = np.append(
                predictions, outputs.data.cpu().numpy(), axis=0
            )
            y = np.append(y, labels.data.cpu().numpy(), axis=0)
predictions

array([[9.41261649e-03],
       [8.27941418e-01],
       [9.99908805e-01],
       [4.03591692e-02],
       [3.12450588e-01],
       [4.07138795e-01],
       [1.29629960e-02],
       [4.47772117e-03],
       [9.99733508e-01],
       [2.50003394e-02],
       [3.01574767e-01],
       [7.30029225e-01],
       [9.48388502e-03],
       [4.35647994e-01],
       [2.25777894e-01],
       [7.05034360e-02],
       [9.93765593e-01],
       [5.13630390e-01],
       [3.70470583e-02],
       [7.81637728e-01],
       [9.96638656e-01],
       [1.93989560e-01],
       [5.20085752e-01],
       [9.35907662e-01],
       [4.16361727e-02],
       [1.26960963e-01],
       [9.95359600e-01],
       [9.95212674e-01],
       [6.57874525e-01],
       [2.19868682e-02],
       [5.83609641e-01],
       [7.72888422e-01],
       [1.74071923e-01],
       [5.55980384e-01],
       [9.86062229e-01],
       [3.70097198e-02],
       [6.21218383e-01],
       [4.91056085e-01],
       [9.99880314e-01],
       [5.50869107e-01],


In [31]:
# Metrics
metrics = dict(AUCPR=None, AUCROC=None, MCC=None)
p = predictions.flatten()
l = y.flatten()

# Plot
def __visualize_metric(data, columns, metric, score):

    # Metric to DataFrame
    df = pd.DataFrame(data, columns=columns)
    # Seaborn aesthetics
    sns.set_context("paper", font_scale=1.5, rc={"lines.linewidth": 1.5})
    sns.set_palette(sns.color_palette(["#1965B0"]))
    # Plot metric
    kwargs = dict(estimator=None, ci=None)
    g = sns.lineplot(x=columns[0], y=columns[1], data=df, **kwargs)
    # Add metric score
    kwargs = dict(horizontalalignment="center", verticalalignment="center")
    plt.text(.5, 0, "%s = %.5f" % (metric, score), **kwargs)
    # Remove spines
    sns.despine()
    # Save & close
    fig = g.get_figure()
    fig.savefig(os.path.join(output_dir, "%s.png" % metric))
    plt.close(fig)

# Metrics to DataFrame
for metric in metrics:
    if metric == "AUCPR":
        score = average_precision_score(l, p)
        prec, recall, _ = precision_recall_curve(l, p)
        # i.e. precision = 0, recall = 1
        prec = np.insert(prec, 0, 0., axis=0)
        recall = np.insert(recall, 0, 1., axis=0)
        data = list(zip(recall, prec))
        __visualize_metric(data, ["Recall", "Precision"], metric, score)
    elif metric == "AUCROC":
        score = roc_auc_score(l, p)
        fpr, tpr, _ = roc_curve(l, p)
        data = list(zip(fpr, tpr))
        __visualize_metric(data, ["FPR", "TPR"], metric, score)
    elif metric == "MCC":
        score = matthews_corrcoef(l, np.rint(p))
    metrics[metric] = score

print(f'Final performance metrics: '
     +f'AUCROC: {metrics["AUCROC"]:.5f}, '
     +f'AUCPR: {metrics["AUCPR"]:.5f}, '
     +f'MCC: {metrics["MCC"]:.5f}')

Final performance metrics: AUCROC: 0.89542, AUCPR: 0.88929, MCC: 0.61649


In [46]:
fwd = pos_train[:1]
fwd

array([[[0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
         1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0.,
         0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
         1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 0., 1.,
         

In [None]:

def __visualize_metric(self, data, y, metric):

    # Metric to DataFrame
    df = pd.DataFrame(data, columns=y)

    # Seaborn aesthetics
    sns.set_context("paper", font_scale=1.5, rc={"lines.linewidth": 1.5})
    sns.set_palette(sns.color_palette(["#1965B0"]))

    # Plot metric
    kwargs = dict(estimator=None, ci=None)
    g = sns.lineplot(x=y[0], y=y[1], data=df, **kwargs)

    # Add metric score
    kwargs = dict(horizontalalignment="center", verticalalignment="center")
    plt.text(.5, 0, "%s = %.5f" % (metric, self.metrics[metric]), **kwargs)

    # Remove spines
    sns.despine()

    # Save & close
    fig = g.get_figure()
    fig.savefig(os.path.join(self.output_dir, "%s.png" % metric))
    plt.close(fig)

In [48]:
one_hot_decode(fwd[0])

'GCATCCACACACCCTCAGATGCTTCCTTTGACGCCCTCTGCTGTGCCCCTAGACACCCCTATCCCGCCACTGGCTGAAGCTGGACTTTGGAGCCATCTGCCTCCCTTGCCTGCGTCCACACCCCGCGCCAGTCCTCAGCCTCCAAGCCCATCTCAGTCGGACCCTTTCTCATTCCTGCCACTCGCTGCCTGTTCCAGGCC'

In [47]:
rev = reverse_complement_one_hot_encoding(fwd)
rev

array([[[0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0.,
         0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 1.,
         0., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
         0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0.,
         0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
         0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0.,
         1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
         

In [49]:
one_hot_decode(rev[0])

'GGCCTGGAACAGGCAGCGAGTGGCAGGAATGAGAAAGGGTCCGACTGAGATGGGCTTGGAGGCTGAGGACTGGCGCGGGGTGTGGACGCAGGCAAGGGAGGCAGATGGCTCCAAAGTCCAGCTTCAGCCAGTGGCGGGATAGGGGTGTCTAGGGGCACAGCAGAGGGCGTCAAAGGAAGCATCTGAGGGTGTGTGGATGC'