In [None]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
root_path = '/gdrive/My Drive/Colab Data/CRISPR Off Target/'
data_dir = root_path + '2018_DeepCRISPR/'
data_path = data_dir + 'all_off_target.csv'
resource_dir = data_dir + "Resources/"

In [None]:
import os
import random
import torch
import numpy as np
import copy

seed = 12345

os.environ['PYTHONHASHSEED']=str(seed)
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
torch.cuda.manual_seed_all(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
cd $resource_dir

/gdrive/.shortcut-targets-by-id/1-CPBoDSc88CelqHVwU-GHHjHASJq0kkO/CRISPR Off Target/2018_DeepCRISPR/Resources


In [None]:
!ls

 1LSTM_1_1_OHE4.weights        BiLSTM_1_1_OHE4.weights
 1LSTM_1_1_TME256.weights      BiLSTM_1_1_TME256.weights
 1LSTM_2_1_OHE4.weights        BiLSTM_2_1_OHE4.weights
 1LSTM_2_1_OHE4.weights.zip    BiLSTM_2_1_TME256.weights
 1LSTM_2_1_TME256.weights      BiLSTM_255_1_OHE4.weights
 1LSTM_255_1_OHE4.weights      BiLSTM_255_1_TME256.weights
 1LSTM_255_1_TME256.weights   'Copy of Results and Analysis CRISPR.gdoc'
 2LSTM_1_1_OHE4.weights        ega_stat
 2LSTM_1_1_TME256.weights      Figures
 2LSTM_2_1_OHE4.weights        ga_stat
 2LSTM_2_1_TME256.weights      rnn_torchviz
 2LSTM_255_1_OHE4.weights      rnn_torchviz.png
 2LSTM_255_1_TME256.weights    test_bpe.csv
 basic_test.csv		       test.csv
 basic_train.csv	       train_bpe.csv
 best_lstm_model_ohe_23_8      train.csv
 best_model_attribution.pkl    w2v_d_model.bin
 best_model_attributions.pkl   w2v_model.bin


#Models

In [None]:
import torch.nn as nn

class LSTM_Model_Generic(nn.Module):
    def __init__(self, config):
        super(LSTM_Model_Generic,self).__init__()
        # emb_size=256, hidden_size=128, hidden_layers=3, output=2

        self.vocab_size = config["vocab_size"]
        self.emb_size = config["emb_size"]
        self.hidden_size = config["hidden_size"]
        self.lstm_layers = config["lstm_layers"]
        self.bi_lstm = config["bi_lstm"]
        self.reshape = config["reshape"]

        self.number_hidden_layers = config["number_hidder_layers"]
        self.dropout_prob = config["dropout_prob"]
        self.hidden_layers = []

        self.hidden_shape = self.hidden_size*2 if self.bi_lstm else self.hidden_size

        self.embedding = None
        if self.vocab_size > 0:
            self.embedding = nn.Embedding(self.vocab_size, self.emb_size, padding_idx=0)

        self.lstm= nn.LSTM(self.emb_size, self.hidden_size, num_layers=self.lstm_layers,
                            batch_first=True, bidirectional=self.bi_lstm)
#         self.lstm= nn.GRU(self.emb_size, self.hidden_size, num_layers=self.lstm_layers,
#                             batch_first=True, bidirectional=self.bi_lstm)

        start_size = self.hidden_shape

        self.relu = nn.ReLU
        # self.dropout = nn.Dropout(self.dropout_prob)

        for i in range(self.number_hidden_layers):
            self.hidden_layers.append(nn.Sequential(
                nn.Linear(start_size, start_size // 2),
                nn.ReLU(),
                nn.Dropout(self.dropout_prob)))

            start_size = start_size // 2

        self.hidden_layers = nn.ModuleList(self.hidden_layers)
        self.output = nn.Linear(start_size,2)


    def forward(self,x):
        dir = 2 if self.bi_lstm else 1
        h = torch.zeros((self.lstm_layers*dir, x.size(0), self.hidden_size)).to(device)
        c = torch.zeros((self.lstm_layers*dir, x.size(0), self.hidden_size)).to(device)

        if self.embedding is not None:
            x = x.type(torch.LongTensor).to(device)
            x = self.embedding(x)
        elif self.reshape:
            x = x.view(x.shape[0],x.shape[1],1)

        x, (hidden, cell) = self.lstm(x, (h,c))

        x = x[:, -1, :]

        # print(x.shape)
        for i, layer in enumerate(self.hidden_layers):
            x = layer(x)
            # print(x.shape)
        x = self.output(x)
        # print(x.shape)
        return x

In [None]:
import torch.nn as nn

class RNN_Model_Generic(nn.Module):
    def __init__(self, config, model_type):
        super(RNN_Model_Generic,self).__init__()
        # emb_size=256, hidden_size=128, hidden_layers=3, output=2

        self.model_type = model_type
        self.vocab_size = config["vocab_size"]
        self.emb_size = config["emb_size"]
        self.hidden_size = config["hidden_size"]
        self.lstm_layers = config["lstm_layers"]
        self.bi_lstm = config["bi_lstm"]
        self.reshape = config["reshape"]

        self.number_hidden_layers = config["number_hidder_layers"]
        self.dropout_prob = config["dropout_prob"]
        self.hidden_layers = []

        self.hidden_shape = self.hidden_size*2 if self.bi_lstm else self.hidden_size

        self.embedding = None
        if self.vocab_size > 0:
            self.embedding = nn.Embedding(self.vocab_size, self.emb_size, padding_idx=0)


        if model_type == "LSTM":
            self.lstm = nn.LSTM(self.emb_size, self.hidden_size, num_layers=self.lstm_layers,
                            batch_first=True, bidirectional=self.bi_lstm)
        elif model_type == "GRU":
            self.lstm= nn.GRU(self.emb_size, self.hidden_size, num_layers=self.lstm_layers,
                           batch_first=True, bidirectional=self.bi_lstm)
        else:
            self.lstm= nn.RNN(self.emb_size, self.hidden_size, num_layers=self.lstm_layers,
                           batch_first=True, bidirectional=self.bi_lstm)

        start_size = self.hidden_shape

        self.relu = nn.ReLU
        # self.dropout = nn.Dropout(self.dropout_prob)

        for i in range(self.number_hidden_layers):
            self.hidden_layers.append(nn.Sequential(
                nn.Linear(start_size, start_size // 2),
                nn.ReLU(),
                nn.Dropout(self.dropout_prob)))

            start_size = start_size // 2

        self.hidden_layers = nn.ModuleList(self.hidden_layers)
        self.output = nn.Linear(start_size,2)


    def forward(self,x):
        dir = 2 if self.bi_lstm else 1
        h = torch.zeros((self.lstm_layers*dir, x.size(0), self.hidden_size)).to(device)
        c = torch.zeros((self.lstm_layers*dir, x.size(0), self.hidden_size)).to(device)

        if self.embedding is not None:
            x = x.type(torch.LongTensor).to(device)
            x = self.embedding(x)
        elif self.reshape:
            x = x.view(x.shape[0],x.shape[1],1)

        if self.model_type == "LSTM":
            x, (hidden, cell) = self.lstm(x, (h,c))
        else:
            x, hidden = self.lstm(x, h)

        x = x[:, -1, :]

        # print(x.shape)
        for i, layer in enumerate(self.hidden_layers):
            x = layer(x)
            # print(x.shape)
        x = self.output(x)
        # print(x.shape)
        return x

#Training and Evaluation Loop

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

class TrainerDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs= inputs
        self.targets = torch.from_numpy(targets)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return torch.Tensor(self.inputs[idx]), self.targets[idx]

In [None]:
def trainer(config, train_x, train_y, num_epochs=100, batch_size=32, debug=False, lr=0.0001,model_type="LSTM"):
    train_pos_idx = np.where(train_y==1)
    train_neg_idx = np.where(train_y==0)

    train_xp = train_x[train_pos_idx]
    train_xn = train_x[train_neg_idx]

    train_yp = train_y[train_pos_idx]
    train_yn = train_y[train_neg_idx]

    train_dataset_pos = TrainerDataset(train_xp, train_yp)
    train_dataloader_pos = DataLoader(train_dataset_pos, batch_size=batch_size//2, shuffle=True)
    train_dataset_neg = TrainerDataset(train_xn, train_yn)
    train_dataloader_neg = DataLoader(train_dataset_neg, batch_size=batch_size//2, shuffle=True)

    seed = 12345
    os.environ['PYTHONHASHSEED']=str(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    model = RNN_Model_Generic(config, model_type).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),lr=lr)
    n_total_steps = len(train_dataloader_neg)
    model.train()

    for epoch in range(num_epochs):
        for i, (train_features_neg, train_labels_neg) in enumerate(train_dataloader_neg):
            train_features_pos, train_labels_pos = next(iter(train_dataloader_pos))
            train_features = torch.cat((train_features_pos, train_features_neg),0)
            train_labels = torch.cat((train_labels_pos, train_labels_neg),0)

#             print(train_features.shape, train_labels.shape)

            outputs = model(train_features.to(device))
            loss = criterion(outputs, train_labels.to(device))

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # if (i+1) % 2000 == 0 and epoch % 10 == 0:
            if (i+1) % 200 == 0:
                print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')
                if debug:
                    return model
    return model

In [None]:
def tester(model, test_x, test_y):
    test_dataset = TrainerDataset(test_x, test_y)
    test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)
    model.eval()
    results = []
    true_labels = []
    with torch.no_grad():
        for test_features, test_labels in test_dataloader:
            outputs = model(test_features.to(device)).detach().to("cpu")
            results.extend(outputs)
            true_labels.extend(test_labels)
    return true_labels, results

In [None]:
class Stats:
    def __init__(self):
        self.acc = 0
        self.pre = 0
        self.re = 0
        self.f1 = 0
        self.roc = 0
        self.prc = 0
        self.tn = 0
        self.fp = 0
        self.fn = 0
        self.tp = 0
    def print(self):
        print('Accuracy: %.4f' %self.acc)
        print('Precision: %.4f' %self.pre)
        print('Recall: %.4f' %self.re)
        print('F1 Score: %.4f' %self.f1)
        print('ROC: %.4f' %self.roc)
        print('PR AUC: %.4f' %self.prc)
        print("Confusion Matrix")
        print(self.tn, "\t", self.fp)
        print(self.fn, "\t", self.tp)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score

def eval_matrices(model, test_x, test_y, debug = True):
    true_y, results = tester(model, test_x, test_y)
    predictions = [torch.nn.functional.softmax(r) for r in results]
    pred_y = np.array([y[1].item() for y in predictions])
    pred_y_list = []
    test_y = np.array([y.item() for y in true_y])

    for x in pred_y:
        if(x>0.5):
            pred_y_list.append(1)
        else:
            pred_y_list.append(0)

    pred_y_list = np.array(pred_y_list)

    tn, fp, fn, tp = confusion_matrix(test_y, pred_y_list).ravel()
    precision, recall, _ = precision_recall_curve(test_y, pred_y)
    auc_score = auc(recall, precision)
    acc = accuracy_score(test_y, pred_y_list)

    pr = -1
    re = -1
    f1 = -1
    try:
        pr = tp / (tp+fp)
        re = tp / (tp+fn)
        f1 = 2*pr*re / (pr+re)
    except:
        f1 = -1

    stats = Stats()
    stats.acc = acc
    stats.pre = pr
    stats.re = re
    stats.f1 = f1
    stats.roc = roc_auc_score(test_y, pred_y)
    stats.prc = auc_score
    stats.tn = tn
    stats.fp = fp
    stats.fn = fn
    stats.tp = tp

    if debug:
        print('Accuracy: %.4f' %acc)
        print('Precision: %.4f' %pr)
        print('Recall: %.4f' %re)
        print('F1 Score: %.4f' %f1)
        print('ROC:',roc_auc_score(test_y, pred_y))
        print('PR AUC: %.4f' % auc_score)

        print(classification_report(test_y, pred_y_list, digits=4))
        print("Confusion Matrix")
        print(confusion_matrix(test_y, pred_y_list))

    return stats

# Data Load

In [None]:
!pip install pickle5

Collecting pickle5
  Downloading pickle5-0.0.11.tar.gz (132 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/132.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.1/132.1 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pickle5
  Building wheel for pickle5 (setup.py) ... [?25l[?25hdone
  Created wheel for pickle5: filename=pickle5-0.0.11-cp310-cp310-linux_x86_64.whl size=256405 sha256=c690f0a23fd3578e98a1a74813eafb419f5a2b95b03312a1f74568075809983f
  Stored in directory: /root/.cache/pip/wheels/7d/14/ef/4aab19d27fa8e58772be5c71c16add0426acf9e1f64353235c
Successfully built pickle5
Installing collected packages: pickle5
Successfully installed pickle5-0.0.11


In [None]:
import pickle5 as pkl
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

in_file = data_dir + "Encoded Data/all_encoded_data.pkl"
enc_dict = {}
with open(in_file, "rb") as f:
    enc_dict = pkl.load(f)

data_x = enc_dict['enc_superposed']
data_y = enc_dict['labels']

data_x = np.array(data_x)
data_y = np.array(data_y)

print(data_x.shape)
print(data_y.shape)


train_x, test_x, train_y, test_y = train_test_split(data_x, data_y,
                                                    stratify=data_y,
                                                    test_size=0.20,
                                                    random_state=5)
print(train_x.shape, train_y.shape)
print(test_x.shape, test_y.shape)


(153233, 23, 4)
(153233,)
(122586, 23, 4) (122586,)
(30647, 23, 4) (30647,)


# Best RNN Model

In [None]:
best_config = {
    'vocab_size': 0,
    'emb_size': 4,
    'hidden_size': 256,
    'lstm_layers': 2,
    'bi_lstm': True,
    'number_hidder_layers': 0,
    'dropout_prob': 0.4,
    'reshape': False,
    'batch_size': 256,
    'epochs': 60,
    'learning_rate': 0.00100
}

In [None]:
config = best_config
debug=False
model_type="RNN"

model = trainer(config, train_x, train_y, num_epochs=config["epochs"], lr=config["learning_rate"], batch_size = config["batch_size"], model_type = model_type)
stats = eval_matrices(model, test_x, test_y)
stats.print()

Epoch [1/60], Step [200/954], Loss: 0.1600
Epoch [1/60], Step [400/954], Loss: 0.2942
Epoch [1/60], Step [600/954], Loss: 0.0585
Epoch [1/60], Step [800/954], Loss: 0.1347
Epoch [2/60], Step [200/954], Loss: 0.0197
Epoch [2/60], Step [400/954], Loss: 0.0395
Epoch [2/60], Step [600/954], Loss: 0.0436
Epoch [2/60], Step [800/954], Loss: 0.0199
Epoch [3/60], Step [200/954], Loss: 0.0214
Epoch [3/60], Step [400/954], Loss: 0.0526
Epoch [3/60], Step [600/954], Loss: 0.1540
Epoch [3/60], Step [800/954], Loss: 0.2036
Epoch [4/60], Step [200/954], Loss: 0.1292
Epoch [4/60], Step [400/954], Loss: 0.0871
Epoch [4/60], Step [600/954], Loss: 0.0296
Epoch [4/60], Step [800/954], Loss: 0.0187
Epoch [5/60], Step [200/954], Loss: 0.0092
Epoch [5/60], Step [400/954], Loss: 0.0050
Epoch [5/60], Step [600/954], Loss: 0.0066
Epoch [5/60], Step [800/954], Loss: 0.0083
Epoch [6/60], Step [200/954], Loss: 0.0276
Epoch [6/60], Step [400/954], Loss: 0.0071
Epoch [6/60], Step [600/954], Loss: 0.0088
Epoch [6/60

  predictions = [torch.nn.functional.softmax(r) for r in results]


Accuracy: 0.9892
Precision: 0.2679
Recall: 0.8855
F1 Score: 0.4113
ROC: 0.9873685334886266
PR AUC: 0.5711
              precision    recall  f1-score   support

           0     0.9995    0.9896    0.9945     30516
           1     0.2679    0.8855    0.4113       131

    accuracy                         0.9892     30647
   macro avg     0.6337    0.9376    0.7029     30647
weighted avg     0.9964    0.9892    0.9920     30647

Confusion Matrix
[[30199   317]
 [   15   116]]
Accuracy: 0.9892
Precision: 0.2679
Recall: 0.8855
F1 Score: 0.4113
ROC: 0.9874
PR AUC: 0.5711
Confusion Matrix
30199 	 317
15 	 116


In [None]:
torch.save(model.state_dict(), "best_rnn_model.pth")

# Best LSTM Model

In [None]:
best_config = {
    'vocab_size': 0,
    'emb_size': 4,
    'hidden_size': 512,
    'lstm_layers': 1,
    'bi_lstm': True,
    'number_hidder_layers': 2,
    'dropout_prob': 0.4,
    'reshape': False,
    'batch_size': 64,
    'epochs': 50,
    'learning_rate': 0.00010
}

In [None]:
config = best_config
debug=False
model_type="LSTM"

model = trainer(config, train_x, train_y, num_epochs=config["epochs"], lr=config["learning_rate"], batch_size = config["batch_size"], model_type = model_type)
stats = eval_matrices(model, test_x, test_y)
stats.print()

Epoch [1/50], Step [200/3815], Loss: 0.4032
Epoch [1/50], Step [400/3815], Loss: 0.3179
Epoch [1/50], Step [600/3815], Loss: 0.2288
Epoch [1/50], Step [800/3815], Loss: 0.1329
Epoch [1/50], Step [1000/3815], Loss: 0.1317
Epoch [1/50], Step [1200/3815], Loss: 0.1568
Epoch [1/50], Step [1400/3815], Loss: 0.1962
Epoch [1/50], Step [1600/3815], Loss: 0.1526
Epoch [1/50], Step [1800/3815], Loss: 0.1736
Epoch [1/50], Step [2000/3815], Loss: 0.1103
Epoch [1/50], Step [2200/3815], Loss: 0.1300
Epoch [1/50], Step [2400/3815], Loss: 0.1748
Epoch [1/50], Step [2600/3815], Loss: 0.1881
Epoch [1/50], Step [2800/3815], Loss: 0.1425
Epoch [1/50], Step [3000/3815], Loss: 0.1693
Epoch [1/50], Step [3200/3815], Loss: 0.1734
Epoch [1/50], Step [3400/3815], Loss: 0.1572
Epoch [1/50], Step [3600/3815], Loss: 0.2549
Epoch [1/50], Step [3800/3815], Loss: 0.1542
Epoch [2/50], Step [200/3815], Loss: 0.1485
Epoch [2/50], Step [400/3815], Loss: 0.0825
Epoch [2/50], Step [600/3815], Loss: 0.0781
Epoch [2/50], Ste

  predictions = [torch.nn.functional.softmax(r) for r in results]


Accuracy: 0.9974
Precision: 0.7339
Recall: 0.6107
F1 Score: 0.6667
ROC: 0.9897963676169378
PR AUC: 0.7208
              precision    recall  f1-score   support

           0     0.9983    0.9990    0.9987     30516
           1     0.7339    0.6107    0.6667       131

    accuracy                         0.9974     30647
   macro avg     0.8661    0.8049    0.8327     30647
weighted avg     0.9972    0.9974    0.9973     30647

Confusion Matrix
[[30487    29]
 [   51    80]]
Accuracy: 0.9974
Precision: 0.7339
Recall: 0.6107
F1 Score: 0.6667
ROC: 0.9898
PR AUC: 0.7208
Confusion Matrix
30487 	 29
51 	 80


In [None]:
torch.save(model.state_dict(), "best_lstm_model.pth")

# Best GRU Model

In [None]:
best_config = {
    'vocab_size': 0,
    'emb_size': 4,
    'hidden_size': 128,
    'lstm_layers': 2,
    'bi_lstm': True,
    'number_hidder_layers': 0,
    'dropout_prob': 0.1,
    'reshape': False,
    'batch_size': 64,
    'epochs': 30,
    'learning_rate': 0.00050
}

In [None]:
config = best_config
debug=False
model_type="GRU"

model = trainer(config, train_x, train_y, num_epochs=config["epochs"], lr=config["learning_rate"], batch_size = config["batch_size"], model_type = model_type)
stats = eval_matrices(model, test_x, test_y)
stats.print()

Epoch [1/30], Step [200/3815], Loss: 0.3177
Epoch [1/30], Step [400/3815], Loss: 0.1547
Epoch [1/30], Step [600/3815], Loss: 0.1019
Epoch [1/30], Step [800/3815], Loss: 0.2982
Epoch [1/30], Step [1000/3815], Loss: 0.1347
Epoch [1/30], Step [1200/3815], Loss: 0.0453
Epoch [1/30], Step [1400/3815], Loss: 0.1297
Epoch [1/30], Step [1600/3815], Loss: 0.0604
Epoch [1/30], Step [1800/3815], Loss: 0.0694
Epoch [1/30], Step [2000/3815], Loss: 0.0249
Epoch [1/30], Step [2200/3815], Loss: 0.0134
Epoch [1/30], Step [2400/3815], Loss: 0.0717
Epoch [1/30], Step [2600/3815], Loss: 0.0448
Epoch [1/30], Step [2800/3815], Loss: 0.0221
Epoch [1/30], Step [3000/3815], Loss: 0.0228
Epoch [1/30], Step [3200/3815], Loss: 0.0460
Epoch [1/30], Step [3400/3815], Loss: 0.0824
Epoch [1/30], Step [3600/3815], Loss: 0.0059
Epoch [1/30], Step [3800/3815], Loss: 0.0364
Epoch [2/30], Step [200/3815], Loss: 0.0387
Epoch [2/30], Step [400/3815], Loss: 0.0726
Epoch [2/30], Step [600/3815], Loss: 0.0085
Epoch [2/30], Ste

  predictions = [torch.nn.functional.softmax(r) for r in results]


Accuracy: 0.9972
Precision: 0.6891
Recall: 0.6260
F1 Score: 0.6560
ROC: 0.9907548186460062
PR AUC: 0.6859
              precision    recall  f1-score   support

           0     0.9984    0.9988    0.9986     30516
           1     0.6891    0.6260    0.6560       131

    accuracy                         0.9972     30647
   macro avg     0.8437    0.8124    0.8273     30647
weighted avg     0.9971    0.9972    0.9971     30647

Confusion Matrix
[[30479    37]
 [   49    82]]
Accuracy: 0.9972
Precision: 0.6891
Recall: 0.6260
F1 Score: 0.6560
ROC: 0.9908
PR AUC: 0.6859
Confusion Matrix
30479 	 37
49 	 82


In [None]:
torch.save(model.state_dict(), "best_gru_model.pth")

# Best Model Interpretation