## Packages

In [None]:
import random
import numpy as np
import os
import torch
import torch.nn as nn
#from pytorch_transformers import BertModel, BertTokenizer, BertConfig, WarmupLinearSchedule 
import re
import pandas as pd 
import json
from torch.utils.data import Dataset
from torch.utils.data import DataLoader, SubsetRandomSampler
import pickle
from sklearn import metrics
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import f1_score, recall_score
from scipy import stats

In [None]:
from tqdm import tqdm_notebook, trange

def seed_everything(seed = 42): 
  random.seed(seed) 
  os.environ['PYTHONHASHSEED'] = str(seed) 
  np.random.seed(seed)
  torch.manual_seed(seed) 
  torch.cuda.manual_seed(seed) 
  torch.backends.cudnn.deterministic = True
# For reproducible results
seed_everything()

In [None]:
import matplotlib as mpl
mpl.style.use('seaborn')

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [None]:
%cd /content/gdrive/My Drive/seq

## Data Preprocessing

In [None]:
class MyDataset(Dataset):
    def __init__(self, X, Y):
        self.data = X
        self.target = Y
        
    def __getitem__(self, index):
        x = self.data[index]
        y = self.target[index]
        
        return x, y
    
    def __len__(self):
        return len(self.data)

In [None]:
npzfile = np.load('data/norm/Astrocytes_norm.npz')

In [None]:
X, y = npzfile['arr_0'], npzfile['arr_1']

In [None]:
subX, subY = shuffle(X, y, random_state=0)

In [None]:
testX = subX[int(len(subY)*0.8):]
testY = subY[int(len(subY)*0.8):]
validX = subX[int(len(subY)*0.6):int(len(subY)*0.8)]
validY = subY[int(len(subY)*0.6):int(len(subY)*0.8)]
trainX = subX[:int(len(subY)*0.6)]
trainY = subY[:int(len(subY)*0.6)]

### Convert to Torch Data

In [None]:
train_X = torch.from_numpy(trainX)
train_y = torch.from_numpy(trainY)
valid_X  = torch.from_numpy(validX)
valid_y = torch.from_numpy(validY)
test_X = torch.from_numpy(testX)
test_y = torch.from_numpy(testY)

In [None]:
train_dataset = MyDataset(train_X, train_y)
valid_dataset = MyDataset(valid_X, valid_y)
test_dataset = MyDataset(test_X, test_y)

## Helper Functions

In [None]:
def bestmodel(model_name,save_model_time,valid_loss):
    bestloss = 10000
    if valid_loss < bestloss :
        bestloss = valid_loss
        torch.save(model_name, 'model/model{save_model_time}/bestmodel.pkl'.format(save_model_time=save_model_time))
        torch.save(model_name.state_dict(), 'model/model{save_model_time}/net_params_bestmodel.pkl'.format(save_model_time=save_model_time))
    return True  

## Training and Validating

In [None]:
save_model_time = '0'
mkpath = 'model/model%s'% save_model_time
# os.makedirs(mkpath)

In [None]:
class TrainHelper():
    '''
    Helper class that makes it a bit easier and cleaner to define the training routine
    
    '''

    def __init__(self,model,train_set,test_set,opts):
      self.model = model  # neural net

      # device agnostic code snippet
      self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
      self.model.to(self.device)

      self.epochs = opts['epochs']
      self.optimizer = torch.optim.Adam(model.parameters(), opts['lr']) # optimizer method for gradient descent
      #self.optimizer = torch.optim.SGD(model.parameters(), opts['lr'])
      self.criterion = torch.nn.MSELoss()
      self.train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                      batch_size=opts['batch_size'],
                                                      shuffle=True)
      self.valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
                                                      batch_size=opts['batch_size'],
                                                      shuffle=True)
    def train(self):
      self.model.train() # put model in training mode
      for epoch in range(self.epochs):
          self.tr_loss = []
          for i, (data,labels) in tqdm_notebook(enumerate(self.train_loader),
                                                  total = len(self.train_loader)):

              data, labels = data.to(self.device),labels.to(self.device)
              self.optimizer.zero_grad()  
              outputs = self.model(data)
              labels = labels.unsqueeze(1)
              loss = self.criterion(outputs.float(), labels.float())
              loss.backward()                        
              self.optimizer.step()                  
              self.tr_loss.append(loss.item())       
          if (epoch+1) % 5 == 0 or epoch == 0: # save the model every _ epoch
              torch.save(self.model, 'model/model{save_model_time}/net_{epoch}.pkl'.format(save_model_time=save_model_time,epoch=int((epoch+1)/5)))
              torch.save(self.model.state_dict(), 'model/model{save_model_time}/net_params_{epoch}.pkl'.format(save_model_time=save_model_time,epoch=int((epoch+1)/5)))
          
          self.test(epoch) # run through the validation set

    def test(self,epoch):
            
      self.model.eval()    # puts model in eval mode
      self.test_loss = []
      self.test_accuracy = []

      for i, (data, labels) in enumerate(self.valid_loader):
          
          data, labels = data.to(self.device),labels.to(self.device)
          # pass data through network
          # turn off gradient calculation to speed up calcs and reduce memory
          with torch.no_grad():
              outputs = self.model(data)
          # make our predictions and update our loss info
          labels = labels.unsqueeze(1)
          loss = self.criterion(outputs, labels)
          self.test_loss.append(loss.item())
      
      test_loss.append(np.mean(self.test_loss))
      train_loss.append(np.mean(self.tr_loss))    
      bestmodel(self.model,save_model_time,np.mean(self.test_loss)) # find best model
      print('epoch: {}, train loss: {}, test loss: {}'.format( 
      epoch+1, np.mean(self.tr_loss), np.mean(self.test_loss)))

## Testing

In [None]:
train_X, train_y = shuffle(train_X, train_y, random_state=0) 
train_X_sub = train_X[:2000]
train_y_sub = train_y[:2000]
sub_dataset = MyDataset(train_X_sub, train_y_sub)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=True)

In [None]:
def get_list_con(model):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    pred, true = [], []
    for i, (data, labels) in enumerate(test_loader):
      data, labels = data.to(device),labels.to(device)
    # pass data through network
    # turn off gradient calculation to speed up calcs and reduce memory
      with torch.no_grad():
          outputs = model(data)
    # make our predictions and update our loss info
      predicted = []
      for o in outputs.tolist():
        predicted.append(o[0])
      pred.extend(predicted)
      true.extend(labels.tolist())
    return true, pred

### AUC

In [None]:
def getAUC(model):
    labels, predicts = get_list_cat(model)
    score = metrics.roc_auc_score(labels, predicts, average='weighted')
    return score

### Pearson R

In [None]:
def getR(model):
    labels, predicts = get_list_con(model)
    corr, _ = stats.pearsonr(labels, predicts)
    return corr

### Plot Train Verse Test Loss

In [None]:
def pltloss(train_loss, test_loss, epoch):
    epochs = [i for i in range(epoch)]
    fig = plt.figure()
    plt.plot(epochs, train_loss, 'g', label='Training loss')
    plt.plot(epochs, test_loss, 'b', label='Testing loss')
    plt.title('Training and Testing Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

### Plot R

In [None]:
def pltR(r, epoch):
    epochs = [i for i in range(epoch+1)][::5][1:]
    fig = plt.figure()
    plt.plot(epochs, r, 'g', label='Pearson R')
    plt.title('R Score Over Time')
    plt.xlabel('Epochs')
    plt.ylabel('R')
    plt.legend()
    plt.show()

### Plot Predicated Verse Label

In [None]:
def plotcomp(model):
    labels, predicts = get_list_con(model)
    idx_list = [i for i in range(len(labels))]
    idx_sele = random.sample(idx_list, 50)
    fig = plt.figure()
    label_sele, pred_sele = [], []
    for i in idx_sele:
      label_sele.append(labels[i])
      pred_sele.append(predicts[i])
    plt.scatter(pred_sele, label_sele, c='b', marker='+')
    plt.plot([0, max(pred_sele)], [0, max(label_sele)], color = 'black', linewidth = 1)
    plt.title('Actual Values vs Predicated Values')
    plt.xlabel('Predicated Values')
    plt.ylabel('Actual Values')
    plt.xlim(0, max(pred_sele))
    plt.ylim(0, max(label_sele))
    plt.legend()
    plt.show()

## Models

In [None]:
class CNN(nn.Module):
    def __init__(self, input_size, num_classes):
        """
        init convolution and activation layers
        Args:
        x: (Nx1x2004)
        class: 

        """
        super(CNN, self).__init__() 
        
        self.conv1 = torch.nn.Conv1d(input_size[0], 32, 3)
        self.relu = torch.nn.ReLU()
        self.conv2 = torch.nn.Conv1d(32, 64, 3)
        self.pool = torch.nn.MaxPool1d(4)
        self.fc1 = torch.nn.Linear(2304, num_classes)
        self.sig = nn.Sigmoid()

    def forward(self, x):
        """
        forward function describes how input tensor is transformed to output tensor
        Args:
            
        """
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.conv2(x)
        x = self.relu(x)
        x = self.pool(x)
        x = torch.flatten(x, 1)
        # print(x.size())
        # assert 0
        x = self.fc1(x)
        x = self.sig(x)

        return x

In [None]:
cnn = CNN(train_X.shape[1:], classes)
cnn

In [None]:
opts = {
    'lr': 5e-4,
    'epochs': 50,
    'batch_size': 100,
    'loss_fxn': 'c'
}

In [None]:
test_loss, train_loss = [], []
CNNTrainer = TrainHelper(model = cnn,
                      train_set = train_dataset,
                      test_set = valid_dataset, opts = opts)

In [None]:
CNNTrainer.train()

### Check for Output

In [None]:
r_list = []
for num in range(opts['epochs']//5):
  model.load_state_dict(torch.load('model/model'+save_model_time+'/net_params_'+str(num)+'.pkl'))
  model.cuda()
  r_list.append(getR(cnn)

In [None]:
pltloss(train_loss, test_loss, opts['epochs'])

In [None]:
pltR(r_list, opts['epochs'])

In [None]:
plotcomp(cnn)