In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [2]:
import sys
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install scikit-learn
!{sys.executable} -m pip install torch
!{sys.executable} -m pip install torchmetrics
!{sys.executable} -m pip install statsmodels
!{sys.executable} -m pip install scipy
!{sys.executable} -m pip install GPUtil
!{sys.executable} -m pip install gc

[31mERROR: Could not find a version that satisfies the requirement gc (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for gc[0m[31m
[0m

In [3]:
import numpy as np
import pandas as pd
import torch
import sklearn
import gc
import time
import random
import os, sys, math


import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold

from GPUtil import showUtilization as gpu_usage
from numba import cuda  

from torchmetrics.classification import BinaryPrecision
from torchmetrics.classification import BinaryRecall
from torchmetrics.classification import BinaryF1Score
from torchmetrics.classification import BinaryPrecisionRecallCurve
from torchmetrics.classification import ROC
from torchmetrics.classification import BinaryConfusionMatrix

from torch.nn.parallel import DistributedDataParallel
from torch.nn.parallel import DataParallel
from torch.cuda.amp import GradScaler, autocast

In [4]:
# simulate labels
np.random.seed(12345)
N = 10000 
d = 2500
y = pd.Series(np.random.binomial(n = 1, p = 0.02, size = N))
print("fraction of positives: ", sum(y)/len(y))
X = np.random.normal(loc = 0, scale = 1, size = (N, d, 4))

fraction of positives:  0.0208


In [5]:
print(torch.__version__)

2.0.1


In [6]:
def free_gpu_cache(verbose=True):
    if verbose:
        print("Initial GPU Usage")
        gpu_usage()                             

    gc.collect()
    torch.cuda.empty_cache()

    #cuda.select_device(0)
    #cuda.close()
    #cuda.select_device(0)
    
    if verbose:
        print("GPU Usage after emptying the cache")
        gpu_usage()

In [7]:
class EncodingDataset(torch.utils.data.Dataset):
  '''
  Convert the list of PyTorch tensors into a PyTorch Dataset so I can use the DataLoader function for training
  '''

  def __init__(self, X, y):
    #First, transpose each pd df since CONV1D takes as input vectors of shape [BxCxT] where B is minibatch size, C is # of channels, and T is sequence length    
    X = [np.array(df.transpose()) for df in X]
    #Then convert each dataframe into a PyTorch tensor
    X = [torch.tensor(df) for df in X]
    #Finally, cast the values in the tensors to float
    X = [df.to(torch.float) for df in X]
    self.X = X
    
    #convert each label into a PyTorch tensor
    #Then I need to add an extra dimension so each label in y is a 1D PyTorch tensor instead of a 0D PyTorch tensor
    y = [torch.tensor(np.array(label)) for label in y]
    y = [label.to(torch.float) for label in y]
    y = [torch.unsqueeze(label, dim=-1) for label in y]
    self.y = y
    
  def __len__(self):
      return len(self.X)

  def __getitem__(self, i):
      return self.X[i], self.y[i]

In [8]:
class one_layer(nn.Module):
    def __init__(self, kernel_size, in_channels, conv_size, l1, l2, dropout):
        super().__init__()
        self.conv1 = nn.Conv1d(in_channels, conv_size, kernel_size, stride=1, padding=0)
        self.conv1_batchnorm = nn.BatchNorm1d(conv_size)
        self.conv1_dropout = nn.Dropout(p=dropout)
        
        self.fc1 = nn.Linear(l1, l2)
        self.fc2 = nn.Linear(l2, 1)
        
    def forward(self, x):
        out = self.conv1_batchnorm(self.conv1(x))
        out = F.max_pool1d(torch.relu(out), 2)
        out = self.conv1_dropout(out)
        
        out = out.flatten(1)
        out = torch.relu(self.fc1(out))
        out = self.fc2(out)
        
        return out
    
class two_layer(nn.Module):
    def __init__(self, kernel_size, in_channels, conv_size, l2, dropout, dim):
        super().__init__()
        padding = 0
        stride = 1
        self.conv1 = nn.Conv1d(in_channels, conv_size, kernel_size, stride=stride, padding=padding)
        self.conv1_batchnorm = nn.BatchNorm1d(conv_size)
        out_dims = [conv_size, 
                    math.floor((dim + 2*padding - 1*(kernel_size - 1) - 1)/stride) + 1]
        self.conv1_dropout = nn.Dropout(p=dropout)
        out_dims = [conv_size, 
                    math.floor((out_dims[1] + 2*padding - 1*(2 - 1) - 1)/2) + 1]
        
        self.conv2 = nn.Conv1d(conv_size, conv_size, kernel_size, stride=1, padding=padding)
        out_dims = [conv_size, 
                    math.floor((out_dims[1] + 2*padding - 1*(kernel_size - 1) - 1)/stride) + 1]
        self.conv2_batchnorm = nn.BatchNorm1d(conv_size)
        out_dims = [conv_size, 
                    math.floor((out_dims[1] + 2*padding - 1*(2 - 1) - 1)/2) + 1]
        self.conv2_dropout = nn.Dropout(p=dropout)
        l1 = out_dims[0]*out_dims[1]
        self.fc1 = nn.Linear(l1, l2)
        self.fc2 = nn.Linear(l2, 1)
        
    def forward(self, x):
        out = self.conv1_batchnorm(self.conv1(x))
        out = F.max_pool1d(torch.relu(out), 2)
        out = self.conv1_dropout(out)
        
        out = self.conv2_batchnorm(self.conv2(out))
        out = F.max_pool1d(torch.relu(out), 2)
        out = self.conv2_dropout(out)
        
        out = out.flatten(1)
        out = torch.relu(self.fc1(out))
        out = self.fc2(out)
        
        return out

class three_layer(nn.Module):
    def __init__(self, kernel_size, in_channels, conv_size, l2, dropout, dim):
        super().__init__()
        padding = 0
        stride = 1
        self.conv1 = nn.Conv1d(in_channels, conv_size, kernel_size, stride=stride, padding=padding)
        out_dims = [conv_size, 
                    math.floor((dim + 2*padding - 1*(kernel_size - 1) - 1)/stride) + 1]
        self.conv1_batchnorm = nn.BatchNorm1d(conv_size)
        out_dims = [conv_size, 
                    math.floor((out_dims[1] + 2*padding - 1*(2 - 1) - 1)/2) + 1]
        self.conv1_dropout = nn.Dropout(p=dropout)
        
        self.conv2 = nn.Conv1d(conv_size, conv_size, kernel_size, stride=stride, padding=padding)
        out_dims = [conv_size, 
                    math.floor((out_dims[1] + 2*padding - 1*(kernel_size - 1) - 1)/stride) + 1]
        self.conv2_batchnorm = nn.BatchNorm1d(conv_size)
        out_dims = [conv_size, 
                    math.floor((out_dims[1] + 2*padding - 1*(2 - 1) - 1)/2) + 1]
        self.conv2_dropout = nn.Dropout(p=dropout)
        
        self.conv3 = nn.Conv1d(conv_size, conv_size, kernel_size, stride=stride, padding=padding)
        out_dims = [conv_size, 
                    math.floor((out_dims[1] + 2*padding - 1*(kernel_size - 1) - 1)/stride) + 1]
        self.conv3_batchnorm = nn.BatchNorm1d(conv_size)
        out_dims = [conv_size, 
                    math.floor((out_dims[1] + 2*padding - 1*(2 - 1) - 1)/2) + 1]
        self.conv3_dropout = nn.Dropout(p=dropout)
        
        l1 = out_dims[0]*out_dims[1]
        self.fc1 = nn.Linear(l1, l2)
        self.fc2 = nn.Linear(l2, 1)
        
    def forward(self, x):
        out = self.conv1_batchnorm(self.conv1(x))
        #print("first convolution:", out.shape)
        out = F.max_pool1d(torch.relu(out), 2)
        #print("first max pool: ", out.shape)
        out = self.conv1_dropout(out)
        #print("first dropout: ", out.shape)
        
        out = self.conv2_batchnorm(self.conv2(out))
        #print("2nd conv: ", out.shape)
        out = F.max_pool1d(torch.relu(out), 2)
        #print("2nd max pool: ", out.shape)
        out = self.conv2_dropout(out)
        #print("2nd dropout: ", out.shape)
        out = self.conv3_batchnorm(self.conv3(out))
        #print("3rd conv: ", out.shape)
        out = F.max_pool1d(torch.relu(out), 2)
        #print("3rd max pool: ", out.shape)
        out = self.conv3_dropout(out)
        #print(out.shape)
        
        out = out.flatten(1)
        #print("flattened: ", out.shape)
        out = self.fc1(out)
        out = torch.relu(out)
        out = self.fc2(out)
        
        return out

In [9]:
def train_model(n_epochs, optimizer, model, loss_fn, train_loader_batch):
    for epoch in range(1, n_epochs + 1):
        loss_train = 0.0
        for encodings, labels in train_loader_batch:
            encodings = encodings.to(device=device)
            labels = labels.to(device=device)
            
            outputs = model(encodings)
            
            loss = loss_fn(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loss_train += loss.item()
            
def evaluate_model(model, train_loader_eval, test_loader_eval):
    
    train_probs_list, train_preds_list, train_labels_list = [], [], []
    for encodings, labels in train_loader_eval:
        with torch.no_grad():
            train_encodings = encodings.to(device=device)
            train_labels = labels.to(device=device)

            #Get model predictions on test set data
            train_outputs = model(train_encodings)
            train_probs = torch.sigmoid(train_outputs)        
            train_preds = torch.round(train_probs)

            #Convert to correct format
            train_labels = train_labels.type(torch.int64)
            train_preds = train_preds.type(torch.int64) 
        for i in range(0,len(encodings)):
            train_probs_list.append(train_probs[i])
            train_preds_list.append(train_preds[i])
            train_labels_list.append(train_labels[i])
    
    val_probs_list, val_preds_list, val_labels_list = [], [], []
    for encodings, labels in test_loader_eval:
        with torch.no_grad():
            val_encodings = encodings.to(device=device)
            val_labels = labels.to(device=device)

            #Get model predictions on test set data
            val_outputs = model(val_encodings)
            val_probs = torch.sigmoid(val_outputs)        
            val_preds = torch.round(val_probs)

            #Convert to correct format
            val_labels = val_labels.type(torch.int64)
            val_preds = val_preds.type(torch.int64)
        for i in range(0,len(encodings)):
            val_probs_list.append(val_probs[i])
            val_preds_list.append(val_preds[i])
            val_labels_list.append(val_labels[i])
    
    model_performance = {"train_probs":torch.stack(train_probs_list),
                         "train_preds":torch.stack(train_preds_list),
                         "train_labels":torch.stack(train_labels_list),
                         "val_probs":torch.stack(val_probs_list),
                         "val_preds":torch.stack(val_preds_list),
                         "val_labels":torch.stack(val_labels_list),
                        }
    
    return(model_performance)

In [10]:
def upsample_minority_class(X, y, upsampling_fraction):
    """
    Inputs: list of encodings X, binary feature labels y, upsampling fraction [0,1]
    Upsamples the minority class (1) of the input dataframe and outputs a list of encodings containing a more balanced dataset
    (i.e. if upsampling_fraction=0.3, then 30% of the resulting list will be sampled from the minority class)
    """
    
    if upsampling_fraction == 0:
        return(X, y)
    
    else:
        maj_index = [i for i in range(len(y)) if y[i]==1]
        minority_index = [i for i in range(len(y)) if y[i]==0]
        maj_list = [X[i] for i in maj_index]
        minority_list = [X[i] for i in minority_index]
        
        n_samples = round((upsampling_fraction*len(X) - len(maj_list))/(1-upsampling_fraction))
        upsampled_encodings_minority = [random.choice(minority_list) for _ in range(n_samples)]
        upsampled_encodings_minority.extend(minority_list)
        
        upsampled_encodings_full = upsampled_encodings_minority
        upsampled_encodings_full.extend(maj_list)
        
        upsampled_labels_full = ([1] * (n_samples + len(minority_list)))
        upsampled_labels_full.extend([0] * len(maj_list))
        
        return(upsampled_encodings_full, upsampled_labels_full)

In [11]:
def CNN_hyperparameter_tuning_CV(param_grid):
    """
    Inputs: hyperparameter grid and fraction to upsample the data
    
    This function trains and evaluates CNN models for a particular encoding featurespace and returns 
    training/validation scoring metrics using K-fold cross-validation
    """
    score_tracker = []
    for model_name in param_grid['model_name']:
        for weight_decay in param_grid['weight_decay']:
            for learning_rate in param_grid['learning_rate']:
                for batch_size in param_grid['batch_size']:
                    for dropout in param_grid['dropout']:
                        for conv_size in param_grid['conv_size']:
                            for n_epochs in param_grid['n_epochs']:
                                for layer2 in param_grid['layer2']:
                                    for upsample_frac in param_grid['upsample_frac']:
                                        for kernel_size in param_grid['kernel_size']:
                                                
                                                free_gpu_cache(verbose=False)
                                
                                                #device = (torch.device('cuda') if torch.cuda.is_available()
                                                #      else torch.device('cpu'))
                                                #print(f"Training on device {device}.")
                                                start = time.time()
    
                                                metrics = {"n_epochs": n_epochs,
                                                           "weight_decay": weight_decay,
                                                           "learning_rate": learning_rate,
                                                           "batch_size": batch_size,
                                                           "dropout": dropout,
                                                           "conv_size": conv_size,
                                                           "model_name": model_name,
                                                           "layer2": layer2,
                                                           "upsample_frac": upsample_frac,
                                                           "kernel_size": kernel_size} 
                                                #print("metrics: ", metrics)
            

                                                train_recall_scores = []
                                                train_precision_scores = []
                                                train_f1_scores = []
                                                train_pr_auc_scores = []
                                                train_roc_auc_scores = []
                                                train_conf_matrices = []

                                                val_recall_scores = []
                                                val_precision_scores = []
                                                val_f1_scores = []
                                                val_pr_auc_scores = []
                                                val_roc_auc_scores = []
                                                val_conf_matrices = []

                                                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

                                                for train_fold_index, val_fold_index in cv.split(X, Y):
                                                    # Get the training data
                                                    X_train_fold = [X[i] for i in train_fold_index]
                                                    Y_train_fold = [Y[i] for i in train_fold_index]
                                    
                                                    # Get the validation data
                                                    X_val_fold = [X[i] for i in val_fold_index]
                                                    Y_val_fold = [Y[i] for i in val_fold_index]

                                                    # Upsample only the data in the training section
                                                    X_train_fold_upsample, Y_train_fold_upsample = upsample_minority_class(X_train_fold, Y_train_fold, upsampling_fraction=0.3)
                                                    del X_train_fold
                                                    del Y_train_fold

                                                    train_data = EncodingDataset(X_train_fold_upsample,Y_train_fold_upsample)
                                                    val_data = EncodingDataset(X_val_fold,Y_val_fold)

                                                    train_loader_batch = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=8)
                                                    test_loader_batch = torch.utils.data.DataLoader(val_data, batch_size=batch_size, num_workers=8)
                                    
                                                    if model_name == "1layer":
                                                        model = one_layer(kernel_size = 10, 
                                                                      in_channels = X_train_fold_upsample[0].shape[1], 
                                                                      conv_size = conv_size, 
                                                                      in_size = X_train_fold_upsample[0].shape[0], 
                                                                      l2 = l2, 
                                                                      dropout = dropout)
                                                    if model_name == "2layer":
                                                        model = two_layer(kernel_size = kernel_size, 
                                                                            in_channels = X_train_fold_upsample[0].shape[1], 
                                                                            conv_size = conv_size, 
                                                                            l2 = layer2, 
                                                                            dropout = dropout,
                                                                            dim = X_train_fold_upsample[0].shape[0])
                                                    if model_name == "3layer":
                                                        model = three_layer(kernel_size = kernel_size, 
                                                                            in_channels = X_train_fold_upsample[0].shape[1], 
                                                                            conv_size = conv_size, 
                                                                            l2 = layer2, 
                                                                            dropout = dropout,
                                                                            dim = X_train_fold_upsample[0].shape[0])

                                                    #model = nn.parallel.DataParallel(model)
                                                    model.to(device)
                                    
                                                    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
                                                    loss_fn = nn.BCEWithLogitsLoss()

                                                    # Fit the model on the upsampled training data
                                                    train_model(
                                                        n_epochs = n_epochs,
                                                        optimizer = optimizer,
                                                        model = model,
                                                        loss_fn = loss_fn,
                                                        train_loader_batch = train_loader_batch
                                                    )
                                    
                                                    model_performance = evaluate_model(model, train_loader_batch, test_loader_batch)


                                                    # Score the model on the upsampled training data and non-upsampled validation data: PRECISION
                                                    precision = BinaryPrecision(threshold=0.5).to(device)
                                                    train_precision_scores.append(precision(model_performance["train_preds"], model_performance["train_labels"]).cpu().numpy())
                                                    val_precision_scores.append(precision(model_performance["val_preds"], model_performance["val_labels"]).cpu().numpy())

                                                    # Score the model on the upsampled training data and non-upsampled validation data: RECALL
                                                    recall = BinaryRecall(threshold=0.5).to(device)
                                                    train_recall_scores.append(recall(model_performance["train_preds"], 
                                                                              model_performance["train_labels"]).cpu().numpy())
                                                    val_recall_scores.append(recall(model_performance["val_preds"], 
                                                                            model_performance["val_labels"]).cpu().numpy())

                                                    # Score the model on the upsampled training data and non-upsampled validation data: F1 SCORE
                                                    f1_score = BinaryF1Score().to(device)
                                                    train_f1_scores.append(f1_score(model_performance["train_preds"], 
                                                                            model_performance["train_labels"]).cpu().numpy())
                                                    val_f1_scores.append(f1_score(model_performance["val_preds"], 
                                                                          model_performance["val_labels"]).cpu().numpy())

                                                    # Score the model on the upsampled training data and non-upsampled validation data: PR AUC
                                                    pr_curve = BinaryPrecisionRecallCurve(thresholds=None).to(device)
                                                    precision_PR, recall_PR, thresholds_PR = pr_curve(model_performance["train_probs"], 
                                                                                              model_performance["train_labels"])
                                                    train_pr_auc_scores.append(sklearn.metrics.auc(recall_PR.cpu().numpy(), 
                                                                                           precision_PR.cpu().numpy()))
                                                    precision_PR, recall_PR, thresholds_PR = pr_curve(model_performance["val_probs"], 
                                                                                              model_performance["val_labels"])
                                                    val_pr_auc_scores.append(sklearn.metrics.auc(recall_PR.cpu().numpy(), precision_PR.cpu().numpy()))
                                                    
                                                    roc_curve = ROC(task="binary", thresholds=None).to(device)
                                                    train_fpr, train_tpr, train_thresholds = roc_curve(model_performance["train_probs"], 
                                                                                    model_performance["train_labels"])
                                                    train_roc_auc_scores.append(sklearn.metrics.auc(train_fpr.cpu().numpy(), 
                                                                                           train_tpr.cpu().numpy()))
                                                    val_fpr, val_tpr, val_thresholds = roc_curve(model_performance["train_probs"], 
                                                                                    model_performance["train_labels"])
                                                    val_roc_auc_scores.append(sklearn.metrics.auc(val_fpr.cpu().numpy(), 
                                                                                           val_tpr.cpu().numpy()))
                                                    

                                                    #Get confusion matrices
                                                    confmat_test = BinaryConfusionMatrix().to(device)
                                                    train_conf_matrices.append(confmat_test(model_performance["train_preds"], model_performance["train_labels"]).cpu().numpy())
                                                    val_conf_matrices.append(confmat_test(model_performance["val_preds"], 
                                                                                  model_performance["val_labels"]).cpu().numpy())
                                                # end CV loop

                                                stop = time.time()

                                                metrics['train_precision_score'] = np.mean(train_precision_scores)
                                                metrics['train_recall_score'] = np.mean(train_recall_scores)
                                                metrics['train_f1_score'] = np.mean(train_f1_scores)
                                                metrics['train_pr_auc_score'] = np.mean(train_pr_auc_scores)
                                                metrics['train_roc_auc_score'] = np.mean(train_roc_auc_scores)
                                                metrics['train_conf_matrix'] = np.mean(train_conf_matrices, axis=0)
                                                metrics['val_precision_score'] = np.mean(val_precision_scores)
                                                metrics['val_recall_score'] = np.mean(val_recall_scores)
                                                metrics['val_f1_score'] = np.mean(val_f1_scores)
                                                metrics['val_pr_auc_score'] = np.mean(val_pr_auc_scores)
                                                metrics['val_roc_auc_score'] = np.mean(val_roc_auc_scores)
                                                metrics['val_conf_matrix'] = np.mean(val_conf_matrices, axis=0)
                                                metrics["time"] = stop-start

                                                score_tracker.append(metrics)

                                                score_tracker_csv = pd.DataFrame(score_tracker)

                                                score_tracker_csv.to_csv('out/CNN/_params_running_v2.csv')
    return score_tracker

In [12]:
#NOTE: EVALUATING WITH 5-FOLD CV
param_grid = {"model_name": ["2layer", "3layer"],
              "weight_decay": [0.01,0.05],
              "learning_rate": [1e-3,1e-6],
              "batch_size": [2, 8],
              "dropout": [0,0.2],
              "conv_size": [5,50],
              "n_epochs": [10,30],
              "layer2": [10,20],
              "upsample_frac": [0,0.3],
              "kernel_size": [5,10]
             }
# total number: 2^10 = 1024

In [None]:

device = (torch.device('cuda') if torch.cuda.is_available()
          else torch.device('cpu'))
print(f"Training on device {device}.")

#Subselecting peptide sequences to remove those with low coverage from screen
X = [X[i] for i in range(X.shape[0])]
Y = list(y)

#SUBSAMPLING FOR DEBUGGING
#X = X[0:100]
#Y = Y[0:100]

#RUN CNN W/ HYPERPARAMETER TUNING ON THIS ENCODING
torch.backends.cudnn.benchmark = True
score_tracker = CNN_hyperparameter_tuning_CV(param_grid)
score_tracker_csv = pd.DataFrame(score_tracker)
score_tracker_csv.to_csv('Out/CNN/' + encoding_method + '_params_final.csv')

Training on device cuda.
