# Google Drive Mount


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Direcoty & Libraries

In [None]:
%cd /content/drive/My\ Drive/Colab\ Notebooks

/content/drive/My Drive/Colab Notebooks


In [None]:
#Importing Libraries and Packages
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, IterableDataset
from torch.utils.tensorboard import SummaryWriter

# calculate train time, writing train data to files etc.
import os 
import logging
import pandas as pd
import numpy as np
import time
from pathlib import Path
from random import randint
from scipy import signal
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

import pdb

# Dataset

In [None]:
# Data split
class Split_data:
    def __init__(self, dir_path: str):

        self.files = tuple(Path(dir_path).glob("**/*.csv"))

    def __len__(self):
        return len(self.files)

    def splitset(self,split_ratio,random_seed,shuffle_dataset = True):
        dataset_size = len(self.files)
        indices = list(range(dataset_size))
        split = int(np.floor(split_ratio * len(self.files)))
        if shuffle_dataset:
            np.random.seed(random_seed)
            np.random.shuffle(indices)
        train_indices, val_indices = indices[split:], indices[:split]
        return train_indices,val_indices

# Extracting data from csv files
class CoolDataset(IterableDataset):

    def __init__(self, dir_path: str, seq_length: str, input_size: str, samples_per_events: str, indices, convolution=False):

        super().__init__()
        self.files = tuple(Path(dir_path).glob("**/*.csv"))
        self.indices = indices
        self.seq_length = seq_length
        self.input_size = input_size
        self.SAMPLES_PER_EVENT = samples_per_events
        self.window = signal.gaussian(8, std=3)
        self.convolution = convolution

        assert seq_length % 2 == 0, "Please pass an even seq length"

    def __iter__(self):

        # Initialise Counter for events and files
        self.file_nr = 0
        self.event_in_file = 0
        self._sample_nr = 0

        return self

    def __next__(self):
        # Reads the current file and looks for event
        df = self.read_file(self.files[self.indices[self.file_nr]])  # could be cached so you dont read it anew every iteration
        events = df[df["FO"] ==1]
        if events.shape[0] > 0:
            if self._sample_nr < self.SAMPLES_PER_EVENT:
                # just give back the current event again, with different sampling, until we have generated
                # SAMPLES_PER_EVENT such samples
                # pdb.set_trace()
                if self._sample_nr == 0:
                    event_frame = events.iloc[self.event_in_file].name
                if self._sample_nr == 1:
                    event_frame = events.iloc[self.event_in_file].name
                if self._sample_nr == 2:
                    event_frame = events.iloc[self.event_in_file].name
                if self._sample_nr == 3:
                    event_frame = events.iloc[self.event_in_file].name
                if self._sample_nr == 4:
                    event_frame = events.iloc[self.event_in_file].name + 1
                if self._sample_nr == 5:
                    event_frame = events.iloc[self.event_in_file].name + 1
                if self._sample_nr == 6:
                    event_frame = events.iloc[self.event_in_file].name - 1
                if self._sample_nr == 7:
                    event_frame = events.iloc[self.event_in_file].name - 1
                if self._sample_nr == 8:
                    event_frame = events.iloc[self.event_in_file].name - 2
                if self._sample_nr == 9:
                    event_frame = events.iloc[self.event_in_file].name + 2
                 
                input_data, output_data = self.sample_seq_around_event_frame(df, event_frame)
                self._sample_nr += 1
            # else:
            elif self._sample_nr == self.SAMPLES_PER_EVENT:
                
                # pdb.set_trace()
                self.event_in_file += 1  # work on the next event in this file
                self._sample_nr = 0  # reset for the next event
                
                # check whether we are done with this file
                # otherwise we return the next event on the beginning of the next iteration
                if self.event_in_file >= len(events):
                    self.file_nr += 1
                    # If there still are files to run, it resets the variables
                    if self.file_nr < len(self.indices):
                        # pdb.set_trace()
                        logging.info("File is complete. Going to new file...")
                        self.event_in_file = 0
                        self._sample_nr = 0
                        return next(self)
                    else:
                        # processed the last file, we are done
                        logging.info("File is complete. All files done. Stopping...")
                        raise StopIteration
                elif self.event_in_file < len(events):

                    # pdb.set_trace()

                    event_frame = events.iloc[self.event_in_file].name
                    input_data, output_data = self.sample_seq_around_event_frame(df, event_frame)
                    self._sample_nr += 1
        else:
            logging.info("No events detected")
            # pdb.set_trace()
            self.file_nr += 1
            self.event_in_file = 0
            self._sample_nr = 0
            return next(self)

        return input_data, output_data

    def sample_seq_around_event_frame(self, df, event_idx):

        # Returns inputs with event data
        start_idx = event_idx - randint(10, self.seq_length / 2)
        if start_idx > 0:
            end_idx = start_idx + self.seq_length
            if end_idx <= len(df):
                input = df.iloc[start_idx:end_idx, 0:self.input_size].to_numpy()
                output = df.iloc[start_idx:end_idx]['FO'].to_numpy()
            elif end_idx > len(df):
                end_idx = len(df)
                start_idx = end_idx - self.seq_length
                input = df.iloc[start_idx:end_idx, 0:self.input_size].to_numpy()
                output = df.iloc[start_idx:end_idx]['FO'].to_numpy()
        elif start_idx <= 0:
            start_idx = event_idx
            end_idx = start_idx + self.seq_length
            input = df.iloc[start_idx:end_idx, 0:self.input_size].to_numpy()
            output = df.iloc[start_idx:end_idx]['FO'].to_numpy()
            if end_idx <= len(df):
                input = df.iloc[start_idx:end_idx, 0:self.input_size].to_numpy()
                output = df.iloc[start_idx:end_idx]['FO'].to_numpy()
            elif end_idx > len(df):
                end_idx = len(df)
                start_idx = end_idx - self.seq_length
                input = df.iloc[start_idx:end_idx, 0:self.input_size].to_numpy()
                output = df.iloc[start_idx:end_idx]['FO'].to_numpy()

        # Converted to Tensor
        if self.convolution:
            output = signal.convolve(output, self.window, mode='same')

        input_data = input
        output_data = output

        assert input_data.shape[0] == output_data.shape[0] == self.seq_length

        return input_data, output_data

    def read_file(self, f):

        df = pd.read_csv(open(f, "r"))
        fname = os.path.basename(f)
        if fname[0:2] == 'RT':
            df = df.drop(['Unnamed: 0','ID',
                          'High_RTOE_X','RTOE_X','RTOE_Y','RTOE_Z','V_RTOE_X','V_RTOE_Y','V_RTOE_Z',
                          'High_RHEE_X',#'RHEE_X','RHEE_Y','RHEE_Z','V_RHEE_X','V_RHEE_Y','V_RHEE_Z',
                          'High_RANK_X','RANK_X','RANK_Y','RANK_Z','V_RANK_X','V_RANK_Y','V_RANK_Z',
                          'High_RHLX_X',#'RHLX_X','RHLX_Y','RHLX_Z','V_RHLX_X','V_RHLX_Y','V_RHLX_Z',
                          'High_RPMT5_X','RPMT5_X','RPMT5_Y','RPMT5_Z','V_RPMT5_X','V_RPMT5_Y','V_RPMT5_Z'], axis=1)#'RPMT5_X','RPMT5_Y','RPMT5_Z','V_RPMT5_X','V_RPMT5_Y','V_RPMT5_Z'
        elif fname[0:2] == 'LT':
            df = df.drop(['Unnamed: 0','ID',
                          'High_LTOE_X','LTOE_X','LTOE_Y','LTOE_Z','V_LTOE_X','V_LTOE_Y','V_LTOE_Z',
                          'High_LHEE_X',#LHEE_X','LHEE_Y','LHEE_Z','V_LHEE_X','V_LHEE_Y','V_LHEE_Z',
                          'High_LANK_X','LANK_X','LANK_Y','LANK_Z','V_LANK_X','V_LANK_Y','V_LANK_Z',
                          'High_LHLX_X',#'LHLX_X','LHLX_Y','LHLX_Z','V_LHLX_X','V_LHLX_Y','V_LHLX_Z',
                          'High_LPMT5_X','LPMT5_X','LPMT5_Y','LPMT5_Z','V_LPMT5_X','V_LPMT5_Y','V_LPMT5_Z'], axis=1)#'LPMT5_X','LPMT5_Y','LPMT5_Z','V_LPMT5_X','V_LPMT5_Y','V_LPMT5_Z'
        return df

# Model

In [None]:
class Network(nn.Module):
    # TO DO
    def __init__(self, config):
        super(Network, self).__init__()

        # Model construct Configuration
        self.input_size = config.input_size
        self.hidden_size = config.hidden_size
        self.output_size = config.output_size
        self.batch_size = config.batch_size
        self.num_layers = config.num_layers
        self.drop_out = config.drop_out
        self.seq_length = config.seq_length
        self.device = config.device
        self.lstm = nn.LSTM(self.input_size, self.hidden_size, self.num_layers, dropout = self.drop_out, batch_first=True,bidirectional=True)
        self.linear = nn.Linear(self.hidden_size*2, self.output_size,bias= True)
        torch.nn.init.xavier_uniform_(self.linear.weight)

    def forward(self, x):

        hidden, cell = self.init_hidden()
        out, _ = self.lstm(x, (hidden, cell))
        logits = self.linear(out)

        return logits[:, :, -1]

    def init_hidden(self):

        weight = next((self.parameters())).data
        hidden, cell = (weight.new(self.num_layers*2, self.batch_size, self.hidden_size).zero_().to(self.device),
                        weight.new(self.num_layers*2, self.batch_size, self.hidden_size).zero_().to(self.device))
        return hidden, cell


# Early Stop

In [None]:
class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    # def __init__(self, patience: str,, delta: str, Name: str, verbose=False):
    def __init__(self, patience: str, delta: str, Name: str,  verbose = False,):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.Name = Name

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}\n')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model\n')
        torch.save(model.state_dict(), f"Models/2markers/FO/HLXHEE/v1/{self.Name}-FScheckpoint.pt",_use_new_zipfile_serialization=False)
        self.val_loss_min = val_loss

# Training and Validation

In [None]:
class Trainer:

    def __init__(self,input_size, batch_size,seq_length,hidden_size,num_layers,drop_out,lr,epoch,config):

        # System configuration
        self.device = config.device
        self.log_interval = config.log_interval
        self.output_size = config.output_size
        self.seed = config.seed
        self.validation_split = config.validation_split
        self.convolution = config.convolution
        torch.manual_seed(self.seed)

        # Hyper Parameters
        config.input_size = input_size
        config.batch_size = batch_size
        config.hidden_size = hidden_size
        config.seq_length = seq_length
        config.num_layers = num_layers
        config.drop_out = drop_out
        config.lr = lr

        # Model Construction
        self.model = Network(config).to(self.device)
        self.model = self.model.to(self.device)
        print(self.model)

        # Optimizer and Loss
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.pos_weight = torch.ones([config.weight_length])  
        self.pos_weight_factor = self.pos_weight * config.weight_factor
        self.criterion = nn.BCEWithLogitsLoss(pos_weight= self.pos_weight_factor).to(self.device)
       
        # Initialise the early_stopping object
        self.Name = f"FO-{config.weight_factor}-WF-{hidden_size}-HS-{num_layers}-NL-{lr}-LR-{epoch}-epochs"
        self.early_stopping = EarlyStopping(patience=model_config.patience, verbose=True,delta = model_config.delta,Name = self.Name)
        print(self.Name)

        # DataLoader
        dataset = Split_data(r"data/iteration1/train/")
        train_idx, val_idx = dataset.splitset(self.validation_split,self.seed)
        self.train_loader = DataLoader(CoolDataset(r"data/iteration1/train/", config.seq_length, config.input_size, config.samples_per_event, train_idx,convolution=self.convolution), batch_size=config.batch_size, drop_last=True, shuffle=False)
        self.val_loader = DataLoader(CoolDataset(r"data/iteration1/train/", config.seq_length, config.input_size, config.samples_per_event, val_idx,convolution=self.convolution), batch_size=config.batch_size, drop_last=True, shuffle=False)

        self.globaliter = 0     
        train_log_dir = 'logs/tensorboard_FO/2markers/train/HLXHEE'+ self.Name
        val_log_dir = 'logs/tensorboard_FO/2markers/val/HLXHEE' + self.Name
        self.train_summary_writer = SummaryWriter(train_log_dir)
        self.val_summary_writer =  SummaryWriter(val_log_dir)

    def train(self,epoch):
      
      self.model.train()
      start = time.time()
      with self.train_summary_writer:
        for batch_idx, (data, target) in enumerate(self.train_loader):
              
              self.optimizer.zero_grad()

              data, target = data.to(self.device), target.to(self.device)
              predictions = self.model(data.float())
              loss = self.criterion(predictions.float(), target.float())
              loss.backward()
              
              self.optimizer.step()
              pred = torch.sigmoid(predictions.detach())
              correct_indx_positive = pred[target > 0.5]  # Should have batch_size * 1's
              correct_indx_negative = pred[target <= 0.5]  # Should have (batch_size*seq_length-batch_size) * 0'

              TPR = len(correct_indx_positive[correct_indx_positive > 0.5]) / len(correct_indx_positive)
              TNR = len(correct_indx_negative[correct_indx_negative <= 0.5]) / len(correct_indx_negative)
              TPR = TPR * 100
              TNR = TNR * 100


              self.globaliter += 1
              
              if batch_idx % self.log_interval == 0:

                  print('Train Epoch: {}\tLoss: {:.6f}\tTPR: {}\tTNR: {}\tTime: {:.6f}'.format(epoch,loss.item(), TPR, TNR, (time.time() - start)))
                  self.train_summary_writer.add_scalar('Loss', loss.item(),self.globaliter)
  
    def val(self):
        
        self.model.eval()
        val_loss = 0
        start = time.time()
        with self.val_summary_writer:
          with torch.no_grad():
            for data, target in self.val_loader:

                  data, target = data.to(self.device), target.to(self.device)

                  predictions = self.model(data.float())

                  val_loss = self.criterion(predictions.float(), target.float())

                  pred = torch.sigmoid(predictions.detach())
                  correct_indx_positive = pred[target > 0.5]  # Should have batch_size * 1's
                  correct_indx_negative = pred[target <= 0.5]  # Should have (batch_size*seq_length-batch_size) * 0'

                  TPR = len(correct_indx_positive[correct_indx_positive > 0.5]) / len(correct_indx_positive)
                  TNR = len(correct_indx_negative[correct_indx_negative <= 0.5]) / len(correct_indx_negative)
                  TPR = TPR * 100
                  TNR = TNR * 100


            print('Val_loss: {:.6f}\tTPR: {}\tTNR: {}\tTime: {:.6f}\n'.format(val_loss.item(), TPR, TNR, (time.time() - start)))
            self.val_summary_writer.add_scalar('val_loss', val_loss.item(),self.globaliter)
            return val_loss, TPR, TNR, self.Name

# Hyperparmeter Setting

In [None]:
def main(hparam, model_config):
    for input_size in hparam['input_size']:
        for batch_size in hparam['batch_size']:
            for seq_length in hparam['seq_length']:
                for hidden_size in hparam['hidden_size']:
                    for lr in hparam['lr']:
                        for num_layers in hparam['num_layers']:
                            for drop_out in hparam['drop_out']:
                                for epochs in hparam['epochs']:
                                    trainer = Trainer(input_size, batch_size, seq_length, hidden_size, num_layers, drop_out, lr, epochs, model_config)
                                    TPR = dict()
                                    TNR = dict()
                                    for epoch in range(epochs):
                                        epoch_start = time.time()
                                        trainer.train(epoch)
                                        # pdb.set_trace()
                                        val_loss, TPR[epoch], TNR[epoch],Name = trainer.val()
                                        trainer.early_stopping(val_loss, trainer.model)
                                        if trainer.early_stopping.early_stop:
                                            print("Early stopping")
                                            break
                                        print(f"Epoch Duration {time.time() - epoch_start}")
                                    # csv file save
                                    ROC = pd.DataFrame({'TPR':pd.Series(TPR),'TNR':pd.Series(TNR)})
                                    ROC.to_csv(f"Models/2markers/FO/HLXHEE/v1/{Name}-ROC.csv", index=False)

# Main

In [9]:
class Config:

    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)
            
if __name__ == '__main__':
    
    if torch.cuda.is_available():
      device = torch.device("cuda") 
      print("Running on the GPU")
    else:
      device = torch.device("cpu")
      print("Running on the CPU")


    model_config = Config(
        device=device,
        # Early Stop
        patience=20,
        delta=0.001,
        log_interval=1000,
        # Dataset Configuration
        validation_split=0.2,
        seed=2,
        samples_per_event=1,
        convolution = True,
        output_size=1,
        # Weight loss
        weight_length=128,
        weight_factor=13,
    )

    hparam = {
        
        'input_size': [12],   # 2 markers:12 3 markers: 18, 4 markers: 24, 5 markers:30, 6 markers:36, 7 markers:42, 8 markers:48
        'batch_size': [64],
        'seq_length': [128],
        'hidden_size': [256,512,1024],
        'num_layers': [2,5,10],
        'drop_out' : [0.5],
        'lr': [0.00001],
        'epochs': [100],
    }

    main(hparam, model_config)

Running on the GPU
Network(
  (lstm): LSTM(12, 256, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (linear): Linear(in_features=512, out_features=1, bias=True)
)
FO-13-WF-256-HS-2-NL-1e-05-LR-100-epochs
Train Epoch: 0	Loss: 1.033759	TPR: 72.23382045929019	TNR: 56.90392843251652	Time: 16.219748
Val_loss: 0.863723	TPR: 63.44969199178645	TNR: 92.00519143413368	Time: 94.184358

Validation loss decreased (inf --> 0.863723).  Saving model

Epoch Duration 483.7441711425781
Train Epoch: 1	Loss: 0.874351	TPR: 68.3083511777302	TNR: 91.01618122977347	Time: 0.912091
Val_loss: 0.744291	TPR: 67.14876033057851	TNR: 91.17799688635185	Time: 7.506114

Validation loss decreased (0.863723 --> 0.744291).  Saving model

Epoch Duration 39.89317989349365
Train Epoch: 2	Loss: 0.780639	TPR: 68.23770491803278	TNR: 92.34164070612668	Time: 0.919925
Val_loss: 0.641811	TPR: 72.23382045929019	TNR: 91.81900687151561	Time: 7.271188

Validation loss decreased (0.744291 --> 0.641811).  Saving model

E