# Import Libraries

In [None]:
import numpy as np

import pickle

from tqdm import tqdm

from imblearn.over_sampling import SMOTE

import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader, TensorDataset

from sklearn.metrics import confusion_matrix, accuracy_score, recall_score

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load Data

In [None]:
#### load data from the output file #####
path_output = '../Transformer/data/output/data_sequence_pm.pt'

X_train, y_train, X_test, y_test = torch.load(path_output)

# Data Processing

In [None]:
##### sampling: smote #####
X_train = X_train.cpu()
y_train = y_train.cpu()

sample_size, window, feature_size = X_train.shape

X_train = X_train.reshape(-1, window * feature_size)  # shape: [num_samples, num_timesteps * num_features]

smote = SMOTE(sampling_strategy = 1.0, random_state = 0)
X_train, y_train = smote.fit_resample(X_train, y_train)


X_train = X_train.reshape(-1, window, feature_size)


X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)

In [None]:
##### create datasets #####
train_dataset, test_dataset = TensorDataset(X_train, y_train), TensorDataset(X_test, y_test)


##### create dataLoaders #####
train_loader, test_loader = DataLoader(train_dataset, batch_size=1000, shuffle=True), DataLoader(test_dataset, batch_size=500)

# Modeling

## Architecture

In [None]:
class Time2Vector(nn.Module):
    
    def __init__(self, seq_len):
        super(Time2Vector, self).__init__()
        
        self.seq_len = seq_len
        
        self.weights_linear = nn.Parameter(torch.randn(self.seq_len), requires_grad=True)
        self.bias_linear = nn.Parameter(torch.randn(self.seq_len), requires_grad=True)
        
        self.weights_periodic = nn.Parameter(torch.randn(self.seq_len), requires_grad=True)
        self.bias_periodic = nn.Parameter(torch.randn(self.seq_len), requires_grad=True)

        
    def forward(self, x):
        
        # x shape: [batch_size, seq_len, num_features]
        x = torch.mean(x[:, :, :], dim=-1)  # reducing last dimension, shape: [batch_size, seq_len]
        
        time_linear = self.weights_linear * x + self.bias_linear  # shape: [batch_size, seq_len]
        time_linear = time_linear.unsqueeze(-1) # shape: [batch_size, seq_len, 1]
        
        time_periodic = torch.sin(x * self.weights_periodic + self.bias_periodic)  # shape: [batch_size, seq_len]
        time_periodic = time_periodic.unsqueeze(-1) # shape: [batch_size, seq_len, 1]
        
        
        return torch.cat([time_linear, time_periodic], dim=-1) # concatenating linear and periodic components, shape: [batch_size, seq_len, 2]

In [None]:
class SingleAttention(nn.Module):
    
    def __init__(self, d_k, d_v, input_features):
        super(SingleAttention, self).__init__()
        
        self.d_k = d_k
        
        self.query = nn.Linear(input_features, d_k)
        self.key = nn.Linear(input_features, d_k)
        self.value = nn.Linear(input_features, d_v)

        
    def forward(self, inputs):
        
        q = self.query(inputs[0])  # [batch_size, seq_len, d_k]
        k = self.key(inputs[1])    # [batch_size, seq_len, d_k]
        v = self.value(inputs[2])  # [batch_size, seq_len, d_v]
        
        attn_weights = torch.bmm(q, k.transpose(1, 2)) / np.sqrt(self.d_k)
        attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1)
        
        attn_out = torch.bmm(attn_weights, v)  # [batch_size, seq_len, d_v]
        
        
        return attn_out

In [None]:
class MultiAttention(nn.Module):
    
    def __init__(self, d_k, d_v, n_heads, input_features):
        super(MultiAttention, self).__init__()
        
        self.n_heads = n_heads
        
        self.attn_heads = nn.ModuleList([SingleAttention(d_k, d_v, input_features) for _ in range(n_heads)])
        self.linear = nn.Linear(n_heads * d_v, input_features)

        
    def forward(self, inputs):
        
        attn = [head(inputs) for head in self.attn_heads]  # [batch_size, seq_len, d_v] each
        concat_attn = torch.cat(attn, dim=-1)  # [batch_size, seq_len, n_heads * d_v]
        multi_linear = self.linear(concat_attn)  # [batch_size, seq_len, input_features]
        
        return multi_linear

In [None]:
class TransformerEncoderLayer(nn.Module):
    
    def __init__(self, d_k, d_v, n_heads, ff_dim, input_features, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        
        input_features = input_features + 2
        
        self.attn_multi = MultiAttention(d_k, d_v, n_heads, input_features)
        
        self.attn_dropout = nn.Dropout(dropout)
        self.attn_normalize = nn.LayerNorm(normalized_shape=(input_features,))
        
        self.ff_conv1D_1 = nn.Conv1d(in_channels=input_features, out_channels=ff_dim, kernel_size=1)
        self.ff_conv1D_2 = nn.Conv1d(in_channels=ff_dim, out_channels=input_features, kernel_size=1)
        
        self.ff_dropout = nn.Dropout(dropout)
        self.ff_normalize = nn.LayerNorm(normalized_shape=(input_features,))

        
    def forward(self, inputs):
        
        attn_layer = self.attn_multi(inputs)  # [batch_size, seq_len, input_features]
        attn_layer = self.attn_dropout(attn_layer)
        attn_layer = self.attn_normalize(attn_layer + inputs[0])
        attn_layer = attn_layer.transpose(1, 2)  # [batch_size, input_features, seq_len]
        
        ff_layer = self.ff_conv1D_1(attn_layer)  # [batch_size, ff_dim, seq_len]
        ff_layer = self.ff_conv1D_2(ff_layer)  # [batch_size, input_features, seq_len]
        ff_layer = self.ff_dropout(ff_layer)
        ff_layer = self.ff_normalize(ff_layer.transpose(1, 2) + attn_layer.transpose(1, 2))  # [batch_size, seq_len, input_features]
        
        
        return ff_layer

In [None]:
class TimeTransformer(nn.Module):
    
    def __init__(self, seq_len, d_k, d_v, n_heads, ff_dim, input_features, num_classes=1, dropout=0.1):
        super(TimeTransformer, self).__init__()
        
        self.time_embedding = Time2Vector(seq_len)
        
        self.attn_layer1 = TransformerEncoderLayer(d_k, d_v, n_heads, ff_dim, input_features, dropout)
        self.attn_layer2 = TransformerEncoderLayer(d_k, d_v, n_heads, ff_dim, input_features, dropout)
        self.attn_layer3 = TransformerEncoderLayer(d_k, d_v, n_heads, ff_dim, input_features, dropout)
        self.attn_layer4 = TransformerEncoderLayer(d_k, d_v, n_heads, ff_dim, input_features, dropout)
        
        self.global_avg_pooling = nn.AdaptiveAvgPool1d(1)
        self.dropout = nn.Dropout(dropout)
        
        self.fc1 = nn.Linear(input_features + 2, 16)
        self.fc2 = nn.Linear(16, num_classes)

        
    def forward(self, x):
        
        time_embed = self.time_embedding(x)  # [batch_size, seq_len, 2]
        x = torch.cat([x, time_embed], dim=-1)  # [batch_size, seq_len, input_features + 2]

        x = self.attn_layer1((x, x, x))
        x = self.attn_layer2((x, x, x))
        x = self.attn_layer3((x, x, x))
        x = self.attn_layer4((x, x, x))
        
        x = self.global_avg_pooling(x.transpose(1, 2))  # [batch_size, input_features, 1]
        x = x.squeeze(-1)  # [batch_size, input_features]
        x = self.dropout(x)
        
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        
        out = torch.sigmoid(self.fc2(x))  # apply sigmoid here
        
        
        return out

## Model Training

In [None]:
model = TimeTransformer(
    seq_len=window,             
    input_features=feature_size,       
    
    d_k=128,
    d_v=128,
    n_heads=5,              # number of heads in multiheadattention
    ff_dim=64,              # dimension of feedforward network
    
    num_classes=1           # binary classification
)


if torch.cuda.is_available():
    model = model.to(device)

In [None]:
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)

In [None]:
##### function to determine recall and far #####
def get_metrics(y_true, y_pred):

    y_pred = torch.round(y_pred).cpu().numpy()
    y_true = y_true.cpu().numpy()


    c_matrix = confusion_matrix(y_true, y_pred)
    
    tn, fp, fn, tp = c_matrix.ravel()

    # calculate recall (sensitivity)
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0

    # calculate false alarm rate (false positive rate)
    false_alarm_rate = fp / (fp + tn) if (fp + tn) != 0 else 0

    
    return recall, false_alarm_rate

In [None]:
##### training #####
num_epochs = 25

optimum_validation_recall = 0.7
optimum_validation_far = 0.4


for epoch in range(num_epochs):
    
    # training phase
    model.train()
    
    epoch_loss = 0.0
    num_batches = 0
    
    # lists to store true labels and predictions for each epoch
    y_true = []; y_pred = []

    # wrap train_loader with tqdm for a progress bar
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f'Epoch {epoch + 1}/{num_epochs}')

    
    for batch_idx, (X, y) in progress_bar:
        
        X = X.to(device); y = y.to(device)
        
        optimizer.zero_grad()
        outputs = model(X)
        
        loss = criterion(outputs, y.float())
        loss.backward()
        
        optimizer.step()

        # update running loss and batch count
        epoch_loss += loss.item()
        num_batches += 1

        # accumulate true and predicted values
        y_true.append(y.detach())
        y_pred.append(outputs.detach())

        
        
    progress_bar.close()
        
    # convert lists to tensors
    y_true = torch.cat(y_true); y_pred = torch.cat(y_pred)

    train_recall, train_far = get_metrics(y_true, y_pred)



    # evaluation
    model.eval()
    
    validation_loss = 0.0
    validation_batches = 0
    
    y_true = []; y_pred = []
    

    with torch.no_grad():
        
        for X, y in test_loader:
            
            X = X.to(device); y = y.to(device)

            outputs = model(X)
            loss = criterion(outputs, y.float())

            validation_loss += loss.item()
            validation_batches += 1

            y_true.append(y.detach())
            y_pred.append(outputs.detach())

            
            
    # calculate validation metrics
    y_true = torch.cat(y_true)
    y_pred = torch.cat(y_pred)
    
    validation_recall, validation_far = get_metrics(y_true, y_pred)

    validation_loss = validation_loss / validation_batches


    
    print('Reports...........')
    
    print(f'Training ---> Loss: {epoch_loss / num_batches:.4f}, Recall: {train_recall:.4f}, False Alarm Rate: {train_far:.4f}')
    print(f'Validity ---> Loss: {validation_loss:.4f}, Recall: {validation_recall:.4f}, False Alarm Rate: {validation_far:.4f}')

    
    
    # check if the recall has improved
    
    if (validation_recall > optimum_validation_recall) and (validation_far < optimum_validation_far):
        
        optimum_validation_recall = validation_recall
        optimum_validation_far = validation_far
        
        print('\nValidation Recall Improved, Saving Model...')
        
        torch.save(model.state_dict(), f'../Transformer/model/{epoch + 1}.pth')

In [None]:
# ##### load the best model after training is complete #####
# model.load_state_dict(torch.load('../Transformer/model/.pth'))

## Model Evaluation

In [None]:
##### function to determine recall and far #####
def get_metrics(y_true, y_pred):

    y_pred = torch.round(y_pred).cpu().numpy()
    y_true = y_true.cpu().numpy()


    c_matrix = confusion_matrix(y_true, y_pred)
    
    tn, fp, fn, tp = c_matrix.ravel()

    # calculate recall (sensitivity)
    recall = tp / (tp + fn) if (tp + fn) != 0 else 0

    # calculate false alarm rate (false positive rate)
    false_alarm_rate = fp / (fp + tn) if (fp + tn) != 0 else 0

    
    return c_matrix, recall, false_alarm_rate

In [None]:
##### evaluate the model
model.eval()

y_true = []; y_pred = []


with torch.no_grad():
    
    for X, y in test_loader:
        
        X = X.to(device); y = y.to(device)

        outputs = model(X)
        
        y_pred.extend(outputs)
        y_true.extend(y)

        
        
# convert lists to tensors
y_true = torch.stack(y_true)
y_pred = torch.stack(y_pred)

# calculate metrics
c_matrix, recall, false_alarm_rate = get_metrics(y_true, y_pred)


print(c_matrix)
print(f'Recall: {recall:.4f}')
print(f'False Alarm Rate: {false_alarm_rate:.4f}')