In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import scipy.io as sio
import time
import os

import matplotlib.pyplot as plt
import seaborn as sn

In [None]:
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
learning_rate = 1e-4
num_epoch = 30
batch_size = 128
gan = False
sampler = False

# Dataset

In [None]:
class GPVSDataset(Dataset):
    def __init__(self, df, feature_columns):
        super().__init__()
        self.df = df
        self.sequences = self.df[feature_columns]
        self.labels = self.df['Fault_type']

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        sequence = np.array([np.array(item) for item in self.sequences.iloc[idx, :]]).T
        label = self.labels[idx]
        return dict(
            sequence = torch.Tensor(sequence),
            label = torch.tensor(label).long()
        )

In [None]:
dataset_folder = '/kaggle/input/gpvs-gan-dataset/gpvs_sl200_s15'
filenames = os.listdir(dataset_folder)

if gan:
    train_df = pd.read_pickle(os.path.join(dataset_folder,[filename for filename in filenames if 'GAN' in filename][0]))
else:
    train_df = pd.read_pickle(os.path.join(dataset_folder,[filename for filename in filenames if 'TRAIN' in filename][0]))

val_df = pd.read_pickle(os.path.join(dataset_folder,[filename for filename in filenames if 'VALI' in filename][0]))
test_df = pd.read_pickle(os.path.join(dataset_folder,[filename for filename in filenames if 'TEST' in filename][0]))

le = LabelEncoder()
le.fit(train_df['Fault_type'])

train_df['Fault_type'] = le.transform(train_df['Fault_type'])
val_df['Fault_type'] = le.transform(val_df['Fault_type'])
test_df['Fault_type'] = le.transform(test_df['Fault_type'])

In [None]:
print(f'Train set lenght: {len(train_df)}')
print(f'Validation set lenght: {len(val_df)}')
print(f'Test set lenght: {len(test_df)}')

In [None]:
le.classes_

# DataLoader

In [None]:
def getSampler(df, label_column):
    class_occ = pd.DataFrame()
    class_occ['Occ'] = df[label_column].value_counts().sort_index()
    class_occ['weight'] = class_occ['Occ'].apply(lambda x: 1./x)
    #print(class_occ)
    
    weights_dict = class_occ['weight'].to_dict()
    df['weight'] = df[label_column].apply(lambda x: weights_dict[x])
    #print(df)
    
    weights = torch.DoubleTensor(df.weight.values)
    #print('weights: ', weights)

    sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights), replacement=True)
    return sampler

In [None]:
feature_columns = ['Ipv', 'Vpv', 'Vdc', 'ia', 'ib', 'ic', 'va', 'vb', 'vc', 'Iabc', 'If', 'Vabc', 'Vf']

train_data = GPVSDataset(train_df, feature_columns)
if sampler and not gan:
    train_sampler = getSampler(train_df, 'Fault_type')
    train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler, shuffle=False)
else:
    train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

val_data = GPVSDataset(val_df, feature_columns)
val_dataloader = DataLoader(val_data, batch_size=batch_size)

dataloaders = {
   'train': train_dataloader,
    'validation': val_dataloader
}

dataset_sizes = {x: len(dataloaders[x]) for x in ['train', 'validation']}

In [None]:
def countLabels(dataloader, class_names):
    labels_count = {x: 0 for x in class_names}

    for data in dataloader:
        labels = data['label']
        for idx in range(len(class_names)):
            labels_count[class_names[idx]] += torch.sum(labels == idx).item()
        
    return labels_count

In [None]:
#labels_count = countLabels(train_dataloader, le.classes_)
#labels_count

# Model

In [None]:
class LSTM_FCN(nn.Module):
    def __init__(self, n_features, n_classes, n_hidden=256, n_layers=3):
        super().__init__()

        self.lstm = nn.LSTM(
            input_size = n_features,
            hidden_size = n_hidden,
            batch_first = True,
            num_layers = n_layers, # Stack LSTMs
            dropout = 0.2  # This model works on a lot of regularisation
        )

        self.classifier = nn.Linear(n_hidden, n_classes)

    def forward(self, x):
        self.lstm.flatten_parameters()  # For distrubuted training

        _, (hidden, _) = self.lstm(x)
        # We want the output from the last layer to go into the final
        # regressor linear layer
        out = hidden[-1]

        return self.classifier(out)

In [None]:
model = LSTM_FCN(n_features=len(feature_columns), n_classes=len(le.classes_)).to(device)

criterion = torch.nn.CrossEntropyLoss()
optimizer_ft = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer_ft, patience = 3, verbose = True)

In [None]:
#find best model on validation test
def train_loop_validation(dataloaders, startEpoch, numEpochs, model_conv, criterionCNN, optimizer_conv, scheduler,
                          best_acc, best_loss, best_epoca, outputPath):
  
  for epochs in range(startEpoch, numEpochs + 1):
    since = time.time()
    
    modelLoss_train = 0.0
    modelAcc_train = 0.0

    model_conv.train() 

    totalSize = 0
    
    #for each batch
    for data in dataloaders['train']:
        inputs = data['sequence'].type(torch.FloatTensor).to(device)
        labels = data['label'].to(device)
      
        optimizer_conv.zero_grad()
        model_conv.zero_grad()
      
        y = model_conv(inputs)
        outp, preds = torch.max(y, 1)   
        
        lossCNN = criterionCNN(y, labels) #media per batch 

        modelLoss_train += lossCNN.item() * inputs.size(0)
        totalSize += inputs.size(0)
        modelAcc_train += torch.sum(preds == labels.data).item()

        lossCNN.backward()  # pred = f(x)   -> loss = L(f(x), l_true)
    
        optimizer_conv.step()
    
    
    modelLoss_epoch_train = modelLoss_train/totalSize
    modelAcc_epoch_train  = modelAcc_train/totalSize
    
    #salvataggio dei pesi ad ogni iterazione -> nel caso si blocchi e vogliamo riprendere il train 
    torch.save(model_conv.state_dict(), outputPath + 'train_weights.pth')
    
    model_conv.eval()
    totalSize_val = 0
    modelLoss_val = 0.0
    modelAcc_val = 0.0

    for data in dataloaders['validation']:
        inputs = data['sequence'].type(torch.FloatTensor).cuda()
        labels = data['label'].cuda()
      
        y = model_conv(inputs)
        outp, preds = torch.max(y, 1) 
        lossCNN = criterionCNN(y, labels)

        modelLoss_val += lossCNN.item() * inputs.size(0)
        totalSize_val += inputs.size(0)
        modelAcc_val += torch.sum(preds == labels.data).item()
    
    modelLoss_epoch_val=modelLoss_val/totalSize_val
    modelAcc_epoch_val = modelAcc_val/totalSize_val
    time_elapsed = time.time()-since
    
    scheduler.step(modelLoss_epoch_val)

    #print(time_elapsed)
    print('[Epoch %d][TRAIN on %d [Loss: %.4f  ACC: %.4f]][VAL on %d [Loss: %.4f  ACC: %.4f]][TIME: %.0f m %.0f s]' 
          %(epochs, totalSize, modelLoss_epoch_train, modelAcc_epoch_train, totalSize_val, modelLoss_epoch_val, 
            modelAcc_epoch_val, time_elapsed // 60, time_elapsed % 60))
    
    #if epochs == 1 or modelLoss_epoch_val < best_loss:
    if (modelAcc_epoch_val > best_acc) or (modelAcc_epoch_val == best_acc and modelLoss_epoch_val < best_loss) :
        print('     .... Saving best weights ....')
        best_acc = modelAcc_epoch_val
        best_loss = modelLoss_epoch_val
        best_epoca = epochs
        #salvataggio dei migliori pesi sul validation
        torch.save(model_conv.state_dict(), outputPath + 'best_model_weights.pth')
      
    
    with open(outputPath + 'learningRate.txt', "a") as file_object:
        file_object.write(str(optimizer_ft.param_groups[0]['lr']) +'\n')
    
    with open(outputPath + 'lossTrain.txt', "a") as file_object:
        file_object.write(str(modelLoss_epoch_train) +'\n')
      
    with open(outputPath + 'AccTrain.txt', "a") as file_object:
        file_object.write(str(modelAcc_epoch_train)+'\n')
      
    with open(outputPath + 'lossVal.txt', "a") as file_object:
        file_object.write(str(modelLoss_epoch_val)+'\n')
      
    with open(outputPath + 'AccVal.txt', "a") as file_object:
        file_object.write(str(modelAcc_epoch_val)+'\n')
      
    sio.savemat(outputPath + 'check_point.mat', {'best_acc': best_acc, 
                                                 'best_loss': best_loss,
                                                 'best_epoca': best_epoca,
                                                 'last_epoch': epochs})

In [None]:
startEpoch = 1
best_acc = 0
best_loss= 0
best_epoca = 0
WeightPath = './'
train_loop_validation(dataloaders, startEpoch, num_epoch, model, criterion, optimizer_ft, scheduler, best_acc, best_loss, best_epoca, WeightPath)

In [None]:
lossModel_Train = []
lossModel_val = []
accModel_Train = []
accModel_val = []
lrs = []

WeightPath = './'
file = open(WeightPath + 'lossTrain.txt', 'r')
Testo = file.readlines()
for element in Testo:
    lossModel_Train.append(float(element))

file = open(WeightPath + 'lossVal.txt', 'r')
Testo = file.readlines()
for element in Testo:
    lossModel_val.append(float(element))

plt.figure()
plt.title("Model: Training Vs Validation Losses")
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.plot(list(range(1,len(lossModel_Train)+1)), lossModel_Train, color='r', label="Training Loss")
plt.plot(list(range(1, len(lossModel_val)+1)), lossModel_val, color='g', label="Validation Loss")
plt.legend()
plt.savefig(WeightPath + 'LossTrainVal.png')

file = open(WeightPath + 'AccTrain.txt', 'r')
Testo = file.readlines()
for element in Testo:
    accModel_Train.append(float(element))

file = open(WeightPath + 'AccVal.txt', 'r')
Testo = file.readlines()
for element in Testo:
    accModel_val.append(float(element))

plt.figure()
plt.title("Training Vs Validation Accuracies")
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.plot(list(range(1, len(accModel_Train)+1)), accModel_Train, color='r', label="Training Accuracy")
plt.plot(list(range(1, len(accModel_val)+1)), accModel_val, color='g', label="Validation Accuracy")
plt.legend()
plt.savefig(WeightPath + 'AccTrainVal.png')

file = open(WeightPath + 'learningRate.txt', 'r')
Testo = file.readlines()
for element in Testo:
    lrs.append(float(element))

plt.figure()
plt.title("Learning Rate")
plt.xlabel('Epoch')
plt.ylabel('lr')
plt.plot(list(range(1, len(lrs)+1)), lrs, color='b')
plt.savefig(WeightPath + 'AccTrainVal.png')

## Load best weights

In [None]:
weight_path = './best_model_weights.pth'
checkpoint = torch.load(weight_path)
model.load_state_dict(checkpoint)

# Test

In [None]:
test_data = GPVSDataset(test_df.reset_index(drop=True), feature_columns)
test_dataloader = DataLoader(test_data, batch_size=1, shuffle=False)

correct = 0
total = 0
model.eval()
Test_results = pd.DataFrame()

# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for data in test_dataloader:
        inputs, labels = data['sequence'], data['label']
        inputs, labels = inputs.to(device), labels.to(device)

        # calculate outputs by running images through the network
        outputs = model(inputs)
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        #print(predicted, labels, predicted == labels)
        correct += (predicted == labels).sum().item()

        Test_results = pd.concat([Test_results, pd.DataFrame({'label': [int(labels.item())], 'pred': [int(predicted.item())]})], ignore_index = True)

print(f'Accuracy of the network on the test sequences: {100 * correct // total} %')

In [None]:
c = confusion_matrix(Test_results['label'],Test_results['pred'])
df_cm = pd.DataFrame(c , index = le.classes_, columns = le.classes_)
plt.figure(figsize = (20,20))
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
sn.heatmap(df_cm, annot=True)

In [None]:
print(classification_report(Test_results['label'],Test_results['pred'], target_names = le.classes_))