In [None]:
# # Crash on purpose to get more ram :
# import torch
# torch.tensor([10.]*10000000000)

In [None]:
import os
from IPython.display import clear_output

In [None]:
!pip install transformers -U 
!pip install sentencepiece
clear_output()

In [None]:
PROJECT_PATH = "drive/MyDrive/Thesis/experiments/SecReq"
CHECKPOINTS_PATH = os.path.join(PROJECT_PATH, "checkpoints_gpu")

## Data downloading

In [None]:
DATA_FOLDER = "SecReq"

In [None]:
!wget -O SecReq.zip https://www.dropbox.com/sh/mcvx5ium0zx7bly/AABfJaFt0nWvjiNJs1RUYf_Pa?dl=1
!unzip SecReq.zip -d SecReq
!rm SecReq.zip
clear_output()

In [None]:
from google.colab import drive
drive.mount('/content/drive')
os.environ["PROJECT_PATH"] = PROJECT_PATH
! [ ! -d $PROJECT_PATH ] && mkdir $PROJECT_PATH

Mounted at /content/drive


## Script

In [None]:
import dataclasses
import os
import re
import sys

import numpy as np
import pandas as pd
import torch

from torch.utils.data import Dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    Trainer, TrainingArguments,
)

MODEL_NAME = "t5-small"
MAX_LENGTH = 100

In [None]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
clear_output()

## Data Preparation


In [None]:
class SecReqDataset(Dataset):
  def __init__(self, original_dataframe, tokenizer, train=True):
    self.tokenizer = tokenizer
    self.train = train
    self._load_dataset(original_dataframe)

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}
    return item

  def __len__(self):
    return len(self.data["input_ids"])

  def _load_dataset(self, dataframe):
    dataset_df = self._format2t5(dataframe)
    self.data = self._convert_to_features(dataset_df)

  def _format2t5(self, df): 
    def _process(x):
      input = f"security classification: {x[0]}"
      if self.train:
        return pd.Series({"inputs": input, "targets": x[1]})
      else: 
        return pd.Series({"inputs": input})
    return df.apply(_process, axis=1)

  def _convert_to_features(self, df):
      if self.train:
        encodings = self.tokenizer.prepare_seq2seq_batch(df.inputs.to_list(), df.targets.to_list(), max_length=MAX_LENGTH)
        encodings = {
            'input_ids': encodings['input_ids'], 
            'attention_mask': encodings['attention_mask'],
            'labels': encodings['labels'],
        }
      else:
        encodings = self.tokenizer(df.inputs.to_list(), padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
        encodings = {
            'input_ids': encodings['input_ids'], 
            'attention_mask': encodings['attention_mask'],
        }

      return encodings

In [None]:
def read_secreq(path):
    columns = ["Text", "Label"]
    resulting_dataset = pd.DataFrame(columns=columns)
    for f in os.listdir(path):
        filepath = os.path.join(path, f)
        dataset = pd.read_csv(
            filepath,
            sep=";",
            header=None,
            names=resulting_dataset.columns,
            engine="python",
        )
        resulting_dataset = resulting_dataset.append(dataset)
    resulting_dataset['Text'] = resulting_dataset['Text'].apply(str.strip)
    return resulting_dataset.dropna()

In [None]:
print("===Started data preprocessing===")

full_train = read_secreq(DATA_FOLDER)
msk = np.random.rand(len(full_train)) < 0.8
train = full_train[msk]
valid = full_train[~msk]

train_dataset = SecReqDataset(train, tokenizer, True)
valid_dataset = SecReqDataset(valid, tokenizer, True)
torch.save(train_dataset, 'train_data.pt')
torch.save(valid_dataset, 'valid_data.pt')

print("===Finished data preprocessing===")

===Started data preprocessing===
===Finished data preprocessing===


In [None]:
len(valid_dataset)

113

In [None]:
SEC = "sec"
NONSEC = "nonsec"
OTHER_LABEL = "other"
sec_idxs, non_sec_idxs = tokenizer.prepare_seq2seq_batch(["sec", "nonsec"])['input_ids']

idxs_to_label = {
    tuple(sec_idxs): 1,
    tuple(non_sec_idxs): 0,
}

## Training

In [None]:
print("===Started model loading===")
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).cuda()
print("===Finished model loading===")

===Started model loading===


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1197.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242065649.0, style=ProgressStyle(descri…


===Finished model loading===


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions[0].argmax(-1)

    def _convert_to_labels(idxs):
      label = idxs_to_label.get(tuple(idxs), -1)
      return label

    targets = np.fromiter(map(_convert_to_labels, labels), dtype=np.int)
    predictions  = np.fromiter(map(_convert_to_labels, preds), dtype=np.int)
    wrong_predictions = np.where((predictions == -1))[0]
    wrong_predictions_number = wrong_predictions.shape[0]

    acc = accuracy_score(targets, predictions)
    targets = np.delete(targets, wrong_predictions)
    predictions = np.delete(predictions, wrong_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(targets, predictions, average='binary')

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'wrong_predictions': wrong_predictions_number,
    }

In [None]:
def train():
  training_args = TrainingArguments(    
      output_dir=CHECKPOINTS_PATH,    
      num_train_epochs=400,
      save_steps=1000,
      save_total_limit=5,
      warmup_steps=300,              
      weight_decay=0.01,              
      logging_dir='./logs',            
      logging_steps=10,
      evaluation_strategy="steps",
      # evaluation_strategy="epoch",
  )

  train_dataset  = torch.load('train_data.pt')
  valid_dataset = torch.load('valid_data.pt')
  
  print("===Started model training===")
  trainer = Trainer(
    model=model,                        
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
  )

  trainer.train()
  print("===Finished model training===")

  return model

model = train()

===Started model training===


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Wrong Predictions,Runtime,Samples Per Second
10,13.0509,11.816146,0.0,0.0,0.0,0.0,113,0.4205,268.698
20,13.1274,11.635314,0.0,0.0,0.0,0.0,113,0.4653,242.866
30,13.2076,11.337351,0.0,0.0,0.0,0.0,113,0.4272,264.492
40,11.7471,10.928904,0.0,0.0,0.0,0.0,113,0.4773,236.754
50,11.8955,10.445702,0.0,0.0,0.0,0.0,113,0.4346,260.004
60,10.9894,9.883946,0.0,0.0,0.0,0.0,113,0.4919,229.712
70,10.4201,9.197437,0.0,0.0,0.0,0.0,113,0.4527,249.624
80,9.9664,8.5437,0.0,0.0,0.0,0.0,113,0.4931,229.148
90,8.9244,7.806835,0.0,0.0,0.0,0.0,113,0.4916,229.882
100,7.5792,6.930985,0.0,0.0,0.0,0.0,113,0.4948,228.357


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


===Finished model training===


In [None]:
model.save_pretrained(os.path.join(PROJECT_PATH, "t5-small.pt"))