In [None]:
# # Crash on purpose to get more ram :
# import torch
# torch.tensor([10.]*10000000000)

In [1]:
import os
from IPython.display import clear_output

In [2]:
!pip install transformers -U 
!pip install sentencepiece
clear_output()

In [3]:
PROJECT_PATH = "drive/MyDrive/Thesis/experiments/Full_Train"
CHECKPOINTS_PATH = os.path.join(PROJECT_PATH, "checkpoints_gpu")

## Data downloading

In [4]:
DATA_PATH = "data.csv"

In [5]:
!wget -O data.csv https://raw.githubusercontent.com/reconrus/Security_Requirements_Extraction/a7261a4fbb77d0eafdee8369697590e4551bb7fc/result.csv

--2021-03-09 23:56:23--  https://raw.githubusercontent.com/reconrus/Security_Requirements_Extraction/a7261a4fbb77d0eafdee8369697590e4551bb7fc/result.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407363 (398K) [text/plain]
Saving to: ‘data.csv’


2021-03-09 23:56:23 (46.1 MB/s) - ‘data.csv’ saved [407363/407363]



In [6]:
from google.colab import drive
drive.mount('/content/drive')
os.environ["PROJECT_PATH"] = PROJECT_PATH
! [ ! -d $PROJECT_PATH ] && mkdir $PROJECT_PATH

Mounted at /content/drive


## Script

In [7]:
import dataclasses
import os
import re
import sys

import numpy as np
import pandas as pd
import torch

from torch.utils.data import Dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    Trainer, TrainingArguments,
)

MODEL_NAME = "t5-small"
MAX_LENGTH = 100

In [8]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
clear_output()

## Data Preparation


In [9]:
class SecReqDataset(Dataset):
  def __init__(self, original_dataframe, tokenizer, train=True):
    self.tokenizer = tokenizer
    self.train = train
    self._load_dataset(original_dataframe)

  def __getitem__(self, idx):
    item = {key: torch.tensor(val[idx]) for key, val in self.data.items()}
    return item

  def __len__(self):
    return len(self.data["input_ids"])

  def _load_dataset(self, dataframe):
    dataset_df = self._format2t5(dataframe)
    self.data = self._convert_to_features(dataset_df)

  def _format2t5(self, df): 
    def _process(x):
      input = f"security classification: {x[0]}"
      if self.train:
        return pd.Series({"inputs": input, "targets": x[1]})
      else: 
        return pd.Series({"inputs": input})
    return df.apply(_process, axis=1)

  def _convert_to_features(self, df):
      if self.train:
        encodings = self.tokenizer.prepare_seq2seq_batch(df.inputs.to_list(), df.targets.to_list(), max_length=MAX_LENGTH)
        encodings = {
            'input_ids': encodings['input_ids'], 
            'attention_mask': encodings['attention_mask'],
            'labels': encodings['labels'],
        }
      else:
        encodings = self.tokenizer(df.inputs.to_list(), padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt")
        encodings = {
            'input_ids': encodings['input_ids'], 
            'attention_mask': encodings['attention_mask'],
        }

      return encodings

In [24]:
def read_dataset(path):
    resulting_dataset = pd.read_csv(path, sep="\t")
    resulting_dataset['Label'].replace('xyz', 'sec', inplace=True)
    return resulting_dataset

In [11]:
SEC_LABEL = "sec"
NONSEC_LABEL = "nonsec"
OTHER_LABEL = "other"
SEC_IDX = 1
NON_SEC_IDX = 0
sec_idxs, non_sec_idxs = tokenizer.prepare_seq2seq_batch(["sec", "nonsec"])['input_ids']

idxs_to_label = {
    tuple(sec_idxs): SEC_IDX,
    tuple(non_sec_idxs): NON_SEC_IDX,
}

## Training

In [12]:
def load_model():
  print("===Started model loading===")
  # model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).cuda()
  model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
  print("===Finished model loading===")
  return model

In [13]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions[0].argmax(-1)

    def _convert_to_labels(idxs):
      label = idxs_to_label.get(tuple(idxs), SEC_IDX)
      # label = idxs_to_label.get(tuple(idxs), -1)
      return label

    targets = np.fromiter(map(_convert_to_labels, labels), dtype=np.int)
    predictions  = np.fromiter(map(_convert_to_labels, preds), dtype=np.int)
    wrong_predictions = np.where((predictions == -1))[0]
    wrong_predictions_number = wrong_predictions.shape[0]

    acc = accuracy_score(targets, predictions)
    targets = np.delete(targets, wrong_predictions)
    predictions = np.delete(predictions, wrong_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(targets, predictions, average='binary')

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'wrong_predictions': wrong_predictions_number,
    }

In [14]:
def train(model, train_dataset, valid_dataset, epochs=10, evaluation_strategy="epoch"):
  training_args = TrainingArguments(    
      output_dir=CHECKPOINTS_PATH,    
      num_train_epochs=epochs,
      # save_steps=1000,
      save_total_limit=0,
      warmup_steps=300,              
      weight_decay=0.01,              
      logging_dir='./logs',            
      logging_steps=10,
      # evaluation_strategy="steps",
      evaluation_strategy=evaluation_strategy,
  )

  # train_dataset  = torch.load('train_data.pt')
  # valid_dataset = torch.load('valid_data.pt')
  
  print("===Started model training===")
  trainer = Trainer(
    model=model,                        
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
  )

  trainer.train()
  print("===Finished model training===")
  
  evaluation = trainer.evaluate() if valid_dataset else None
  return evaluation

In [25]:
full_train = read_dataset(DATA_PATH)

In [None]:
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, random_state=3)

metrics = defaultdict(list)

for train_index, valid_index in skf.split(full_train['Text'], full_train['Label']):
  train_df = full_train.iloc[train_index]
  valid_df = full_train.iloc[valid_index]
  train_dataset = SecReqDataset(train_df, tokenizer, True)
  valid_dataset = SecReqDataset(valid_df, tokenizer, True)
  # torch.save(train_dataset, 'train_data.pt')
  # torch.save(valid_dataset, 'valid_data.pt')

  model = load_model()
  evaluation = train(model, train_dataset, valid_dataset, 1)
  for key, value in evaluation.items():
    metrics[key].append(value)

for key, value in metrics.items():
  print(f'{key}. mean: {np.mean(value)}, std: {np.std(value)}')



===Started model loading===
===Finished model loading===
===Started model training===


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Wrong Predictions,Runtime,Samples Per Second
1,0.1614,0.269762,0.955975,0.920455,0.852632,1.0,0,1.1897,267.291


===Finished model training===


===Started model loading===
===Finished model loading===
===Started model training===


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Wrong Predictions,Runtime,Samples Per Second
1,0.1277,0.193003,0.987421,0.975904,0.952941,1.0,0,1.3026,244.119


===Finished model training===


===Started model loading===
===Finished model loading===
===Started model training===


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Wrong Predictions,Runtime,Samples Per Second
1,0.1826,0.186058,1.0,1.0,1.0,1.0,0,1.17,271.803


===Finished model training===


===Started model loading===
===Finished model loading===
===Started model training===


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Wrong Predictions,Runtime,Samples Per Second
1,0.2476,0.139136,0.996845,0.993865,0.987805,1.0,0,1.1679,271.426


===Finished model training===


===Started model loading===
===Finished model loading===
===Started model training===


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Wrong Predictions,Runtime,Samples Per Second
1,0.2201,0.04189,1.0,1.0,1.0,1.0,0,1.2181,260.241


===Finished model training===


===Started model loading===
===Finished model loading===
===Started model training===


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Wrong Predictions,Runtime,Samples Per Second
1,0.2321,0.02113,0.993691,0.987654,0.97561,1.0,0,1.1574,273.88


===Finished model training===


===Started model loading===
===Finished model loading===
===Started model training===


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Wrong Predictions,Runtime,Samples Per Second
1,0.1973,0.033575,0.968454,0.941176,0.888889,1.0,0,1.2368,256.315


===Finished model training===


===Started model loading===
===Finished model loading===
===Started model training===


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Wrong Predictions,Runtime,Samples Per Second
1,0.185,0.030668,0.996845,0.993789,0.987654,1.0,0,1.2385,255.957


===Finished model training===


===Started model loading===
===Finished model loading===
===Started model training===


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Wrong Predictions,Runtime,Samples Per Second
1,0.1862,0.01662,1.0,1.0,1.0,1.0,0,1.1836,267.824


===Finished model training===


===Started model loading===
===Finished model loading===
===Started model training===


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Wrong Predictions,Runtime,Samples Per Second
1,0.1825,0.076698,0.996845,0.993789,0.987654,1.0,0,1.2246,258.857


===Finished model training===


eval_loss. mean: 0.10085404440760612, std: 0.08527051412406432
eval_accuracy. mean: 0.9896077614427714, std: 0.014426513777892603
eval_f1. mean: 0.9806631621914665, std: 0.026260999484271033
eval_precision. mean: 0.9633184920428495, std: 0.04885772172884868
eval_recall. mean: 1.0, std: 0.0
eval_wrong_predictions. mean: 0.0, std: 0.0
eval_runtime. mean: 1.25256, std: 0.06128928454469017
eval_samples_per_second. mean: 253.9342, std: 12.555689927678209
epoch. mean: 1.0, std: 0.0


In [50]:
model = load_model()

===Started model loading===
===Finished model loading===


In [49]:
!wget -O pure.csv https://www.dropbox.com/s/k9w0d1f9xo0x3nb/pure_all.csv?dl=0
pure = read_dataset("pure.csv")
clear_output()

In [54]:
train_with_pure = pd.concat([full_train, pure], ignore_index=True).dropna()

In [55]:
train_with_pure['Text'] = train_with_pure['Text'].map(str)
train_with_pure['Label'] = train_with_pure['Label'].map(str)
train_with_pure['Label'].unique()

array(['sec', 'nonsec'], dtype=object)

In [57]:
train_dataset = SecReqDataset(train_with_pure, tokenizer, True)
evaluation = train(model, train_dataset, None, 1, 'no')

===Started model training===


Step,Training Loss
10,10.5291
20,10.1673
30,9.934
40,9.4186
50,9.6993
60,9.0547
70,8.3363
80,7.5736
90,7.2908
100,6.4717


===Finished model training===


In [58]:
model.save_pretrained(os.path.join(PROJECT_PATH, "t5-small-with-pure.pt"))