## GPU Check

In [17]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


## Imports

In [2]:
# import libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import math
import seaborn as sns
from collections import defaultdict
import torch
from torch import optim
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn

from transformers import BertTokenizer, BertModel
from tqdm import tqdm

## Data Processing

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_pickle("/content/drive/MyDrive/data_df")
df.head()

Unnamed: 0,descriptions,vulnerability_type,severity,vulnerability_type_description
0,ip_input.c in BSD-derived TCP/IP implementatio...,CWE-20,MEDIUM,Improper Input Validation
1,ICMP redirect messages may crash or lock up a ...,CWE-20,MEDIUM,Improper Input Validation
2,Denial of service in Windows NT Local Security...,CWE-20,HIGH,Improper Input Validation
3,An attacker can conduct a denial of service in...,CWE-20,HIGH,Improper Input Validation
4,Denial of service in IIS 4.0 via a flood of HT...,CWE-20,MEDIUM,Improper Input Validation


In [5]:
X = df.drop(['vulnerability_type', 'severity', 'vulnerability_type_description'], axis=1)
y = df['vulnerability_type']

x_train, _, y_train, _ = train_test_split(X, y, train_size=50000, stratify=y)
sampled_data = pd.concat([x_train, y_train], axis=1)

# encode the labels
sampled_data['vulnerability_type'] = LabelEncoder().fit_transform(sampled_data['vulnerability_type'])

In [6]:
sampled_data.head()

Unnamed: 0,descriptions,vulnerability_type
3985,Stack-based buffer overflow in hpc.c in dvips ...,0
13956,Windows File Protection (WFP) in Windows 2000 ...,20
24943,PHP remote file inclusion vulnerability in new...,20
43842,The av-centerd SOAP service in AlienVault OSSI...,19
12749,"Buffer overflow in (1) gv 3.5.8 and earlier, (...",20


In [7]:
sampled_data.groupby(['vulnerability_type']).size()

vulnerability_type
0      2710
1       528
2      1053
3       647
4      2051
5      1526
6      1922
7       817
8      1386
9       440
10      716
11      739
12      295
13      443
14      822
15     1905
16     7828
17      293
18     4578
19     1105
20    14745
21     3451
dtype: int64

In [8]:
len(sampled_data['vulnerability_type'].unique())

22

## Config

In [9]:
MAX_LEN = 512
NUM_LABELS = len(sampled_data['vulnerability_type'].unique())
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
EPOCHS = 5
LEARNING_RATE = 3e-05
TOKENIZER = BertTokenizer.from_pretrained('bert-base-uncased')
MODEL_PATH = f"bert_vulnerability_epoch_{EPOCHS}_model.pt"

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

## Custom Dataset

In [10]:
class VulnerabilityDataset:
    def __init__(self, dataframe):
        self.tokenizer = TOKENIZER
        self.data = dataframe
        self.descriptions = self.data.descriptions.tolist()
        self.targets = self.data.vulnerability_type.tolist()
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, index):
        descriptions = str(self.descriptions[index]).lower()

        inputs = self.tokenizer.encode_plus(
            descriptions,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation = True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long)
        }

## Model

In [11]:
class VulnerabilityClf(torch.nn.Module):
    def __init__(self):
        super(VulnerabilityClf, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.drop = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, NUM_LABELS)

    def forward(self, ids, mask, token_type_ids):
        # Using the pooled outputs from bert
        outputs = self.bert(ids, attention_mask=mask,token_type_ids=token_type_ids)
        output = outputs.pooler_output
        output = self.drop(output)
        output = self.linear(output)
        return output

## Engine

#### Loss Function

In [12]:
def loss_fn(outputs, targets):
    return torch.nn.CrossEntropyLoss()(outputs, targets)

#### Train Function

In [13]:
def train(epoch, model, training_loader, device, optimizer):
    model.train()
    for _, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.long)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _ % 10000 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

#### Validation Function

In [14]:
def validation(epoch, model, testing_loader, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.long)
            outputs = model(ids, mask, token_type_ids)

            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.argmax(outputs, axis=1).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

## Run

























                                                                                             

In [15]:
def run(df):
    # Creating Dataset and Dataloaders
    train_size = 0.8
    train_dataset = df.sample(frac=train_size, random_state=200)
    test_dataset = df.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    print("FULL Dataset: {}".format(df.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(test_dataset.shape))

    training_set = VulnerabilityDataset(train_dataset)
    testing_set = VulnerabilityDataset(test_dataset)

    train_params = {
        'batch_size': TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
        }

    test_params = {
        'batch_size': VALID_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
        }

    training_loader = DataLoader(training_set, **train_params)
    testing_loader = DataLoader(testing_set, **test_params)

    # GPU check and setting the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print(torch.cuda.get_device_name(0))

    model = VulnerabilityClf()
    model.to(device)

    # Model parameters
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    # Optimizer and Scheduler
    optimizer = torch.optim.AdamW(optimizer_parameters, lr=3e-5)
    # num_training_steps = int(len(train_dataset) / config.TRAIN_BATCH_SIZE * config.EPOCHS)

    # scheduler = get_linear_schedule_with_warmup(
    #     optimizer,
    #     num_warmup_steps=0,
    #     num_training_steps=num_training_steps
    # )

    # Training loop
    best_val_acc = 0
    print("--Started Training...")
    for epoch in range(EPOCHS):
        train(epoch, model, training_loader, device, optimizer)
        outputs, targets = validation(epoch, model, testing_loader, device)
        accuracy = accuracy_score(targets, outputs)
        print(f"Epoch: {epoch}")
        print(f"--Validation accuracy: {accuracy}\n")
        if accuracy > best_val_acc:
            torch.save(model.state_dict(), MODEL_PATH)
            best_val_acc = accuracy

In [16]:
run(sampled_data)

FULL Dataset: (50000, 2)
TRAIN Dataset: (40000, 2)
TEST Dataset: (10000, 2)


RuntimeError: ignored