In [84]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math
import os
import pandas as pd
import torch.nn as nn

In [85]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [86]:
config = {
    "num_labels": 7,
    "hidden_dropout_prob": 0.15,
    "hidden_size": 768,
    "max_length": 512,
}

training_parameters = {
    "batch_size": 2,
    "epochs": 1,
    "output_folder": "model_weight",
    "output_file": "model.pt",
    "learning_rate": 2e-5,
    "print_after_steps": 100,
    "save_steps": 500,

}

In [87]:
# from transformers import BertTokenizer, BertModel
from transformers import AutoTokenizer, AutoModel
class ReviewDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.tokenizer = AutoTokenizer.from_pretrained('jackaduma/SecBERT')

    def __getitem__(self, index):
        review = self.df.iloc[index]["text"]
        sentiment = self.df.iloc[index]["label"]
        sentiment_dict = {'000 - Normal': 0,
          '126 - Path Traversal': 1,
          '242 - Code Injection': 2,
          '153 - Input Data Manipulation': 3,
          '310 - Scanning for Vulnerable Software': 4,
          '194 - Fake the Source of Data': 5,
          '34 - HTTP Response Splitting': 6}
        label = sentiment_dict[sentiment]
        encoded_input = self.tokenizer.encode_plus(
                review,
                add_special_tokens=True,
                max_length= config["max_length"],
                pad_to_max_length=True,
                return_overflowing_tokens=True,
            )
        if "num_truncated_tokens" in encoded_input and encoded_input["num_truncated_tokens"] > 0:
            # print("Attention! you are cropping tokens")
            pass

        input_ids = encoded_input["input_ids"]
        attention_mask = encoded_input["attention_mask"] if "attention_mask" in encoded_input else None

        token_type_ids = encoded_input["token_type_ids"] if "token_type_ids" in encoded_input else None



        data_input = {
            "input_ids": torch.tensor(input_ids),
            "attention_mask": torch.tensor(attention_mask),
            "token_type_ids": torch.tensor(token_type_ids),
            "label": torch.tensor(label),
        }

        return data_input["input_ids"], data_input["attention_mask"], data_input["token_type_ids"], data_input["label"]



    def __len__(self):
        return self.df.shape[0]

In [88]:
df_train = pd.read_csv('../dataset/dataset_capec_combine.csv')
df_train.head()

Unnamed: 0,text,label
0,GET /blog/index.php/2020/04/04/voluptatum-repr...,000 - Normal
1,GET /blog/xmlrpc.php?rsd,000 - Normal
2,GET /blog/index.php/2020/04/04/nihil-tenetur-e...,000 - Normal
3,GET /blog/index.php/2020/04/04/explicabo-qui-f...,000 - Normal
4,GET /blog/index.php/2020/04/04/explicabo-qui-f...,000 - Normal


In [89]:
# Optional (not effect very much)
df_train['text'] = df_train['text'].str.replace('/',' ')
df_train.head()

Unnamed: 0,text,label
0,GET blog index.php 2020 04 04 voluptatum-repr...,000 - Normal
1,GET blog xmlrpc.php?rsd,000 - Normal
2,GET blog index.php 2020 04 04 nihil-tenetur-e...,000 - Normal
3,GET blog index.php 2020 04 04 explicabo-qui-f...,000 - Normal
4,GET blog index.php 2020 04 04 explicabo-qui-f...,000 - Normal


In [90]:
## Reduce data for testing
df_242 = df_train[(df_train['label'] == '242 - Code Injection')]
df_242 = df_242.sample(frac = 1)
df_242 = df_242[:50000]
df_000 = df_train[(df_train['label'] == '000 - Normal')]
df_000 = df_000.sample(frac = 1)
df_000 = df_000[:50000]

df_sub = df_train[(df_train['label'] != '000 - Normal') & (df_train['label'] != '242 - Code Injection')]

df_train = pd.concat([df_train,df_242,df_000], ignore_index=True)

In [91]:
## prep
source_dataset = ReviewDataset(df_train)
source_dataloader = DataLoader(dataset = source_dataset, batch_size = training_parameters["batch_size"], shuffle = True, num_workers = 2)


In [92]:
df_transfer = pd.read_csv('../dataset/dataset_capec_transfer.csv')
df_transfer.head()

Unnamed: 0,text,label
0,POST /vendor/phpunit/phpunit/src/Util/PHP/eval...,153 - Input Data Manipulation
1,POST /cgi-bin/ViewLog.asp remote_submit_Flag=...,153 - Input Data Manipulation
2,GET /.svn/wc.db,153 - Input Data Manipulation
3,GET /blog/.svn/wc.db,153 - Input Data Manipulation
4,GET /blog/index.php/my-account/.svn/wc.db,153 - Input Data Manipulation


In [93]:
# Optional (not effect very much)
df_transfer['text'] = df_transfer['text'].str.replace('/',' ')
df_transfer.head()

Unnamed: 0,text,label
0,POST vendor phpunit phpunit src Util PHP eval...,153 - Input Data Manipulation
1,POST cgi-bin ViewLog.asp remote_submit_Flag=...,153 - Input Data Manipulation
2,GET .svn wc.db,153 - Input Data Manipulation
3,GET blog .svn wc.db,153 - Input Data Manipulation
4,GET blog index.php my-account .svn wc.db,153 - Input Data Manipulation


In [94]:
target_dataset = ReviewDataset(df_transfer)
target_dataloader = DataLoader(dataset = target_dataset, batch_size = training_parameters["batch_size"], shuffle = True, num_workers = 2)

In [95]:
from torch.autograd import Function


class GradientReversalFn(Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha

        return output, None

In [96]:
import torch
import torch.nn as nn
import torch.optim as optim

class DomainAdaptationModel(nn.Module):
    def __init__(self):
        super(DomainAdaptationModel, self).__init__()
        
        num_labels = config["num_labels"]
        self.bert = AutoModel.from_pretrained('jackaduma/SecBERT')
        self.dropout = nn.Dropout(config["hidden_dropout_prob"])
        self.sentiment_classifier = nn.Sequential(
            nn.Linear(config["hidden_size"], num_labels),
            nn.LogSoftmax(dim=1),
        )
        self.domain_classifier = nn.Sequential(
            nn.Linear(config["hidden_size"], 2),
            nn.LogSoftmax(dim=1),
        )


    def forward(
          self,
          input_ids=None,
          attention_mask=None,
          token_type_ids=None,
          labels=None,
          grl_lambda = 1.0, 
          ):

        outputs = self.bert(
                input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
            )

#         pooled_output = outputs[1] # For bert-base-uncase
        pooled_output = outputs.pooler_output 
        pooled_output = self.dropout(pooled_output)


        reversed_pooled_output = GradientReversalFn.apply(pooled_output, grl_lambda)

        sentiment_pred = self.sentiment_classifier(pooled_output)
        domain_pred = self.domain_classifier(reversed_pooled_output)

        return sentiment_pred.to(device), domain_pred.to(device)

In [97]:
def compute_accuracy(logits, labels):
    
    predicted_labels_dict = {
      0: 0,
      1: 0,
      2: 0,
      3: 0,
      4: 0,
      5: 0,
      6: 0,
    }
    
    predicted_label = logits.max(dim = 1)[1]
    
    for pred in predicted_label:
        # print(pred.item())
        predicted_labels_dict[pred.item()] += 1
    acc = (predicted_label == labels).float().mean()
    
    return acc, predicted_labels_dict

In [98]:
def evaluate(model, dataset = "transfer", percentage = 5):
    with torch.no_grad():
        predicted_labels_dict = {                                                   
          0: 0,
          1: 0,
          2: 0,
          3: 0,
          4: 0,
          5: 0,
          6: 0,                                                                   
        }
        
        dev_df = pd.read_csv("../dataset/dataset_capec_" + dataset + ".csv")
        data_size = dev_df.shape[0]
        selected_for_evaluation = int(data_size*percentage/100)
        dev_df = dev_df.head(selected_for_evaluation)
        dataset = ReviewDataset(dev_df)

        dataloader = DataLoader(dataset = dataset, batch_size = training_parameters["batch_size"], shuffle = True, num_workers = 2)

        mean_accuracy = 0.0
        total_batches = len(dataloader)
        
        for input_ids, attention_mask, token_type_ids, labels in dataloader:
            inputs = {
                "input_ids": input_ids.squeeze(axis=1),
                "attention_mask": attention_mask.squeeze(axis=1),
                "token_type_ids" : token_type_ids.squeeze(axis=1),
                "labels": labels,
            }
            for k, v in inputs.items():
                inputs[k] = v.to(device)


            sentiment_pred, _ = model(**inputs)
            predicted_label = sentiment_pred.max(dim = 1)[1]
            accuracy, predicted_labels = compute_accuracy(sentiment_pred, inputs["labels"])
            mean_accuracy += accuracy
            for i in range(7): 
              predicted_labels_dict[i] += predicted_labels[i]

        print(predicted_label)
    return mean_accuracy/total_batches

In [99]:
lr = training_parameters["learning_rate"]
n_epochs = training_parameters["epochs"]

model = DomainAdaptationModel()
model.to(device)

optimizer = optim.Adam(model.parameters(), lr)

loss_fn_sentiment_classifier = torch.nn.NLLLoss()
loss_fn_domain_classifier = torch.nn.NLLLoss()
'''
In one training step we will update the model using both the source labeled data and target unlabeled data
We will run it till the batches last for any of these datasets

In our case target dataset has more data. Hence, we will leverage the entire source dataset for training

If we use the same approach in a case where the source dataset has more data then the target dataset then we will
under-utilize the labeled source dataset. In such a scenario it is better to reload the target dataset when it finishes
This will ensure that we are utilizing the entire source dataset to train our model.
'''

max_batches = min(len(source_dataloader), len(target_dataloader))

for epoch_idx in range(n_epochs):
    
    source_iterator = iter(source_dataloader)
    target_iterator = iter(target_dataloader)

    for batch_idx in range(max_batches):
        
        p = float(batch_idx + epoch_idx * max_batches) / (training_parameters["epochs"] * max_batches)
        grl_lambda = 2. / (1. + np.exp(-10 * p)) - 1
        grl_lambda = torch.tensor(grl_lambda)
        
        model.train()
        
        if(batch_idx%training_parameters["print_after_steps"] == 0 ):
            print("Training Step:", batch_idx)
        
        optimizer.zero_grad()
        
        # Souce dataset training update
        input_ids, attention_mask, token_type_ids, labels = next(source_iterator)
        inputs = {
            "input_ids": input_ids.squeeze(axis=1),
            "attention_mask": attention_mask.squeeze(axis=1),
            "token_type_ids" : token_type_ids.squeeze(axis=1),
            "labels" : labels,
            "grl_lambda" : grl_lambda,
        }

        for k, v in inputs.items():
            inputs[k] = v.to(device)
    
        sentiment_pred, domain_pred = model(**inputs)
        loss_s_sentiment = loss_fn_sentiment_classifier(sentiment_pred, inputs["labels"])
        y_s_domain = torch.zeros(training_parameters["batch_size"], dtype=torch.long).to(device)
        loss_s_domain = loss_fn_domain_classifier(domain_pred, y_s_domain)


        # Target dataset training update 
        input_ids, attention_mask, token_type_ids, labels = next(target_iterator)
        inputs = {
            "input_ids": input_ids.squeeze(axis=1),
            "attention_mask": attention_mask.squeeze(axis=1),
            "token_type_ids" : token_type_ids.squeeze(axis=1),
            "labels" : labels,
            "grl_lambda" : grl_lambda,
        }

        for k, v in inputs.items():
            inputs[k] = v.to(device)
    
        _, domain_pred = model(**inputs)
        
        # Note that we are not using the sentiment predictions here for updating the weights
        y_t_domain = torch.ones(input_ids.shape[0], dtype=torch.long).to(device)
        # print(domain_pred.shape, y_t_domain.shape)
        loss_t_domain = loss_fn_domain_classifier(domain_pred, y_t_domain)

        # Combining the loss 

        loss = loss_s_sentiment + loss_s_domain + loss_t_domain
        loss.backward()
        optimizer.step()

    # Evaluate the model after every epoch
    
    # torch.save(model.state_dict(), os.path.join(training_parameters["output_folder"], "epoch_" + str(epoch_idx)  +  training_parameters["output_file"] ))
    torch.save(model, os.path.join(training_parameters["output_folder"], "epoch_" + str(epoch_idx)  +  training_parameters["output_file"] ))
#     accuracy = evaluate(model, dataset = "combine", percentage = 1).item()
#     print("Accuracy on amazon after epoch " + str(epoch_idx) + " is " + str(accuracy))

    # accuracy = evaluate(model, dataset = "transfer", percentage = 100).item()
    # print("Accuracy on transfer dataset after epoch " + str(epoch_idx) + " is " + str(accuracy))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

Training Step: 0
Training Step: 100
Training Step: 200
Training Step: 300
Training Step: 400
Training Step: 500
Training Step: 600
Training Step: 700
Training Step: 800
Training Step: 900
Training Step: 1000
Training Step: 1100
Training Step: 1200
Training Step: 1300
Training Step: 1400
Training Step: 1500
Training Step: 1600
Training Step: 1700
Training Step: 1800
Training Step: 1900
Training Step: 2000
Training Step: 2100
Training Step: 2200
Training Step: 2300
Training Step: 2400
Training Step: 2500
Training Step: 2600
Training Step: 2700
Training Step: 2800
Training Step: 2900
Training Step: 3000
Training Step: 3100
Training Step: 3200
Training Step: 3300
Training Step: 3400
Training Step: 3500
Training Step: 3600
Training Step: 3700
Training Step: 3800
Training Step: 3900
Training Step: 4000
Training Step: 4100
Training Step: 4200
Training Step: 4300
Training Step: 4400
Training Step: 4500
Training Step: 4600
Training Step: 4700
Training Step: 4800
Training Step: 4900
Training Ste

In [100]:
# accuracy = evaluate(model, dataset = "transfer", percentage = 100).item()
# print("Accuracy on transfer dataset after epoch " + str(epoch_idx) + " is " + str(accuracy))

In [106]:
weight_path = "model_weight/epoch_0model.pt"

model_test = torch.load(weight_path)
model_test.eval()

DomainAdaptationModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=0)
      (position_embeddings): Embedding(514, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [107]:
accuracy = evaluate(model_test, dataset = "transfer", percentage = 100).item()
print("Accuracy on transfer dataset is " + str(accuracy))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


tensor([2], device='cuda:0')
Accuracy on transfer dataset is 0.5252960920333862


In [108]:
def pred(model, dataset = "transfer", percentage = 5):
    all_pred = []
    with torch.no_grad():
        
        dev_df = pd.read_csv("../dataset/dataset_capec_" + dataset + ".csv")
        data_size = dev_df.shape[0]
        selected_for_evaluation = int(data_size*percentage/100)
        dev_df = dev_df.head(selected_for_evaluation)
        dataset = ReviewDataset(dev_df)

        dataloader = DataLoader(dataset = dataset, batch_size = training_parameters["batch_size"], shuffle = True, num_workers = 2)
        
        for input_ids, attention_mask, token_type_ids, labels in dataloader:
            inputs = {
                "input_ids": input_ids.squeeze(axis=1),
                "attention_mask": attention_mask.squeeze(axis=1),
                "token_type_ids" : token_type_ids.squeeze(axis=1),
                "labels": labels,
            }
            for k, v in inputs.items():
                inputs[k] = v.to(device)

            sentiment_pred, _ = model(**inputs)
            pred_label = sentiment_pred.max(dim = 1)[1]
            for pred in pred_label:
                all_pred.append(pred.item())
    return all_pred

In [109]:
pred_list = pred(model_test, dataset = "transfer", percentage = 10)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [110]:
pred_list

[1,
 0,
 0,
 2,
 0,
 1,
 2,
 0,
 0,
 1,
 0,
 0,
 2,
 1,
 0,
 1,
 4,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 2,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 2,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 2,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 2,
 1,
 1,
 1,
 4,
 0,
 0,
 0,
 0,
 2,
 0,
 1,
 0,
 2,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 2,
 1,
 2,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 2,
 1,
 0,
 1,
 0,
 1,
 1,
 2,
 2,
 0,
 0,
 1,
 1,
 4,
 1,
 1,
 1,
 2,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 2,
 0,
 1,
