<a href="https://colab.research.google.com/github/tungduong03/App-Chat/blob/main/secBERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Code_Injection_Dataset

Mounted at /content/drive
/content/drive/MyDrive/Code_Injection_Dataset


In [3]:
df = pd.read_csv('dataset_capec.csv')  # Đọc file CSV

In [4]:
# Đếm số lượng record cho mỗi loại label
label_counts = df['label'].value_counts()

# Lọc các label có số lượng record > 20000
labels_above_20000 = label_counts[label_counts > 20000].index

# Lấy 20,000 record đầu tiên cho các label đó
df_above_20000 = df[df['label'].isin(labels_above_20000)]
df_above_20000 = df_above_20000.groupby('label').head(20000)

# Lấy các record còn lại (cho các label không có số lượng lớn hơn 20000)
df_below_20000 = df[~df['label'].isin(labels_above_20000)]

# Ghép các record lại với nhau
df_combined = pd.concat([df_below_20000, df_above_20000])

# Xáo trộn dữ liệu sau khi ghép
df_combined = df_combined.sample(frac=1).reset_index(drop=True)

df = df_combined

In [5]:
# Phân chia dữ liệu train-test
X_train, X_test, y_train, y_test = train_test_split(df['text'],
                                                    df['label'],
                                                    test_size=0.2,
                                                    shuffle=True) # shuffle=True

In [6]:
# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)  # Encode train labels
y_test_encoded = label_encoder.transform(y_test)        # Encode test labels

# In danh sách nhãn đã mã hóa
print(label_encoder.classes_)  # ['000 - Normal', '001 - XSS', ...]
num_classes = len(label_encoder.classes_)  # Số lượng lớp
num_classes

['000 - Normal' '126 - Path Traversal' '153 - Input Data Manipulation'
 '194 - Fake the Source of Data' '242 - Code Injection'
 '272 - Protocol Manipulation' '310 - Scanning for Vulnerable Software'
 '34 - HTTP Response Splitting' '66 - SQL Injection']


9

In [7]:
# Tải Tokenizer của SecBERT
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("jackaduma/SecBERT")  # SecBERT

#from transformers import AutoTokenizer, AutoModelForMaskedLM

#tokenizer = AutoTokenizer.from_pretrained("jackaduma/SecBERT")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/378k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

In [8]:
# Token hóa văn bản
# Tokenize dữ liệu train và test
def preprocess_data(texts, tokenizer, max_length=128):
    # Convert the Pandas Series to a list of strings
    texts = texts.tolist()  # This line is added to fix the error
    return tokenizer(
        texts,
        padding='max_length',        # Thêm padding
        truncation=True,             # Cắt ngắn văn bản
        max_length=max_length,       # Độ dài tối đa
        return_tensors='tf'          # Trả về Tensor
    )

X_train_tokens = preprocess_data(X_train, tokenizer)
X_test_tokens = preprocess_data(X_test, tokenizer)

In [9]:
from transformers import TFBertModel
import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Model

# Tải mô hình SecBERT
secbert_model = TFBertModel.from_pretrained("jackaduma/SecBERT")

# Đóng băng các lớp SecBERT (tùy chọn, có thể fine-tune)
secbert_model.trainable = False

model.safetensors:   0%|          | 0.00/336M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'bert.embeddings.position_ids', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


**secBERT + fine-tune + epoch = 5**

In [10]:
from tensorflow.keras.layers import Lambda

# Bật fine-tune
secbert_model.trainable = True

# Input layers
input_ids = tf.keras.Input(shape=(128,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.Input(shape=(128,), dtype=tf.int32, name="attention_mask")

# Gọi SecBERT qua Lambda layer, specify output_shape
bert_outputs = Lambda(
    lambda x: secbert_model(input_ids=x[0], attention_mask=x[1])[0][:, 0, :],
    output_shape=(768,)  # Specify the output shape here
)([input_ids, attention_mask])

# Thêm các lớp Dense
x = Dense(128, activation="relu")(bert_outputs)
x = Dropout(0.5)(x)
output = Dense(num_classes, activation="softmax")(x)

# Tạo mô hình
model = Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile mô hình
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [12]:
# huấn luyện mô hình
history = model.fit(
    {
        "input_ids": X_train_tokens["input_ids"],
        "attention_mask": X_train_tokens["attention_mask"]
    },
    y_train_encoded,
    validation_split=0.2,
    epochs=5,
    batch_size=128
)

Epoch 1/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 593ms/step - accuracy: 0.3357 - loss: 1.7659 - val_accuracy: 0.4237 - val_loss: 1.5100
Epoch 2/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m363s[0m 580ms/step - accuracy: 0.3988 - loss: 1.5650 - val_accuracy: 0.4367 - val_loss: 1.4744
Epoch 3/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m352s[0m 580ms/step - accuracy: 0.4057 - loss: 1.5442 - val_accuracy: 0.4355 - val_loss: 1.4651
Epoch 4/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m351s[0m 579ms/step - accuracy: 0.4061 - loss: 1.5325 - val_accuracy: 0.4396 - val_loss: 1.4744
Epoch 5/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m394s[0m 599ms/step - accuracy: 0.4150 - loss: 1.5222 - val_accuracy: 0.4357 - val_loss: 1.4585


In [17]:
from sklearn.metrics import classification_report

# Dự đoán trên tập validation hoặc test
y_pred = model.predict({
    "input_ids": X_test_tokens["input_ids"],
    "attention_mask": X_test_tokens["attention_mask"]
})

# Lấy nhãn dự đoán từ xác suất (với softmax)
y_pred_labels = y_pred.argmax(axis=1)

label_names = label_encoder.classes_  # Lấy danh sách tên nhãn

# Tính toán precision, recall, f1-score
print(classification_report(y_test_encoded, y_pred_labels, target_names=label_names))


[1m758/758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 123ms/step
                                        precision    recall  f1-score   support

                          000 - Normal       0.35      0.59      0.44      3974
                  126 - Path Traversal       0.83      0.75      0.79      3545
         153 - Input Data Manipulation       0.00      0.00      0.00       272
         194 - Fake the Source of Data       0.31      0.44      0.37      3965
                  242 - Code Injection       0.45      0.63      0.52      2751
           272 - Protocol Manipulation       0.00      0.00      0.00      1381
310 - Scanning for Vulnerable Software       0.77      0.47      0.58       478
          34 - HTTP Response Splitting       0.43      0.19      0.26      3810
                    66 - SQL Injection       0.39      0.28      0.32      4067

                              accuracy                           0.43     24243
                             macro avg  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
test_loss, test_acc = model.evaluate(
    {"input_ids": X_test_tokens["input_ids"], "attention_mask": X_test_tokens["attention_mask"]},
    y_test_encoded
)
print(f"Test accuracy: {test_acc}")


[1m758/758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 132ms/step - accuracy: 0.4314 - loss: 1.4606
Test accuracy: 0.4344346821308136


**secBERT + no-fine-tune + epoch=5**

In [18]:
from tensorflow.keras.layers import Lambda

# Bật fine-tune
secbert_model.trainable = False

# Input layers
input_ids = tf.keras.Input(shape=(128,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.Input(shape=(128,), dtype=tf.int32, name="attention_mask")

# Gọi SecBERT qua Lambda layer, specify output_shape
bert_outputs = Lambda(
    lambda x: secbert_model(input_ids=x[0], attention_mask=x[1])[0][:, 0, :],
    output_shape=(768,)  # Specify the output shape here
)([input_ids, attention_mask])

# Thêm các lớp Dense
x = Dense(128, activation="relu")(bert_outputs)
x = Dropout(0.5)(x)
output = Dense(num_classes, activation="softmax")(x)

# Tạo mô hình
model = Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile mô hình
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [19]:
# huấn luyện mô hình
history = model.fit(
    {
        "input_ids": X_train_tokens["input_ids"],
        "attention_mask": X_train_tokens["attention_mask"]
    },
    y_train_encoded,
    validation_split=0.2,
    epochs=5,
    batch_size=128
)

Epoch 1/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m376s[0m 601ms/step - accuracy: 0.3367 - loss: 1.7535 - val_accuracy: 0.4174 - val_loss: 1.4904
Epoch 2/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m365s[0m 579ms/step - accuracy: 0.4052 - loss: 1.5509 - val_accuracy: 0.4403 - val_loss: 1.4671
Epoch 3/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 579ms/step - accuracy: 0.4136 - loss: 1.5389 - val_accuracy: 0.4350 - val_loss: 1.4730
Epoch 4/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m394s[0m 599ms/step - accuracy: 0.4161 - loss: 1.5305 - val_accuracy: 0.4340 - val_loss: 1.4654
Epoch 5/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m370s[0m 580ms/step - accuracy: 0.4141 - loss: 1.5242 - val_accuracy: 0.4349 - val_loss: 1.4582


In [20]:
test_loss, test_acc = model.evaluate(
    {"input_ids": X_test_tokens["input_ids"], "attention_mask": X_test_tokens["attention_mask"]},
    y_test_encoded
)
print(f"Test accuracy: {test_acc}")

[1m758/758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 126ms/step - accuracy: 0.4315 - loss: 1.4583
Test accuracy: 0.43340346217155457


In [21]:
from sklearn.metrics import classification_report

# Dự đoán trên tập validation hoặc test
y_pred = model.predict({
    "input_ids": X_test_tokens["input_ids"],
    "attention_mask": X_test_tokens["attention_mask"]
})

# Lấy nhãn dự đoán từ xác suất (với softmax)
y_pred_labels = y_pred.argmax(axis=1)

label_names = label_encoder.classes_  # Lấy danh sách tên nhãn

# Tính toán precision, recall, f1-score
print(classification_report(y_test_encoded, y_pred_labels, target_names=label_names))

[1m758/758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 130ms/step
                                        precision    recall  f1-score   support

                          000 - Normal       0.34      0.51      0.40      3974
                  126 - Path Traversal       0.80      0.77      0.78      3545
         153 - Input Data Manipulation       0.00      0.00      0.00       272
         194 - Fake the Source of Data       0.36      0.22      0.27      3965
                  242 - Code Injection       0.44      0.60      0.51      2751
           272 - Protocol Manipulation       0.00      0.00      0.00      1381
310 - Scanning for Vulnerable Software       0.77      0.47      0.58       478
          34 - HTTP Response Splitting       0.43      0.24      0.31      3810
                    66 - SQL Injection       0.33      0.52      0.41      4067

                              accuracy                           0.43     24243
                             macro avg 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**secBERT + fine-tune + learning-rate + epoch=5 + drop=0.25 + dense=256**

In [22]:
from tensorflow.keras.layers import Lambda

# Bật fine-tune
secbert_model.trainable = True

# Input layers
input_ids = tf.keras.Input(shape=(128,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.Input(shape=(128,), dtype=tf.int32, name="attention_mask")

# Gọi SecBERT qua Lambda layer, specify output_shape
bert_outputs = Lambda(
    lambda x: secbert_model(input_ids=x[0], attention_mask=x[1])[0][:, 0, :],
    output_shape=(768,)  # Specify the output shape here
)([input_ids, attention_mask])

# Thêm các lớp Dense
x = Dense(256, activation="relu")(bert_outputs)
x = Dropout(0.25)(x)
output = Dense(num_classes, activation="softmax")(x)

# Tạo mô hình
model = Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile mô hình
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [23]:
# huấn luyện mô hình
history = model.fit(
    {
        "input_ids": X_train_tokens["input_ids"],
        "attention_mask": X_train_tokens["attention_mask"]
    },
    y_train_encoded,
    validation_split=0.2,
    epochs=5,
    batch_size=128
)

Epoch 1/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 611ms/step - accuracy: 0.2871 - loss: 1.9249 - val_accuracy: 0.4305 - val_loss: 1.5298
Epoch 2/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m370s[0m 599ms/step - accuracy: 0.4152 - loss: 1.5466 - val_accuracy: 0.4355 - val_loss: 1.4827
Epoch 3/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m371s[0m 580ms/step - accuracy: 0.4290 - loss: 1.4980 - val_accuracy: 0.4369 - val_loss: 1.4639
Epoch 4/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m394s[0m 600ms/step - accuracy: 0.4315 - loss: 1.4783 - val_accuracy: 0.4452 - val_loss: 1.4524
Epoch 5/5
[1m607/607[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m370s[0m 580ms/step - accuracy: 0.4336 - loss: 1.4715 - val_accuracy: 0.4407 - val_loss: 1.4474


In [24]:
test_loss, test_acc = model.evaluate(
    {"input_ids": X_test_tokens["input_ids"], "attention_mask": X_test_tokens["attention_mask"]},
    y_test_encoded
)
print(f"Test accuracy: {test_acc}")

[1m758/758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 129ms/step - accuracy: 0.4340 - loss: 1.4546
Test accuracy: 0.43583714962005615


In [None]:
from sklearn.metrics import classification_report

# Dự đoán trên tập validation hoặc test
y_pred = model.predict({
    "input_ids": X_test_tokens["input_ids"],
    "attention_mask": X_test_tokens["attention_mask"]
})

# Lấy nhãn dự đoán từ xác suất (với softmax)
y_pred_labels = y_pred.argmax(axis=1)

label_names = label_encoder.classes_  # Lấy danh sách tên nhãn

# Tính toán precision, recall, f1-score
print(classification_report(y_test_encoded, y_pred_labels, target_names=label_names))

**Thêm lớp Dense và Epoch**

In [25]:
from tensorflow.keras.layers import Lambda

# Bật fine-tune
secbert_model.trainable = True

# Input layers
input_ids = tf.keras.Input(shape=(128,), dtype=tf.int32, name="input_ids")
attention_mask = tf.keras.Input(shape=(128,), dtype=tf.int32, name="attention_mask")

# Gọi SecBERT qua Lambda layer, specify output_shape
bert_outputs = Lambda(
    lambda x: secbert_model(input_ids=x[0], attention_mask=x[1])[0][:, 0, :],
    output_shape=(768,)  # Specify the output shape here
)([input_ids, attention_mask])

# Thêm các lớp Dense
x = Dense(512, activation="relu")(bert_outputs)
x = Dropout(0.25)(x)
x = Dense(256, activation="relu")(x)
x = Dropout(0.25)(x)
output = Dense(num_classes, activation="softmax")(x)

# Tạo mô hình
model = Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile mô hình
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
              loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [26]:
# huấn luyện mô hình
history = model.fit(
    {
        "input_ids": X_train_tokens["input_ids"],
        "attention_mask": X_train_tokens["attention_mask"]
    },
    y_train_encoded,
    validation_split=0.2,
    epochs=7,
    batch_size=64
)

Epoch 1/7
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m466s[0m 371ms/step - accuracy: 0.3250 - loss: 1.7938 - val_accuracy: 0.4326 - val_loss: 1.4811
Epoch 2/7
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m427s[0m 315ms/step - accuracy: 0.4244 - loss: 1.5060 - val_accuracy: 0.4424 - val_loss: 1.4455
Epoch 3/7
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m443s[0m 316ms/step - accuracy: 0.4325 - loss: 1.4756 - val_accuracy: 0.4437 - val_loss: 1.4331
Epoch 4/7
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m435s[0m 310ms/step - accuracy: 0.4378 - loss: 1.4525 - val_accuracy: 0.4454 - val_loss: 1.4282
Epoch 5/7
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 309ms/step - accuracy: 0.4340 - loss: 1.4483 - val_accuracy: 0.4465 - val_loss: 1.4191
Epoch 6/7
[1m1213/1213[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 315ms/step - accuracy: 0.4390 - loss: 1.4373 - val_accuracy: 0.4437 - val_loss: 1.415

In [27]:
test_loss, test_acc = model.evaluate(
    {"input_ids": X_test_tokens["input_ids"], "attention_mask": X_test_tokens["attention_mask"]},
    y_test_encoded
)
print(f"Test accuracy: {test_acc}")

[1m758/758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 127ms/step - accuracy: 0.4429 - loss: 1.4158
Test accuracy: 0.44454067945480347


In [28]:
from sklearn.metrics import classification_report

# Dự đoán trên tập validation hoặc test
y_pred = model.predict({
    "input_ids": X_test_tokens["input_ids"],
    "attention_mask": X_test_tokens["attention_mask"]
})

# Lấy nhãn dự đoán từ xác suất (với softmax)
y_pred_labels = y_pred.argmax(axis=1)

label_names = label_encoder.classes_  # Lấy danh sách tên nhãn

# Tính toán precision, recall, f1-score
print(classification_report(y_test_encoded, y_pred_labels, target_names=label_names))

[1m758/758[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 130ms/step
                                        precision    recall  f1-score   support

                          000 - Normal       0.35      0.43      0.38      3974
                  126 - Path Traversal       0.83      0.75      0.79      3545
         153 - Input Data Manipulation       0.22      0.01      0.01       272
         194 - Fake the Source of Data       0.36      0.31      0.33      3965
                  242 - Code Injection       0.44      0.64      0.52      2751
           272 - Protocol Manipulation       0.38      0.09      0.15      1381
310 - Scanning for Vulnerable Software       0.64      0.65      0.65       478
          34 - HTTP Response Splitting       0.36      0.35      0.35      3810
                    66 - SQL Injection       0.39      0.40      0.40      4067

                              accuracy                           0.44     24243
                             macro avg 

In [29]:
from sklearn.metrics import confusion_matrix

# Tính confusion matrix
conf_matrix = confusion_matrix(y_test_encoded, y_pred_labels)
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[1714   85    0  420  126  134   94  487  914]
 [ 372 2660    1   55  100   12    4   56  285]
 [  88    0    2   34   41    0    0   74   33]
 [ 662   16    5 1233  737   23    0  885  404]
 [ 200  378    0   54 1751    0   12  290   66]
 [ 602    1    1  150   40  128   36   93  330]
 [  58    0    0   23    0   27  311    0   59]
 [ 563   55    0  738  690    0    1 1334  429]
 [ 676    0    0  764  450   14   25  494 1644]]
