In [1]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
from itertools import product
from transformers import BertTokenizer, AdamW
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
from torch.optim.lr_scheduler import CosineAnnealingLR
from pys.functions import CustomBertModel, create_dataset, train_with_validation, test
from pys.params import batch_size, learning_rate, num_epochs

In [None]:
dataset_df = pd.read_csv('../csv/dataset.csv')
baseline_df = pd.read_csv('../csv/baseline.csv')
dataset_aug_df = pd.read_csv('../csv/dataset_aug.csv')
test_df = pd.read_csv('../csv/cve_data.csv')

baseline_descriptions = baseline_df['Example Description'].tolist()

filtered_dataset_df = dataset_df[~dataset_df['Example Description'].isin(
    baseline_descriptions)]

train_df = pd.concat([filtered_dataset_df, dataset_aug_df], ignore_index=True)
val_df = baseline_df.copy()
test_df = test_df.copy()

train_labels = train_df['Artifact Id']
val_labels = val_df['Artifact Id']
test_labels = test_df['Artifact Id']

train_labels_counts = train_labels.value_counts()
val_labels_counts = val_labels.value_counts()
test_labels_counts = test_labels.value_counts()

all_unique_labels = list(set(train_labels.unique().tolist(
) + val_labels.unique().tolist() + test_labels.unique().tolist()))

label_mapping = {label: idx for idx, label in enumerate(all_unique_labels)}

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [2]:
T_max = 50
eta_min = 1e-5
output_model_path = "../models/model_cve_test_baseline_val.pth"

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = create_dataset(train_df, tokenizer, label_mapping)
val_dataset = create_dataset(val_df, tokenizer, label_mapping)
test_dataset = create_dataset(test_df, tokenizer, label_mapping)

model = CustomBertModel(num_labels=len(label_mapping))
model.bert.dropout.p = 0.3
print(model.bert.dropout.p)
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = CosineAnnealingLR(optimizer, T_max=T_max, eta_min=eta_min)
model.to(device)

train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_loader = DataLoader(val_dataset, sampler=RandomSampler(val_dataset), batch_size=batch_size)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

f1_train, f1_val, acc_train, acc_val, loss_train, loss_val = train_with_validation(
    model, train_loader, val_loader, optimizer, device, num_epochs, scheduler
)

f1_test, acc_test = test(model, test_loader, device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0.1


In [None]:
def plot(acc_train, acc_val, f1_train, f1_val, loss_train, loss_val):
  
    epochs = range(1, num_epochs + 1)

    fig, axes = plt.subplots(1, 3, figsize=(20, 6))  

    axes[0].plot(epochs, acc_train, color='blue', linestyle='-', label='Train Accuracy')
    axes[0].plot(epochs, acc_val, color='red', linestyle='-', label='Validation Accuracy')
    axes[0].set_title('Accuracy Over Epochs')
    axes[0].set_xlabel('Epochs')
    axes[0].set_ylabel('Accuracy (%)')
    axes[0].legend()
    axes[0].grid(True)

    axes[1].plot(epochs, f1_train, color='blue', linestyle='-', label='Train F1 Score')
    axes[1].plot(epochs, f1_val, color='red', linestyle='-', label='Validation F1 Score')
    axes[1].set_title('F1 Score Over Epochs')
    axes[1].set_xlabel('Epochs')
    axes[1].set_ylabel('F1 Score')
    axes[1].legend()
    axes[1].grid(True)

    axes[2].plot(epochs, loss_train, color='blue', linestyle='-', label='Train Loss')
    axes[2].plot(epochs, loss_val, color='red', linestyle='-', label='Validation Loss')
    axes[2].set_title('Loss Over Epochs')
    axes[2].set_xlabel('Epochs')
    axes[2].set_ylabel('Loss')
    axes[2].legend()
    axes[2].grid(True)

    fig.suptitle(f"Training and Validation Metrics\n"
                 "dropout_cve_test_baseline_val.ipynb", fontsize=12)

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

In [None]:
plot(acc_train, acc_val, f1_train, f1_val, loss_train, loss_val)

In [None]:
# Print training results
print("\n--- Training Results ---")
print(f"Training F1 Score (Weighted): {f1_train[-1]:.4f}")
print(f"Training Accuracy: {acc_train[-1]:.2f}%")
print(f"Training Loss: {loss_train[-1]:.4f}")

# Print validation results
print("\n--- Validation Results ---")
print(f"Validation F1 Score (Weighted): {f1_val[-1]:.4f}")
print(f"Validation Accuracy: {acc_val[-1]:.2f}%")
print(f"Validation Loss: {loss_val[-1]:.4f}")

# Print test results
print("\n--- Test Results ---")
print(f"Test F1 Score (Weighted): {f1_test:.4f}")
print(f"Test Accuracy: {acc_test:.2f}%")
