In [3]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
from itertools import product
from transformers import BertTokenizer, AdamW
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler
from torch.optim.lr_scheduler import CosineAnnealingLR
from pys.functions import CustomBertModel, create_dataset, train, test
from pys.params import batch_size, learning_rate, num_epochs

In [4]:
train_df = pd.read_csv('../csv/merged_aug.csv')
test_df = pd.read_csv('../csv/cve_data.csv')

train_labels = train_df['Artifact Id']
test_labels = test_df['Artifact Id']

train_labels_counts = train_labels.value_counts()
test_labels_counts = test_labels.value_counts()

train_labels_list = train_labels_counts.tolist()
test_labels_list = test_labels_counts.tolist()

unique_train_labels = train_labels.unique().tolist()
unique_test_labels = test_labels.unique().tolist()

all_unique_labels = list(set(unique_train_labels + unique_test_labels))

label_mapping = {label: idx for idx, label in enumerate(all_unique_labels)}

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)


In [5]:
print(label_mapping)

{'d3f:FileSystem': 0, 'd3f:Command': 1, 'd3f:Software': 2, 'd3f:Database': 3, 'd3f:DisplayServer': 4, 'd3f:BootLoader': 5, 'd3f:System Software': 6, 'd3f:InterprocessCommunication': 7, 'd3f:OperatingSystem': 8, 'd3f:HardwareDriver': 9}


In [3]:
T_max = 50
eta_min = 1e-5

In [1]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = create_dataset(train_df, tokenizer, label_mapping)
test_dataset = create_dataset(test_df, tokenizer, label_mapping)

model = CustomBertModel(num_labels=len(label_mapping))
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = CosineAnnealingLR(optimizer, T_max=T_max, eta_min=eta_min)
model.to(device)

train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
test_loader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

f1_train, acc_train, loss_train = train(
    model, train_loader, optimizer, device, num_epochs, scheduler
)

f1_test, acc_test = test(model, test_loader, device)



NameError: name 'torch' is not defined

In [None]:
# Print training results
print("\n--- Training Results ---")
print(f"Training F1 Score (Weighted): {f1_train[-1]:.4f}")
print(f"Training Accuracy: {acc_train[-1]:.2f}%")
print(f"Training Loss: {loss_train[-1]:.4f}")

# Print test results
print("\n--- Test Results ---")
print(f"Test F1 Score (Weighted): {f1_test:.4f}")
print(f"Test Accuracy: {acc_test:.2f}%")


--- Training Results ---


NameError: name 'f1_train' is not defined