In [17]:
!pip install accelerate -U



In [2]:
import pandas as pd
import transformers
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder

# Load and preprocess the data
csv_path = 'Final.csv'  # Update this path
data = pd.read_csv(csv_path)

# Encode labels
encoder = LabelEncoder()
data['encoded_labels'] = encoder.fit_transform(data['id'])

# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['encoded_labels'])

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('jackaduma/SecBERT')
model = BertForSequenceClassification.from_pretrained('jackaduma/SecBERT', num_labels=len(encoder.classes_))

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        encoding = self.tokenizer(
            item['sentences'],
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(item['encoded_labels'], dtype=torch.long)
        }

# Parameters
MAX_LEN = 128
BATCH_SIZE = 8

# Create datasets
train_dataset = CustomDataset(train_data, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_data, tokenizer, MAX_LEN)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the trained model
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at jackaduma/SecBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,4.2933,3.4631
2,3.2165,2.861105
3,2.6774,2.668988


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

In [3]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

# Define a function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Initialize the Trainer with the model and tokenizer
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Evaluate the model
eval_result = trainer.evaluate()

# Print the evaluation results
print(f"Evaluation Results: {eval_result}")


Evaluation Results: {'eval_loss': 2.6689882278442383, 'eval_accuracy': 0.4630113141862489, 'eval_f1': 0.3613972365698255, 'eval_precision': 0.3482484492497176, 'eval_recall': 0.4630113141862489, 'eval_runtime': 258.6075, 'eval_samples_per_second': 4.443, 'eval_steps_per_second': 0.557}


  _warn_prf(average, modifier, msg_start, len(result))
