## Log Classification Using Transformer

##### Codes were mainly referenced from https://www.analyticsvidhya.com/blog/2024/06/finetuning-llama-3-for-sequence-classification/

In [1]:
import bitsandbytes as bnb
print(bnb.__version__)  

0.44.1


In [3]:
#!pip install -q transformers accelerate trl bitsandbytes datasets evaluate
#!pip install -q peft scikit-learn

In [5]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType




In [7]:
# Check if GPU is available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


## Read the Data

In [10]:
df = pd.read_csv('./combined_logs_with_labels.csv',delimiter=',', encoding='latin-1')
pd.set_option('display.max_colwidth', None)
df.head()

Unnamed: 0,Log,Label
0,143 INFO dfs.DataNode$DataXceiver: Receiving block src: /10.250.19.102:54106 dest: /10.250.19.102:50010,Normal
1,35 INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /mnt/hadoop/mapred/system/job_200811092030_0001/job.jar.,Normal
2,143 INFO dfs.DataNode$DataXceiver: Receiving block src: /10.250.10.6:40524 dest: /10.250.10.6:50010,Normal
3,145 INFO dfs.DataNode$DataXceiver: Receiving block src: /10.250.14.224:42420 dest: /10.250.14.224:50010,Normal
4,145 INFO dfs.DataNode$PacketResponder: PacketResponder 1 for block terminating,Normal


## Model

In [13]:
# making it TF.Data and spliting into 2 datasets
df['Label'] = df['Label'].map({'Normal':0,'Anomaly':1})

##### Train-test split ensuring 80% of the anomolous data is within the train dataset

In [16]:
# Shuffle the entire DataFrame
df = df.sample(frac=1, random_state=38).reset_index(drop=True)  # Shuffle the entire dataset

# Total samples
total_samples = len(df)
train_size = int(total_samples * 0.8)  # 80% of the entire dataset for training

# Separate the classes
normal_samples = df[df['Label'] == 0]
anomaly_samples = df[df['Label'] == 1]

#print(total_samples) # 19412
#print(train_size) # 15529
#print(len(normal_samples)) # 17220
#print(len(anomaly_samples)) # 2192

# Determine the number of Anomaly samples for the training set
anomaly_train_size = int(len(anomaly_samples) * 0.8)  # 80% of Anomaly samples
normal_train_size = train_size - anomaly_train_size  # Remaining from Normals

# Sample from each class
normal_train = normal_samples.sample(normal_train_size, random_state=38)
anomaly_train = anomaly_samples.sample(anomaly_train_size, random_state=38)
#print(len(normal_train)) # 13776
#print(len(anomaly_train)) # 1753

# Combine the training samples
train_df = pd.concat([anomaly_train, normal_train])
# Remaining as test samples
test_df = pd.concat([anomaly_samples, normal_samples]).drop(train_df.index)

##### Converting pandas DataFrames into Hugging Face Dataset objects 

In [19]:
from datasets import DatasetDict, Dataset

dataset_train = Dataset.from_pandas(train_df)
# dataset_val = Dataset.from_pandas(val_df)
dataset_test = Dataset.from_pandas(test_df)

# Combine them into a single DatasetDict
dataset = DatasetDict({
    'train': dataset_train,
    #'validation': dataset_val,
    'test': dataset_test
})

##### Class weights for handling class imbalance 

In [60]:

normal = train_df['Label'].value_counts()[0]
anomaly = train_df['Label'].value_counts()[1]
total = normal + anomaly
weight_for_0 = (1 / normal) * (total) / 2.0
weight_for_1 = (1 / anomaly) * (total) / 2.0

class_weights = {0: weight_for_0, 1: weight_for_1}  # Create a dictionary

print("Weight for class 0: {:.2f}".format(weight_for_0))
print("Weight for class 1: {:.2f}".format(weight_for_1))

import torch

class_weights_tensor = torch.tensor(list(class_weights.values()), dtype=torch.float32)
#class_weights = (1/train_df['Label'].value_counts(normalize=True).sort_index()).tolist()
#class_weights = torch.tensor(class_weights).to(device)  # Move class weights to the GPU
#class_weights = class_weights / class_weights.sum()
#class_weights

Weight for class 0: 0.56
Weight for class 1: 4.43


##### Load the tokenizer

In [25]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [27]:
# Set a padding token 
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Using eos_token as pad_token

##### Data preprocessing function

In [30]:
def data_preprocessing(batch):
    # Tokenize the logs and include the labels
    tokenized = tokenizer(batch['Log'], truncation=True, max_length=512, padding=True)
    tokenized['labels'] = batch['Label']  # Add the labels to the tokenized output
    return tokenized

##### Tokenize the dataset

In [33]:
tokenized_data = dataset.map(data_preprocessing, batched=True, remove_columns=['Log', 'Label'])
tokenized_data.set_format("torch")

Map:   0%|          | 0/15529 [00:00<?, ? examples/s]

Map:   0%|          | 0/3883 [00:00<?, ? examples/s]

##### Data collator

In [36]:
from transformers import DataCollatorWithPadding

collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

##### Load the model

In [39]:
# Make sure the model's configuration also recognizes the pad token
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=2,
    pad_token_id=tokenizer.pad_token_id,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##### Define a custom Trainer class to incorporate class weights

In [42]:
from transformers import Trainer
import torch.nn as nn

class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Forward pass
        labels = inputs.get("labels").to(device)  # Move labels to GPU
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # Calculate the weighted loss
        loss_fct = nn.CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

##### Evaluation metrics

In [45]:
import evaluate

accuracy = evaluate.load("accuracy")

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

##### Define training arguments

In [48]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=8,   # batch size per device during training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                 # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)



##### Initialize custom Trainer

In [62]:
trainer = CustomTrainer(
    model=model.to(device),
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],  
    tokenizer=tokenizer,
    data_collator=collate_fn,
    compute_metrics = compute_metrics,
    class_weights=class_weights_tensor.to(device)
)

  super().__init__(*args, **kwargs)


##### Start the training

In [64]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6627,0.217304,0.94257


TrainOutput(global_step=1942, training_loss=0.6529541709733673, metrics={'train_runtime': 100.4602, 'train_samples_per_second': 154.579, 'train_steps_per_second': 19.331, 'total_flos': 299278522765896.0, 'train_loss': 0.6529541709733673, 'epoch': 1.0})

##### Save the model

In [66]:
trainer.save_model('./transformer_model')

##### Prediction

In [69]:
from transformers import pipeline

classifier = pipeline(
    "text-classification",  
    model="./transformer_model",
    tokenizer="./transformer_model",
    device_map="cuda", #cuda for gpt
)


In [81]:
# This is supposed to be a Normal log
text = ['INFO dfs.FSNamesystem: BLOCK* NameSystem.addStoredBlock: blockMap updated: 10.251.66.102:50010 is added to size 67108864']

predictions = classifier(text)
print(predictions)

[{'label': 'LABEL_0', 'score': 0.9820976853370667}]


In [83]:
# This is supposed to be an Anomaly log
text = ['INFO dfs.FSNamesystem: BLOCK* NameSystem.allocateBlock: /user/root/sortrand/_temporary/_task_200811092030_0002_r_000074_2/part-00074.']

predictions = classifier(text)
print(predictions)

[{'label': 'LABEL_0', 'score': 0.9505198001861572}]


##### Although the accuracy for transformer is high, it does not seemed to work well on unseen data
