In [None]:
#!pip install accelerate -U

## Import Libraries ##

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F

## Load Data ##

In [None]:
# Load the Data
df = pd.read_csv('HDFS_sequence.csv', sep=',', quotechar='"', names=["text", "label"])
df = df[1:]
df

## Feature Extraction ##

In [4]:
# Feature extraction
X = list(df['text'])
y = list(df['label'])

In [5]:
# Get dummies(mapping)
y = pd.get_dummies(y, drop_first=True)['Normal']
y = y.astype(int)

In [6]:
#Make Anomalous as 1 and Normal as 0
for x in range(len(y)):
  if y[x] == 0:
    y[x] = 1
  else:
    y[x] = 0

## Train-test split ##

In [7]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

## Load the Model ##

In [None]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

## Get the Encodings ##

In [9]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

## Get DataSets ##

In [10]:
# Create datasets
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]
        label = torch.tensor(self.labels.iloc[idx])

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}

# Create an instance of the custom dataset
dataset_train = CustomDataset(train_encodings, y_train)
dataset_test = CustomDataset(test_encodings, y_test)

## Train ##

In [11]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./bert_base_model",
    evaluation_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    save_steps=1000,
    save_total_limit=2,
    num_train_epochs=10,
    logging_dir="./logs",
)

In [12]:
# Trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset_train,         # training dataset
    eval_dataset=dataset_test             # evaluation dataset
)

trainer.train()

Step,Training Loss,Validation Loss
100,No log,0.187298
200,No log,0.178088


Step,Training Loss,Validation Loss
100,No log,0.187298
200,No log,0.178088
300,No log,0.167678
400,No log,0.053821
500,0.077300,0.005118
600,0.077300,0.000163
700,0.077300,0.000105
800,0.077300,9.4e-05


TrainOutput(global_step=800, training_loss=0.048560485057532785, metrics={'train_runtime': 1562.5067, 'train_samples_per_second': 1.024, 'train_steps_per_second': 0.512, 'total_flos': 27133327584000.0, 'train_loss': 0.048560485057532785, 'epoch': 10.0})

## Save the Model ##

In [13]:
model.save_pretrained("./fine_tuned_bert_model_for_HDFS")
tokenizer.save_pretrained("./fine_tuned_bert_model_for_HDFS")

('./fine_tuned_bert_model_for_HDFS/tokenizer_config.json',
 './fine_tuned_bert_model_for_HDFS/special_tokens_map.json',
 './fine_tuned_bert_model_for_HDFS/vocab.txt',
 './fine_tuned_bert_model_for_HDFS/added_tokens.json')

## Prediction ##

In [19]:
model_path = "./fine_tuned_bert_model_for_HDFS"
model_saved = BertForSequenceClassification.from_pretrained(model_path)
tokenizer_saved = BertTokenizer.from_pretrained(model_path)

test_encodings = tokenizer_saved(X_test, truncation=True, padding=True, return_tensors="pt")
#already have test_encodings
predictions = model_saved(**test_encodings)
logits = predictions.logits

# Apply softmax to get probabilities
probabilities = F.softmax(logits, dim=1)

# Get the predicted label (0 or 1)
predicted_labels = torch.argmax(probabilities, dim=1)
predicted_labels = predicted_labels.tolist()
# Print the predicted labels
print(predicted_labels)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Evaluation ##

In [None]:
y_test_labels = y_test.tolist()
# Create confusion matrix
cm = confusion_matrix(y_test_labels, predicted_labels)
# Calculate metrics
accuracy = accuracy_score(y_test_labels, predicted_labels)
precision = precision_score(y_test_labels, predicted_labels)
recall = recall_score(y_test_labels, predicted_labels)
f1 = f1_score(y_test_labels, predicted_labels)
print("Confusion Matrix:")
print(cm)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)