In [53]:
#!pip install accelerate -U

## Import Libraries ##

In [54]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import torch.nn.functional as F

## Load Data ##

In [55]:
# Load the Data
df = pd.read_csv('/content/HDFS_sequence.csv', sep=',', quotechar='"', names=["text", "label"])
df = df[1:]
df

Unnamed: 0,text,label
1,"{'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
2,"{'E2', 'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
3,"{'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
4,"{'E2', 'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
5,"{'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
...,...,...
4037,"{'E26', 'E11', 'E9', 'E22', 'E5'}",Anomaly
4038,"{'E2', 'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
4039,"{'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
4040,"{'E26', 'E11', 'E9', 'E22', 'E5'}",Normal


## Feature Extraction ##

In [56]:
# Feature extraction
X = list(df['text'])
y = list(df['label'])

In [57]:
# Get dummies(mapping)
y = pd.get_dummies(y, drop_first=True)['Normal']

In [58]:
for x in range(len(y)):
  if y[x] == 0:
    y[x] = 1
  else:
    y[x] = 0


## Train-test split ##

In [59]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

## Access GPU ##

In [60]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Load the Model ##

In [61]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Get the Encodings ##

In [64]:
# Get the Encodings
train_encodings = tokenizer(X_train, truncation=True, padding=True, return_tensors="pt")
test_encodings = tokenizer(X_test, truncation=True, padding=True, return_tensors="pt")

## Move Data to GPU ##

In [65]:
# Move data to GPU
train_encodings = {key: value.to(device) for key, value in train_encodings.items()}
test_encodings = {key: value.to(device) for key, value in test_encodings.items()}
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long).to(device)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long).to(device)


## Get DataSets ##

In [66]:
# Create datasets
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]
        label = self.labels[idx]

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}

# Create an instance of the custom dataset
dataset_train = CustomDataset(train_encodings, y_train_tensor)
dataset_test = CustomDataset(test_encodings, y_test_tensor)

## Train ##

In [67]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./bert_base_model",
    evaluation_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    save_steps=1000,
    save_total_limit=2,
    num_train_epochs=10,
    logging_dir="./logs",
    report_to="tensorboard",
)

In [None]:
# Trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset_train,         # training dataset
    eval_dataset=dataset_test             # evaluation dataset
)

trainer.train()

## Save the Model ##

In [None]:
model.save_pretrained("./fine_tuned_bert_model_for_HDFS")
tokenizer.save_pretrained("./fine_tuned_bert_model_for_HDFS")

## Prediction ##

In [None]:
model_path = "./fine_tuned_bert_model_for_HDFS"
model_saved = BertForSequenceClassification.from_pretrained(model_path).to(device)
tokenizer_saved = BertTokenizer.from_pretrained(model_path)

In [None]:
test_encodings = tokenizer_saved(X_test, truncation=True, padding=True, return_tensors="pt").to(device)
predictions = model_saved(**test_encodings)
logits = predictions.logits

# Apply softmax to get probabilities
probabilities = F.softmax(logits, dim=1)

# Get the predicted label (0 or 1)
predicted_labels = torch.argmax(probabilities, dim=1)
predicted_labels = predicted_labels.tolist()
# Print the predicted labels
print(predicted_labels)

## Evaluation ##

In [None]:
y_test_labels = y_test.tolist()
# Create confusion matrix
cm = confusion_matrix(y_test_labels, predicted_labels)

# Calculate metrics
accuracy = accuracy_score(y_test_labels, predicted_labels)
precision = precision_score(y_test_labels, predicted_labels)
recall = recall_score(y_test_labels, predicted_labels)
f1 = f1_score(y_test_labels, predicted_labels)

print("Confusion Matrix:")
print(cm)
print("\nAccuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)