In [1]:
#!pip install accelerate -U

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

In [33]:
# Load the Data
df = pd.read_csv('/content/data.csv', sep=',', quotechar='"', names=["text", "label"])
df = df[1:201]
df

Unnamed: 0,text,label
1,"{'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
2,"{'E2', 'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
3,"{'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
4,"{'E2', 'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
5,"{'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
...,...,...
196,"{'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
197,"{'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
198,"{'E26', 'E11', 'E9', 'E22', 'E5'}",Normal
199,"{'E2', 'E26', 'E11', 'E9', 'E22', 'E5'}",Normal


In [14]:
# Feature extraction
X = list(df['text'])
y = list(df['label'])

In [15]:
# Get dummies(mapping)
y = pd.get_dummies(y, drop_first=True)['Normal']

In [16]:
for x in range(len(y)):
  if y[x] == 0:
    y[x] = 1
  else:
    y[x] = 0


In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [19]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
train_encodings = tokenizer(X_train, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

In [21]:
# Create datasets
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        input_ids = self.inputs['input_ids'][idx]
        attention_mask = self.inputs['attention_mask'][idx]
        label = torch.tensor(self.labels.iloc[idx])

        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}

# Create an instance of the custom dataset
dataset_train = CustomDataset(train_encodings, y_train)
dataset_test = CustomDataset(test_encodings, y_test)


In [None]:
## Train the model ##

In [22]:
# Training Arguments
training_args = TrainingArguments(
    output_dir="./bert_base_model",
    evaluation_strategy="steps",
    eval_steps=100,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    save_steps=1000,
    save_total_limit=2,
    num_train_epochs=10,
    logging_dir="./logs",
)

In [24]:
# Trainer
trainer = Trainer(
    model=model,                         # the instantiated Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=dataset_train,         # training dataset
    eval_dataset=dataset_test             # evaluation dataset
)

trainer.train()

Step,Training Loss,Validation Loss
100,No log,0.000948
200,No log,4e-06
300,No log,3e-06
400,No log,2e-06
500,0.008400,2e-06
600,0.008400,2e-06
700,0.008400,2e-06
800,0.008400,2e-06


TrainOutput(global_step=800, training_loss=0.005259044244230609, metrics={'train_runtime': 1777.0463, 'train_samples_per_second': 0.9, 'train_steps_per_second': 0.45, 'total_flos': 27133327584000.0, 'train_loss': 0.005259044244230609, 'epoch': 10.0})

In [None]:
## Evaluation ##

In [30]:
# Evaluate the model on the test data
results = trainer.evaluate()

# Print all keys and their values
for key, value in results.items():
    print(f'{key}: {value}')

# Find the key for accuracy
accuracy_key = [key for key in results.keys() if 'accuracy' in key.lower()]
if accuracy_key:
    accuracy = results[accuracy_key[0]]
    print(f'{accuracy_key[0]}: {accuracy * 100:.2f}%')
else:
    print("Accuracy key not found.")


eval_loss: 1.9103267732134555e-06
eval_runtime: 14.6982
eval_samples_per_second: 2.721
eval_steps_per_second: 1.361
epoch: 10.0
Accuracy key not found.


## Testing on Test dataset ##

In [None]:
output = trainer.predict(dataset_test)
print(output.predictions)

In [32]:
import torch.nn.functional as F

#predictions = F.softmax(torch.tensor(output.predictions), dim=1)
predictions = torch.argmax(F.softmax(torch.tensor(output.predictions), dim=1), dim=1)
# Print the probabilities
print(predictions.tolist())


[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Save the model ##

In [None]:
model.save_pretrained("./fine_tuned_bert_model")
tokenizer.save_pretrained("./fine_tuned_bert_model")


## New Data Output Prediction ##

In [43]:
# Load the fine-tuned BERT model and tokenizer
model_path = "./fine_tuned_bert_model"
model_saved = BertForSequenceClassification.from_pretrained(model_path)
tokenizer_saved = BertTokenizer.from_pretrained(model_path)

# New input without label
new_input = ["{'E1','E26','E11','E5'}", "{'E11','E32','E17','E45','E43'}","{'E1','E5','E43'}"]

# Tokenize the input
tokenized_input = tokenizer_saved(new_input, truncation=True, padding=True, return_tensors="pt")

# Forward pass to obtain logits
output = model_saved(**tokenized_input)
logits = output.logits

# Apply softmax to get probabilities
probabilities = F.softmax(logits, dim=1)

# Get the predicted label (0 or 1)
predicted_labels = torch.argmax(probabilities, dim=1)

# Print the predicted labels
print(predicted_labels.tolist())

[0, 0, 1]
