# Classification with BERT

In [1]:
MODEL = 'google-bert/bert-base-uncased'
TOKEN_LENGTH = 512

In [2]:
!pip install transformers datasets scikit-learn --quiet

In [14]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, pipeline
from sklearn.metrics import accuracy_score, classification_report
import torch
torch.cuda.empty_cache()
import gc

# model.cpu()
gc.collect()
torch.cuda.empty_cache()

## 1. Read Data

In [4]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_val = pd.read_csv("val.csv")

# Convert to Huggingface Dataset
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=TOKEN_LENGTH)

dataset_train = dataset_train.map(tokenize_function, batched=True)
dataset_val = dataset_val.map(tokenize_function, batched=True)

# Set format for PyTorch
dataset_train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
dataset_val.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

## 2. Train Model

In [5]:

# Load pre-trained BERT model
num_classes = df_train['label'].nunique()  # Number of classes in the dataset
model = BertForSequenceClassification.from_pretrained(MODEL, num_labels=num_classes)

# Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


# Train the model
trainer.train()
# Evaluate the model
trainer.evaluate()
# Save the model
trainer.save_model('./bert')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.6308,1.55666,0.342222
2,1.4947,1.513391,0.368889
3,1.4406,1.497809,0.364444


## 3. Evaluate

In [6]:
df_test = pd.read_csv('test.csv')

In [10]:
classifier_zero_shot = pipeline('zero-shot-classification', model=MODEL, truncation=True, device=0)
label_mapping = {0: "1 star review", 1: "2 star review", 2: "3 star review", 3: "4 star review", 4: "5 star review"}
df_zero_shot = df_test.copy()
df_zero_shot['predictions'] = df_zero_shot['text'].apply(lambda x: classifier_zero_shot(x, candidate_labels=list(label_mapping.keys()))["labels"][0])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [11]:
classifier_fine_tuned = pipeline('text-classification', model="./bert", truncation=True, device=0)
df_fine_tuned = df_test.copy()
df_fine_tuned['predictions'] = df_fine_tuned['text'].apply(lambda x: classifier_fine_tuned(x)[0]['label'])

In [17]:
def get_performance_metrics(df_test):
  y_test = df_test.label.round()
  y_pred = df_test.predictions.round()

  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))

  # print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
  # print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [18]:
# performance using off-the shelf model
get_performance_metrics(df_zero_shot)


Classification Report:
              precision    recall  f1-score   support

           0       0.13      0.36      0.19        96
           1       0.20      0.01      0.02       108
           2       0.07      0.01      0.02       100
           3       0.24      0.09      0.13       115
           4       0.18      0.32      0.23       106

    accuracy                           0.15       525
   macro avg       0.16      0.16      0.12       525
weighted avg       0.17      0.15      0.12       525



In [20]:
df_fine_tuned

Unnamed: 0,label,text,id,predictions
0,3,Its not a very helpful place but It was clean....,29178,LABEL_2
1,3,After the drive up from Phoenix for March Madn...,360024,LABEL_0
2,3,"Unlike my boyfriend Darren W, I'm not much of ...",3537,LABEL_4
3,0,The one at Alexis Nihon is really the worst. \...,625413,LABEL_0
4,3,I have no idea what this place is like at nigh...,485401,LABEL_2
...,...,...,...,...
520,3,One of the better dining experiences you'll ha...,522495,LABEL_4
521,0,"For the past few years, my employees and I att...",137395,LABEL_0
522,4,A damn good Popsicle. You can not go wrong wit...,601079,LABEL_0
523,1,"Love carne Asada fries,but when the fries are ...",555537,LABEL_4


In [19]:
# performance of a fine-tuned model
get_performance_metrics(df_fine_tuned)


Classification Report:


ValueError: Mix of label input types (string and number)