# Classification with BERT

In [1]:
MODEL = 'google-bert/bert-base-uncased'
TOKEN_LENGTH = 512

In [2]:
!pip install --upgrade "transformers==4.43.2" accelerate datasets scikit-learn --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autogluon-multimodal 1.1.1 requires nvidia-ml-py3==7.352.0, which is not installed.
autogluon-multimodal 1.1.1 requires accelerate<0.22.0,>=0.21.0, but you have accelerate 1.1.1 which is incompatible.
autogluon-multimodal 1.1.1 requires jsonschema<4.22,>=4.18, but you have jsonschema 4.23.0 which is incompatible.
autogluon-multimodal 1.1.1 requires omegaconf<2.3.0,>=2.1.1, but you have omegaconf 2.3.0 which is incompatible.
autogluon-multimodal 1.1.1 requires scikit-learn<1.4.1,>=1.3.0, but you have scikit-learn 1.5.2 which is incompatible.
autogluon-multimodal 1.1.1 requires scipy<1.13,>=1.5.4, but you have scipy 1.14.1 which is incompatible.
autogluon-multimodal 1.1.1 requires transformers[sentencepiece]<4.41.0,>=4.38.0, but you have transformers 4.43.2 which is incompatible.
autogluon-timeseries 1.1.1 requ

In [3]:
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, pipeline
from sklearn.metrics import accuracy_score, classification_report
import torch

2024-11-17 13:07:59.045658: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-17 13:07:59.061804: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-17 13:07:59.066820: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-17 13:07:59.078620: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 1. Read Data

In [4]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_val = pd.read_csv("val.csv")

# Convert to Huggingface Dataset
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=TOKEN_LENGTH)

dataset_train = dataset_train.map(tokenize_function, batched=True)
dataset_val = dataset_val.map(tokenize_function, batched=True)

# Set format for PyTorch
dataset_train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
dataset_val.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


Map:   0%|          | 0/250 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

## 2. Train Model

In [5]:

# Load pre-trained BERT model
num_classes = df_train['label'].nunique()  # Number of classes in the dataset
model = BertForSequenceClassification.from_pretrained(MODEL, num_labels=num_classes)

# Define TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


# Train the model
trainer.train()
# Evaluate the model
trainer.evaluate()
# Save the model
trainer.save_model('./bert')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.5921,1.552612,0.28
2,1.4687,1.488468,0.422222
3,1.3992,1.459183,0.435556


## 3. Evaluate

In [6]:
df_test = pd.read_csv('test.csv')

In [7]:
classifier_zero_shot = pipeline('zero-shot-classification', model=MODEL, truncation=True, device=0)
label_mapping = {0: "1 star review", 1: "2 star review", 2: "3 star review", 3: "4 star review", 4: "5 star review"}
df_zero_shot = df_test.copy()
df_zero_shot['predictions'] = df_zero_shot['text'].apply(lambda x: classifier_zero_shot(x, candidate_labels=list(label_mapping.keys()))["labels"][0])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [8]:
classifier_fine_tuned = pipeline('text-classification', model="./bert", truncation=True, device=0)
df_fine_tuned = df_test.copy()
df_fine_tuned['predictions'] = df_fine_tuned['text'].apply(lambda x: classifier_fine_tuned(x)[0]['label'])
df_fine_tuned['predictions'] = df_fine_tuned['predictions'].str.replace('LABEL_', '', regex=False).astype(int)


In [13]:
def get_performance_metrics(df_test):
  y_test = df_test.label.round()
  y_pred = df_test.predictions.round()

  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))


In [14]:
# performance using off-the shelf model
get_performance_metrics(df_zero_shot)


Classification Report:
              precision    recall  f1-score   support

           0       0.10      0.05      0.07        96
           1       0.21      0.40      0.28       108
           2       0.20      0.25      0.22       100
           3       0.18      0.11      0.14       115
           4       0.15      0.10      0.12       106

    accuracy                           0.18       525
   macro avg       0.17      0.18      0.17       525
weighted avg       0.17      0.18      0.17       525



In [19]:
# performance of a fine-tuned model
get_performance_metrics(df_fine_tuned)


Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.51      0.52        96
           1       0.27      0.16      0.20       108
           2       0.33      0.52      0.40       100
           3       0.53      0.09      0.15       115
           4       0.43      0.77      0.55       106

    accuracy                           0.40       525
   macro avg       0.42      0.41      0.36       525
weighted avg       0.42      0.40      0.36       525

