<a href="https://colab.research.google.com/github/tmathema/NLP_series/blob/main/Text_classification_chatbot_arena_kaggle_competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#pip install datasets
#!pip install evaluate



In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import pipeline
import torch
from datasets import Dataset
from sklearn.metrics import accuracy_score, classification_report, f1_score, recall_score, precision_score
import evaluate

Upload the data

In [4]:
train_data = pd.read_csv('/content/gdrive/MyDrive/NLP_series/text_classification/train.csv')
test_data = pd.read_csv('/content/gdrive/MyDrive/NLP_series/text_classification/test.csv')

Preprocess the data

In [5]:
test_data.head()

Unnamed: 0,id,prompt,response_a,response_b
0,136060,"[""I have three oranges today, I ate an orange ...","[""You have two oranges today.""]","[""You still have three oranges. Eating an oran..."
1,211333,"[""You are a mediator in a heated political deb...","[""Thank you for sharing the details of the sit...","[""Mr Reddy and Ms Blue both have valid points ..."
2,1233961,"[""How to initialize the classification head wh...","[""When you want to initialize the classificati...","[""To initialize the classification head when p..."


In [6]:
train_data = train_data.iloc[:10000,]

In [7]:
train_data.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


In [8]:
def labeling(df):
  if df['winner_tie']==1:
    return 0
  if df['winner_model_a']==1:
    return 1
  elif df['winner_model_b']==1:
    return 2
  else:
    return np.NaN

train_data['label'] = train_data.apply(labeling, axis=1)

In [9]:
train_data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,3503
2,3408
0,3089


Train validation split

In [10]:
X_train, X_val, y_train, y_val = train_test_split(train_data.drop(['winner_model_a', 'id', 'model_a', 'model_b', 'winner_model_b', 'winner_tie'], axis=1),
                                    train_data['label'],  random_state=1, test_size=0.25,  shuffle=True)

In [11]:
X_train.shape

(7500, 4)

In [12]:
X_val.shape

(2500, 4)

In [13]:
y_train.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,2600
1,2598
0,2302


In [14]:
y_val.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,905
2,808
0,787


In [15]:
def tokenize_batch(batch):
    text_a = [f"Prompt: {p} Response: {r}" for p, r in zip(batch["prompt"], batch["response_a"])]
    text_b = [f"Prompt: {p} Response: {r}" for p, r in zip(batch["prompt"], batch["response_b"])]

    return text_a, text_b


In [16]:
text_a, text_b = tokenize_batch(X_train.to_dict(orient="list"))
X_train["text_a"] = text_a
X_train["text_b"] = text_b


In [17]:
text_a, text_b = tokenize_batch(X_val.to_dict(orient="list"))
X_val["text_a"] = text_a
X_val["text_b"] = text_b

In [18]:
X_train.iloc[0]

Unnamed: 0,651
prompt,"[""Convert this into complex legalese:\n\""Hey w..."
response_a,"[""Greetings and Salutations,\n\nI hereby exten..."
response_b,"[""I am sorry, but the request to convert \""Hey..."
label,1
text_a,"Prompt: [""Convert this into complex legalese:\..."
text_b,"Prompt: [""Convert this into complex legalese:\..."


In [19]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(examples):
    return tokenizer(
        examples['text_a'],
        examples['text_b'],
        truncation=True,
        padding="max_length"
    )

# ✅ Use apply with axis=1 to process rows
tokenized_train = X_train.apply(tokenize_function, axis=1)
tokenized_val = X_val.apply(tokenize_function, axis=1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [20]:

# Extract everything into lists
input_ids = [example.input_ids for example in tokenized_val]
attention_mask = [example.attention_mask for example in tokenized_val]
labels = y_val

# Create a single Dataset object
val_dataset = Dataset.from_dict({
    'input_ids': input_ids,
    'attention_mask': attention_mask,
    'labels': labels
})



In [22]:
# Extract everything into lists
input_ids = [example.input_ids for example in tokenized_train]
attention_mask = [example.attention_mask for example in tokenized_train]
labels = y_train
# Create a single Dataset object
train_dataset = Dataset.from_dict({
    'input_ids': input_ids,
    'attention_mask': attention_mask,
    'labels': labels
})

In [23]:
#dynamic padding
data_collator = DataCollatorWithPadding(tokenizer)

Define the model

In [24]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Initialise the trainer

In [25]:
#you can also define some training arguments as parameters
training_args = TrainingArguments("test-trainer")

Launch the trainer

In [26]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Trainer(


Train the model

In [27]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtmathema2017[0m ([33mtmathema2017-university-of-cape-town[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,1.1201
1000,1.1065
1500,1.1026
2000,1.1012
2500,1.0989


TrainOutput(global_step=2814, training_loss=1.1048159514578335, metrics={'train_runtime': 1003.2996, 'train_samples_per_second': 22.426, 'train_steps_per_second': 2.805, 'total_flos': 5920051898880000.0, 'train_loss': 1.1048159514578335, 'epoch': 3.0})

Model validation

In [28]:
predictions = trainer.predict(val_dataset)
print(predictions.predictions.shape, predictions.label_ids.shape)

(2500, 3) (2500,)


In [29]:
preds = np.argmax(predictions.predictions, axis=-1)

In [30]:
print("Accuracy:", accuracy_score(y_val, preds))
print(classification_report(y_val, preds))


Accuracy: 0.3772
              precision    recall  f1-score   support

           0       0.38      0.51      0.43       787
           1       0.40      0.42      0.41       905
           2       0.34      0.21      0.26       808

    accuracy                           0.38      2500
   macro avg       0.37      0.38      0.37      2500
weighted avg       0.37      0.38      0.37      2500



In [34]:
f1_score(y_val, preds, average = 'macro')

0.3653851061939306

In [36]:
precision_score(y_val, preds, average = 'macro')

0.3705363204058207

With defined training arguments

In [37]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)




In [42]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "f1": f1.compute(predictions=predictions, references=labels, average='macro')["f1"],
    }


In [43]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [44]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,1.090985,0.3832,0.278425
2,1.088300,1.103709,0.3524,0.275667


KeyboardInterrupt: 