# PART III: CLASSIFICATION (fine-tuning Bert model)

## 1. Setting up for the fine-tuning.

In [1]:
# Load nessesary libraries.
import pandas as pd
import torch
from torch import nn
from transformers import BertForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support
from datasets import DatasetDict, Dataset
import plotly.figure_factory as ff
import warnings
warnings.filterwarnings('ignore')

  "class": algorithms.Blowfish,


In [2]:
# Download a compatible Pytorch version.
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Check whether CUDA is accessible.
cuda_available = torch.cuda.is_available()
cuda_device= torch.cuda.get_device_name(0)

if cuda_available == True:
    print('CUDA was successfully installed and compiled on my device.')
    print('CUDA device name is:', cuda_device)
else:
    print('Cuda in not available')

CUDA was successfully installed and compiled on my device.
CUDA device name is: NVIDIA GeForce GTX 1650


## 2 Preprocessing

In [3]:
# Import finalized dataset as pandas data frame.
df = pd.read_csv('data_jobads_final.csv', index_col=None)

# Apply the final touch.
df['job_description'] = df['job_description'].str.replace('\n', ' ')
df = df.dropna()
df = df.iloc[:,-2:]
df = df.reset_index(drop=True)

df.head(2)

Unnamed: 0,job_description,label
0,silver stream healthcare group offer great emp...,registered_nurse
1,create a better future for yourself recruitne...,registered_nurse


In [4]:
# Check the 'label' column for unique values.
labels = df['label'].unique().tolist()

print(labels)

['registered_nurse', 'electrician', 'data_analyst']


In [5]:
# Create 'id2label', 'label2id' variables for mapping the labels.
num_labels = len(labels)
id2label = {id:label for id, label in enumerate(labels)}
label2id = {label: id for id, label in enumerate(labels)}

print(id2label)
print(label2id)

{0: 'registered_nurse', 1: 'electrician', 2: 'data_analyst'}
{'registered_nurse': 0, 'electrician': 1, 'data_analyst': 2}


In [6]:
# Encode the 'label' column.
df['label_encoded'] = df.label.map(lambda x: label2id[x.strip()])

df.head(2)

Unnamed: 0,job_description,label,label_encoded
0,silver stream healthcare group offer great emp...,registered_nurse,0
1,create a better future for yourself recruitne...,registered_nurse,0


In [7]:
# Examine the distribution and value counts for the 'label_encoded' column.
print("the proportion of total label ID:".upper())
print(df['label_encoded'].value_counts(normalize=True).sort_index(), '\n')

print("the count of total label ID:".upper())
print(df['label_encoded'].value_counts())

THE PROPORTION OF TOTAL LABEL ID:
0    0.552316
1    0.125214
2    0.322470
Name: label_encoded, dtype: float64 

THE COUNT OF TOTAL LABEL ID:
0    644
2    376
1    146
Name: label_encoded, dtype: int64


## 3 Fine-tuning

In [8]:
# Split the DataFrame into training and testing sets and maintain label proportions.
train, validation_test = train_test_split(df, test_size=0.3, random_state=820, stratify=df['label'])
test, validation = train_test_split(validation_test, test_size=0.5, random_state=820, stratify=validation_test['label'])

print('TOTAL shape:', df.shape)
print('TRAINING shape:', train.shape)
print('VALIDATION shape:', validation.shape)
print('TEST shape:', test.shape)

TOTAL shape: (1166, 3)
TRAINING shape: (816, 3)
VALIDATION shape: (175, 3)
TEST shape: (175, 3)


In [9]:
# Convert each set to Dataset format.
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(validation)
test_dataset = Dataset.from_pandas(test)

# Remove '__index_level_0__' feature.
train_dataset = train_dataset.remove_columns('__index_level_0__')
val_dataset = val_dataset.remove_columns('__index_level_0__')
test_dataset = test_dataset.remove_columns('__index_level_0__')

# Create DatasetDict variable.
jobads = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test' : test_dataset
    })

In [10]:
# Load a tokenizer from the 'bert-base-uncased' pretrained model.
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [11]:
# Define a custom tokenization function for long and short text.
def custom_tokenize(batch, strategy="default", max_length=512):
    
    tokenized_outputs = {'input_ids': [], 'attention_mask': []}

    for text in batch['job_description']:
        # Tokenize using the default strategy if the text is shorter than the maximum length.
        if strategy == "default":
            inputs = tokenizer.encode_plus(text, max_length=max_length, truncation=True, padding="max_length", return_tensors='pt')
        # Tokenize using the default strategy if the text is longer than the maximum length.
        elif strategy == "head-tail":
            tokens = tokenizer.tokenize(text)

            head_length = int((max_length - 3) * 0.6)
            tail_length = (max_length - 3) - head_length
            # If the text is longer than the specified maximum length, split it into head and tail parts.
            if len(tokens) > max_length - 2:
                head_tokens = tokens[:head_length]
                tail_tokens = tokens[-tail_length:]
                input_ids = tokenizer.encode_plus(text=" ".join(head_tokens + tail_tokens), max_length=max_length, truncation=True, padding="max_length", return_tensors='pt')['input_ids']
                attention_mask = tokenizer.encode_plus(text=" ".join(head_tokens + tail_tokens), max_length=max_length, truncation=True, padding="max_length", return_tensors='pt')['attention_mask']
            # If the text is within the maximum length, tokenize it as is.
            else:
                encoded_plus = tokenizer.encode_plus(text, max_length=max_length, truncation=True, padding="max_length", return_tensors='pt')
                input_ids, attention_mask = encoded_plus['input_ids'], encoded_plus['attention_mask']
            inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
        else:

            pass

        # Append tokenized input and attention mask to the outputs.
        tokenized_outputs['input_ids'].append(inputs['input_ids'].squeeze().tolist())
        tokenized_outputs['attention_mask'].append(inputs['attention_mask'].squeeze().tolist())

    return tokenized_outputs

In [12]:
# Define a costum function for tokenization using the "head-tail" strategy.
def tokenize(examples):
    return custom_tokenize(examples, strategy="head-tail")

In [13]:
# Apply the function to the dataset
jobads_encoded = jobads.map(tokenize, batched=True, batch_size=None)

print(jobads_encoded)

Map:   0%|          | 0/816 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1121 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/175 [00:00<?, ? examples/s]

Map:   0%|          | 0/175 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'attention_mask'],
        num_rows: 816
    })
    validation: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'attention_mask'],
        num_rows: 175
    })
    test: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'attention_mask'],
        num_rows: 175
    })
})


In [14]:
# Set the dataset format to PyTorch tensors.
jobads_encoded.set_format('torch', columns=['input_ids', 'attention_mask', 'label_encoded'])

In [15]:
# Calculate class weights.
labels = train['label_encoded'].unique()
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=labels,
                                     y=train['label_encoded'])

# Convert the computed class weights to a PyTorch tensor.
class_weights = torch.from_numpy(class_weights).float()

print(class_weights)

tensor([0.6031, 2.6667, 1.0342])


In [16]:
# Finalise the dataset.
jobads_encoded = jobads_encoded.rename_column('label_encoded', 'labels')

print(jobads_encoded)

DatasetDict({
    train: Dataset({
        features: ['job_description', 'label', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 816
    })
    validation: Dataset({
        features: ['job_description', 'label', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 175
    })
    test: Dataset({
        features: ['job_description', 'label', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 175
    })
})


In [17]:
# Define the custom trainer class.
class WeightedLossTrainer(Trainer):
    
    # Override the method for loss computation.
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.get('logits')
        labels = inputs.get('labels')
        
        # Move class weights to device.
        class_weights_device = class_weights.to(model.device)
        
        # Calculate the loss using CrossEntropyLoss function with the computed class weights.
        loss_func = nn.CrossEntropyLoss(weight=class_weights_device)
        loss = loss_func(logits, labels)
        
        # Return a tuple containing loss and outputs if 'return_outputs' is True.
        return (loss, outputs) if return_outputs else loss

In [18]:
# Check if CUDA (GPU) is available and set the device accordingly, otherwise use CPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize a BERT-based sequence classification model.
total_labels = 3

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=total_labels,
                                                      id2label=id2label,
                                                      label2id=label2id)
# Move the model to device.
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [19]:
# Define a costum function to compute accuracy, F1, precision, and recall for a given set of predictions.
def compute_metrics(pred):
  
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
  acc = accuracy_score(labels, preds)
  
  return {'Accuracy': acc,
          'F1': f1,
          'Precision': precision,
          'Recall': recall}

In [20]:
# Define the batch size for training.
batch_size = 16

# Calculate the number of logging steps based on the dataset size and batch size.
logging_steps = len(jobads_encoded['train']) // batch_size

# Specify the directory where the trained model and logs will be saved.
output_dir = 'ft_bert_temuulen'

# Create an instance of TrainingArguments to configure the training process.
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=3,
                                  learning_rate=2e-5,
                                  per_device_eval_batch_size=batch_size,
                                  per_gpu_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  logging_steps=logging_steps,
                                  save_strategy='epoch',
                                  fp16=True,
                                  load_best_model_at_end=True)

In [21]:
# Create an instance of the WeightedLossTrainer for training the model.
trainer = WeightedLossTrainer(model=model,
                              args=training_args,
                              train_dataset=jobads_encoded['train'],
                              eval_dataset=jobads_encoded['validation'],
                              tokenizer=tokenizer,
                              compute_metrics= compute_metrics)

In [22]:
# Start the training.
trainer.train()

  0%|          | 0/306 [00:00<?, ?it/s]

{'loss': 0.5984, 'learning_rate': 1.6797385620915034e-05, 'epoch': 0.5}


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'loss': 0.1097, 'learning_rate': 1.3529411764705885e-05, 'epoch': 1.0}


  0%|          | 0/11 [00:00<?, ?it/s]

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'eval_loss': 0.018117755651474, 'eval_Accuracy': 1.0, 'eval_F1': 1.0, 'eval_Precision': 1.0, 'eval_Recall': 1.0, 'eval_runtime': 62.7705, 'eval_samples_per_second': 2.788, 'eval_steps_per_second': 0.175, 'epoch': 1.0}
{'loss': 0.0253, 'learning_rate': 1.0196078431372549e-05, 'epoch': 1.5}


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'loss': 0.0139, 'learning_rate': 6.862745098039216e-06, 'epoch': 2.0}


  0%|          | 0/11 [00:00<?, ?it/s]

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'eval_loss': 0.004948759451508522, 'eval_Accuracy': 1.0, 'eval_F1': 1.0, 'eval_Precision': 1.0, 'eval_Recall': 1.0, 'eval_runtime': 63.0869, 'eval_samples_per_second': 2.774, 'eval_steps_per_second': 0.174, 'epoch': 2.0}
{'loss': 0.0081, 'learning_rate': 3.529411764705883e-06, 'epoch': 2.5}


Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'loss': 0.0048, 'learning_rate': 1.9607843137254904e-07, 'epoch': 3.0}


  0%|          | 0/11 [00:00<?, ?it/s]

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'eval_loss': 0.003226655535399914, 'eval_Accuracy': 1.0, 'eval_F1': 1.0, 'eval_Precision': 1.0, 'eval_Recall': 1.0, 'eval_runtime': 63.145, 'eval_samples_per_second': 2.771, 'eval_steps_per_second': 0.174, 'epoch': 3.0}
{'train_runtime': 2987.2688, 'train_samples_per_second': 0.819, 'train_steps_per_second': 0.102, 'train_loss': 0.12668609974625844, 'epoch': 3.0}


TrainOutput(global_step=306, training_loss=0.12668609974625844, metrics={'train_runtime': 2987.2688, 'train_samples_per_second': 0.819, 'train_steps_per_second': 0.102, 'train_loss': 0.12668609974625844, 'epoch': 3.0})

In [23]:
# Save the fine-tuned BERT model.
trainer.model.save_pretrained('ft_bert_temuulen2')

# Save the tokenizer used for fine-tuning to the 'ft_bert_temuulen3_tokenizer'.
tokenizer.save_pretrained('ft_bert_temuulen_tokenizer2')

('ft_bert_temuulen_tokenizer2\\tokenizer_config.json',
 'ft_bert_temuulen_tokenizer2\\special_tokens_map.json',
 'ft_bert_temuulen_tokenizer2\\vocab.txt',
 'ft_bert_temuulen_tokenizer2\\added_tokens.json',
 'ft_bert_temuulen_tokenizer2\\tokenizer.json')

## 4 Evaluating the fine tuned model.

In [24]:
# Specify the directory paths for the fine-tuned model and tokenizer.
model_path = 'ft_bert_temuulen2'
tokenizer_path = 'ft_bert_temuulen_tokenizer2'

# Load the BERT and the tokenizer.
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [25]:
# Load prepared an unseen dataset.
test_dataset = jobads_encoded['test'] 

# Create a Trainer instance.
trainer = Trainer(model=model, tokenizer=tokenizer)
predictions = trainer.predict(test_dataset)

# Convert predictions to numpy for further analysis.
labels = predictions.label_ids
preds = predictions.predictions.argmax(-1)

  0%|          | 0/22 [00:00<?, ?it/s]

In [26]:
# Plot Confusion Matrix.
cm_labels = ['registered nurse', 'electrician', 'data analyst']
cm_matrix = confusion_matrix(labels, preds)
cm_title = "CONFUSION MATRIX: fine-tuned 'bert-base-uncased' model for classification"

fig = ff.create_annotated_heatmap(z=cm_matrix, 
                                  x=cm_labels,
                                  y=cm_labels, 
                                  colorscale='balance', 
                                  showscale=True,
                                  annotation_text=cm_matrix)

fig.update_layout(width=700, 
                  height=700, 
                  title=cm_title, 
                  title_x=0.5,
                  xaxis=dict(title='Predicted Value', side='bottom'), 
                  yaxis_title='True Value')

fig.update_yaxes(tickangle=-90)  
    
fig.show()

# Print detailed classification report.
report = classification_report(labels, preds, output_dict=True)
report_title = "CLASSIFICATION REPORT: fine-tuned 'bert-base-uncased' model for classification"

print(report_title, '\n')
print(classification_report(labels, preds))
print(label2id)

CLASSIFICATION REPORT: fine-tuned 'bert-base-uncased' model for classification 

              precision    recall  f1-score   support

           0       1.00      0.99      0.99        96
           1       1.00      1.00      1.00        22
           2       0.98      1.00      0.99        57

    accuracy                           0.99       175
   macro avg       0.99      1.00      1.00       175
weighted avg       0.99      0.99      0.99       175

{'registered_nurse': 0, 'electrician': 1, 'data_analyst': 2}
