# PART III: CLASSIFICATION

## 0. GENERAL

### 0.1. Load the module

In [46]:
# Load nessesary libraries.
import sys
import time
import torch
import psutil
import gpustat
import platform
import warnings
import pandas as pd
import plotly.figure_factory as ff
from torch import nn
from transformers import Trainer
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score 
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from datasets import Dataset
from datasets import DatasetDict
warnings.filterwarnings('ignore')

### 0.2. Check the computational environment

In [47]:
# List the software and hardware configurations used for conducting the experiment.
print('WINDOWS VERSION:', platform.platform())
print('PYTHON VERSION:', sys.version)
print('CPU CORE:', psutil.cpu_count(logical=False))
print('CPU SPEED:', psutil.cpu_freq())
print('GPU:', gpustat.new_query().gpus[0].name)
print(f'RAM: {psutil.virtual_memory().total/(1024 ** 3):.2f} GB')
print(f"HARD DRIVE: {psutil.disk_usage('/').total/(1024 ** 3):.2f} GB")

WINDOWS VERSION: Windows-10-10.0.22631-SP0
PYTHON VERSION: 3.11.4 | packaged by Anaconda, Inc. | (main, Jul  5 2023, 13:38:37) [MSC v.1916 64 bit (AMD64)]
CPU CORE: 4
CPU SPEED: scpufreq(current=2496.0, min=0.0, max=2496.0)
GPU: NVIDIA GeForce GTX 1650
RAM: 31.87 GB
HARD DRIVE: 237.45 GB


### 0.3. Load the main dataset

*experiment dataset*

In [48]:
# Import dataset as pandas data frame.
df_experiment = pd.read_csv('data_jobads_final.csv', index_col=None)

# Apply small modifications.
df_experiment['job_description'] = df_experiment['job_description'].str.replace('\n', ' ')
df_experiment = df_experiment.dropna().reset_index(drop=True)
df_experiment = df_experiment.iloc[:,-2:]

print('The shape of the experiment dataset is:', df_experiment.shape)
df_experiment.head(2)

The shape of the experiment dataset is: (1166, 2)


Unnamed: 0,job_description,label
0,silver stream healthcare group offer great emp...,registered_nurse
1,create a better future for yourself recruitne...,registered_nurse


*benchmark dataset*

In [49]:
# Import datasets as pandas data frame.
df_benchmark_train = pd.read_csv('ag_news_train.csv', index_col=None)
df_benchmark_test = pd.read_csv('ag_news_test.csv', index_col=None)

In [50]:
print('The shape of the train dataset for benchmark is:', df_benchmark_train.shape)
print('The shape of the test dataset for benchmark is:', df_benchmark_test.shape)
df_benchmark_train.head(2)

The shape of the train dataset for benchmark is: (120000, 2)
The shape of the test dataset for benchmark is: (7600, 2)


Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reut...,business
1,Carlyle Looks Toward Commercial Aerospace (Re...,business


## 1. CLASSIFICATION WITH FINE-TUNED BERT MODEL

### 1.1 Experiment

In [51]:
# Starting the timer to track the execution duration.
start = time.time()

*load the dataset*

In [52]:
# Create a copy of the original DataFrame for experimentation with BERT.
df_bert = df_experiment.copy()

df_bert.head(2)

Unnamed: 0,job_description,label
0,silver stream healthcare group offer great emp...,registered_nurse
1,create a better future for yourself recruitne...,registered_nurse


*preprocess the dataset*

In [54]:
# Create 'id2label', 'label2id' variables for mapping the labels.
labels = df_experiment['label'].unique().tolist()
num_labels = len(labels)
id2label = {id:label for id, label in enumerate(labels)}
label2id = {label: id for id, label in enumerate(labels)}

print(id2label)
print(label2id)

{0: 'registered_nurse', 1: 'electrician', 2: 'data_analyst'}
{'registered_nurse': 0, 'electrician': 1, 'data_analyst': 2}


In [55]:
# Encode the 'label' column.
df_experiment['label_encoded'] = df_experiment.label.map(lambda x: label2id[x.strip()])

df_experiment.head(2)

Unnamed: 0,job_description,label,label_encoded
0,silver stream healthcare group offer great emp...,registered_nurse,0
1,create a better future for yourself recruitne...,registered_nurse,0


In [56]:
# Examine the distribution and value counts for the 'label_encoded' column.
print("the proportion of total label ID:".upper())
print(df_experiment['label_encoded'].value_counts(normalize=True).sort_index(), '\n')

print("the count of total label ID:".upper())
print(df_experiment['label_encoded'].value_counts())

THE PROPORTION OF TOTAL LABEL ID:
0    0.552316
1    0.125214
2    0.322470
Name: label_encoded, dtype: float64 

THE COUNT OF TOTAL LABEL ID:
0    644
2    376
1    146
Name: label_encoded, dtype: int64


*initialize the gpu* (optional)

To enhance the effectiveness of managing matrix and tensor operations, the CUDA device was created. This capability represents a key advantage of utilizing the BERT model within the Torch framework.

In [57]:
# Check whether CUDA is accessible and, if so, create a CUDA device.
cuda_available = torch.cuda.is_available()
cuda_device= torch.cuda.get_device_name(0)

if cuda_available == True:
    device = torch.device('cuda')
    print('CUDA was successfully installed and compiled on my device.')
    print('CUDA device name is:', cuda_device)
else:
    print('Cuda in not available')

CUDA was successfully installed and compiled on my device.
CUDA device name is: NVIDIA GeForce GTX 1650


*train test split*

In [58]:
# Split the DataFrame into training and testing sets and maintain label proportions.
train, validation_test = train_test_split(df_experiment, test_size=0.3, random_state=630, stratify=df_experiment['label'])
test, validation = train_test_split(validation_test, test_size=0.5, random_state=630, stratify=validation_test['label'])

print('TOTAL shape:', df_experiment.shape)
print('TRAINING shape:', train.shape)
print('VALIDATION shape:', validation.shape)
print('TEST shape:', test.shape)

TOTAL shape: (1166, 3)
TRAINING shape: (816, 3)
VALIDATION shape: (175, 3)
TEST shape: (175, 3)


*convert to Dataset format*

In [59]:
# Convert each set to Dataset format.
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(validation)
test_dataset = Dataset.from_pandas(test)

# Remove '__index_level_0__' feature.
train_dataset = train_dataset.remove_columns('__index_level_0__')
val_dataset = val_dataset.remove_columns('__index_level_0__')
test_dataset = test_dataset.remove_columns('__index_level_0__')

# Create DatasetDict variable.
jobads = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test' : test_dataset
    })

*initialize the model*

In [60]:
# Load a tokenizer from the 'bert-base-uncased' pretrained model.
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Initialize a BERT-based sequence classification model.
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=3,
                                                      id2label=id2label,
                                                      label2id=label2id)
# Move the model to device.
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

*tokenize the text*

In [61]:
# Define a custom tokenization function for processing long and short texts.
def custom_tokenize(batch, strategy='default', max_length=512):
    
    tokenized_outputs = {'input_ids': [], 'attention_mask': []}

    for text in batch['job_description']:
        # Tokenize using the default strategy if the text is shorter than the maximum length.
        if strategy == 'default':
            inputs = tokenizer.encode_plus(text, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')
        # Tokenize using the default strategy if the text is longer than the maximum length.
        elif strategy == 'head-tail':
            tokens = tokenizer.tokenize(text)

            head_length = int((max_length - 3) * 0.6)
            tail_length = (max_length - 3) - head_length
            # If the text is longer than the specified maximum length, split it into head and tail parts.
            if len(tokens) > max_length - 2:
                head_tokens = tokens[:head_length]
                tail_tokens = tokens[-tail_length:]
                input_ids = tokenizer.encode_plus(text=' '.join(head_tokens + tail_tokens), max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')['input_ids']
                attention_mask = tokenizer.encode_plus(text=" ".join(head_tokens + tail_tokens), max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')['attention_mask']
            # If the text is within the maximum length, tokenize it as is.
            else:
                encoded_plus = tokenizer.encode_plus(text, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')
                input_ids, attention_mask = encoded_plus['input_ids'], encoded_plus['attention_mask']
            inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
        else:

            pass

        # Append tokenized input and attention mask to the outputs.
        tokenized_outputs['input_ids'].append(inputs['input_ids'].squeeze().tolist())
        tokenized_outputs['attention_mask'].append(inputs['attention_mask'].squeeze().tolist())

    return tokenized_outputs

In [62]:
# Define a costum function for tokenization using the 'head-tail' strategy.
def tokenize(examples):
    return custom_tokenize(examples, strategy='head-tail')

In [63]:
# Apply the function to the dataset.
jobads_encoded = jobads.map(tokenize, batched=True, batch_size=None)

print(jobads_encoded)

Map:   0%|          | 0/816 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (944 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/175 [00:00<?, ? examples/s]

Map:   0%|          | 0/175 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'attention_mask'],
        num_rows: 816
    })
    validation: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'attention_mask'],
        num_rows: 175
    })
    test: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'attention_mask'],
        num_rows: 175
    })
})


In [64]:
# Set the dataset format to PyTorch tensors.
jobads_encoded.set_format('torch', columns=['input_ids', 'attention_mask', 'label_encoded'])

*manage the loss function with inbalanced text*

In [65]:
# Calculate class weights.
labels = train['label_encoded'].unique()
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=labels,
                                     y=train['label_encoded'])

# Convert the computed class weights to a PyTorch tensor.
class_weights = torch.from_numpy(class_weights).float()

print(class_weights)

tensor([0.6031, 1.0342, 2.6667])


In [66]:
# Finalise the dataset.
jobads_encoded = jobads_encoded.rename_column('label_encoded', 'labels')

print(jobads_encoded)

DatasetDict({
    train: Dataset({
        features: ['job_description', 'label', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 816
    })
    validation: Dataset({
        features: ['job_description', 'label', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 175
    })
    test: Dataset({
        features: ['job_description', 'label', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 175
    })
})


In [67]:
# Define the custom trainer class.
class WeightedLossTrainer(Trainer):
    
    # Override the method for loss computation.
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.get('logits')
        labels = inputs.get('labels')
        
        # Move class weights to device.
        class_weights_device = class_weights.to(model.device)
        
        # Calculate the loss using CrossEntropyLoss function with the computed class weights.
        loss_func = nn.CrossEntropyLoss(weight=class_weights_device)
        loss = loss_func(logits, labels)
        
        # Return a tuple containing loss and outputs if 'return_outputs' is True.
        return (loss, outputs) if return_outputs else loss

*manage training argumets*

In [68]:
# Define a costum function to compute accuracy, F1, precision, and recall for a given set of predictions.
def compute_metrics(pred):
  
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
  acc = accuracy_score(labels, preds)
  
  return {'Accuracy': acc,
          'F1': f1,
          'Precision': precision,
          'Recall': recall}

In [69]:
# Define the batch size for training.
batch_size = 128

# Calculate the number of logging steps based on the dataset size and batch size.
logging_steps = len(jobads_encoded['train']) // batch_size

# Specify the directory where the trained model and logs will be saved.
output_dir = 'ft_bert_temuulen'

# Create an instance of TrainingArguments to configure the training process.
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=3,
                                  learning_rate=2e-5,
                                  per_device_eval_batch_size=batch_size,
                                  per_gpu_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  logging_steps=logging_steps,
                                  save_strategy='epoch',
                                  fp16=True,
                                  load_best_model_at_end=True)

*fine-tune the model*

In [70]:
# Create an instance of the WeightedLossTrainer for training the model.
trainer = WeightedLossTrainer(model=model,
                              args=training_args,
                              train_dataset=jobads_encoded['train'],
                              eval_dataset=jobads_encoded['validation'],
                              tokenizer=tokenizer,
                              compute_metrics= compute_metrics)

In [71]:
# Start the training.
trainer.train()

  0%|          | 0/306 [00:00<?, ?it/s]

KeyboardInterrupt: 

*save the model*

In [None]:
# Save the fine-tuned BERT model.
trainer.model.save_pretrained('ft_bert_temuulen2')

# Save the tokenizer used for fine-tuning to the 'ft_bert_temuulen3_tokenizer'.
tokenizer.save_pretrained('ft_bert_temuulen_tokenizer2')

*evaluate the model*

In [28]:
# Specify the directory paths for the fine-tuned model and tokenizer.
model_path = 'ft_bert_temuulen2'
tokenizer_path = 'ft_bert_temuulen_tokenizer2'

# Load the BERT and the tokenizer.
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [29]:
# Load prepared an unseen dataset.
test_dataset = jobads_encoded['test'] 

# Create a Trainer instance.
classifier = Trainer(model=model, tokenizer=tokenizer)
predictions = classifier.predict(test_dataset)

# Convert predictions to numpy for further analysis.
labels = predictions.label_ids
preds = predictions.predictions.argmax(-1)

  0%|          | 0/22 [00:00<?, ?it/s]

In [30]:
# Plot Confusion Matrix.
cm_labels = ['registered nurse', 'electrician', 'data analyst']
cm_matrix = confusion_matrix(labels, preds)
cm_title = "CONFUSION MATRIX: fine-tuned 'bert-base-uncased' model for classification"

fig = ff.create_annotated_heatmap(z=cm_matrix, 
                                  x=cm_labels,
                                  y=cm_labels, 
                                  colorscale='balance', 
                                  showscale=True,
                                  annotation_text=cm_matrix)

fig.update_layout(width=700, 
                  height=700, 
                  title=cm_title, 
                  title_x=0.5,
                  xaxis=dict(title='Predicted Value', side='bottom'), 
                  yaxis_title='True Value')

fig.update_yaxes(tickangle=-90)  
    
fig.show()

# Print detailed classification report.
report = classification_report(labels, preds, output_dict=True)
report_title = "CLASSIFICATION REPORT: fine-tuned 'bert-base-uncased' model for classification"

print(report_title, '\n')
print(classification_report(labels, preds))
print(label2id)

CLASSIFICATION REPORT: fine-tuned 'bert-base-uncased' model for classification 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        96
           1       1.00      1.00      1.00        22
           2       1.00      1.00      1.00        57

    accuracy                           1.00       175
   macro avg       1.00      1.00      1.00       175
weighted avg       1.00      1.00      1.00       175

{'registered_nurse': 0, 'electrician': 1, 'data_analyst': 2}


### 1.2 Test

In [72]:
# Initialize a text classification pipeline.
classifier = pipeline('text-classification', model='ft_bert_temuulen2', tokenizer='ft_bert_temuulen_tokenizer2')

In [73]:
# Test model with random text samples.
random_text = ["1. I promote health and help people who is sick", 
               "2. If your outlet isn't working and you don't have a light, you can ask me to fix it.", 
               "3. All day, I'm sitting in front of the screen, solving problems with my mouse and keyboard", 
               "4. I'm familiar with the process of providing injections to individuals.",
               "5. If the streetlights fail, I can replace them with new ones.",
               "6. I'm familiar with both Python and R, and I use these tools for my work."]

for x in range(len(random_text)):
    print(classifier(random_text[x]), '=', random_text[x])

[{'label': 'registered_nurse', 'score': 0.9776366353034973}] = 1. I promote health and help people who is sick
[{'label': 'electrician', 'score': 0.9333652257919312}] = 2. If your outlet isn't working and you don't have a light, you can ask me to fix it.
[{'label': 'data_analyst', 'score': 0.8441108465194702}] = 3. All day, I'm sitting in front of the screen, solving problems with my mouse and keyboard
[{'label': 'registered_nurse', 'score': 0.9447991251945496}] = 4. I'm familiar with the process of providing injections to individuals.
[{'label': 'electrician', 'score': 0.9593295454978943}] = 5. If the streetlights fail, I can replace them with new ones.
[{'label': 'data_analyst', 'score': 0.9338756203651428}] = 6. I'm familiar with both Python and R, and I use these tools for my work.


The random sentences employed to evaluate the fine-tuned Bert-base classification model were intentionally vague and concise, aiming to assess the model's capability in challenging scenarios. Nevertheless, the outcomes and associated scores demonstrate that the model excelled, accurately forecasting every sentence with significant confidence levels (average probability score: **0.9322**).

In [74]:
# Test model with job seekers' data.
# Load the experiment participants dataset.
df_jobseeker = pd.read_csv('data_jobseeker.csv', index_col=None)

# Apply minor modifications for further use.
df_jobseeker['combined_info'] = df_jobseeker.education + '. ' + df_jobseeker.skill + '. ' + df_jobseeker.experience + '.'
df_jobseeker.drop(['education', 'skill', 'experience'], axis=1, inplace=True)

for x in range(3):
    print(classifier(df_jobseeker.iat[x, -1]), '=', df_jobseeker.iat[x, -2])

[{'label': 'registered_nurse', 'score': 0.9979069232940674}] = registered nurse
[{'label': 'electrician', 'score': 0.9958240985870361}] = electrician
[{'label': 'data_analyst', 'score': 0.996567964553833}] = data analyst


All data from each participant in the experiment was classified and assigned into the appropriate labelled classes with outstanding results.(average probability score: **0.9968**)

### 1.3 Benchmarking

In [153]:
df_benchmark_train_bert = df_benchmark_train.copy()
df_benchmark_test_bert = df_benchmark_test.copy()

In [154]:
labels = df_benchmark_train_bert['label'].unique().tolist()
num_labels = len(labels)
id2label = {id:label for id, label in enumerate(labels)}
label2id = {label: id for id, label in enumerate(labels)}

print(id2label)
print(label2id)

{0: 'business', 1: 'sci/tech', 2: 'sports', 3: 'world'}
{'business': 0, 'sci/tech': 1, 'sports': 2, 'world': 3}


In [155]:
# Encode the 'label' column.
df_benchmark_train_bert['label_encoded'] = df_benchmark_train_bert.label.map(lambda x: label2id[x.strip()])
df_benchmark_test_bert['label_encoded'] = df_benchmark_test_bert.label.map(lambda x: label2id[x.strip()])

In [156]:
# Examine the distribution and value counts for the 'label_encoded' column.
print('the proportion of total label ID in a train dataset:'.upper())
print(df_benchmark_train_bert['label_encoded'].value_counts(normalize=True).sort_index(), '\n')

print('the count of total label ID in a train dataset:'.upper())
print(df_benchmark_train_bert['label_encoded'].value_counts())

THE PROPORTION OF TOTAL LABEL ID IN A TRAIN DATASET:
0    0.25
1    0.25
2    0.25
3    0.25
Name: label_encoded, dtype: float64 

THE COUNT OF TOTAL LABEL ID IN A TRAIN DATASET:
0    30000
1    30000
2    30000
3    30000
Name: label_encoded, dtype: int64


In [157]:
# Examine the distribution and value counts for the 'label_encoded' column.
print('the proportion of total label ID in a test dataset:'.upper())
print(df_benchmark_test_bert['label_encoded'].value_counts(normalize=True).sort_index(), '\n')

print('the count of total label ID in a test dataset:'.upper())
print(df_benchmark_test_bert['label_encoded'].value_counts())

THE PROPORTION OF TOTAL LABEL ID IN A TEST DATASET:
0    0.25
1    0.25
2    0.25
3    0.25
Name: label_encoded, dtype: float64 

THE COUNT OF TOTAL LABEL ID IN A TEST DATASET:
0    1900
1    1900
2    1900
3    1900
Name: label_encoded, dtype: int64


In [164]:
df_bert_drop, df_bert_use_train = train_test_split(df_benchmark_train_bert, 
                                                   test_size=0.01, random_state=630, 
                                                   stratify=df_benchmark_train_bert['label'])

df_bert_drop, df_bert_use_test = train_test_split(df_benchmark_test_bert, 
                                                   test_size=0.06, random_state=630, 
                                                   stratify=df_benchmark_test_bert['label'])

In [165]:
# Split the DataFrame into training and testing sets and maintain label proportions.
train, validation = train_test_split(df_bert_use_train, test_size=0.3, random_state=630)
test = df_bert_use_test
print('TRAINING shape:', train.shape)
print('VALIDATION shape:', validation.shape)
print('TEST shape:', test.shape)

TRAINING shape: (840, 3)
VALIDATION shape: (360, 3)
TEST shape: (456, 3)


In [166]:
# Convert each set to Dataset format.
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(validation)
test_dataset = Dataset.from_pandas(test)

# Remove '__index_level_0__' feature.
train_dataset = train_dataset.remove_columns('__index_level_0__')
val_dataset = val_dataset.remove_columns('__index_level_0__')

# Create DatasetDict variable.
benchmark_text = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test' : test_dataset
    })

In [167]:
benchmark_text

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_encoded'],
        num_rows: 840
    })
    validation: Dataset({
        features: ['text', 'label', 'label_encoded'],
        num_rows: 360
    })
    test: Dataset({
        features: ['text', 'label', 'label_encoded', '__index_level_0__'],
        num_rows: 456
    })
})

In [168]:
# Load a tokenizer from the 'bert-base-uncased' pretrained model.
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Initialize a BERT-based sequence classification model.
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=4,
                                                      id2label=id2label,
                                                      label2id=label2id)
# Move the model to device.
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [169]:
# Define a custom tokenization function for long and short text.
def custom_tokenize(batch, strategy='default', max_length=512):
    
    tokenized_outputs = {'input_ids': [], 'attention_mask': []}

    for text in batch['text']:
        # Tokenize using the default strategy if the text is shorter than the maximum length.
        if strategy == 'default':
            inputs = tokenizer.encode_plus(text, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')
        # Tokenize using the default strategy if the text is longer than the maximum length.
        elif strategy == 'head-tail':
            tokens = tokenizer.tokenize(text)

            head_length = int((max_length - 3) * 0.6)
            tail_length = (max_length - 3) - head_length
            # If the text is longer than the specified maximum length, split it into head and tail parts.
            if len(tokens) > max_length - 2:
                head_tokens = tokens[:head_length]
                tail_tokens = tokens[-tail_length:]
                input_ids = tokenizer.encode_plus(text=' '.join(head_tokens + tail_tokens), max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')['input_ids']
                attention_mask = tokenizer.encode_plus(text=" ".join(head_tokens + tail_tokens), max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')['attention_mask']
            # If the text is within the maximum length, tokenize it as is.
            else:
                encoded_plus = tokenizer.encode_plus(text, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt')
                input_ids, attention_mask = encoded_plus['input_ids'], encoded_plus['attention_mask']
            inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
        else:

            pass

        # Append tokenized input and attention mask to the outputs.
        tokenized_outputs['input_ids'].append(inputs['input_ids'].squeeze().tolist())
        tokenized_outputs['attention_mask'].append(inputs['attention_mask'].squeeze().tolist())

    return tokenized_outputs

In [170]:
# Define a costum function for tokenization using the 'head-tail' strategy.
def tokenize(examples):
    return custom_tokenize(examples, strategy='head-tail')

In [171]:
# Apply the function to the dataset.
benchmark_text_encoded = benchmark_text.map(tokenize, batched=True, batch_size=None)

print(benchmark_text_encoded)

Map:   0%|          | 0/840 [00:00<?, ? examples/s]

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/456 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_encoded', 'input_ids', 'attention_mask'],
        num_rows: 840
    })
    validation: Dataset({
        features: ['text', 'label', 'label_encoded', 'input_ids', 'attention_mask'],
        num_rows: 360
    })
    test: Dataset({
        features: ['text', 'label', 'label_encoded', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 456
    })
})


In [172]:
# Set the dataset format to PyTorch tensors.
benchmark_text_encoded.set_format('torch', columns=['input_ids', 'attention_mask', 'label_encoded'])

In [173]:
# Calculate class weights.
labels = train['label_encoded'].unique()
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=labels,
                                     y=train['label_encoded'])

# Convert the computed class weights to a PyTorch tensor.
class_weights = torch.from_numpy(class_weights).float()

print(class_weights)

tensor([0.9589, 1.0000, 1.0345, 1.0096])


In [174]:
# Finalise the dataset.
benchmark_text_encoded = benchmark_text_encoded.rename_column('label_encoded', 'labels')

print(benchmark_text_encoded)

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 840
    })
    validation: Dataset({
        features: ['text', 'label', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 360
    })
    test: Dataset({
        features: ['text', 'label', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 456
    })
})


In [175]:
# Define the custom trainer class.
class WeightedLossTrainer(Trainer):
    
    # Override the method for loss computation.
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.get('logits')
        labels = inputs.get('labels')
        
        # Move class weights to device.
        class_weights_device = class_weights.to(model.device)
        
        # Calculate the loss using CrossEntropyLoss function with the computed class weights.
        loss_func = nn.CrossEntropyLoss(weight=class_weights_device)
        loss = loss_func(logits, labels)
        
        # Return a tuple containing loss and outputs if 'return_outputs' is True.
        return (loss, outputs) if return_outputs else loss

In [176]:
# Define a costum function to compute accuracy, F1, precision, and recall for a given set of predictions.
def compute_metrics(pred):
  
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  
  precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
  acc = accuracy_score(labels, preds)
  
  return {'Accuracy': acc,
          'F1': f1,
          'Precision': precision,
          'Recall': recall}

In [177]:
# Define the batch size for training.
batch_size = 128

# Calculate the number of logging steps based on the dataset size and batch size.
logging_steps = len(benchmark_text_encoded['train']) // batch_size

# Specify the directory where the trained model and logs will be saved.
output_dir = 'ft_bert_temuulen_benchmark'

# Create an instance of TrainingArguments to configure the training process.
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=3,
                                  learning_rate=2e-5,
                                  per_device_eval_batch_size=batch_size,
                                  per_gpu_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  logging_steps=logging_steps,
                                  save_strategy='epoch',
                                  fp16=True,
                                  load_best_model_at_end=True)

In [178]:
# Create an instance of the WeightedLossTrainer for training the model.
trainer = WeightedLossTrainer(model=model,
                              args=training_args,
                              train_dataset=benchmark_text_encoded['train'],
                              eval_dataset=benchmark_text_encoded['validation'],
                              tokenizer=tokenizer,
                              compute_metrics= compute_metrics)

In [179]:
# Start the training.
trainer.train()

  0%|          | 0/315 [00:00<?, ?it/s]

{'loss': 1.3578, 'learning_rate': 1.9682539682539684e-05, 'epoch': 0.06}
{'loss': 1.3775, 'learning_rate': 1.9301587301587303e-05, 'epoch': 0.11}
{'loss': 1.2501, 'learning_rate': 1.8984126984126986e-05, 'epoch': 0.17}
{'loss': 1.1891, 'learning_rate': 1.8603174603174605e-05, 'epoch': 0.23}
{'loss': 1.0946, 'learning_rate': 1.8222222222222224e-05, 'epoch': 0.29}
{'loss': 1.0213, 'learning_rate': 1.7841269841269843e-05, 'epoch': 0.34}
{'loss': 0.9149, 'learning_rate': 1.7460317460317463e-05, 'epoch': 0.4}
{'loss': 0.864, 'learning_rate': 1.707936507936508e-05, 'epoch': 0.46}
{'loss': 0.7577, 'learning_rate': 1.66984126984127e-05, 'epoch': 0.51}
{'loss': 0.7214, 'learning_rate': 1.631746031746032e-05, 'epoch': 0.57}
{'loss': 0.5501, 'learning_rate': 1.5936507936507936e-05, 'epoch': 0.63}
{'loss': 0.4762, 'learning_rate': 1.555555555555556e-05, 'epoch': 0.69}
{'loss': 0.583, 'learning_rate': 1.5174603174603176e-05, 'epoch': 0.74}
{'loss': 0.6202, 'learning_rate': 1.4793650793650795e-05, '

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


  0%|          | 0/3 [00:00<?, ?it/s]

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'eval_loss': 0.40253737568855286, 'eval_Accuracy': 0.8861111111111111, 'eval_F1': 0.8859683527665684, 'eval_Precision': 0.8930375197034629, 'eval_Recall': 0.8836991107348197, 'eval_runtime': 142.0429, 'eval_samples_per_second': 2.534, 'eval_steps_per_second': 0.021, 'epoch': 1.0}
{'loss': 0.3782, 'learning_rate': 1.326984126984127e-05, 'epoch': 1.03}
{'loss': 0.3353, 'learning_rate': 1.288888888888889e-05, 'epoch': 1.09}
{'loss': 0.2478, 'learning_rate': 1.2507936507936508e-05, 'epoch': 1.14}
{'loss': 0.2005, 'learning_rate': 1.2126984126984127e-05, 'epoch': 1.2}
{'loss': 0.2064, 'learning_rate': 1.1746031746031748e-05, 'epoch': 1.26}
{'loss': 0.4247, 'learning_rate': 1.1365079365079366e-05, 'epoch': 1.31}
{'loss': 0.3572, 'learning_rate': 1.0984126984126986e-05, 'epoch': 1.37}
{'loss': 0.468, 'learning_rate': 1.0603174603174604e-05, 'epoch': 1.43}
{'loss': 0.1808, 'learning_rate': 1.0222222222222223e-05, 'epoch': 1.49}
{'loss': 0.1702, 'learning_rate': 9.841269841269842e-06, 'epoch':

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'loss': 0.1491, 'learning_rate': 6.7936507936507944e-06, 'epoch': 2.0}


  0%|          | 0/3 [00:00<?, ?it/s]

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'eval_loss': 0.34270069003105164, 'eval_Accuracy': 0.8972222222222223, 'eval_F1': 0.8970306001188354, 'eval_Precision': 0.9007849315846249, 'eval_Recall': 0.8953352313915589, 'eval_runtime': 147.0154, 'eval_samples_per_second': 2.449, 'eval_steps_per_second': 0.02, 'epoch': 2.0}
{'loss': 0.0923, 'learning_rate': 6.412698412698414e-06, 'epoch': 2.06}
{'loss': 0.0995, 'learning_rate': 6.031746031746032e-06, 'epoch': 2.11}
{'loss': 0.1827, 'learning_rate': 5.650793650793651e-06, 'epoch': 2.17}
{'loss': 0.2233, 'learning_rate': 5.26984126984127e-06, 'epoch': 2.23}
{'loss': 0.0921, 'learning_rate': 4.888888888888889e-06, 'epoch': 2.29}
{'loss': 0.0501, 'learning_rate': 4.5079365079365085e-06, 'epoch': 2.34}
{'loss': 0.2301, 'learning_rate': 4.126984126984127e-06, 'epoch': 2.4}
{'loss': 0.0364, 'learning_rate': 3.7460317460317463e-06, 'epoch': 2.46}
{'loss': 0.2068, 'learning_rate': 3.3650793650793655e-06, 'epoch': 2.51}
{'loss': 0.1457, 'learning_rate': 2.984126984126984e-06, 'epoch': 2.57

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.
Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


  0%|          | 0/3 [00:00<?, ?it/s]

Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future version. Using `--per_device_eval_batch_size` is preferred.


{'eval_loss': 0.358962744474411, 'eval_Accuracy': 0.8916666666666667, 'eval_F1': 0.8918794877027946, 'eval_Precision': 0.8954196217494089, 'eval_Recall': 0.8902409786897311, 'eval_runtime': 158.9289, 'eval_samples_per_second': 2.265, 'eval_steps_per_second': 0.019, 'epoch': 3.0}
{'train_runtime': 4039.5725, 'train_samples_per_second': 0.624, 'train_steps_per_second': 0.078, 'train_loss': 0.4089018243172812, 'epoch': 3.0}


TrainOutput(global_step=315, training_loss=0.4089018243172812, metrics={'train_runtime': 4039.5725, 'train_samples_per_second': 0.624, 'train_steps_per_second': 0.078, 'train_loss': 0.4089018243172812, 'epoch': 3.0})

In [180]:
# Save the fine-tuned BERT model.
trainer.model.save_pretrained('ft_bert_temuulen2_benchmark')

# Save the tokenizer used for fine-tuning to the 'ft_bert_temuulen3_tokenizer'.
tokenizer.save_pretrained('ft_bert_temuulen_tokenizer2_benchmark')

('ft_bert_temuulen_tokenizer2_benchmark\\tokenizer_config.json',
 'ft_bert_temuulen_tokenizer2_benchmark\\special_tokens_map.json',
 'ft_bert_temuulen_tokenizer2_benchmark\\vocab.txt',
 'ft_bert_temuulen_tokenizer2_benchmark\\added_tokens.json',
 'ft_bert_temuulen_tokenizer2_benchmark\\tokenizer.json')

In [181]:
# Specify the directory paths for the fine-tuned model and tokenizer.
model_path = 'ft_bert_temuulen2_benchmark'
tokenizer_path = 'ft_bert_temuulen_tokenizer2_benchmark'

# Load the BERT and the tokenizer.
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

In [182]:
# Load prepared an unseen dataset.
test_dataset = benchmark_text_encoded['test'] 

# Create a Trainer instance.
classifier = Trainer(model=model, tokenizer=tokenizer)
predictions = classifier.predict(test_dataset)

# Convert predictions to numpy for further analysis.
labels = predictions.label_ids
preds = predictions.predictions.argmax(-1)

  0%|          | 0/57 [00:00<?, ?it/s]

In [184]:
# Plot Confusion Matrix.
cm_labels = ['0', '1', '2', '3']
cm_matrix = confusion_matrix(labels, preds)
cm_title = "CONFUSION MATRIX: fine-tuned 'bert-base-uncased' model for classification"

fig = ff.create_annotated_heatmap(z=cm_matrix, 
                                  x=cm_labels,
                                  y=cm_labels, 
                                  colorscale='balance', 
                                  showscale=True,
                                  annotation_text=cm_matrix)

fig.update_layout(width=700, 
                  height=700, 
                  title=cm_title, 
                  title_x=0.5,
                  xaxis=dict(title='Predicted Value', side='bottom'), 
                  yaxis_title='True Value')

fig.update_yaxes(tickangle=-90)  
    
fig.show()

# Print detailed classification report.
report = classification_report(labels, preds, output_dict=True)
report_title = "CLASSIFICATION REPORT: fine-tuned 'bert-base-uncased' model for classification"

print(report_title, '\n')
print(classification_report(labels, preds))
print(label2id)

CLASSIFICATION REPORT: fine-tuned 'bert-base-uncased' model for classification 

              precision    recall  f1-score   support

           0       0.84      0.81      0.82       114
           1       0.83      0.84      0.83       114
           2       0.99      1.00      1.00       114
           3       0.91      0.92      0.92       114

    accuracy                           0.89       456
   macro avg       0.89      0.89      0.89       456
weighted avg       0.89      0.89      0.89       456

{'business': 0, 'sci/tech': 1, 'sports': 2, 'world': 3}
