# BERT text classification: data validation of collected job description details.

In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset
from transformers import TFAutoModel, AutoTokenizer
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.15.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
# Import finalized dataset as pandas data frame.
df = pd.read_csv('data_jobads_final.csv', index_col=None)

# Replace newline characters in the 'job_description' column with a space.
df['job_description'] = df['job_description'].str.replace('\n', ' ')

# Remove None values.
df = df.dropna()

# Select and use only the last two columns for this evaluation.
df = df.iloc[:,-2:]

df.head(3)

Unnamed: 0,job_description,label
0,silver stream healthcare group offer great emp...,registered_nurse
1,create a better future for yourself !! recruit...,registered_nurse
2,"access healthcare, one of ireland’s leading he...",registered_nurse


In [37]:
df

Unnamed: 0,job_description,label,label_text
0,silver stream healthcare group offer great emp...,0,registered_nurse
1,create a better future for yourself !! recruit...,0,registered_nurse
2,"access healthcare, one of ireland’s leading he...",0,registered_nurse
3,are you a dedicated and compassionate staff nu...,0,registered_nurse
4,clinical research nurse - cardiology (cnm2) w...,0,registered_nurse
...,...,...,...
1234,the successful candidate will have exposure to...,2,data_analyst
1235,sector: fintech you will be a data-driven indi...,2,data_analyst
1236,our client are recognised as a market leader a...,2,data_analyst
1237,the role our operations analysts are responsib...,2,data_analyst


In [3]:
# Updated 'label' column values according to the specified mapping.
mapping = {'registered_nurse': 0, 'electrician': 1, 'data_analyst': 2}
df['label_text'] = df['label']
df['label'] = df['label'].replace(mapping) # or df['label'] = pd.factorize(df.label)[0]

df['label'].value_counts()

0    644
2    376
1    146
Name: label, dtype: int64

In [6]:
# Split the DataFrame into training and testing sets while maintaining label proportions.
train, validation = train_test_split(df, test_size=0.3, random_state=820, stratify=df['label'])

print('The shape of the TRAINING dataset is:', train.shape)
print('The shape of the VALIDATION dataset is:', validation.shape)

The shape of the TRAINING dataset is: (816, 3)
The shape of the VALIDATION dataset is: (350, 3)


In [7]:
# Convert DataFrames to Hugging Face Dataset.
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(validation)

# Remove '__index_level_0__' feature
train_dataset = train_dataset.remove_columns('__index_level_0__')
val_dataset = val_dataset.remove_columns('__index_level_0__')

# Create DatasetDict.
jobads = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    })

In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = TFAutoModel.from_pretrained('bert-base-uncased')




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [9]:
def tokenize(batch):
    return tokenizer(batch['job_description'], padding=True, truncation=True)

jobads_encoded = jobads.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/816 [00:00<?, ? examples/s]

Map:   0%|          | 0/350 [00:00<?, ? examples/s]

In [10]:
jobads_encoded

DatasetDict({
    train: Dataset({
        features: ['job_description', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 816
    })
    validation: Dataset({
        features: ['job_description', 'label', 'label_text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 350
    })
})

In [12]:
jobads_encoded.set_format('tf',
                          columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])

BATCH_SIZE = 32

def order(inp):
    '''
    This function will group all the inputs of BERT into single
    dictionary and then output it with labels.
    '''
    data = list(inp.values())
    return {
        'input_ids' : data[1],
        'attention_mask' : data[2],
        'token_type_ids' : data[3]
    }, data[0]
    
# Convert train split of 'jobads_encoded' to tensorflow format.
train_dataset = tf.data.Dataset.from_tensor_slices(jobads_encoded['train'][:])

# Set batch_size and shuffle.
train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(100)

# Map the 'order' function.
train_dataset = train_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

# ... doing the same for test set ...
val_dataset = tf.data.Dataset.from_tensor_slices(jobads_encoded['validation'][:])
val_dataset = val_dataset.batch(BATCH_SIZE)
val_dataset = val_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

In [13]:
inp, out = next(iter(train_dataset)) # a batch from train_dataset
print(inp, '\n\n', out)

{'input_ids': <tf.Tensor: shape=(16, 512), dtype=int64, numpy=
array([[  101,  2054,  2017, ...,     0,     0,     0],
       [  101, 15563,  2024, ...,     0,     0,     0],
       [  101,  2194,  6337, ...,  2030,  5907,   102],
       ...,
       [  101,  2057,  2024, ...,     0,     0,     0],
       [  101,  3751,  2937, ...,     0,     0,     0],
       [  101,  2194, 19184, ...,     0,     0,     0]], dtype=int64)>, 'attention_mask': <tf.Tensor: shape=(16, 512), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)>, 'token_type_ids': <tf.Tensor: shape=(16, 512), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int64)>}

In [14]:
class BERTForClassification(tf.keras.Model):
    
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')
        
    def call(self, inputs):
        x = self.bert(inputs)[1]
        return self.fc(x)

In [15]:
classifier = BERTForClassification(model, num_classes=3)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [16]:
history = classifier.fit(
    train_dataset,
    epochs=3
)

Epoch 1/3


Epoch 2/3
Epoch 3/3


In [18]:
evaluation_results = classifier.evaluate(val_dataset)



In [19]:
print('All Metrics:')
for metric_name, metric_value in zip(classifier.metrics_names, evaluation_results):
    print(f"{metric_name}: {metric_value:.4f}")


All Metrics:
loss: 0.0372
accuracy: 0.9914


In [23]:
classifier.save('fine_tuned_bert_multi_label_classifier_temuulen')









INFO:tensorflow:Assets written to: fine_tuned_bert_multi_label_classifier_temuulen\assets


INFO:tensorflow:Assets written to: fine_tuned_bert_multi_label_classifier_temuulen\assets










In [35]:
precision_metric = tf.keras.metrics.Precision(name='precision')
recall_metric = tf.keras.metrics.Recall(name='recall')
f1_metric = tfa.metrics.F1Score(num_classes=3, average='weighted', name='f1_score')

In [36]:
classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy', precision_metric, recall_metric, f1_metric]
)

In [None]:
classifier.fit(train_dataset, epochs=3, validation_data=(val_data, val_labels))

# After training, you can evaluate on the test set and get the confusion matrix
test_loss, test_accuracy, test_precision, test_recall, test_f1 = classifier.evaluate(test_data, test_labels)
print(f'Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}')
print(f'Test Recall: {test_recall}')
print(f'Test F1 Score: {test_f1}')

# Get confusion matrix
predictions = classifier.predict(test_data)
conf_matrix = confusion_matrix(test_labels, np.argmax(predictions, axis=-1))
print('Confusion Matrix:')
print(conf_matrix)

In [29]:
# Evaluate the model on the validation dataset
y_true = []  # List to store true labels
y_pred = []  # List to store predicted labels

for features, labels in val_dataset:
    predictions = classifier.predict(features)  # Assuming predict method returns class probabilities
    predicted_labels = np.round(predictions).astype(int)  # Round and convert to integer
    
    y_true.extend(labels.numpy())
    y_pred.extend(predicted_labels)

# Compute and print classification report
class_report = classification_report(y_true, y_pred, target_names=['Class 0', 'Class 1', 'Class 2'])
print('Classification Report:')
print(class_report)

# Compute and print accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Compute and print precision, recall, and F1 score for each class
precision = precision_score(y_true, y_pred, average=None)
recall = recall_score(y_true, y_pred, average=None)
f1 = f1_score(y_true, y_pred, average=None)

for i in range(len(precision)):
    print(f'Class {i}: Precision={precision[i]:.4f}, Recall={recall[i]:.4f}, F1 Score={f1[i]:.4f}')



ValueError: Classification metrics can't handle a mix of multiclass and multilabel-indicator targets

In [30]:
!pip install tensorflow-addons

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.22.0-cp311-cp311-win_amd64.whl.metadata (1.8 kB)
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons)
  Downloading typeguard-2.13.3-py3-none-any.whl (17 kB)
Downloading tensorflow_addons-0.22.0-cp311-cp311-win_amd64.whl (719 kB)
   ---------------------------------------- 0.0/719.8 kB ? eta -:--:--
   - -------------------------------------- 30.7/719.8 kB 1.3 MB/s eta 0:00:01
   ------ --------------------------------- 112.6/719.8 kB 1.3 MB/s eta 0:00:01
   ------ ------------------------------- 122.9/719.8 kB 901.1 kB/s eta 0:00:01
   ------------- -------------------------- 245.8/719.8 kB 1.4 MB/s eta 0:00:01
   ----------------------- ---------------- 419.8/719.8 kB 1.9 MB/s eta 0:00:01
   ----------------------------- ---------- 522.2/719.8 kB 1.9 MB/s eta 0:00:01
   ---------------------------------------- 719.8/719.8 kB 2.3 MB/s eta 0:00:00
Installing collected packages: typeguard, tensorflow-addons
Successfully