# BERT text classification: data validation of collected job description details.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset
from transformers import TFAutoModel, AutoTokenizer
import tensorflow as tf
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch



In [2]:
# Import finalized dataset as pandas data frame.
df = pd.read_csv('data_jobads_final.csv', index_col=None)

# Replace newline characters in the 'job_description' column with a space.
df['job_description'] = df['job_description'].str.replace('\n', ' ')

# Remove None values.
df = df.dropna()

# Select and use only the last two columns for this evaluation.
df = df.iloc[:,-2:]

df.head(3)

Unnamed: 0,job_description,label
0,silver stream healthcare group offer great emp...,registered_nurse
1,create a better future for yourself !! recruit...,registered_nurse
2,"access healthcare, one of ireland’s leading he...",registered_nurse


In [3]:
# Encode the labels
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

print(df['label_encoded'].value_counts(), '\n')
print(df['label'].value_counts())

2    644
0    376
1    146
Name: label_encoded, dtype: int64 

registered_nurse    644
data_analyst        376
electrician         146
Name: label, dtype: int64


In [4]:
# Split the DataFrame into training and testing sets while maintaining label proportions.
train, validation_test = train_test_split(df, test_size=0.3, random_state=820, stratify=df['label'])
test, validation = train_test_split(validation_test, test_size=0.5, random_state=820, stratify=validation_test['label'])

print('The shape of the TRAINING dataset is:', train.shape)
print('The shape of the VALIDATION dataset is:', validation.shape)
print('The shape of the TEST dataset is:', test.shape)

The shape of the TRAINING dataset is: (816, 3)
The shape of the VALIDATION dataset is: (175, 3)
The shape of the TEST dataset is: (175, 3)


In [5]:
# Convert DataFrames to Hugging Face Dataset.
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(validation)
test_dataset = Dataset.from_pandas(test)

# Remove '__index_level_0__' feature
train_dataset = train_dataset.remove_columns('__index_level_0__')
val_dataset = val_dataset.remove_columns('__index_level_0__')
test_dataset = test_dataset.remove_columns('__index_level_0__')

# Create DatasetDict.
jobads = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test' : test_dataset
    })

In [6]:
example = jobads['train'][0]
example

{'job_description': "ttm healthcare have partnered with one of ireland's leading addiction and homeless services to recruit a registered nurse for their dublin based facility. this is a full time permanent contract working 39 hours per week monday to sunday from 8am to 8pm. weekends will be on rotation.  requirements: registered nurse with the nmbi. a minimum of 2 years of post qualification experience in a paid full-time capacity or part time equivalent. a relevant qualification in the area of addiction studies / mental health would be an advantage. capacity to develop positive relationships with clients and staff. must be able to demonstrate proficient it skills including microsoft word, excel, powerpoint and a good knowledge of crm system for reporting purposes. good communication skills. fulfilling and challenging environment competitive salaries sick pay scheme flexible working arrangements minimum 23 days annual leave defined contribution pension scheme (after probation) death in

In [7]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = TFAutoModel.from_pretrained('bert-base-uncased')

RuntimeError: Failed to import transformers.models.bert.modeling_tf_bert because of the following error (look up to see its traceback):
cannot import name 'cast' from partially initialized module 'keras.src.backend' (most likely due to a circular import) (c:\Users\temulenbd\anaconda3\Lib\site-packages\keras\src\backend\__init__.py)

In [9]:
def tokenize(batch):
    return tokenizer(batch['job_description'], padding=True, truncation=True, add_special_tokens=True)

jobads_encoded = jobads.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/816 [00:00<?, ? examples/s]

Map:   0%|          | 0/175 [00:00<?, ? examples/s]

Map:   0%|          | 0/175 [00:00<?, ? examples/s]

In [10]:
jobads_encoded

DatasetDict({
    train: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 816
    })
    validation: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 175
    })
    test: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 175
    })
})

In [11]:
example = jobads_encoded['train'][0]

print(example['input_ids'])
print('\n')
print(tokenizer.decode(example['input_ids']))

[101, 23746, 2213, 9871, 2031, 12404, 2007, 2028, 1997, 3163, 1005, 1055, 2877, 13449, 1998, 11573, 2578, 2000, 13024, 1037, 5068, 6821, 2005, 2037, 5772, 2241, 4322, 1012, 2023, 2003, 1037, 2440, 2051, 4568, 3206, 2551, 4464, 2847, 2566, 2733, 6928, 2000, 4465, 2013, 1022, 3286, 2000, 1022, 9737, 1012, 13499, 2097, 2022, 2006, 9963, 1012, 5918, 1024, 5068, 6821, 2007, 1996, 13221, 5638, 1012, 1037, 6263, 1997, 1016, 2086, 1997, 2695, 8263, 3325, 1999, 1037, 3825, 2440, 1011, 2051, 3977, 2030, 2112, 2051, 5662, 1012, 1037, 7882, 8263, 1999, 1996, 2181, 1997, 13449, 2913, 1013, 5177, 2740, 2052, 2022, 2019, 5056, 1012, 3977, 2000, 4503, 3893, 6550, 2007, 7846, 1998, 3095, 1012, 2442, 2022, 2583, 2000, 10580, 27029, 2009, 4813, 2164, 7513, 2773, 1010, 24970, 1010, 2373, 8400, 1998, 1037, 2204, 3716, 1997, 13675, 2213, 2291, 2005, 7316, 5682, 1012, 2204, 4807, 4813, 1012, 21570, 1998, 10368, 4044, 6975, 20566, 5305, 3477, 5679, 12379, 2551, 7565, 6263, 2603, 2420, 3296, 2681, 4225, 6691, 

In [12]:
# TRANSFORMING DATASET FROM HUGGING FACE FORMAT TO TENSER FLOW FORMAT

jobads_encoded.set_format('tf',
                          columns=['input_ids', 'attention_mask', 'token_type_ids', 'label_encoded'])

BATCH_SIZE = 16

def order(inp):
    '''
    This function will group all the inputs of BERT into single
    dictionary and then output it with labels.
    '''
    data = list(inp.values())
    return {
        'input_ids' : data[1],
        'attention_mask' : data[2],
        'token_type_ids' : data[3]
    }, data[0]
    
# Convert train split of 'jobads_encoded' to tensorflow format.
train_dataset = tf.data.Dataset.from_tensor_slices(jobads_encoded['train'][:])

# Set batch_size and shuffle.
train_dataset = train_dataset.batch(BATCH_SIZE).shuffle(100)

# Map the 'order' function.
train_dataset = train_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

# ... doing the same for test set ...
val_dataset = tf.data.Dataset.from_tensor_slices(jobads_encoded['validation'][:])
val_dataset = val_dataset.batch(BATCH_SIZE)
val_dataset = val_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

test_dataset = tf.data.Dataset.from_tensor_slices(jobads_encoded['test'][:])
test_dataset = test_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.map(order, num_parallel_calls=tf.data.AUTOTUNE)

inp, out = next(iter(train_dataset)) # a batch from train_dataset
print(inp, '\n\n', out)

{'input_ids': <tf.Tensor: shape=(16, 512), dtype=int64, numpy=
array([[  101,  3105, 12827, ...,     0,     0,     0],
       [  101,  3229,  9871, ...,     0,     0,     0],
       [  101,  2486, 15680, ...,     0,     0,     0],
       ...,
       [  101,  2304, 16901, ...,  2968,  1998,   102],
       [  101, 20052, 15680, ...,     0,     0,     0],
       [  101,  7570,  6894, ...,     0,     0,     0]], dtype=int64)>, 'attention_mask': <tf.Tensor: shape=(16, 512), dtype=int64, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)>, 'token_type_ids': <tf.Tensor: shape=(16, 512), dtype=int64, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int64)>}

In [1]:
train_dataset

NameError: name 'train_dataset' is not defined

In [14]:
# TO CREAT THE MODEL USING SUBCLASSING IPA IN KERAS
class BERTForClassification(tf.keras.Model):
    
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax') # CREATING LAST DENSE LAYER WITH 3 NEURONS AND LAYER WITH SOFTMAX(PROBABILITY DISTRIBUTION)
        
    def call(self, inputs):
        x = self.bert(inputs)[1] # FOR TEXT CLASSIFICATION WE ONLY NEED POOLER OUTPUT
        return self.fc(x)

In [15]:
classifier = BERTForClassification(model, num_classes=3)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']      
)

In [16]:
# Assuming df is your DataFrame with a column named 'label' representing the class labels
# Replace 'label' with the actual column name in your DataFrame

# Calculate class weights
labels = train['label_encoded'].unique()
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=labels,
                                     y=train['label_encoded'])

# Create a dictionary with class weights
class_weight_dict = dict(zip(labels, class_weights))

print("Class Weights:", class_weight_dict)

Class Weights: {2: 0.6031042128603105, 1: 2.6666666666666665, 0: 1.0342205323193916}


In [17]:
history = classifier.fit(
            train_dataset,
            epochs=3,
            class_weight=class_weight_dict
            )

Epoch 1/3




In [None]:
val_loss, val_accuracy = classifier.evaluate(val_dataset)
print(f'Validation Loss: {val_loss}')
print(f'Validation Accuracy: {val_accuracy}')

In [None]:
val_predictions = classifier.predict(val_dataset)
val_predicted_labels = np.argmax(val_predictions, axis=1)

In [None]:

# Access the validation set
validation_set = jobads_encoded['validation']

true_labels = validation_set['label']

In [None]:
from sklearn.metrics import classification_report

# Assuming you have the true labels for your validation set
true_val_labels = true_labels

print(classification_report(true_val_labels, val_predicted_labels))

In [None]:
test_loss, test_accuracy = classifier.evaluate(test_dataset)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

In [None]:

test_set = jobads_encoded['test']

true_labels = test_set['label']

In [None]:
test_predictions = classifier.predict(test_dataset)
test_predicted_labels = np.argmax(test_predictions, axis=1)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix



conf_matrix = confusion_matrix(true_labels, test_predicted_labels)


print(classification_report(true_labels, test_predicted_labels))
print(confusion_matrix(true_labels, test_predicted_labels))


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the context for the plot
sns.set_context('talk')

# Create a figure and axis
plt.figure(figsize=(7, 4))

# Create the heatmap
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='viridis', cbar=True)

# Add labels
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('CONFUSION MATRIX')

# Add tick labels if needed (e.g., for class names)
class_names = ['Class1', 'Class2', 'Class3']  # replace with your class names
plt.xticks(ticks=np.arange(len(class_names)) + 0.5, labels=class_names)
plt.yticks(ticks=np.arange(len(class_names)) + 0.5, labels=class_names, rotation=0)

# Show the plot
plt.show()

In [1]:
classifier.save('ft_bert_temulen1')

NameError: name 'classifier' is not defined

SIMILARIRY function

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert_temuulen')
loaded_model = tf.saved_model.load('bert_temuulen')

In [None]:
# Function to obtain BERT vectors for a text value
def get_bert_vectors(text):
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        outputs = loaded_model(**tokens)
    embeddings = outputs.last_hidden_state[:, 0, :]
    return embeddings

# Apply the function to each row in the DataFrame
df['bert_vectors'] = df['job_description'].apply(get_bert_vectors)

In [16]:
labels = df['label'].unique().tolist()
labels

['registered_nurse', 'electrician', 'data_analyst']

In [17]:
num_labels = len(labels)
id2label = {id:label for id, label in enumerate(labels)}
label2id = {label: id for id, label in enumerate(labels)}

In [22]:
from transformers import pipeline, BertForSequenceClassification, BertTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3, id2label=id2label, label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'F1' : f1,
        'Precision' : precision,
        'Recall' : recall
        }
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    """
    Computes accuracy, F1, precision, and recall for a given set of predictions.
    
    Args:
        pred (obj): An object containing label_ids and predictions attributes.
            - label_ids (array-like): A 1D array of true class labels.
            - predictions (array-like): A 2D array where each row represents
              an observation, and each column represents the probability of 
              that observation belonging to a certain class.
              
    Returns:
        dict: A dictionary containing the following metrics:
            - Accuracy (float): The proportion of correctly classified instances.
            - F1 (float): The macro F1 score, which is the harmonic mean of precision
              and recall. Macro averaging calculates the metric independently for
              each class and then takes the average.
            - Precision (float): The macro precision, which is the number of true
              positives divided by the sum of true positives and false positives.
            - Recall (float): The macro recall, which is the number of true positives
              divided by the sum of true positives and false negatives.
    """
    # Extract true labels from the input object
    labels = pred.label_ids
    
    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(-1)
    
    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    
    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)
    
    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }


In [24]:
training_args = TrainingArguments(
    output_dir='ft_bert_temulen2',
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy='steps',
    logging_dir='multi_class_logs',
    logging_steps=50,
    evaluation_strategy='steps',
    eval_steps=50,
    save_strategy='steps',
    load_best_model_at_end=True
)

In [32]:
trainer = Trainer(
    # the pre-trained model that will be fine-tuned 
    model=model,
     # training arguments that we defined above                        
    args=training_args,                 
    train_dataset=,         
    eval_dataset=,            
    compute_metrics= compute_metrics)

In [10]:
import torch

In [11]:
print(torch.version.cuda)

11.8


In [12]:
print(torch.cuda.is_available())

True


In [12]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Looking in indexes: https://download.pytorch.org/whl/cu121


In [5]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
if tf.config.list_physical_devices('GPU'):
    print("GPU is detected")
else:
    print("GPU not detected")

Num GPUs Available:  0
GPU not detected


In [6]:
!pip install tensorflow[and-cuda]

^C


Collecting tensorflow[and-cuda]
  Using cached tensorflow-2.15.0-cp311-cp311-win_amd64.whl.metadata (3.6 kB)
Collecting nvidia-cublas-cu12==12.2.5.6 (from tensorflow[and-cuda])
  Downloading nvidia_cublas_cu12-12.2.5.6-py3-none-win_amd64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.2.142 (from tensorflow[and-cuda])
  Downloading nvidia_cuda_cupti_cu12-12.2.142-py3-none-win_amd64.whl.metadata (1.6 kB)
Collecting nvidia-cuda-nvcc-cu12==12.2.140 (from tensorflow[and-cuda])
  Downloading nvidia_cuda_nvcc_cu12-12.2.140-py3-none-win_amd64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.2.140 (from tensorflow[and-cuda])
  Downloading nvidia_cuda_nvrtc_cu12-12.2.140-py3-none-win_amd64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.2.140 (from tensorflow[and-cuda])
  Downloading nvidia_cuda_runtime_cu12-12.2.140-py3-none-win_amd64.whl.metadata (1.5 kB)
Collecting nvidia-cudnn-cu12==8.9.4.25 (from tensorflow[and-cuda])
  Downloading nvidia_cudnn_cu12-8.9

  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.0 requires FuzzyTM>=0.4.0, which is not installed.
tables 3.8.0 requires blosc2~=2.0.0, which is not installed.
azure-core 1.29.7 requires typing-extensions>=4.6.0, but you have typing-extensions 4.5.0 which is incompatible.
pydantic 2.5.3 requires typing-extensions>=4.6.1, but you have typing-extensions 4.5.0 which is incompatible.
pydantic-core 2.14.6 requires typing-extensions!=4.7.0,>=4.6.0, but you 

In [13]:
!pip install tensorflow



In [15]:
!pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-pypi-packag