In [2]:
import pandas as pd
import torch
from transformers import pipeline, BertForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import DatasetDict, Dataset
from sklearn.utils.class_weight import compute_class_weight
from torch import nn
from sklearn.metrics import f1_score
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



In [3]:
# Import finalized dataset as pandas data frame.
df = pd.read_csv('data_jobads_final.csv', index_col=None)

# Replace newline characters in the 'job_description' column with a space.
df['job_description'] = df['job_description'].str.replace('\n', ' ')

# Remove None values.
df = df.dropna()

# Select and use only the last two columns for this evaluation.
df = df.iloc[:,-2:]

df.head(3)

Unnamed: 0,job_description,label
0,silver stream healthcare group offer great emp...,registered_nurse
1,create a better future for yourself !! recruit...,registered_nurse
2,"access healthcare, one of ireland’s leading he...",registered_nurse


In [4]:
labels = df['label'].unique().tolist()
labels

['registered_nurse', 'electrician', 'data_analyst']

In [5]:
num_labels = len(labels)
id2label = {id:label for id, label in enumerate(labels)}
label2id = {label: id for id, label in enumerate(labels)}

In [6]:
df['label_encoded'] = df.label.map(lambda x: label2id[x.strip()])
df.head(3)

Unnamed: 0,job_description,label,label_encoded
0,silver stream healthcare group offer great emp...,registered_nurse,0
1,create a better future for yourself !! recruit...,registered_nurse,0
2,"access healthcare, one of ireland’s leading he...",registered_nurse,0


In [7]:
print(df['label_encoded'].value_counts(normalize=True).sort_index(), '\n')
print(df['label_encoded'].value_counts())

0    0.552316
1    0.125214
2    0.322470
Name: label_encoded, dtype: float64 

0    644
2    376
1    146
Name: label_encoded, dtype: int64


In [8]:
# Split the DataFrame into training and testing sets while maintaining label proportions.
train, validation_test = train_test_split(df, test_size=0.3, random_state=820, stratify=df['label'])
test, validation = train_test_split(validation_test, test_size=0.5, random_state=820, stratify=validation_test['label'])

print('The shape of the TRAINING dataset is:', train.shape)
print('The shape of the VALIDATION dataset is:', validation.shape)
print('The shape of the TEST dataset is:', test.shape)

The shape of the TRAINING dataset is: (816, 3)
The shape of the VALIDATION dataset is: (175, 3)
The shape of the TEST dataset is: (175, 3)


In [9]:
# Convert DataFrames to Hugging Face Dataset.
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(validation)
test_dataset = Dataset.from_pandas(test)

# Remove '__index_level_0__' feature
train_dataset = train_dataset.remove_columns('__index_level_0__')
val_dataset = val_dataset.remove_columns('__index_level_0__')
test_dataset = test_dataset.remove_columns('__index_level_0__')

# Create DatasetDict.
jobads = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test' : test_dataset
    })

In [10]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [11]:
def tokenize(examples):
    return tokenizer(examples['job_description'], truncation=True, max_length=512)

jobads_encoded = jobads.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/816 [00:00<?, ? examples/s]

Map:   0%|          | 0/175 [00:00<?, ? examples/s]

Map:   0%|          | 0/175 [00:00<?, ? examples/s]

In [12]:
jobads_encoded

DatasetDict({
    train: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 816
    })
    validation: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 175
    })
    test: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 175
    })
})

In [13]:
jobads_encoded.set_format('torch', columns=['input_ids', 'attention_mask', 'label_encoded'])

In [14]:
jobads_encoded

DatasetDict({
    train: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 816
    })
    validation: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 175
    })
    test: Dataset({
        features: ['job_description', 'label', 'label_encoded', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 175
    })
})

In [15]:
# Assuming df is your DataFrame with a column named 'label' representing the class labels
# Replace 'label' with the actual column name in your DataFrame

# Calculate class weights
labels = train['label_encoded'].unique()
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=labels,
                                     y=train['label_encoded'])

class_weights = torch.from_numpy(class_weights).float()
print(class_weights)

tensor([0.6031, 2.6667, 1.0342])


In [16]:
class_weights

tensor([0.6031, 2.6667, 1.0342])

In [17]:
jobads_encoded = jobads_encoded.rename_column('label_encoded', 'labels')


In [18]:
jobads_encoded

DatasetDict({
    train: Dataset({
        features: ['job_description', 'label', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 816
    })
    validation: Dataset({
        features: ['job_description', 'label', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 175
    })
    test: Dataset({
        features: ['job_description', 'label', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 175
    })
})

In [19]:
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.get('logits')
        labels = inputs.get('labels')
        class_weights_device = class_weights.to(model.device)
        loss_func = nn.CrossEntropyLoss(weight=class_weights_device)
        loss = loss_func(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=3,
                                                      id2label=id2label,
                                                      label2id=label2id)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [24]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'F1' : f1,
        'Precision' : precision,
        'Recall' : recall
        }
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    """
    Computes accuracy, F1, precision, and recall for a given set of predictions.
    
    Args:
        pred (obj): An object containing label_ids and predictions attributes.
            - label_ids (array-like): A 1D array of true class labels.
            - predictions (array-like): A 2D array where each row represents
              an observation, and each column represents the probability of 
              that observation belonging to a certain class.
              
    Returns:
        dict: A dictionary containing the following metrics:
            - Accuracy (float): The proportion of correctly classified instances.
            - F1 (float): The macro F1 score, which is the harmonic mean of precision
              and recall. Macro averaging calculates the metric independently for
              each class and then takes the average.
            - Precision (float): The macro precision, which is the number of true
              positives divided by the sum of true positives and false positives.
            - Recall (float): The macro recall, which is the number of true positives
              divided by the sum of true positives and false negatives.
    """
    # Extract true labels from the input object
    labels = pred.label_ids
    
    # Obtain predicted class labels by finding the column index with the maximum probability
    preds = pred.predictions.argmax(-1)
    
    # Compute macro precision, recall, and F1 score using sklearn's precision_recall_fscore_support function
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    
    # Calculate the accuracy score using sklearn's accuracy_score function
    acc = accuracy_score(labels, preds)
    
    # Return the computed metrics as a dictionary
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }


In [25]:
batch_size = 16
logging_steps = len(jobads_encoded['train']) // batch_size
output_dir = 'ft_bert_temuulen3'
training_args = TrainingArguments(output_dir=output_dir,
                                  num_train_epochs=3,
                                  learning_rate=1e-5,
                                  per_device_eval_batch_size=batch_size,
                                  per_gpu_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy='epoch',
                                  logging_steps=logging_steps,
                                  save_strategy='epoch',
                                  fp16=True,
                                  load_best_model_at_end=True)

In [26]:
trainer = WeightedLossTrainer(model=model,
                              args=training_args,
                              train_dataset=jobads_encoded['train'],
                              eval_dataset=jobads_encoded['validation'],
                              tokenizer=tokenizer,
                              compute_metrics= compute_metrics)

In [27]:
trainer.train()

  0%|          | 0/306 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
trainer.model.save_pretrained('ft_bert_temuulen3')
tokenizer.save_pretrained('ft_bert_temuulen3_tokenizer')

('ft_bert_temuulen3_tokenizer\\tokenizer_config.json',
 'ft_bert_temuulen3_tokenizer\\special_tokens_map.json',
 'ft_bert_temuulen3_tokenizer\\vocab.txt',
 'ft_bert_temuulen3_tokenizer\\added_tokens.json',
 'ft_bert_temuulen3_tokenizer\\tokenizer.json')

In [None]:
classifier = pipeline('text-classification', model='ft_bert_temuulen3', tokenizer='ft_bert_temuulen3_tokenizer')

In [None]:
test_nurse = classifier("I promote health, prevent disease, and help people who is sick")
test_el = classifier("If you don't have any light at home, I'm here to help.")
test_da = classifier("All day, I'm sitting in front of the screen, solving problems with my mouse and keyboard")
print(test_nurse)
print(test_el)
print(test_da)

[{'label': 'registered_nurse', 'score': 0.5082383155822754}]
[{'label': 'electrician', 'score': 0.7804846167564392}]
[{'label': 'data_analyst', 'score': 0.4992416799068451}]


In [None]:
df_jobads = pd.read_csv('data_jobseeker.csv', index_col=None)

In [None]:
df_jobads

Unnamed: 0,participant,data_collection,date,location,preferred_position,education,skill,experience
0,user_1,voice call,2023-12-17 15:30:00,"dublin, ireland",registered nurse,bachelor's degree: critical care nursing,"patient care, wound care, medical procedures, ...",registered nurse: 3 years
1,user_2,voice call,2023-12-27 11:50:00,"dublin, ireland",electrician,"high school diploma, vocational electrician ce...","circuit testing, blueprint reading, fault find...",residential electrician's helper: 1 year
2,user_3,google form,2023-12-31 13:39:00,"dublin, ireland",data analyst,"degree: master of science in data analytics, b...","python, data mining and extraction, data analy...",entry level data analyst: 1 year; data coordin...
