In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [2]:
%cd /content/gdrive/My Drive/hr_subcate/

/content/gdrive/My Drive/hr_subcate


In [3]:
!ls

test_hr_subcate.csv  train_hr_subcate.csv


### Import train, test Tech Group data of Technology Services department

### Dataframe to Dataset

In [None]:
!pip install datasets

In [4]:
import pandas as pd
train = pd.read_csv('train_hr_subcate.csv')
test = pd.read_csv('test_hr_subcate.csv')

In [5]:
# Calculate the sum of each row to determine the number of labels per description
rowsums = train.iloc[:, 1:].sum(axis=1)

# Count the number of descriptions without any label
no_label_count = (rowsums == 0).sum()

# Filter the DataFrame to exclude rows with no labels
train_df_new = train[rowsums != 0]

print(f"Number of descriptions without any label: {no_label_count}")
print(f"Shape of original DataFrame: {train.shape}")
print(f"Shape of filtered DataFrame: {train_df_new.shape}")

Number of descriptions without any label: 29
Shape of original DataFrame: (7264, 88)
Shape of filtered DataFrame: (7235, 88)


In [6]:
# Calculate the sum of each row to determine the number of labels per description
rowsums = test.iloc[:, 1:].sum(axis=1)

# Count the number of descriptions without any label
no_label_count = (rowsums == 0).sum()

# Filter the DataFrame to exclude rows with no labels
test_df_new = test[rowsums != 0]

print(f"Number of descriptions without any label: {no_label_count}")
print(f"Shape of original DataFrame: {test.shape}")
print(f"Shape of filtered DataFrame: {test_df_new.shape}")

Number of descriptions without any label: 7
Shape of original DataFrame: (1816, 88)
Shape of filtered DataFrame: (1809, 88)


In [7]:
train = train_df_new
test = test_df_new

In [8]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

In [9]:
train_dataset

Dataset({
    features: ['Description', 'Viewing of pay slip in HRIS', 'TaaS | Sprout - Change in Personal Information', 'HRIS Log in Issue', 'ID/Badge Card Request', 'HRIS - Approver Account Creation\xa0', 'HRIS - Offboarding\xa0', 'HRIS - Work Information Update', 'EAA Updates\xa0', 'Other HRIS Concerns', 'Inaccurate Data Correction', 'Personal Data Updating', 'Complicated Report', 'Simple Report', 'Exit Management Concern', 'Employement Verification', 'OT Allowance reimbursement', 'Reimbursable allowance (part of compensation)', 'Endorsement for Salary Increase', 'Endorsement for Promotion', 'Endorsement for Lateral Movement', 'Endorsement for Communication Allowance', 'Other movement endorsement/process inquiries', 'Certificate of Employment Request', 'Non-Disclosure Agreement Request', 'Memo Request', 'Employment Contract / Consultancy Agreement request', 'Personal information update', 'Marriage Certificate/Birth Certificate/Death Certificate/Solo Parent ID upload', 'HRIS unlock r

### Label Encoding

In [10]:
import os
def department_label_encoding(train_dataset):
    labels = [col for col in train_dataset.column_names if col != 'Description']
    labels = labels[:-1]
    label2id = {label: idx for idx, label in enumerate(labels)}
    id2label = {idx: label for label, idx in label2id.items()}
    print(label2id, id2label)
    return label2id, id2label

In [11]:
label2id, id2label = department_label_encoding(train_dataset)

{'Viewing of pay slip in HRIS': 0, 'TaaS | Sprout - Change in Personal Information': 1, 'HRIS Log in Issue': 2, 'ID/Badge Card Request': 3, 'HRIS - Approver Account Creation\xa0': 4, 'HRIS - Offboarding\xa0': 5, 'HRIS - Work Information Update': 6, 'EAA Updates\xa0': 7, 'Other HRIS Concerns': 8, 'Inaccurate Data Correction': 9, 'Personal Data Updating': 10, 'Complicated Report': 11, 'Simple Report': 12, 'Exit Management Concern': 13, 'Employement Verification': 14, 'OT Allowance reimbursement': 15, 'Reimbursable allowance (part of compensation)': 16, 'Endorsement for Salary Increase': 17, 'Endorsement for Promotion': 18, 'Endorsement for Lateral Movement': 19, 'Endorsement for Communication Allowance': 20, 'Other movement endorsement/process inquiries': 21, 'Certificate of Employment Request': 22, 'Non-Disclosure Agreement Request': 23, 'Memo Request': 24, 'Employment Contract / Consultancy Agreement request': 25, 'Personal information update': 26, 'Marriage Certificate/Birth Certifica

### import model

In [12]:
!pip install protobuf



In [16]:
from transformers import AutoTokenizer
model_path = 'tangminhanh/hr_cate'
tokenizer = AutoTokenizer.from_pretrained(model_path)

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

### Tokenization Node

In [17]:
departments = [col for col in train_dataset.column_names if col != 'Description']
departments = departments[:-1]
departments

['Viewing of pay slip in HRIS',
 'TaaS | Sprout - Change in Personal Information',
 'HRIS Log in Issue',
 'ID/Badge Card Request',
 'HRIS - Approver Account Creation\xa0',
 'HRIS - Offboarding\xa0',
 'HRIS - Work Information Update',
 'EAA Updates\xa0',
 'Other HRIS Concerns',
 'Inaccurate Data Correction',
 'Personal Data Updating',
 'Complicated Report',
 'Simple Report',
 'Exit Management Concern',
 'Employement Verification',
 'OT Allowance reimbursement',
 'Reimbursable allowance (part of compensation)',
 'Endorsement for Salary Increase',
 'Endorsement for Promotion',
 'Endorsement for Lateral Movement',
 'Endorsement for Communication Allowance',
 'Other movement endorsement/process inquiries',
 'Certificate of Employment Request',
 'Non-Disclosure Agreement Request',
 'Memo Request',
 'Employment Contract / Consultancy Agreement request',
 'Personal information update',
 'Marriage Certificate/Birth Certificate/Death Certificate/Solo Parent ID upload',
 'HRIS unlock request',
 '

In [18]:
def preprocess_function(example):
  text = example["Description"]
  labels = [0. for _ in range(len(departments))]
  for department in departments:
      if example[department] == 1:
          label_id = label2id[department]
          labels[label_id] = 1.

  example = tokenizer(text, truncation=True, padding='max_length', max_length=512)
  example['labels'] = labels
  return example

In [19]:
tokenized_train_dataset = train_dataset.map(preprocess_function)
tokenized_test_dataset = test_dataset.map(preprocess_function)

Map:   0%|          | 0/7235 [00:00<?, ? examples/s]

Map:   0%|          | 0/1809 [00:00<?, ? examples/s]

### Train Model

In [4]:
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip3 install transformers[torch] accelerate evaluate sentencepiece accelerate

In [20]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int)

    # Compute metrics
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='micro')
    precision = precision_score(labels, predictions, average='micro')
    recall = recall_score(labels, predictions, average='micro')

    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

In [21]:
len(label2id)

87

In [22]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding

model = AutoModelForSequenceClassification.from_pretrained(
   model_path, num_labels=len(label2id),
           id2label=id2label, label2id=label2id,
                       problem_type = "multi_label_classification", ignore_mismatched_sizes=True)

config.json:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/568M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at tangminhanh/hr_cate and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([24]) in the checkpoint and torch.Size([87]) in the model instantiated
- classifier.weight: found shape torch.Size([24, 768]) in the checkpoint and torch.Size([87, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
training_args = TrainingArguments(
    output_dir='./hr_subcate',
    eval_strategy="epoch",
    learning_rate=2e-5,  # Adjusted learning rate
    per_device_train_batch_size=32,  # Increased batch size
    per_device_eval_batch_size=32,  # Increased eval batch size
    num_train_epochs=10,  # Increased epochs
    weight_decay=0.01,
)

In [30]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [31]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.014265,0.732449,0.80829,0.902655,0.731788
2,No log,0.013995,0.747374,0.810089,0.876123,0.753311
3,0.012200,0.012992,0.771697,0.828546,0.89393,0.772075
4,0.012200,0.013106,0.789386,0.831019,0.873479,0.792494
5,0.009200,0.012604,0.789939,0.838823,0.88882,0.79415
6,0.009200,0.012743,0.791045,0.838337,0.878935,0.801325
7,0.007300,0.012612,0.797678,0.842196,0.878824,0.808499
8,0.007300,0.012309,0.798784,0.843634,0.871689,0.817329
9,0.006100,0.012452,0.802653,0.84578,0.878194,0.815673
10,0.006100,0.012383,0.807076,0.847409,0.880428,0.816777


TrainOutput(global_step=2270, training_loss=0.008326226656657484, metrics={'train_runtime': 1313.4525, 'train_samples_per_second': 55.084, 'train_steps_per_second': 1.728, 'total_flos': 9598885655500800.0, 'train_loss': 0.008326226656657484, 'epoch': 10.0})

In [32]:
trainer.evaluate()

{'eval_loss': 0.012383392080664635,
 'eval_accuracy': 0.8070757324488668,
 'eval_f1': 0.8474091039221301,
 'eval_precision': 0.8804283164782868,
 'eval_recall': 0.8167770419426048,
 'eval_runtime': 10.8708,
 'eval_samples_per_second': 166.41,
 'eval_steps_per_second': 5.243,
 'epoch': 10.0}

In [None]:
trainer.save_model('hr_techgroup')

In [33]:
trainer.push_to_hub('hr_subcate')

events.out.tfevents.1721721939.9cdcb57939d9.2051.2:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1721723264.9cdcb57939d9.2051.3:   0%|          | 0.00/560 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/568M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tangminhanh/hr_subcate/commit/5d47b6082ed53296880d542a9cb28e8156da9b74', commit_message='hr_subcate', commit_description='', oid='5d47b6082ed53296880d542a9cb28e8156da9b74', pr_url=None, pr_revision=None, pr_num=None)