In [1]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
%cd /content/gdrive/My Drive/fin_train_test/

/content/gdrive/My Drive/fin_train_test


In [3]:
!ls

test  train


### Import train, test Tech Group data of Technology Services department

### Dataframe to Dataset

In [4]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━[0m [32m317.4/547.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m42.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-an

In [4]:
import pandas as pd
train = pd.read_csv('train/fin_tg.csv')
test = pd.read_csv('test/fin_tg.csv')

In [5]:
# Calculate the sum of each row to determine the number of labels per description
rowsums = train.iloc[:, 1:].sum(axis=1)

# Count the number of descriptions without any label
no_label_count = (rowsums == 0).sum()

# Filter the DataFrame to exclude rows with no labels
train_df_new = train[rowsums != 0]

print(f"Number of descriptions without any label: {no_label_count}")
print(f"Shape of original DataFrame: {train.shape}")
print(f"Shape of filtered DataFrame: {train_df_new.shape}")

Number of descriptions without any label: 11
Shape of original DataFrame: (4092, 4)
Shape of filtered DataFrame: (4081, 4)


In [6]:
# Calculate the sum of each row to determine the number of labels per description
rowsums = test.iloc[:, 1:].sum(axis=1)

# Count the number of descriptions without any label
no_label_count = (rowsums == 0).sum()

# Filter the DataFrame to exclude rows with no labels
test_df_new = test[rowsums != 0]

print(f"Number of descriptions without any label: {no_label_count}")
print(f"Shape of original DataFrame: {test.shape}")
print(f"Shape of filtered DataFrame: {test_df_new.shape}")

Number of descriptions without any label: 4
Shape of original DataFrame: (1024, 4)
Shape of filtered DataFrame: (1020, 4)


In [7]:
train = train_df_new
test = test_df_new

In [8]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

In [10]:
train_dataset

Dataset({
    features: ['Description', 'Billing', 'Finance - Accounts Payables', 'Finance - Treasury', '__index_level_0__'],
    num_rows: 4081
})

### Label Encoding

In [9]:
import os
def department_label_encoding(train_dataset):
    labels = [col for col in train_dataset.column_names if col != 'Description']
    labels = labels[:-1]
    label2id = {label: idx for idx, label in enumerate(labels)}
    id2label = {idx: label for label, idx in label2id.items()}
    print(label2id, id2label)
    return label2id, id2label

In [10]:
label2id, id2label = department_label_encoding(train_dataset)

{'Billing': 0, 'Finance - Accounts Payables': 1, 'Finance - Treasury': 2} {0: 'Billing', 1: 'Finance - Accounts Payables', 2: 'Finance - Treasury'}


### import model

In [11]:
!pip install protobuf



In [12]:
from transformers import AutoTokenizer
model_path = 'microsoft/deberta-v3-small'
tokenizer = AutoTokenizer.from_pretrained(model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### Tokenization Node

In [13]:
departments = [col for col in train_dataset.column_names if col != 'Description']
departments = departments[:-1]
departments

['Billing', 'Finance - Accounts Payables', 'Finance - Treasury']

In [14]:
def preprocess_function(example):
  text = example["Description"]
  labels = [0. for _ in range(len(departments))]
  for department in departments:
      if example[department] == 1:
          label_id = label2id[department]
          labels[label_id] = 1.

  example = tokenizer(text, truncation=True, padding='max_length', max_length=512)
  example['labels'] = labels
  return example

In [15]:
tokenized_train_dataset = train_dataset.map(preprocess_function)
tokenized_test_dataset = test_dataset.map(preprocess_function)

Map:   0%|          | 0/4081 [00:00<?, ? examples/s]

Map:   0%|          | 0/1020 [00:00<?, ? examples/s]

### Train Model

In [16]:
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
!pip3 install transformers[torch] accelerate evaluate sentencepiece accelerate

Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_c

In [17]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = sigmoid(predictions)
    predictions = (predictions > 0.5).astype(int)

    # Compute metrics
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='micro')
    precision = precision_score(labels, predictions, average='micro')
    recall = recall_score(labels, predictions, average='micro')

    return {"accuracy": accuracy, "f1": f1, "precision": precision, "recall": recall}

In [18]:
len(label2id)

3

In [19]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding

model = AutoModelForSequenceClassification.from_pretrained(
   model_path, num_labels=len(label2id),
           id2label=id2label, label2id=label2id,
                       problem_type = "multi_label_classification", ignore_mismatched_sizes=True)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
training_args = TrainingArguments(
    output_dir='./fin_techgroup',
    eval_strategy="epoch",
    learning_rate=2e-5,  # Adjusted learning rate
    per_device_train_batch_size=64,  # Increased batch size
    per_device_eval_batch_size=128,  # Increased eval batch size
    num_train_epochs=8,  # Increased epochs
    weight_decay=0.01,
)

In [29]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.127868,0.931373,0.934506,0.931774,0.937255
2,No log,0.071112,0.966667,0.966667,0.966667,0.966667
3,No log,0.078557,0.961765,0.962782,0.96184,0.963725
4,No log,0.051289,0.977451,0.977451,0.977451,0.977451
5,No log,0.061594,0.971569,0.972073,0.971596,0.972549
6,No log,0.059648,0.976471,0.976471,0.976471,0.976471
7,No log,0.061245,0.976471,0.976471,0.976471,0.976471
8,0.072700,0.057071,0.976471,0.976471,0.976471,0.976471


TrainOutput(global_step=512, training_loss=0.07121447124518454, metrics={'train_runtime': 566.3229, 'train_samples_per_second': 57.649, 'train_steps_per_second': 0.904, 'total_flos': 4325026810503168.0, 'train_loss': 0.07121447124518454, 'epoch': 8.0})

In [31]:
trainer.evaluate()

{'eval_loss': 0.0570712648332119,
 'eval_accuracy': 0.9764705882352941,
 'eval_f1': 0.9764705882352941,
 'eval_precision': 0.9764705882352941,
 'eval_recall': 0.9764705882352941,
 'eval_runtime': 5.7294,
 'eval_samples_per_second': 178.028,
 'eval_steps_per_second': 1.396,
 'epoch': 8.0}

In [32]:
trainer.push_to_hub('fin_tg')

events.out.tfevents.1721293725.311a2b46fc59.11840.0:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

events.out.tfevents.1721294426.311a2b46fc59.11840.3:   0%|          | 0.00/560 [00:00<?, ?B/s]

Upload 7 LFS files:   0%|          | 0/7 [00:00<?, ?it/s]

events.out.tfevents.1721293759.311a2b46fc59.11840.2:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

events.out.tfevents.1721293741.311a2b46fc59.11840.1:   0%|          | 0.00/5.24k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/568M [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/tangminhanh/fin_techgroup/commit/3376beb6a0226823836e9509e0ca1cdb59817b80', commit_message='fin_tg', commit_description='', oid='3376beb6a0226823836e9509e0ca1cdb59817b80', pr_url=None, pr_revision=None, pr_num=None)