In [1]:
!pip install accelerate -U
!pip install transformers soundfile torchmetrics gdown

Collecting accelerate
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub (from accelerate)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, accelerate
Successfully installed accelerate-0.23.0 huggingface-hub-0.17.1
Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m31.6 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics
  Downloading torchmetrics-1.1.2-py3-none-any.whl (764 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m764.8/764.8 kB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.

In [None]:
# !mkdir ./dataset
import gdown
def drive_download(idx, output):
    url = 'https://drive.google.com/uc?id=' + idx
    gdown.download(url, output, quiet=False)

In [None]:
# Data download
drive_download("1ZBL3h6bHMmd8MIUNXqg72PucUkC9ZSWJ", "./dataset/train_data.zip")
drive_download("1ZepptsTrVSjQEx-dpBBmQ2b7xYFLn_64", "./dataset/public_test.zip")
# drive_download("1K_07kix1OgBGO2FNPh-Lxqr1yLbtqFYt", "./dataset/train.jsonl")

Downloading...
From: https://drive.google.com/uc?id=1ZBL3h6bHMmd8MIUNXqg72PucUkC9ZSWJ
To: /content/dataset/train_data.zip
100%|██████████| 733M/733M [00:06<00:00, 106MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1ZepptsTrVSjQEx-dpBBmQ2b7xYFLn_64
To: /content/dataset/public_test.zip
100%|██████████| 131M/131M [00:01<00:00, 72.1MB/s]


In [None]:
!unzip ./dataset/public_test.zip -d ./dataset/test
!unzip ./dataset/train_data.zip -d ./dataset/train

In [None]:
# Data vocab, sentence, and token label
# drive_download("186Tv-dPED5QiIJy4sRvlsNsvYxLpXfWX", "./vn_base_vocab.json")
# drive_download("1KXn1A17ce7jNX1qQhx8l0DQK1KYhbmls", "./train_token_labels_20230909.json")
# drive_download("1K-oNwBu2svshAkmifU9wISKPMvKgeKy4", "./train_20230909.jsonl")

Downloading...
From: https://drive.google.com/uc?id=186Tv-dPED5QiIJy4sRvlsNsvYxLpXfWX
To: /content/vn_base_vocab.json
100%|██████████| 1.35k/1.35k [00:00<00:00, 5.00MB/s]
Downloading...
From: https://drive.google.com/uc?id=1KXn1A17ce7jNX1qQhx8l0DQK1KYhbmls
To: /content/train_token_labels_20230909.json
100%|██████████| 271k/271k [00:00<00:00, 3.62MB/s]
Downloading...
From: https://drive.google.com/uc?id=1K-oNwBu2svshAkmifU9wISKPMvKgeKy4
To: /content/train_20230909.jsonl
100%|██████████| 3.30M/3.30M [00:00<00:00, 21.0MB/s]


In [None]:
#  Upload py file: model.py, utils.py, dataset.py, trainer.py

In [1]:
import re
import torch
import utils
from trainer import Trainer, TrainerV2
from model import BertSLU, BertSLUV2
from functools import partial
from dataset import BertDataset
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer

In [2]:
def custom_collate(tokenizer, is_train, batch):
    inputs = tokenizer([i["text"] for i in batch], return_tensors="pt", padding="longest")
    if not is_train:
        return inputs, torch.zeros_like(inputs["input_ids"]), torch.zeros(inputs["input_ids"].size(0))
    seq_len = inputs["input_ids"].size(1)
    token_labels = torch.stack([
        torch.tensor(i["token_label"] + [-100]*(seq_len - len(i["token_label"]))) for i in batch
    ])
    intent_labels = torch.tensor([i["intent_label"] for i in batch])
    return inputs, token_labels, intent_labels

In [3]:
def get_loader(annotation_path, token_label_path, batch_size=2, test_size=0.3):
    tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
    all_data = utils.load_json(token_label_path)
    all_data = [v for k, v in all_data.items()]
    all_text = [i["sentence"] for i in all_data]
    dataset = BertDataset(all_text, all_data)
    N = len(dataset)
    print("Len dataset", N)
    train_size = int(N * (1-test_size))
    train_set, valid_set = torch.utils.data.random_split(dataset, [train_size, N-train_size])
    if test_size == 0:
        train_set = dataset
        valid_set = dataset
    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=partial(custom_collate, tokenizer, True)
    )
    valid_loader = DataLoader(
        valid_set,
        batch_size=batch_size,
        collate_fn=partial(custom_collate, tokenizer, True)
    )
    return train_loader, valid_loader

In [4]:
def get_test_loader(test_path, batch_size=2):
    tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
    sequences = utils.load_json(test_path)
    id_seqs = [k for k, v in sequences.items()]
    seqs = [v for k, v in sequences.items()]
    dataset = BertDataset(seqs)
    test_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=partial(custom_collate, tokenizer, False)
    )
    return test_loader, id_seqs, seqs

In [5]:
train_loader, valid_loader = get_loader("./train_20230909.jsonl", "./train_token_labels_20230909.json", 32)
print(f"Len train_loader: {len(train_loader)} - Len valid_loader: {len(valid_loader)}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Len dataset 7490
Len train_loader: 164 - Len valid_loader: 71


In [6]:
test_loader, test_file_id, all_seqs = get_test_loader("./4gram_test_sentences_v3_32w.json", 32)
len(test_loader)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


41

In [8]:
!mkdir ./checkpoint

In [15]:
!mv ./checkpoint/slu_intent.pt ./checkpoint/slu_token.pt

In [17]:
class config:
    epochs = 15
    checkpoint_path = "./checkpoint/slu_intent.pt"
    learning_rate = 5e-5
    adam_eps = 1e-8
    warmup_steps = 1000
    weight_decay = 0.005

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
intent_model = BertSLUV2("intent_class", 15, 9, "vinai/phobert-base-v2")
print(f"Num of param:", sum(p.numel() for p in intent_model.parameters()))
optimizer = torch.optim.AdamW(intent_model.parameters(), lr=config.learning_rate, eps=config.adam_eps, weight_decay=config.weight_decay)
criterion = torch.nn.CrossEntropyLoss()

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Num of param: 135009807


In [19]:
trainer = TrainerV2(intent_model, optimizer, criterion, amp=False, device=device)
trainer.fit(train_loader, valid_loader, epochs=config.epochs, checkpoint=config.checkpoint_path)

Running on: Tesla T4
Total update step: 2460
Epoch: 1
Train step: 164 / 164 - loss: 0.16437 - acc: 0.9630
Valid step: 71 / 71 - loss: 0.00003 - acc: 1.0000
	 => Train loss: 0.65861 - Train acc: 0.85
	 => Valid loss: 0.07309 - Valid acc: 0.99
	 => Time: 0:00:30/step - lr: : 5.000000e-05
[+] Save checkpoint successfully
Epoch: 2
Train step: 164 / 164 - loss: 0.00331 - acc: 1.0000
Valid step: 71 / 71 - loss: 0.00000 - acc: 1.0000
	 => Train loss: 0.07080 - Train acc: 0.99
	 => Valid loss: 0.04030 - Valid acc: 0.99
	 => Time: 0:00:30/step - lr: : 5.000000e-05
[+] Save checkpoint successfully
Epoch: 3
Train step: 164 / 164 - loss: 0.00000 - acc: 1.0000
Valid step: 71 / 71 - loss: 0.00000 - acc: 1.0000
	 => Train loss: 0.04880 - Train acc: 0.99
	 => Valid loss: 0.05464 - Valid acc: 0.99
	 => Time: 0:00:30/step - lr: : 5.000000e-05
[+] Save checkpoint successfully
Epoch: 4
Train step: 164 / 164 - loss: 0.00000 - acc: 1.0000
Valid step: 71 / 71 - loss: 0.00000 - acc: 1.0000
	 => Train loss: 0.

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
token_model = BertSLUV2("token_class", 15, 9, "vinai/phobert-base-v2")
print(f"Num of param:", sum(p.numel() for p in token_model.parameters()))
optimizer = torch.optim.AdamW(token_model.parameters(), lr=config.learning_rate, eps=config.adam_eps, weight_decay=config.weight_decay)
criterion = torch.nn.CrossEntropyLoss()

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Num of param: 135005193


In [None]:
trainer = TrainerV2(token_model, optimizer, criterion, amp=False, device=device)
trainer.fit(train_loader, valid_loader, epochs=config.epochs, checkpoint=config.checkpoint_path)

# **Inference**

In [None]:
drive_download("1ZBpjMakFSDdShWbn1_BHstr--wRCnlv7", "./checkpoint_bert.pt")

Downloading...
From: https://drive.google.com/uc?id=1ZBpjMakFSDdShWbn1_BHstr--wRCnlv7
To: /content/checkpoint_bert.pt
100%|██████████| 540M/540M [00:06<00:00, 89.8MB/s]


In [None]:
!cp -r ./drive/MyDrive/datasets/checkpoint_bert.pt ./

In [20]:
intent_model = BertSLUV2("intent_class", 15, 9, "vinai/phobert-base-v2")
token_model = BertSLUV2("token_class", 15, 9, "vinai/phobert-base-v2")
intent_trainer = TrainerV2(intent_model, optimizer, criterion, amp=False, device=device)
intent_trainer.load_checkpoint("./checkpoint/slu_intent.pt")
token_trainer = TrainerV2(token_model, optimizer, criterion, amp=False, device=device)
token_trainer.load_checkpoint("./checkpoint/slu_token.pt")

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[+] Model load successful
[+] Model load successful


In [21]:
# trainer = TrainerV2(model, optimizer, criterion, amp=False, device=device)
# intent_trainer.load_checkpoint("./checkpoint/checkpoint_bert.pt")
_ , all_intents = intent_trainer.test(test_loader)
all_tokens, _ = token_trainer.test(test_loader)
len(all_tokens), len(all_intents)

 41 / 41
 41 / 41


(1299, 1299)

In [22]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [25]:
all_intents[:5]

[6, 0, 14, 3, 1]

In [26]:
INVERSE_MAP_TOKENS = {
    0: 'word',
    1: 'time at',
    2: 'device',
    3: 'changing value',
    4: 'scene',
    5: 'command',
    6: 'location',
    7: 'duration',
    8: 'target number'
 }

INVERSE_MAP_INTENTS = {
    0: 'Giảm độ sáng của thiết bị',
    1: 'Đóng thiết bị',
    2: 'Hủy hoạt cảnh',
    3: 'Tắt thiết bị',
    4: 'Tăng âm lượng của thiết bị',
    5: 'Giảm mức độ của thiết bị',
    6: 'Bật thiết bị',
    7: 'Tăng mức độ của thiết bị',
    8: 'Tăng nhiệt độ của thiết bị',
    9: 'Kiểm tra tình trạng thiết bị',
    10: 'Mở thiết bị',
    11: 'Giảm âm lượng của thiết bị',
    12: 'Kích hoạt cảnh',
    13: 'Giảm nhiệt độ của thiết bị',
    14: 'Tăng độ sáng của thiết bị'
}

In [27]:
def collect_label(token):
    token = token[1:]
    for i in range(len(token) - 1, -1, -1):
        if token[i] != 0:
            token = token[:i+1]
            break
    token += [-1]
    map_labels = []
    cur = 0
    val = token[0]
    for idx, i in enumerate(token[1:], 1):
        if i == val:
            continue
        else:
            if val != 0:
                map_labels.append([cur, idx-1, val])
            val = i
            cur = idx
    return map_labels

def convert_into_output(all_tokens, all_intents, all_seqs, test_file_id, tokenizer):
    ans = []
    for idx in range(len(all_tokens)):
        token = all_tokens[idx]
        intent = all_intents[idx]
        seq = tokenizer.tokenize(all_seqs[idx])
        labels = collect_label(token)
        tmp_ans = {
            "intent": INVERSE_MAP_INTENTS[intent],
            "file": test_file_id[idx]
        }
        entities = []
        # print(labels)
        # print(seq)
        # return
        for label in labels:
            if label[-1] == 0:
                continue
            sub_text = seq[label[0]: label[1]+1]
            sub_text = tokenizer.decode(
                tokenizer.convert_tokens_to_ids(sub_text), skip_special_tokens=True
            )
            tmp_add = {"type": INVERSE_MAP_TOKENS[label[-1]], "filler": sub_text}
            # check = list(filter(lambda x: tmp_add["type"] == x["type"] and tmp_add["filler"] == x["filler"], entities))
            # if len(check):
                # continue
            entities += [tmp_add]
        tmp_ans["entities"] = entities
        ans.append(tmp_ans)
        print("\r", end="")
        print(f"\r {idx+1} / {len(all_tokens)}", end="")
    return ans

In [28]:
ans = convert_into_output(all_tokens, all_intents, all_seqs, test_file_id, tokenizer)

 1299 / 1299

In [29]:
ans[:5]

[{'intent': 'Bật thiết bị',
  'file': 'qPANF1Bx3XpmuIEjlPUm9Ez.wav',
  'entities': [{'type': 'command', 'filler': 'bật'},
   {'type': 'time at', 'filler': '9 giờ 40 phút'}]},
 {'intent': 'Giảm độ sáng của thiết bị',
  'file': '8LcLj1sHy9xAZF4ibvlPFca.wav',
  'entities': [{'type': 'command', 'filler': 'giảm'},
   {'type': 'changing value', 'filler': '33%'}]},
 {'intent': 'Tăng độ sáng của thiết bị',
  'file': 'Z5G73Vc0YuNWlgV48QZYyQD.wav',
  'entities': [{'type': 'command', 'filler': 'tăng'},
   {'type': 'changing value', 'filler': '88%'}]},
 {'intent': 'Tắt thiết bị',
  'file': 'jTA7bLvVi4zxyXt8c3ePOTT.wav',
  'entities': [{'type': 'command', 'filler': 'tắt'},
   {'type': 'device', 'filler': 'bóng sân'}]},
 {'intent': 'Đóng thiết bị',
  'file': 'QaiOJwzIYKRxVrLDQHxODfn.wav',
  'entities': [{'type': 'command', 'filler': 'đóng'},
   {'type': 'device', 'filler': 'lò nướng'}]}]

In [30]:
import json

with open("./predictions.jsonl", "w", encoding="utf-8") as f:
    for line in ans:
        json.dump(line, f)
        f.write('\n')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r ./checkpoint/checkpoint_bert.pt ./drive/MyDrive/datasets