In [None]:
!pip install accelerate -U
!pip install transformers soundfile torchmetrics gdown

In [None]:
#  Upload py file: model.py, utils.py, dataset.py, trainer.py, train_token_labels.json for training, test_sentences.json for inference

In [1]:
import re
import torch
import utils
from trainer import Trainer, TrainerV2
from model import BertSLU, BertSLUV2, BertSLUV3
from functools import partial
from dataset import BertDataset
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer

In [3]:
def custom_collate(tokenizer, is_train, batch):
    inputs = tokenizer([i["text"] for i in batch], return_tensors="pt", padding="longest")
    if not is_train:
        return inputs, torch.zeros_like(inputs["input_ids"]), torch.zeros(inputs["input_ids"].size(0))
    seq_len = inputs["input_ids"].size(1)
    token_labels = torch.stack([
        torch.tensor(i["token_label"] + [-100]*(seq_len - len(i["token_label"]))) for i in batch
    ])
    intent_labels = torch.tensor([i["intent_label"] for i in batch])
    return inputs, token_labels, intent_labels

In [4]:
def get_loader(annotation_path, token_label_path, batch_size=2, test_size=0.3):
    tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
    all_data = utils.load_json(token_label_path)
    all_data = [v for k, v in all_data.items()]
    all_text = [i["sentence"] for i in all_data]
    dataset = BertDataset(all_text, all_data)
    N = len(dataset)
    print("Len dataset", N)
    train_size = int(N * (1-test_size))
    train_set, valid_set = torch.utils.data.random_split(dataset, [train_size, N-train_size])
    if test_size == 0:
        train_set = dataset
        valid_set = dataset
    train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=partial(custom_collate, tokenizer, True)
    )
    valid_loader = DataLoader(
        valid_set,
        batch_size=batch_size,
        collate_fn=partial(custom_collate, tokenizer, True)
    )
    return train_loader, valid_loader

In [27]:
def get_test_loader(test_path, batch_size=2):
    tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
    sequences = utils.load_json(test_path)
    id_seqs = [k for k, v in sequences.items()]
    seqs = [v for k, v in sequences.items()]
    dataset = BertDataset(seqs)
    test_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=partial(custom_collate, tokenizer, False)
    )
    return test_loader, id_seqs, seqs

In [9]:
train_loader, valid_loader = get_loader("./train_20230909.jsonl", "./train_token_labels_20230909.json", 32)
print(f"Len train_loader: {len(train_loader)} - Len valid_loader: {len(valid_loader)}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Len dataset 7490
Len train_loader: 164 - Len valid_loader: 71


In [28]:
test_loader, test_file_id, all_seqs = get_test_loader("./4gram_test_sentences_v3_32w.json", 32)
len(test_loader)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


41

In [10]:
!mkdir ./checkpoint

In [22]:
class config:
    epochs = 15
    checkpoint_path_it = "./checkpoint/slu_intent.pt"
    checkpoint_path_tk = "./checkpoint/slu_token.pt"
    learning_rate = 1e-5
    adam_eps = 1e-8
    warmup_steps = 1000
    weight_decay = 0.005

In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
intent_model = BertSLUV3("intent_class", 15, 9, "vinai/phobert-base-v2")
print(f"Num of param:", sum(p.numel() for p in intent_model.parameters()))
optimizer = torch.optim.AdamW(intent_model.parameters(), lr=config.learning_rate, eps=config.adam_eps, weight_decay=config.weight_decay)
criterion = torch.nn.CrossEntropyLoss()

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Num of param: 157673999


In [23]:
trainer = TrainerV2(intent_model, optimizer, criterion, amp=False, device=device)
trainer.load_checkpoint("./checkpoint/slu_intent.pt")
trainer.fit(train_loader, valid_loader, epochs=config.epochs, checkpoint=config.checkpoint_path_it)

[+] Model load successful
Running on: Tesla T4
Total update step: 2460
Epoch: 1
Train step: 164 / 164 - loss: 0.00052 - acc: 1.0000
Valid step: 71 / 71 - loss: 0.00005 - acc: 1.0000
	 => Train loss: 0.00340 - Train acc: 1.00
	 => Valid loss: 0.04433 - Valid acc: 1.00
	 => Time: 0:00:39/step - lr: : 5.000000e-05
[+] Save checkpoint successfully
Epoch: 2
Train step: 164 / 164 - loss: 0.00001 - acc: 1.0000
Valid step: 71 / 71 - loss: 0.00003 - acc: 1.0000
	 => Train loss: 0.01269 - Train acc: 1.00
	 => Valid loss: 0.04624 - Valid acc: 1.00
	 => Time: 0:00:38/step - lr: : 5.000000e-05
[+] Save checkpoint successfully
Epoch: 3
Train step: 164 / 164 - loss: 0.00001 - acc: 1.0000
Valid step: 71 / 71 - loss: 0.00001 - acc: 1.0000
	 => Train loss: 0.00006 - Train acc: 1.00
	 => Valid loss: 0.06051 - Valid acc: 0.99
	 => Time: 0:00:39/step - lr: : 5.000000e-05
[+] Save checkpoint successfully
Epoch: 4
Train step: 164 / 164 - loss: 0.00624 - acc: 1.0000
Valid step: 71 / 71 - loss: 0.00145 - acc: 

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Train step: 70 / 164 - loss: 0.00003 - acc: 1.0000Traceback (most recent call last):
  File "/content/trainer.py", line 302, in fit
    self.forward(train_loader, "train")
  File "/content/trainer.py", line 276, in forward
    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
  File "/usr/local/lib/python3.10/dist-packages/torch/nn/utils/clip_grad.py", line 76, in clip_grad_norm_
    torch._foreach_mul_(grads, clip_coef_clamped.to(device))  # type: ignore[call-overload]
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-23-3dde86392a67>", line 3, in <cell line: 3>
    trainer.fit(train_loader, valid_loader, epochs=config.epochs, checkpoint=config.checkpoint_path)
  File "/content/trainer.py", line 307, in fit
    sys.exit()

TypeError: ignored

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
token_model = BertSLUV3("token_class", 15, 9, "vinai/phobert-base-v2")
print(f"Num of param:", sum(p.numel() for p in token_model.parameters()))
optimizer = torch.optim.AdamW(token_model.parameters(), lr=config.learning_rate, eps=config.adam_eps, weight_decay=config.weight_decay)
criterion = torch.nn.CrossEntropyLoss()

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Num of param: 135005193


In [None]:
trainer = TrainerV2(token_model, optimizer, criterion, amp=False, device=device)
trainer.fit(train_loader, valid_loader, epochs=config.epochs, checkpoint=config.checkpoint_path_tk)

# **Inference**

In [12]:
!cp -r ./drive/MyDrive/checkpoint/slu_token.pt ./

In [25]:
intent_model = BertSLUV3("intent_class", 15, 9, "vinai/phobert-base-v2")
token_model = BertSLUV3("token_class", 15, 9, "vinai/phobert-base-v2")
intent_trainer = TrainerV2(intent_model, optimizer, criterion, amp=False, device=device)
intent_trainer.load_checkpoint("./checkpoint/slu_intent.pt")
token_trainer = TrainerV2(token_model, optimizer, criterion, amp=False, device=device)
token_trainer.load_checkpoint("./slu_token.pt")

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[+] Model load successful
[+] Model load successful


In [29]:
# trainer = TrainerV2(model, optimizer, criterion, amp=False, device=device)
# intent_trainer.load_checkpoint("./checkpoint/checkpoint_bert.pt")
_ , all_intents = intent_trainer.test(test_loader)
all_tokens, _ = token_trainer.test(test_loader)
len(all_tokens), len(all_intents)

 41 / 41
 41 / 41


(1299, 1299)

In [30]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
all_intents[:5]

[6, 0, 14, 3, 1]

In [31]:
INVERSE_MAP_TOKENS = {
    0: 'word',
    1: 'time at',
    2: 'device',
    3: 'changing value',
    4: 'scene',
    5: 'command',
    6: 'location',
    7: 'duration',
    8: 'target number'
 }

INVERSE_MAP_INTENTS = {
    0: 'Giảm độ sáng của thiết bị',
    1: 'Đóng thiết bị',
    2: 'Hủy hoạt cảnh',
    3: 'Tắt thiết bị',
    4: 'Tăng âm lượng của thiết bị',
    5: 'Giảm mức độ của thiết bị',
    6: 'Bật thiết bị',
    7: 'Tăng mức độ của thiết bị',
    8: 'Tăng nhiệt độ của thiết bị',
    9: 'Kiểm tra tình trạng thiết bị',
    10: 'Mở thiết bị',
    11: 'Giảm âm lượng của thiết bị',
    12: 'Kích hoạt cảnh',
    13: 'Giảm nhiệt độ của thiết bị',
    14: 'Tăng độ sáng của thiết bị'
}

In [32]:
def collect_label(token):
    token = token[1:]
    for i in range(len(token) - 1, -1, -1):
        if token[i] != 0:
            token = token[:i+1]
            break
    token += [-1]
    map_labels = []
    cur = 0
    val = token[0]
    for idx, i in enumerate(token[1:], 1):
        if i == val:
            continue
        else:
            if val != 0:
                map_labels.append([cur, idx-1, val])
            val = i
            cur = idx
    return map_labels

def convert_into_output(all_tokens, all_intents, all_seqs, test_file_id, tokenizer):
    ans = []
    for idx in range(len(all_tokens)):
        token = all_tokens[idx]
        intent = all_intents[idx]
        seq = tokenizer.tokenize(all_seqs[idx])
        labels = collect_label(token)
        tmp_ans = {
            "intent": INVERSE_MAP_INTENTS[intent],
            "file": test_file_id[idx]
        }
        entities = []
        # print(labels)
        # print(seq)
        # return
        for label in labels:
            if label[-1] == 0:
                continue
            sub_text = seq[label[0]: label[1]+1]
            sub_text = tokenizer.decode(
                tokenizer.convert_tokens_to_ids(sub_text), skip_special_tokens=True
            )
            tmp_add = {"type": INVERSE_MAP_TOKENS[label[-1]], "filler": sub_text}
            # check = list(filter(lambda x: tmp_add["type"] == x["type"] and tmp_add["filler"] == x["filler"], entities))
            # if len(check):
                # continue
            entities += [tmp_add]
        tmp_ans["entities"] = entities
        ans.append(tmp_ans)
        print("\r", end="")
        print(f"\r {idx+1} / {len(all_tokens)}", end="")
    return ans

In [33]:
ans = convert_into_output(all_tokens, all_intents, all_seqs, test_file_id, tokenizer)

 1299 / 1299

In [35]:
ans[:5]

[{'intent': 'Bật thiết bị',
  'file': 'qPANF1Bx3XpmuIEjlPUm9Ez.wav',
  'entities': [{'type': 'command', 'filler': 'bật'},
   {'type': 'time at', 'filler': '9 giờ 40 phút'}]},
 {'intent': 'Giảm độ sáng của thiết bị',
  'file': '8LcLj1sHy9xAZF4ibvlPFca.wav',
  'entities': [{'type': 'command', 'filler': 'giảm'},
   {'type': 'changing value', 'filler': '33%'}]},
 {'intent': 'Tăng độ sáng của thiết bị',
  'file': 'Z5G73Vc0YuNWlgV48QZYyQD.wav',
  'entities': [{'type': 'command', 'filler': 'tăng'},
   {'type': 'changing value', 'filler': '88%'}]},
 {'intent': 'Tắt thiết bị',
  'file': 'jTA7bLvVi4zxyXt8c3ePOTT.wav',
  'entities': [{'type': 'command', 'filler': 'tắt'},
   {'type': 'device', 'filler': 'bóng sân'}]},
 {'intent': 'Đóng thiết bị',
  'file': 'QaiOJwzIYKRxVrLDQHxODfn.wav',
  'entities': [{'type': 'command', 'filler': 'đóng'},
   {'type': 'device', 'filler': 'lò nướng'}]}]

In [34]:
import json

with open("./predictions.jsonl", "w", encoding="utf-8") as f:
    for line in ans:
        json.dump(line, f)
        f.write('\n')

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [36]:
!cp -r ./checkpoint/slu_intent.pt ./drive/MyDrive/checkpoint