In [None]:
!pip install accelerate -U
!pip install transformers soundfile torchmetrics gdown

In [3]:
!mkdir ./dataset
import gdown
def drive_download(idx, output):
    url = 'https://drive.google.com/uc?id=' + idx
    gdown.download(url, output, quiet=False)

In [4]:
# Data download
drive_download("1ZBL3h6bHMmd8MIUNXqg72PucUkC9ZSWJ", "./dataset/train_data.zip")
drive_download("1ZepptsTrVSjQEx-dpBBmQ2b7xYFLn_64", "./dataset/public_test.zip")
# drive_download("1K_07kix1OgBGO2FNPh-Lxqr1yLbtqFYt", "./dataset/train.jsonl")

Downloading...
From: https://drive.google.com/uc?id=1ZBL3h6bHMmd8MIUNXqg72PucUkC9ZSWJ
To: /content/dataset/train_data.zip
100%|██████████| 733M/733M [00:06<00:00, 106MB/s] 
Downloading...
From: https://drive.google.com/uc?id=1ZepptsTrVSjQEx-dpBBmQ2b7xYFLn_64
To: /content/dataset/public_test.zip
100%|██████████| 131M/131M [00:01<00:00, 72.1MB/s]


In [None]:
!unzip ./dataset/public_test.zip -d ./dataset/test
!unzip ./dataset/train_data.zip -d ./dataset/train

In [20]:
# Data vocab, sentence, and token label
# drive_download("186Tv-dPED5QiIJy4sRvlsNsvYxLpXfWX", "./vn_base_vocab.json")
# drive_download("1KXn1A17ce7jNX1qQhx8l0DQK1KYhbmls", "./train_token_labels_20230909.json")
# drive_download("1K-oNwBu2svshAkmifU9wISKPMvKgeKy4", "./train_20230909.jsonl")

Downloading...
From: https://drive.google.com/uc?id=186Tv-dPED5QiIJy4sRvlsNsvYxLpXfWX
To: /content/vn_base_vocab.json
100%|██████████| 1.35k/1.35k [00:00<00:00, 5.00MB/s]
Downloading...
From: https://drive.google.com/uc?id=1KXn1A17ce7jNX1qQhx8l0DQK1KYhbmls
To: /content/train_token_labels_20230909.json
100%|██████████| 271k/271k [00:00<00:00, 3.62MB/s]
Downloading...
From: https://drive.google.com/uc?id=1K-oNwBu2svshAkmifU9wISKPMvKgeKy4
To: /content/train_20230909.jsonl
100%|██████████| 3.30M/3.30M [00:00<00:00, 21.0MB/s]


In [None]:
#  Upload py file: model.py, utils.py, dataset.py, trainer.py

In [6]:
import torch
import utils
from trainer import Trainer
from model import BertSLU
from functools import partial
from dataset import BertDataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

In [7]:
def custom_collate(tokenizer, is_train, batch):
    inputs = tokenizer([i["text"] for i in batch], return_tensors="pt", padding="longest")
    if not is_train:
        return inputs, torch.zeros_like(inputs["input_ids"]), torch.zeros(inputs["input_ids"].size(0))
    seq_len = inputs["input_ids"].size(1)
    token_labels = torch.stack([
        torch.tensor(i["token_label"] + [-100]*(seq_len - len(i["token_label"]))) for i in batch
    ])
    intent_labels = torch.tensor([i["intent_label"] for i in batch])
    return inputs, token_labels, intent_labels

In [21]:
def get_loader(annotation_path, token_label_path, batch_size=2, test_size=0.3):
    tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
    annotations = utils.load_annotation(annotation_path)
    all_intent = [i["intent"] for i in annotations]
    all_text = [i["sentence"] for i in annotations]
    all_label = utils.load_json(token_label_path)
    all_label = [all_label, all_intent]
    dataset = BertDataset(all_text, all_label, utils.MAP_INTENT)
    N = len(dataset)
    train_size = int(N * (1-test_size))
    train_set, valid_set = torch.utils.data.random_split(dataset, [train_size, N-train_size])
    train_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=partial(custom_collate, tokenizer, True)
    )
    valid_loader = DataLoader(
        valid_set,
        batch_size=batch_size,
        collate_fn=partial(custom_collate, tokenizer, True)
    )
    return train_loader, valid_loader

In [8]:
def get_test_loader(test_path, batch_size=2):
    tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
    sequences = utils.load_json(test_path)
    id_seqs = [k for k, v in sequences.items()]
    seqs = [v for k, v in sequences.items()]
    dataset = BertDataset(seqs)
    test_loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=partial(custom_collate, tokenizer, False)
    )
    return test_loader, id_seqs, seqs

In [22]:
train_loader, valid_loader = get_loader("./train_20230909.jsonl", "./train_token_labels_20230909.json", 16)
print(f"Len train_loader: {len(train_loader)} - Len valid_loader: {len(valid_loader)}")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Len train_loader: 469 - Len valid_loader: 141


In [9]:
test_loader, test_file_id, all_seqs = get_test_loader("./w2v_lasted_test_sentences.json", 8)
len(test_loader)

Downloading (…)lve/main/config.json:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


163

In [10]:
class config:
    epochs = 20
    checkpoint_path = "./checkpoint/checkpoint_bert.pt"
    learning_rate = 5e-5
    adam_eps = 1e-8
    warmup_steps = 1000
    weight_decay = 0.005

In [23]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = BertSLU(15, 9, "vinai/phobert-base-v2")
print(f"Num of param:", sum(p.numel() for p in model.parameters()))
optimizer = torch.optim.AdamW(utils.weight_decay(model, config.weight_decay), lr=config.learning_rate, eps=config.adam_eps)
criterion = torch.nn.CrossEntropyLoss()

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Num of param: 135016728


In [None]:
!mkdir ./checkpoint

In [None]:
trainer = Trainer(model, optimizer, criterion, amp=True, device=device)
# trainer.load_checkpoint("./checkpoint/checkpoint_bert.pt") Load checkpoint if wanna continue training
trainer.fit(train_loader, valid_loader, epochs=config.epochs, checkpoint=config.checkpoint_path)

Inference

In [None]:
drive_download("1ZBpjMakFSDdShWbn1_BHstr--wRCnlv7", "./checkpoint_bert.pt")

In [12]:
!cp -r ./drive/MyDrive/datasets/checkpoint_bert.pt ./

In [13]:
trainer = Trainer(model, optimizer, criterion, amp=True, device=device)
trainer.load_checkpoint("./checkpoint_bert.pt")
all_tokens, all_intents = trainer.test(test_loader)
len(all_tokens), len(all_intents)

[+] Model load successful
 163 / 163


(1299, 1299)

In [14]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
INVERSE_MAP_TOKENS = {
    0: 'word',
    1: 'time at',
    2: 'device',
    3: 'changing value',
    4: 'scene',
    5: 'command',
    6: 'location',
    7: 'duration',
    8: 'target number'
 }

INVERSE_MAP_INTENTS = {
    0: 'Giảm độ sáng của thiết bị',
    1: 'Đóng thiết bị',
    2: 'Hủy hoạt cảnh',
    3: 'Tắt thiết bị',
    4: 'Tăng âm lượng của thiết bị',
    5: 'Giảm mức độ của thiết bị',
    6: 'Bật thiết bị',
    7: 'Tăng mức độ của thiết bị',
    8: 'Tăng nhiệt độ của thiết bị',
    9: 'Kiểm tra tình trạng thiết bị',
    10: 'Mở thiết bị',
    11: 'Giảm âm lượng của thiết bị',
    12: 'Kích hoạt cảnh',
    13: 'Giảm nhiệt độ của thiết bị',
    14: 'Tăng độ sáng của thiết bị'
}

In [16]:
def collect_label(token):
    token = token[1:]
    for i in range(len(token) - 1, -1, -1):
        if token[i] != 0:
            token = token[:i+1]
            break
    token += [-1]
    map_labels = []
    cur = 0
    val = token[0]
    for idx, i in enumerate(token[1:], 1):
        if i == val:
            continue
        else:
            if val != 0:
                map_labels.append([cur, idx-1, val])
            val = i
            cur = idx
    return map_labels

def convert_into_output(all_tokens, all_intents, all_seqs, test_file_id, tokenizer):
    ans = []
    for idx in range(len(all_tokens)):
        token = all_tokens[idx]
        intent = all_intents[idx]
        seq = tokenizer.tokenize(all_seqs[idx])
        labels = collect_label(token)
        tmp_ans = {
            "intent": INVERSE_MAP_INTENTS[intent],
            "file": test_file_id[idx]
        }
        entities = []
        # print(labels)
        # print(seq)
        # return
        for label in labels:
            if label[-1] == 0:
                continue
            sub_text = seq[label[0]: label[1]+1]
            sub_text = tokenizer.decode(
                tokenizer.convert_tokens_to_ids(sub_text), skip_special_tokens=True
            )
            tmp_add = {"type": INVERSE_MAP_TOKENS[label[-1]], "filler": sub_text}
            # check = list(filter(lambda x: tmp_add["type"] == x["type"] and tmp_add["filler"] == x["filler"], entities))
            # if len(check):
                # continue
            entities += [tmp_add]
        tmp_ans["entities"] = entities
        ans.append(tmp_ans)
        print("\r", end="")
        print(f"\r {idx+1} / {len(all_tokens)}", end="")
    return ans

In [17]:
ans = convert_into_output(all_tokens, all_intents, all_seqs, test_file_id, tokenizer)

 1299 / 1299

In [18]:
ans[:5]

[{'intent': 'Tăng độ sáng của thiết bị',
  'file': 'gkr2nW4Zxwv9ay6iR1od5jP.wav',
  'entities': [{'type': 'command', 'filler': 'tăng'},
   {'type': 'device', 'filler': 'đèn hắt'},
   {'type': 'target number', 'filler': '8%'}]},
 {'intent': 'Tắt thiết bị',
  'file': 'WoGuEH1SVfdNDpDGXxcTbHJ.wav',
  'entities': [{'type': 'command', 'filler': 'tắt'},
   {'type': 'device', 'filler': 'đèn treo tường'}]},
 {'intent': 'Mở thiết bị',
  'file': 'SDU8HKVUyOmIpvwzWcMgJW7.wav',
  'entities': [{'type': 'time at', 'filler': '6 giờ 8 phút'},
   {'type': 'command', 'filler': 'mở'},
   {'type': 'device', 'filler': 'máy sưởi'}]},
 {'intent': 'Đóng thiết bị',
  'file': 'QaiOJwzIYKRxVrLDQHxODfn.wav',
  'entities': [{'type': 'command', 'filler': 'đóng'},
   {'type': 'device', 'filler': 'lò nướng'}]},
 {'intent': 'Bật thiết bị',
  'file': 'vda3D3tnIOiwHHelu3W7PJM.wav',
  'entities': [{'type': 'command', 'filler': 'bật'},
   {'type': 'device', 'filler': 'đèn bếd'}]}]

In [19]:
import json

with open("./predictions.jsonl", "w", encoding="utf-8") as f:
    for line in ans:
        json.dump(line, f)
        f.write('\n')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp -r ./checkpoint/checkpoint_bert.pt ./drive/MyDrive/datasets