In [1]:
!python -V

Python 3.8.20


In [3]:
import torch
print(torch.version.cuda)  # Check the CUDA version
print(torch.cuda.is_available())  # Check if CUDA is available

11.8
True


In [None]:
# pip install -r requirements.txt
# conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia

# Process data

In [5]:
import json
from collections import defaultdict

with open("/home/hoang/DSI-transformers/data/12_7/original/legal_corpus_update.json", "r", encoding="utf-8") as f:
    legal_corpus = json.load(f)
with open("/home/hoang/DSI-transformers/data/12_7/original/train_12x7_retrieval.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)
with open("/home/hoang/DSI-transformers/data/12_7/original/validation_12x7_retrieval.json", "r", encoding="utf-8") as f:
    val_data = json.load(f)
with open("/home/hoang/DSI-transformers/data/12_7/original/test_12x7_retrieval.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

def process_data(corpus, data, name="train"):
    count = 0
    for laws in corpus:
        for article in laws["articles"]:
            count += 1
    new_data = []
    id = 0
    for laws in corpus:
        for article in laws["articles"]:
            if name != "test":
                new_data.append({
                    "text_id": str(id),
                    "text": f'document: {laws["law_id"]}\n{article["title"]}\n{article["text"]}'
                })
            for question in data["items"]:
                for rel_article in question["relevant_articles"]:
                    if laws["law_id"] == rel_article["law_id"] and article["article_id"] == rel_article["article_id"]:
                        new_data.append({
                            "text_id": str(id),
                            "text": f'question: {question["question_full"]}'
                        })
                        break
            id+=1
            print(f"Processing {name}_data: {'{:.1%}'.format(id/count)}", end='\r')
    return new_data

def save_data(data, name="train"):
    with open(f"/home/hoang/DSI-transformers/data/12_7/processed/{name}.jsonl", "w") as f:
        for line in data:
            json_line = json.dumps(line, ensure_ascii=False)
            f.write(json_line + "\n")
    print(f"Saved to /home/hoang/DSI-transformers/data/12_7/processed/{name}.jsonl")

def group_items_by_text(data):
    # Sử dụng defaultdict để gom nhóm các phần tử có text giống nhau
    result = defaultdict(list)
    for item in data:
        result[item["text"]].append(item["text_id"])

    # Chuyển đổi kết quả thành danh sách dictionary như mong muốn
    grouped_data = [{"text_id": ids, "text": text} for text, ids in result.items()]
    return grouped_data

new_train_data = process_data(legal_corpus, train_data, "train")
new_val_data = process_data(legal_corpus, val_data, "val")
new_test_data = process_data(legal_corpus, test_data, "test")
new_test_data = group_items_by_text(new_test_data)

save_data(new_train_data, "train")
save_data(new_val_data, "val")
save_data(new_test_data, "test")

Saved to /home/hoang/DSI-transformers/data/12_7/processed/train.jsonl
Saved to /home/hoang/DSI-transformers/data/12_7/processed/val.jsonl
Saved to /home/hoang/DSI-transformers/data/12_7/processed/test.jsonl


In [23]:
!nvidia-smi

Thu Nov 14 02:21:06 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.06             Driver Version: 535.183.06   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off | 00000000:86:00.0 Off |                  Off |
| 38%   64C    P2             370W / 450W |  11351MiB / 24564MiB |    100%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090        Off | 00000000:AF:00.0 Off |  

# Train model

note: num epoch = ceil(max_steps / num_batch)

https://huggingface.co/google-t5/t5-large

Language(s) (NLP): English, French, Romanian, German

Method used:
- model: t5-large
- indexing: Inputs2Target
- document represent: Direct Indexing (only consider first 32 tokens of each document)
- represent docid for retrieval: Naively Structured String Identifiers

In [None]:
%cd /home/hoang/DSI-transformers
!CUDA_VISIBLE_DEVICES=0 python train.py \
    --model_name_or_path t5-large \
    --train_data data/12_7/processed/train.jsonl \
    --eval_data data/12_7/processed/val.jsonl \
    --test_data data/12_7/processed/test.jsonl \
    --max_steps 10000 --warmup_steps 500 \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 2 \
    --eval_steps 1000 --save_steps 1000 \
    --wandb_name JP-t5-large


 73%|█████████████████████████████▊           | 307/422 [01:42<00:37,  3.03it/s][A
 73%|█████████████████████████████▉           | 308/422 [01:42<00:36,  3.08it/s][A
 73%|██████████████████████████████           | 309/422 [01:43<00:36,  3.09it/s][A
 73%|██████████████████████████████           | 310/422 [01:43<00:35,  3.14it/s][A
 74%|██████████████████████████████▏          | 311/422 [01:43<00:36,  3.06it/s][A
 74%|██████████████████████████████▎          | 312/422 [01:44<00:35,  3.07it/s][A
 74%|██████████████████████████████▍          | 313/422 [01:44<00:35,  3.08it/s][A
 74%|██████████████████████████████▌          | 314/422 [01:44<00:37,  2.91it/s][A
 75%|██████████████████████████████▌          | 315/422 [01:45<00:37,  2.85it/s][A
 75%|██████████████████████████████▋          | 316/422 [01:45<00:36,  2.93it/s][A
 75%|██████████████████████████████▊          | 317/422 [01:45<00:35,  2.94it/s][A
 75%|██████████████████████████████▉          | 318/422 [01:46<00:34,  3.03

# Evaluate

In [None]:
%cd /home/hoang/DSI-transformers
!CUDA_VISIBLE_DEVICES=0 python evaluate.py \
    --model_name_or_path results/checkpoint-8000 \
    --test_data data/12_7/processed/test.jsonl \
    --per_device_eval_batch_size 32 \
    --num_beams 200

/home/hoang/DSI-transformers
  state_dict = torch.load(resolved_archive_file, map_location="cpu")
Using custom data configuration default-532232b719c8004a
Reusing dataset json (cache/json/default-532232b719c8004a/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 229.47it/s]
Evaluating test queries:   0%|                            | 0/5 [00:00<?, ?it/s]
true_labels: ['125', '129', '134', '141', '147', '150', '164', '166', '167', '168', '169']
pred_labels: ['352', '369', '356', '346', '137', '372', '347', '326', '168', '371', '337', '349', '301', '345', '342', '348', '370', '338', '163', '328', '327', '308', '336', '339', '340', '325', '368', '343', '330', '344', '332', '314', '334', '355', '304', '321', '315', '309', '341', '357', '329', '358', '335', '396', '313', '240', '362', '238', '402', '252', '394', '302', '359', '237', '322', '374', '324', '407', '320', '365', '354', '317', '318', '397', 

In [1]:
%cd /home/hoang/DSI-transformers
!CUDA_VISIBLE_DEVICES=0 python evaluate.py \
    --model_name_or_path results/checkpoint-8000 \
    --test_data data/12_7/processed/test.jsonl \
    --per_device_eval_batch_size 2 \
    --num_beams 200

/home/hoang/DSI-transformers
  state_dict = torch.load(resolved_archive_file, map_location="cpu")
Using custom data configuration default-532232b719c8004a
Reusing dataset json (cache/json/default-532232b719c8004a/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 267.41it/s]
Evaluating test queries:   0%|                           | 0/65 [00:00<?, ?it/s]
true_labels: ['125', '129', '134', '141', '147', '150', '164', '166', '167', '168', '169']
pred_labels: ['352', '369', '356', '346', '137', '372', '347', '326', '168', '371', '337', '349', '301', '345', '342', '348', '370', '338', '163', '328', '327', '308', '336', '339', '340', '325', '368', '343', '330', '344', '332', '314', '334', '355', '304', '321', '315', '309', '341', '357', '329', '358', '335', '396', '313', '240', '362', '238', '402', '252', '394', '302', '359', '237', '322', '374', '324', '407', '320', '365', '354', '317', '318', '397', 

Lý giải nguyên nhân kq thấp:
- model t5 không hỗ trợ tiếng nhật
- sử dụng method direct indexing, tức chỉ xem xét 32 token đầu tiên, mà data luật thì phần đầu hầu hết là tên đạo luật, chương, tên chương nên sẽ bị lặp khá nhiều
- document và query trong data là quan hệ n-n, nên sẽ làm model bị mơ hồ khi index