In [5]:
def download_and_setup_model(model_id, custom_save_path):
    """
    Download and setup the model in a custom location
    """
    import os
    from transformers import AutoTokenizer, AutoModel
    # Create directory if it doesn't exist
    os.makedirs(custom_save_path, exist_ok=True)
    
    # Download model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModel.from_pretrained(model_id)
    
    # Save to custom location
    tokenizer.save_pretrained(custom_save_path)
    model.save_pretrained(custom_save_path)
    
    return custom_save_path

custom_save_path = download_and_setup_model("NghiemAbe/Vi-Legal-Bi-Encoder-v2", "./embeddings/legal_roberta")

In [14]:
pwd

'/home/thiendc/projects/legal_retrieval'

In [23]:
from transformers import RobertaTokenizerFast, PhobertTokenizer
from tokenizers import ByteLevelBPETokenizer
from pathlib import Path
import tempfile
import os
import json
import shutil
def update_legal_tokenizer(
    base_model_path="/home/thiendc/projects/legal_retrieval/embeddings/legal_roberta",
    texts_list=None,
    output_dir="/home/thiendc/projects/legal_retrieval/embeddings/legal_roberta"):
    
    # Load the original config
    config_path = os.path.join(base_model_path, "config.json")
    with open(config_path, "r") as f:
        config = json.load(f)
    
    # Create a temporary directory to save texts
    with tempfile.TemporaryDirectory() as temp_dir:
        # Save texts to temporary file
        temp_file_path = os.path.join(temp_dir, "corpus.txt")
        with open(temp_file_path, "w", encoding="utf-8") as f:
            for text in texts_list:
                f.write(text + "\n")
        
        # Initialize tokenizer from the base model
        tokenizer = ByteLevelBPETokenizer.from_file(
            vocab_filename=os.path.join(base_model_path, "vocab.json"),
            merges_filename=os.path.join(base_model_path, "merges.txt")
        )
        
        # Train the tokenizer
        tokenizer.train(
            files=[temp_file_path],
            vocab_size=config["vocab_size"],
            min_frequency=2,
            special_tokens=[
                "<s>",
                "<pad>",
                "</s>",
                "<unk>",
                "<mask>"
            ]
        )
        
        # Ensure output directory exists
        os.makedirs(output_dir, exist_ok=True)
        
        # Save the updated tokenizer
        tokenizer.save_model(output_dir)
        
        # Convert to PhobertTokenizer
        updated_tokenizer = PhobertTokenizer(
            vocab_file=os.path.join(output_dir, "vocab.json"),
            merges_file=os.path.join(output_dir, "merges.txt"),
            bos_token="<s>",
            eos_token="</s>",
            sep_token="</s>",
            cls_token="<s>",
            unk_token="<unk>",
            pad_token="<pad>",
            mask_token="<mask>",
            max_len=config["max_position_embeddings"]
        )
        
        # Save tokenizer configuration
        updated_tokenizer.save_pretrained(output_dir)
        
        # Update and save the config
        tokenizer_config = {
            "tokenizer_class": "PhobertTokenizer",
            "model_max_length": config["max_position_embeddings"],
            "padding_side": "right",
            "truncation_side": "right",
            "bos_token": "<s>",
            "eos_token": "</s>",
            "unk_token": "<unk>",
            "sep_token": "</s>",
            "pad_token": "<pad>",
            "cls_token": "<s>",
            "mask_token": "<mask>",
        }
        
        config_path = os.path.join(output_dir, "tokenizer_config.json")
        with open(config_path, "w") as f:
            json.dump(tokenizer_config, f, indent=2)
            
        print(f"Updated tokenizer saved at: {output_dir}")
        print(f"Config saved at: {config_path}")
            
        return updated_tokenizer

In [12]:
from src.preprocessor.utils.dataset_level import read_json
list_corpus = read_json('/home/thiendc/projects/legal_retrieval/data/update_vocab.json')

In [24]:
updated_tokenizer = update_legal_tokenizer(texts_list= list_corpus)

# Test the tokenizer
test_text = "Your test legal text"
encoded = updated_tokenizer(test_text)
decoded = updated_tokenizer.decode(encoded["input_ids"])

print(f"Encoded: {encoded['input_ids']}")
print(f"Decoded: {decoded}")
print(f"Vocabulary size: {updated_tokenizer.vocab_size}")

# Verify compatibility
assert updated_tokenizer.vocab_size == 64001, "Vocabulary size mismatch"
assert updated_tokenizer.model_max_length == 258, "Max length mismatch"

Exception: Error while reading vocab & merges files: No such file or directory (os error 2)

In [None]:
os.path.join()

In [18]:
from src.preprocessor.utils.dataset_level import *
# import string
# old_vocab =read_txt('./data/vocab/phobert_vocab_raw.txt')
# vocabs = []
# for item in old_vocab:
#     item = item.split(" ")[0]
#     if len(item) > 1:
#         item = item.replace("@@", "")
#         for punc in string.punctuation:
#             if item.endswith(punc):
#                 item = item.replace(punc, "")
#     else:
#         item = item
#     vocabs.append(item)

In [26]:
from src.preprocessor.vocab.stopwords import STOP_WORDS
from src.preprocessor.vocab.roman_numerals_dict import ROMAN_DICT
from src.preprocessor.vocab.duties_dict import DUTIES
from src.preprocessor.vocab.legal_dict import LEGAL_DICT


corpus = set(read_json('./src/preprocessor/vocab/data/update_vocab.json'))
phobert_vocab = read_json('./src/preprocessor/vocab/data/phobert_vocab_processed.json')
DUTIES = set(DUTIES.values())
LEGAL_DICT = set(LEGAL_DICT.values())
# corpus.update(set(phobert_vocab))
corpus.update(ROMAN_DICT)
corpus.update(STOP_WORDS)
corpus.update(DUTIES)
corpus.update(LEGAL_DICT)
len(corpus)


13007

In [27]:
save_json(list(corpus), './src/preprocessor/vocab/data/update_vocab_v1.json')