In [1]:
# %%bash
# pip install langchain
# pip install langchain-text-splitters
# pip install --upgrade pymilvus
# pip install "pymilvus[model]"
# pip install -qU langchain-text-splitters
# pip install beautifulsoup4

In [2]:
from langchain.text_splitter import CharacterTextSplitter
from pymilvus.model.hybrid import BGEM3EmbeddingFunction
from bs4 import BeautifulSoup
from scipy import sparse
import pandas as pd
import numpy as np
from glob import glob
import re
import os
import torch
import gc

In [3]:
file_names = glob("../data/VBPL_old/part1/*")
file_names.sort()
file_names

['../data/VBPL_old/part1\\Văn Bản Pháp Luật_p100000-100019.parquet',
 '../data/VBPL_old/part1\\Văn Bản Pháp Luật_p100020-100039.parquet',
 '../data/VBPL_old/part1\\Văn Bản Pháp Luật_p100040-100059.parquet',
 '../data/VBPL_old/part1\\Văn Bản Pháp Luật_p100060-100079.parquet',
 '../data/VBPL_old/part1\\Văn Bản Pháp Luật_p100080-100099.parquet',
 '../data/VBPL_old/part1\\Văn Bản Pháp Luật_p100100-100119.parquet',
 '../data/VBPL_old/part1\\Văn Bản Pháp Luật_p100120-100139.parquet',
 '../data/VBPL_old/part1\\Văn Bản Pháp Luật_p100140-100159.parquet',
 '../data/VBPL_old/part1\\Văn Bản Pháp Luật_p100160-100179.parquet',
 '../data/VBPL_old/part1\\Văn Bản Pháp Luật_p100180-100199.parquet',
 '../data/VBPL_old/part1\\Văn Bản Pháp Luật_p100200-100219.parquet',
 '../data/VBPL_old/part1\\Văn Bản Pháp Luật_p100220-100239.parquet',
 '../data/VBPL_old/part1\\Văn Bản Pháp Luật_p100240-100259.parquet',
 '../data/VBPL_old/part1\\Văn Bản Ph

In [4]:
text_splitter = CharacterTextSplitter(
    chunk_size=4096,
    chunk_overlap=256,
    length_function=len,
)
bge_m3_ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")




Fetching 30 files: 100%|██████████| 30/30 [00:00<?, ?it/s]
  colbert_state_dict = torch.load(os.path.join(model_dir, 'colbert_linear.pt'), map_location='cpu')
  sparse_state_dict = torch.load(os.path.join(model_dir, 'sparse_linear.pt'), map_location='cpu')


In [16]:
part = "encoded_data/part0"

for file_name in file_names:
    page_index = re.findall(r"_(p.*).parquet", file_name)[0]
    os.makedirs(f"{page_index}", exist_ok=True)
    
    print(file_name)
    print(page_index)
    data = pd.read_parquet(file_name)

    print("Encode title...")
    title_embeds = bge_m3_ef.encode_documents(data["title"].tolist())
    data["title_dense"] = title_embeds["dense"]
    sparse.save_npz(f"{page_index}/title_sparse.npz", title_embeds["sparse"])

    print("Get text and chunking text...")
    data["content_text"] = data["content"].apply(
        lambda x: text_splitter.split_text(BeautifulSoup(x, "html.parser").text)
    )
    data.to_parquet(f"{page_index}/encoded_data.parquet", index=False)

    print("Encode content text...")
    content_text = data["content_text"].explode().tolist()
    print("Length: ", len(content_text))
    content_dense_list = []
    content_sparse_list = []
    mini_batch = 200
    for i in range(0, len(content_text), mini_batch):
        torch.cuda.empty_cache()
        gc.collect()
        print(f"Encode content text: {i}-{i+mini_batch}")
        content_ = content_text[i : i + mini_batch]
        content_embeds = bge_m3_ef.encode_documents(content_)
        content_dense_list.extend(content_embeds["dense"])
        content_sparse_list.append(content_embeds["sparse"])

    content_sparse = sparse.vstack(content_sparse_list)
    sparse.save_npz(f"{page_index}/content_sparse.npz", content_sparse)
    with open(f"{page_index}/content_dense.npy", "wb") as file:
        np.save(file, content_dense_list, allow_pickle=False)
    print('='*30)
    print()
    print()
    
print("Done")

../data/VBPL_old/part1\Văn Bản Pháp Luật_p100000-100019.parquet
p100000-100019
Encode title...


AttributeError: 'list' object has no attribute 'shape'