In [1]:
import os
import json
from tqdm import tqdm

In [2]:
def file_open(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

In [3]:
base_path = '../data/document/역도'

name_list = ['역도의 이해', '역도의 스포츠 과학적 원리', '역도경기 기술의 구조와 훈련법', '역도체력의 구조와 훈련법', '역도 훈련프로그램 구성 및 지도안']
path_list = []

suffix = '_dataset.json'
for name in name_list:
    path = os.path.join(base_path, name + suffix)
    path_list.append(path)

dataset_list = []
for path in path_list:
    print(path)
    file = file_open(path)
    dataset_list.append(file)

../data/document/역도\역도의 이해_dataset.json
../data/document/역도\역도의 스포츠 과학적 원리_dataset.json
../data/document/역도\역도경기 기술의 구조와 훈련법_dataset.json
../data/document/역도\역도체력의 구조와 훈련법_dataset.json
../data/document/역도\역도 훈련프로그램 구성 및 지도안_dataset.json


# 문서 전처리
## 검색 전용 데이터 생성
1. 불필요 정보 제거
   - figure, chart, table의 hypotheticalQuestions 제거(?)
   - filepaths 제거
   - category 제거 
2. 페이지 수정
3. tiktoken 수를 기준으로 분할
   - 최소 토큰 350, 최대 토큰 500 
   - overlap: 최대 150 토큰

### 검색 전용 Document 구조
- id: int
- metadata: Dict
  - filename: str
  - page: List[int]
- page_content: str
- type: Document

In [4]:
import tiktoken
import re
from typing import List, Tuple

def get_token_length(text: str, encoding_name: str = "cl100k_base") -> int:
    encoding = tiktoken.get_encoding(encoding_name)
    return len(encoding.encode(text))

def split_into_sentences(text: str) -> List[Tuple[str, int]]:
    sentences = re.split(r'(?<=[.!?])\s+', text)
    
    return [(s.strip(), get_token_length(s.strip())) for s in sentences if s.strip()]

def get_overlap_from_previous_chunk(
    last_chunk: dict,
    current_chunk_tokens: int,
    current_chunk_sentences: List[Tuple[str, int]],
    min_tokens: int,
    max_tokens: int
    ) -> Tuple[List[Tuple[str, int]], int]:

    last_sentences = split_into_sentences(last_chunk['page_content'])
    
    # 필요한 토큰 수와 가용 가능한 토큰 수 계산
    needed_tokens = min_tokens - current_chunk_tokens
    available_tokens = max_tokens - current_chunk_tokens
    
    additional_sentences = []
    additional_tokens = 0
    
    # 이전 청크의 문장들을 뒤에서부터 검사
    for sent, tok in reversed(last_sentences):
        # 현재 청크가 min_tokens보다 작은 경우
        if current_chunk_tokens < min_tokens:
            if additional_tokens + tok <= available_tokens:
                additional_sentences.insert(0, (sent, tok))
                additional_tokens += tok
                if additional_tokens >= needed_tokens:
                    break
        # 일반적인 경우
        else:
            if (current_chunk_tokens + additional_tokens + tok <= max_tokens):
                additional_sentences.insert(0, (sent, tok))
                additional_tokens += tok
            else:
                break
    
    # 확장된 문장 리스트와 토큰 수 생성
    extended_sentences = additional_sentences + current_chunk_sentences
    extended_tokens = current_chunk_tokens + additional_tokens
    
    return extended_sentences, extended_tokens


def chunk_by_token_range_with_sentence_overlap(
    doc: dict,
    chunk_id: int,
    min_tokens: int = 350,
    max_tokens: int = 500,
    ) -> Tuple[List[dict], List[dict], int]:    

    json_list = []
    page_content = ''
    for element in doc['content']:
        if element['category'] == 'table':
            caption = '' 
            if 'caption' in element:
                caption = '\n\n' + element['caption'] + '\n\n' # llm에 context를 입력하는 과정에서 caption 활용을 위한 전처리
            json = element['information']['table_json']
            chunk_id += 1
            json_list.append({'chunk_id': chunk_id,
                              'table': json,
                              'caption': caption})

        if element['category'] in ['figure', 'chart', 'table']:
            if 'caption' in element['information']:
                caption = element['information']['caption']
                page_content += caption + '\n'

            detail = element['information']['detail']
            page_content += detail + '\n'
        
        if element['category'] == 'paragraph':
            page_content += element['information'] + '\n'

    sentences = split_into_sentences(page_content)
    
    chunks = []
    current_chunk_sentences = []
    current_chunk_tokens = 0
    
    i = 0
    while i < len(sentences):
        sentence, token_count = sentences[i]
        # 한 문장이 max_tokens보다 큰 경우
        if token_count > max_tokens:
            if current_chunk_sentences:
                if chunks:
                    # overlap 처리를 별도 함수로 분리
                    extended_sentences, extended_tokens = get_overlap_from_previous_chunk(
                        last_chunk=chunks[-1],
                        current_chunk_tokens=current_chunk_tokens,
                        current_chunk_sentences=current_chunk_sentences,
                        min_tokens=min_tokens,
                        max_tokens=max_tokens
                    )
                    
                    chunk_id += 1
                    chunks.append({
                        'chunk_id': chunk_id,
                        'page_content': ' '.join([s for s, _ in extended_sentences]),
                    })
                else:
                    # 첫 번째 청크인 경우
                    chunk_id += 1
                    chunks.append({
                        'chunk_id': chunk_id,
                        'page_content': ' '.join([s for s, _ in current_chunk_sentences]),
                    })
            
            # 긴 문장을 별도 청크로 저장
            chunk_id += 1
            chunks.append({
                'chunk_id': chunk_id,
                'page_content': sentence,
            })
            
            current_chunk_sentences = []
            current_chunk_tokens = 0
            i += 1
        
        # 일반적인 문장 처리
        else:            
            # 현재 청크가 min_tokens를 넘었고, 새 문장 추가시 max_tokens를 초과하는 경우
            if (current_chunk_tokens >= min_tokens) and (current_chunk_tokens < max_tokens):
                if chunks:
                    # overlap 처리
                    extended_sentences, extended_tokens = get_overlap_from_previous_chunk(
                        last_chunk=chunks[-1],
                        current_chunk_tokens=current_chunk_tokens,
                        current_chunk_sentences=current_chunk_sentences,
                        min_tokens=min_tokens,
                        max_tokens=max_tokens
                    )
                    
                    chunk_id += 1
                    chunks.append({
                        'chunk_id': chunk_id,
                        'page_content': ' '.join([s for s, _ in extended_sentences]),
                    })
                else:
                    # 첫 번째 청크인 경우
                    chunk_id += 1
                    chunks.append({
                        'chunk_id': chunk_id,
                        'page_content': ' '.join([s for s, _ in current_chunk_sentences]),
                    })
                
                # 새로운 청크 시작
                current_chunk_sentences = [(sentence, token_count)]
                current_chunk_tokens = token_count
            
            else:
                current_chunk_sentences.append((sentence, token_count))
                current_chunk_tokens = current_chunk_tokens + token_count
            
            i += 1
    return chunks, json_list, chunk_id

In [5]:
def get_overlap_from_previous_chunk(
    last_chunk: dict,
    current_chunk_tokens: int,
    current_chunk_sentences: List[Tuple[str, int]],
    min_tokens: int,
    max_tokens: int,
    ) -> Tuple[List[Tuple[str, int]], int]:
    last_sentences = split_into_sentences(last_chunk['page_content'])

    max_addable_tokens = max(0, max_tokens - current_chunk_tokens)

    additional_sentences = []
    additional_tokens = 0

    for sent, tok in reversed(last_sentences):
        if additional_tokens + tok <= max_addable_tokens:
            additional_sentences.insert(0, (sent, tok))
            additional_tokens += tok
        else:
            break

    extended_sentences = additional_sentences + current_chunk_sentences
    extended_tokens = current_chunk_tokens + additional_tokens
    
    return extended_sentences, extended_tokens

In [6]:
from langchain_core.documents import Document

def make_dataset_for_search(dataset, before_page, before_id, chunk_id, chunk_size_min=350, chunk_size_max=500):
    new_dataset = []
    new_table_dataset = []
    for i, doc in enumerate(tqdm(dataset['documents'])):
        new_document = {'metadata': {}}
        
        metadata = doc['meta'].copy()
        metadata['filename'] = metadata['filepath'].split('/')[-1]
        del metadata['filepath']

        page_list = metadata['pages']
        page_list = [page + before_page for page in page_list]
        del metadata['pages']

        new_document['metadata'] = metadata
        new_document['metadata']['page'] = page_list
        # new_document['metadata']['doc_id'] = before_id + i + 1

        chunks, table_chunk, chunk_id = chunk_by_token_range_with_sentence_overlap(doc, chunk_id)

        for chunk in chunks:
            new_chunk = {'metadata': metadata.copy()}
            new_chunk['metadata']['chunk_id'] = chunk['chunk_id']
            new_chunk['page_content'] = chunk['page_content']
            new_dataset.append(new_chunk)
        
        for table in table_chunk:
            new_table = {'metadata': {}}
            new_table['metadata']['chunk_id'] = table['chunk_id']
            new_table['metadata']['doc_id'] = before_id + i + 1
            new_table['metadata']['caption'] = table['caption']
            new_table['page_content'] = table['table']
            new_table_dataset.append(new_table)

    return new_dataset, new_table_dataset, chunk_id


In [7]:
new_dataset_list = []
new_table_dataset_list = []
before_page = 0
before_id = 0
chunk_id = 0

for dataset in dataset_list:
    new_dataset, new_table_dataset, chunk_id = make_dataset_for_search(dataset, before_page, before_id, chunk_id)
    new_dataset_list.append(new_dataset)
    new_table_dataset_list.extend(new_table_dataset)

  0%|          | 0/32 [00:00<?, ?it/s]

100%|██████████| 32/32 [00:00<00:00, 69.44it/s]
100%|██████████| 81/81 [00:00<00:00, 697.43it/s]
100%|██████████| 104/104 [00:00<00:00, 1383.62it/s]
100%|██████████| 23/23 [00:00<00:00, 1623.18it/s]
100%|██████████| 36/36 [00:00<00:00, 1631.27it/s]


In [8]:
# with open('../data/document/역도/chunk_with_overlap.json', 'w', encoding='utf-8') as f:
#     json.dump(new_dataset_list, f, ensure_ascii=False, indent=4)

# with open('../data/document/역도/table_chunk.json', 'w', encoding='utf-8') as f:
#     json.dump(new_table_dataset_list, f, ensure_ascii=False, indent=4)