# 긴 문서 요약 방법 실습
- 문장이 너무 요약되면 기존의 의미전달이 안되므로 적절한 방벙을 제시함
- 어떠한 사이즈의 블럭단위로 자를 수 있는지 확인 가능
    - 블럭단위로 문장을 요약한 후 블럭을 합치는 작업 필요

In [1]:
!pip install openai


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
!pip install tiktoken # 토큰수 확인


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import os
from typing import List, Tuple, Optional
from openai import OpenAI
import tiktoken
from tqdm import tqdm

In [None]:
# from google.colab import driver
# drive.mount('/content/drive')

In [17]:
# with open("content/drive/MyDrive/gdrive/data/artificial_intelligence_wikipedia.txt", "r") as file:
with open("./artificial_intelligence_wikipedia.txt", "r") as file:
    artificial_intelligence_wikipedia_text = file.read()
# print(artificial_inteliigence_wikipedia_text)

In [18]:
encoding = tiktoken.encoding_for_model('gpt-4-turbo')
len(encoding.encode(artificial_intelligence_wikipedia_text))

14630

In [8]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [21]:
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))


def get_chat_completion(messages, model='gpt-4-turbo'):
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message.content

In [22]:
def tokenize(text: str) -> List[str]:
    encoding = tiktoken.encoding_for_model('gpt-4-turbo')
    return encoding.encode(text)


def chunk_on_delimiter(input_string: str,
                       max_tokens: int, delimiter: str) -> List[str]:
    chunks = input_string.split(delimiter)
    combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
        chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True
    )
    if dropped_chunk_count > 0:
        print(f"warning: {dropped_chunk_count} chunks were dropped due to overflow")

    combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
    return combined_chunks

# 주어진 청크리스트를 받아서 각 조각을 합침 & 블럭 조합
def combine_chunks_with_no_minimum(
        chunks: List[str],
        max_tokens: int,
        chunk_delimiter="\n\n",
        header: Optional[str] = None,
        add_ellipsis_for_overflow=False,
) -> Tuple[List[str], List[int]]:
    dropped_chunk_count = 0
    output = []
    output_indices = [] # 인덱스 저장
    candidate = (
        [] if header is None else [header]
    )
    candidate_indices = []
    for chunk_i, chunk in enumerate(chunks):
        chunk_with_header = [chunk] if header is None else [header, chunk]
        if len(tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
            print(f"warning: chunk oveflow")
            if (
                add_ellipsis_for_overflow
                and len(tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
            ):
                candidate.append("...")
                dropped_chunk_count += 1
            continue
        extended_candidate_token_count = len(tokenize(chunk_delimiter.join(candidate + [chunk])))

        if extended_candidate_token_count > max_tokens:
            output.append(chunk_delimiter.join(candidate))
            output_indices.append(candidate_indices)
            candidate = chunk_with_header
            candidate_indices = [chunk_i]
        else:
            candidate.append(chunk)
            candidate_indices.append(chunk_i)

    if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
        output.append(chunk_delimiter.join(candidate))
        output_indices.append(candidate_indices)
    return output, output_indices, dropped_chunk_count


In [23]:
def summarize(
        text: str,
        detail: float = 0,
        model: str = 'gpt-4-turbo',
        additional_instructions: Optional[str] = None,
        minimum_chunk_size: Optional[int] = 500,
        chunk_delimiter: str = ".",
        summarize_recursively=False,
        verbose=False
        ):
    assert 0 <= detail <= 1

    max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
    min_chunks = 1
    num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))

    # 청크 크기 
    document_length = len(tokenize(text))
    chunk_size = max(minimum_chunk_size, document_length // num_chunks)
    text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
    if verbose:
        print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
        print(f"Chunk lengths are {[len(tokenize(x)) for x in text_chunks]}")

    # set system message
    system_message_content = "Rewrite this text in summarized form."
    if additional_instructions is not None:
        system_message_content += f"\n\n{additional_instructions}"

    accumulated_summaries = []
    for chunk in tqdm(text_chunks):
        if summarize_recursively and accumulated_summaries:
            # Creating a structured prompt for recursive summarization
            accumulated_summaries_string = '\n\n'.join(accumulated_summaries)
            user_message_content = f"Previous summaries:\n\n{accumulated_summaries_string}\n\nText to summarize next:\n\n{chunk}"
        else:
            # Directly passing the chunk for summarization without recursive context
            user_message_content = chunk

        # Constructing messages based on whether recursive summarization is applied
        messages = [
            {"role": "system", "content": system_message_content},
            {"role": "user", "content": user_message_content}
        ]

        # Assuming this function gets the completion and works as expected
        response = get_chat_completion(messages, model=model)
        accumulated_summaries.append(response)

    # Compile final summary from partial summaries
    final_summary = '\n\n'.join(accumulated_summaries)

    return final_summary


detail을 0에서 1로 증가시키면 기본 문서에 대한 요약이 점차길어진다.
- detail 매개변수의 값이 높을수록 문서를 더 많은 수의 청크로 분할하기 때문에 더 자세한 요약이 생성된다.

In [24]:
summary_with_detail_0 = summarize(artificial_intelligence_wikipedia_text, detail=0, verbose=True)

Splitting the text into 1 chunks to be summarized.
Chunk lengths are [14631]


100%|██████████| 1/1 [00:14<00:00, 14.11s/it]


In [25]:
summary_with_detail_pt25 = summarize(artificial_intelligence_wikipedia_text, detail=0.25, verbose=True)

Splitting the text into 9 chunks to be summarized.
Chunk lengths are [1817, 1807, 1823, 1810, 1806, 1827, 1814, 1829, 103]


100%|██████████| 9/9 [01:33<00:00, 10.35s/it]


In [26]:
summary_with_detail_pt5 = summarize(artificial_intelligence_wikipedia_text, detail=0.5, verbose=True)

Splitting the text into 17 chunks to be summarized.
Chunk lengths are [897, 890, 914, 876, 893, 906, 893, 902, 909, 907, 905, 889, 902, 890, 901, 880, 287]


100%|██████████| 17/17 [02:56<00:00, 10.39s/it]


In [27]:
summary_with_detail_1 = summarize(artificial_intelligence_wikipedia_text, detail=1, verbose=True)

Splitting the text into 31 chunks to be summarized.
Chunk lengths are [492, 427, 485, 490, 496, 478, 473, 497, 496, 501, 499, 497, 493, 470, 472, 494, 489, 492, 481, 485, 471, 500, 486, 498, 478, 469, 498, 468, 493, 478, 103]


100%|██████████| 31/31 [04:38<00:00,  8.99s/it]


In [28]:
[len(tokenize(x)) for x in
 [summary_with_detail_0, summary_with_detail_pt25, summary_with_detail_pt5, summary_with_detail_1]
 ]

[286, 2538, 4531, 6748]

In [None]:
print(summary_with_detail_0)

In [None]:
# 개괄식으로 내용 요약
summary_with_additional_instructions = summarize(artificial_intelligence_wikipedia_text, detail=0.1,
                                                 additional_instructions="Write in point form and focus on numerical data.")
print(summary_with_additional_instructions)

In [None]:
# 재귀적으로 요약 -> 내용은 정확해짐 단 자원소모가 많음
recursive_summary = summarize(artificial_intelligence_wikipedia_text, detail=0.1, summarize_recursively=True)
print(recursive_summary)