In [1]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import chain
from langchain_core.prompts import load_prompt
from langchain_core.output_parsers import StrOutputParser
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain, StuffDocumentsChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

from dotenv import load_dotenv
load_dotenv()

True

### 실험설계
- heading 기반의 chunking 방법론으로 인해, chunk별 paragraph와 글자 수의 편차가 존재함
- 이러한 상황 속, 최적의 summarization technique 방법이 있을 수 있다고 가정
- 동일 chunking들에 대해서 여러 technique를 적용 후 요약 평가

### 실험결과: stuff 방법이 적합
- heading 기반의 chunking 방법으로 인해, chunking 내에 paragraph는 유사 내용으로 구성됨
- map-reduce, map-refine은 긴 문서를 효율적으로 요약하는 기법으로, chunking들의 크기에는 부적합
- 또한 긴 chunking에서도 좋은 성능을 보이지 못함(요약 결과 및 비용)


참고자료: https://discuss.pytorch.kr/t/gn-1-llm/4609?utm_source=chatgpt.com

In [None]:
texts = []
none_paragraph = []
docs_with_metadata = []

for i, document in enumerate(state['documents']):
    heading_text = ' '.join([h for h in document['meta']['heading'].values() if h])
    
    paragraph_text = ' '.join([
        content['information'] if isinstance(content['information'], str) 
        else content['information'].get('detail', '') 
        for content in document['content'] 
        if content['category'] == 'paragraph'
    ])
    
    if paragraph_text == '':
        none_paragraph.append(i)
    
    combined_text = heading_text + '\n' + paragraph_text
    texts.append(combined_text)
    
    metadata = {
        'document_index': i,
        'heading': heading_text,
        'has_paragraph': paragraph_text != '',
        'char_count': len(combined_text),
        'paragraph_count': sum(1 for content in document['content'] if content['category'] == 'paragraph')
    }
    
    doc = Document(page_content=combined_text, metadata=metadata)
    docs_with_metadata.append(doc)

In [None]:
map_prompt = load_prompt("../prompt/summary/map_20250225_01.yaml")
reduce_prompt = load_prompt("../prompt/summary/reduce_20250225_01.yaml")
refine_prompt = load_prompt("../prompt/summary/refine_20250225_01.yaml")
evaluation_prompt = load_prompt("../prompt/summary/summarization_evaluation_20250225_02.yaml")
stuff_prompt = load_prompt("../prompt/summary/stuff_20250317_01.yaml")

# LLM 초기화
llm = ChatOpenAI(temperature=0, model="gpt-4")

def count_tokens(text):
    import tiktoken
    encoder = tiktoken.encoding_for_model("gpt-4")
    return len(encoder.encode(text))

def apply_stuff(doc):
    prompt = stuff_prompt.format(document=doc.page_content)
    response = llm.invoke(prompt)
    
    tokens_input = count_tokens(prompt)
    tokens_output = count_tokens(response.content)
    
    return {
        'document_index': doc.metadata['document_index'],
        'summary_technique': 'stuff',
        'summary_text': response.content,
        'token_total': tokens_input + tokens_output,
        'token_input': tokens_input,
        'token_output': tokens_output
    }

In [None]:
def apply_map_reduce(doc):
    map_prompt_filled = map_prompt.format(heading=doc.metadata['heading'], documents=doc.page_content)
    map_response = llm.invoke(map_prompt_filled)
    map_result = map_response.content
    
    reduce_prompt_filled = reduce_prompt.format(previous_summary="", new_context=map_result)
    reduce_response = llm.invoke(reduce_prompt_filled)
    reduce_result = reduce_response.content
    
    tokens_input = count_tokens(map_prompt_filled) + count_tokens(reduce_prompt_filled)
    tokens_output = count_tokens(map_result) + count_tokens(reduce_result)
    
    return {
        'document_index': doc.metadata['document_index'],
        'summary_technique': 'map-reduce',
        'summary_text': reduce_result,
        'token_total': tokens_input + tokens_output,
        'token_input': tokens_input,
        'token_output': tokens_output,
        'map_result': map_result
    }

In [None]:
def apply_map_refine(doc):
    map_prompt_filled = map_prompt.format(heading=doc.metadata['heading'], documents=doc.page_content)
    map_response = llm.invoke(map_prompt_filled)
    map_result = map_response.content
    
    refine_prompt_filled = refine_prompt.format(summaries=map_result)
    refine_response = llm.invoke(refine_prompt_filled)
    refine_result = refine_response.content
    
    tokens_input = count_tokens(map_prompt_filled) + count_tokens(refine_prompt_filled)
    tokens_output = count_tokens(map_result) + count_tokens(refine_result)
    
    return {
        'document_index': doc.metadata['document_index'],
        'summary_technique': 'map-refine',
        'summary_text': refine_result,
        'token_total': tokens_input + tokens_output,
        'token_input': tokens_input, 
        'token_output': tokens_output
    }

In [None]:
summary_results = []
map_results = []

selected_docs = [doc for doc in docs_with_metadata if doc.metadata['document_index'] not in [0, 1, 3, 7, 10]]

for doc in tqdm(selected_docs):
    stuff_result = apply_stuff(doc)
    summary_results.append(stuff_result)
    
    map_reduce_result = apply_map_reduce(doc)
    summary_results.append(map_reduce_result)
    map_results.append({
        'document_index': map_reduce_result['document_index'],
        'map_result': map_reduce_result['map_result'],
        'token_total': count_tokens(map_reduce_result['map_result']) + count_tokens(map_prompt.format(heading=doc.metadata['heading'], documents=doc.page_content)),
        'token_input': count_tokens(map_prompt.format(heading=doc.metadata['heading'], documents=doc.page_content)),
        'token_output': count_tokens(map_reduce_result['map_result'])
    })
    
    map_refine_result = apply_map_refine(doc)
    summary_results.append(map_refine_result)

summary_df = pd.DataFrame(summary_results)
map_df = pd.DataFrame(map_results)

summary_df.to_csv('../data/experiment/summary/summary_df.csv', index=False)
map_df.to_csv('../data/experiment/summary/map_df.csv', index=False)

In [None]:
paragraph_analysis = summary_df.join(
    pd.DataFrame(
        [(d.metadata['document_index'], d.metadata['has_paragraph'], d.metadata['paragraph_count'], d.metadata['char_count']) 
         for d in docs_with_metadata],
        columns=['document_index', 'has_paragraph', 'paragraph_count', 'char_count']
    ).set_index('document_index'),
    on='document_index'
)

plt.figure(figsize=(12, 8))

techniques = summary_df['summary_technique'].unique()
for technique in techniques:
    subset = summary_df[summary_df['summary_technique'] == technique]
    plt.scatter(subset['token_input'], subset['token_output'], label=technique, alpha=0.7)

plt.xlabel('Input Tokens')
plt.ylabel('Output Tokens')
plt.title('Comparison of Token Usage by Summary Technique')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('../data/experiment/summary/token_comparison.png')

In [None]:
plt.figure(figsize=(12, 8))
for technique in techniques:
    subset = paragraph_analysis[paragraph_analysis['summary_technique'] == technique]
    plt.scatter(subset['paragraph_count'], subset['token_output'], label=technique, alpha=0.7)

plt.xlabel('Paragraph Count')
plt.ylabel('Output Tokens')
plt.title('Summary Output Size by Paragraph Count')
plt.legend()
plt.grid(True, alpha=0.3)
plt.savefig('../data/experiment/summary/paragraph_performance.png')

In [None]:
def evaluate_summary(original_doc, summary, format_str="1-6점"):
    eval_prompt = evaluation_prompt.format(
        original_document=original_doc.page_content,
        summary=summary,
        format=format_str
    )
    
    evaluation = llm.invoke(eval_prompt)
    return evaluation.content

evaluation_results = []
sample_for_eval = summary_df.sample(n=min(10, len(summary_df)))

for _, row in sample_for_eval.iterrows():
    doc_idx = row['document_index']
    original_doc = next(d for d in docs_with_metadata if d.metadata['document_index'] == doc_idx)
    
    eval_result = evaluate_summary(original_doc, row['summary_text'])
    
    evaluation_results.append({
        'document_index': doc_idx,
        'summary_technique': row['summary_technique'],
        'evaluation': eval_result
    })

eval_df = pd.DataFrame(evaluation_results)
eval_df.to_csv('../data/experiment/summary/evaluation_results.csv', index=False)