<a href="https://colab.research.google.com/github/dodamm111/PDF_GQ/blob/main/NLP_processing_try_Bllossom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import json
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# 1. Bllossom-3B 모델과 토크나이저 로드
model_id = "Bllossom/llama-3.2-Korean-Bllossom-3B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# 2. result.json 파일에서 모든 영역의 데이터를 추출 (평문, 도표, 표 등 모두 포함)
with open("result.json", "r", encoding="utf-8") as f:
    ocr_results = json.load(f)

all_texts = [entry["content"].strip() for entry in ocr_results]
full_text = "\n".join(all_texts)

# 3. 전체 텍스트를 문장 단위로 분할 (한국어 문장 종결어미를 기준)
sentences = re.split(r'(?<=[.?!])\s+', full_text)
sentences = [s.strip() for s in sentences if s.strip()]

# 4. 문장 단위로 청크 구성: 각 청크가 최대 max_tokens_per_chunk 이하의 토큰을 가지도록 결합
max_tokens_per_chunk = 512  # 적절히 조정 가능
chunks = []
current_chunk = ""
current_tokens = 0

for sentence in sentences:
    sentence_tokens = len(tokenizer.tokenize(sentence))
    # 청크에 문장을 추가했을 때 최대 토큰 수 초과하면 현재 청크를 저장 후 새 청크 시작
    if current_tokens + sentence_tokens > max_tokens_per_chunk:
        if current_chunk:
            chunks.append(current_chunk.strip())
        current_chunk = sentence + " "
        current_tokens = sentence_tokens
    else:
        current_chunk += sentence + " "
        current_tokens += sentence_tokens

if current_chunk:
    chunks.append(current_chunk.strip())

# 5. 각 청크에 대해 Bllossom-3B를 활용하여 요약 생성
summaries = []
for chunk in chunks:
    prompt = "다음 텍스트를 요약해줘:\n" + chunk + "\n요약:"
    inputs = tokenizer(prompt, return_tensors="pt", max_length=1024, truncation=True).to(model.device)

    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
        do_sample=False,
        temperature=0.7,
        top_p=0.9,
        eos_token_id=tokenizer.eos_token_id
    )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # 프롬프트 이후의 요약된 부분 추출 (없으면 전체 텍스트 사용)
    if "요약:" in generated_text:
        summary = generated_text.split("요약:")[-1].strip()
    else:
        summary = generated_text.strip()
    summaries.append(summary)

# 6. 청크별 요약 결과를 하나의 요약 텍스트로 결합 후, 문장 단위로 분할
combined_summary = " ".join(summaries)
final_sentences = re.split(r'(?<=[.?!])\s+', combined_summary)
final_sentences = [s.strip() for s in final_sentences if s.strip()]

# 7. 각 문장을 {context: "문장", answer: ""} 형태의 JSON 객체로 변환
output_data = [{"context": sentence, "answer": ""} for sentence in final_sentences]

# 8. 결과를 output_bllossom_chunks.json 파일에 저장
with open("output_bllossom_chunks.json", "w", encoding="utf-8") as f:
    json.dump(output_data, f, ensure_ascii=False, indent=2)

print("변환 완료! output_bllossom_chunks.json 파일을 확인하세요.")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


변환 완료! output_bllossom_chunks.json 파일을 확인하세요.
