In [None]:
!pip install accelerate
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install transformers[torch] -U

!pip install datasets
!pip install langchain
!pip install langchain_community
!pip install PyMuPDF
!pip install sentence-transformers
!pip install faiss-cpu
!pip install chromadb
!pip install transformers

In [None]:
!pip install transformers datasets bitsandbytes peft trl accelerate --upgrade -qqq
​

In [None]:
import os
import unicodedata

import torch
import pandas as pd
from tqdm import tqdm # tqdm를 사용하여 진행 상황 표시
import fitz  # PyMuPDF

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    BitsAndBytesConfig,
    Gemma2ForCausalLM
)
from accelerate import Accelerator

# Langchain 관련
from langchain_community.document_loaders import PyMuPDFLoader, TextLoader
from langchain_community.chat_models import ChatOllama
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS, Chroma
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from langchain_community.document_loaders import TextLoader
from langchain_core.prompts import ChatPromptTemplate

warnings.filterwarnings("ignore") # warning 경고문 무시

In [None]:
#네이버 뉴스 요약 데이터셋 로드
# dataset = load_dataset("/재난대비 강령")

def load_files_from_directory(directory_path):
    """디렉토리 내의 모든 PDF 및 TXT 파일을 로드하여 텍스트를 반환합니다."""
    data = []
    for filename in os.listdir(directory_path):
        file_path = os.path.join(directory_path, filename)
        if filename.endswith('.pdf'):
            loader = PyMuPDFLoader(file_path)
            data.extend(loader.load())
        elif filename.endswith('.txt'):
            loader = TextLoader(file_path)
            data.extend(loader.load())
    return dataset

## 양자 모델 Qlora, granada

In [None]:
BASE_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct"

# LoRA 설정 : 양자화된 모델에서 Adaptor를 붙여서 학습할 파라미터만 따로 구성함
lora_config = LoraConfig(
    r=8,
    lora_alpha = 8,
    lora_dropout = 0.05,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

# 4bit 양자화 설정 - QLoRA로 해야 함
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# 모델 로드 (양자화 )
model = AutoModelForCausalLM.from_pretrained(BASE_MODEL,
                                             quantization_config=bnb_config,
                                             device_map="auto"
                                            )

In [None]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, add_special_tokens=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

In [None]:
# llama 모델에서 결과값이 나오게 하는 코드
def generate_prompts(example):
    prompt_list = []
    for i in range(len(example['document'])):
        prompt_list.append(
f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>다음 글을 요약해주세요:
{example['document'][i]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
{example['summary'][i]}<|eot_id|>"""
        )
    return prompt_list

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    max_seq_length=512,
    args=TrainingArguments(
        output_dir="outputs",
        num_train_epochs = 1,
        # max_steps=300,
        per_device_train_batch_size=24, # GPU당 24개 배치
        gradient_accumulation_steps=4, # gradient 반영을 4개 step 마다
        optim="paged_adamw_8bit",
        warmup_steps=0.03,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        push_to_hub=False,
        report_to='none',
    ),
    peft_config=lora_config,   # LoRA 설정값
    formatting_func=prompts,   # 프롬프트 템플릿 함수
)

trainer.train()

In [None]:
document = f"""
기사내용 복붙
"""

pipe = pipeline("text-generation",
                model=model,
                tokenizer=tokenizer,
                max_new_tokens=512)

messages = [
    {"role": "user", "content": f"""다음 글을 요약해주세요. 약 6문단 이상으로요. :\n{document}"""},
]

prompt = pipe.tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
)

outputs = pipe(
    prompt,
    do_sample=True,
    temperature=0.4,
    top_k=50,
    top_p=0.95,
    add_special_tokens=True,
    eos_token_id = [ # eos_token_id를 지정하지 않으면 생성 토큰 반복
        pipe.tokenizer.eos_token_id,
        pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
)

print(outputs[0]['generated_text'][len(prompt):])