# Import Library

In [1]:
import os
import unicodedata

import torch
import pandas as pd
from tqdm import tqdm
import fitz  # PyMuPDF
import pickle

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    BitsAndBytesConfig,
    TrainingArguments,
)
from accelerate import Accelerator

from trl import SFTConfig, SFTTrainer
from datasets import load_dataset, Dataset
import pickle
import wandb

# Langchain Í¥ÄÎ†®
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate 
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from peft import LoraConfig, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


# CONFIG

In [10]:
class CFG:
    # Root of Files
    BASE_DIRECTORY = './'
    DACON_TRAIN_CSV = './train.csv'
    DACON_TEST_CSV = './test.csv'

    EMBEDDINGS_ABOUT_TRAINING_DATASET = 'databases_multilingual-e5-base_training.pickle'
    EMBEDDINGS_ABOUT_TEST_DATASET = 'databases_multilingual-e5-base_test.pickle'

    EMBEDDING_CSV_FULL_TRAIN = './datas/stf_e5_base_full_train.csv'
    EMBEDDING_CSV_TRAIN = './datas/stf_e5_base_train.csv'
    EMBEDDING_CSV_VAL = './datas/stf_e5_base_val.csv'
    EMBEDDING_CSV_TEST = './datas/stf_e5_base_test.csv'

    # About Embedding
    EMBEDDING_MODEL = "intfloat/multilingual-e5-base"
    CHUNK_SIZE = 800
    CHUNK_OVERLAP = 50
    
    SEARCH_TYPE = "mmr"
    SEARCH_KWARGS_K = 3
    SEARCH_KWARGS_FETCH_K = 8

    # About Finetuning
    PRETRAINING_MODEL = "rtzr/ko-gemma-2-9b-it"

    LoRA_RANK = 16
    LoRA_ALPHA = 32
    LoRA_DROPOUT = 0.05

    TRAINING_RESUTL_DIR = './finetune_models/training_result'
    PER_TRAIN_BATCH_SIZE =2
    PER_EVAL_BATCH_SIZE = 2
    NUM_TRAIN_EPOCHS = 5
    LOGGING_DIR='./finetune_models/training_logs'
    LOGGING_STEPS=1000
    SAVE_STEPS=1000
    SAVING_FINETUNING_MODEL_DIR = "./finetune_models/gemma_ko_9b_ver1.01"

    PIPELINE_TEMPERATURE=0.2
    SEQ_MAX_LENGHT=450

    # About Submission
    SAMPLE_SUBMISSION = './sample_submission.csv'
    RESULT_SUBMISSION = './gemma_9b_ver1.02_submission.csv'


# DATA PREPARING 

### Vector DB

In [4]:
def process_pdf(file_path, chunk_size=CFG.CHUNK_SIZE, chunk_overlap=CFG.CHUNK_OVERLAP):
    """PDF ÌÖçÏä§Ìä∏ Ï∂îÏ∂ú ÌõÑ chunk Îã®ÏúÑÎ°ú ÎÇòÎàÑÍ∏∞"""
    # PDF ÌååÏùº Ïó¥Í∏∞
    doc = fitz.open(file_path)
    text = ''
    # Î™®Îì† ÌéòÏù¥ÏßÄÏùò ÌÖçÏä§Ìä∏ Ï∂îÏ∂ú
    for page in doc:
        text += page.get_text()
    # ÌÖçÏä§Ìä∏Î•º chunkÎ°ú Î∂ÑÌï†
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunk_temp = splitter.split_text(text)
    # Document Í∞ùÏ≤¥ Î¶¨Ïä§Ìä∏ ÏÉùÏÑ±
    chunks = [Document(page_content=t) for t in chunk_temp]
    return chunks


def create_vector_db(chunks, model_path=CFG.EMBEDDING_MODEL):
    """FAISS DB ÏÉùÏÑ±"""
    # ÏûÑÎ≤†Îî© Î™®Îç∏ ÏÑ§Ï†ï
    model_kwargs = {'device': 'cuda'}
    encode_kwargs = {'normalize_embeddings': True}
    embeddings = HuggingFaceEmbeddings(
        model_name=model_path,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    # FAISS DB ÏÉùÏÑ± Î∞è Î∞òÌôò
    db = FAISS.from_documents(chunks, embedding=embeddings)
    return db

def normalize_path(path):
    """Í≤ΩÎ°ú Ïú†ÎãàÏΩîÎìú Ï†ïÍ∑úÌôî"""
    return unicodedata.normalize('NFC', path)


def process_pdfs_from_dataframe(df, base_directory):
    """ÎîïÏÖîÎÑàÎ¶¨Ïóê pdfÎ™ÖÏùÑ ÌÇ§Î°úÌï¥ÏÑú DB, retriever Ï†ÄÏû•"""
    pdf_databases = {}
    unique_paths = df['Source_path'].unique()
    
    for path in tqdm(unique_paths, desc="Processing PDFs"):
        # Í≤ΩÎ°ú Ï†ïÍ∑úÌôî Î∞è Ï†àÎåÄ Í≤ΩÎ°ú ÏÉùÏÑ±
        normalized_path = normalize_path(path)
        full_path = os.path.normpath(os.path.join(base_directory, normalized_path.lstrip('./'))) if not os.path.isabs(normalized_path) else normalized_path
        
        pdf_title = os.path.splitext(os.path.basename(full_path))[0]
        print(f"Processing {pdf_title}...")
        
        # PDF Ï≤òÎ¶¨ Î∞è Î≤°ÌÑ∞ DB ÏÉùÏÑ±
        chunks = process_pdf(full_path)
        db = create_vector_db(chunks)
        
        # Retriever ÏÉùÏÑ±
        retriever = db.as_retriever(search_type=CFG.SEARCH_TYPE, 
                                    search_kwargs={'k': 3, 'fetch_k': 8})
        
        # Í≤∞Í≥º Ï†ÄÏû•
        pdf_databases[pdf_title] = {
                'db': db,
                'retriever': retriever
        }
    return pdf_databases

### TRAININGÏóê Í¥ÄÌïú EMBEDDING DB ÏÉùÏÑ±

In [4]:
base_directory = CFG.BASE_DIRECTORY # Your Base Directory
train_df = pd.read_csv(CFG.DACON_TRAIN_CSV)
train_pdf_databases = process_pdfs_from_dataframe(train_df, base_directory)
pickle_file_path = os.path.join(base_directory, CFG.EMBEDDINGS_ABOUT_TRAINING_DATASET)

with open(pickle_file_path, 'wb') as f:
    pickle.dump(train_pdf_databases, f)

Processing PDFs:   0%|          | 0/16 [00:00<?, ?it/s]

Processing 1-1 2024 Ï£ºÏöî Ïû¨Ï†ïÌÜµÍ≥Ñ 1Í∂å...


  warn_deprecated(
Processing PDFs:   6%|‚ñã         | 1/16 [00:07<01:46,  7.09s/it]

Processing 2024 ÎÇòÎùºÏÇ¥Î¶º ÏòàÏÇ∞Í∞úÏöî...


Processing PDFs:  12%|‚ñà‚ñé        | 2/16 [00:14<01:39,  7.12s/it]

Processing Ïû¨Ï†ïÌÜµÍ≥ÑÌï¥ÏÑ§...


Processing PDFs:  19%|‚ñà‚ñâ        | 3/16 [00:18<01:18,  6.04s/it]

Processing Íµ≠ÌÜ†ÍµêÌÜµÎ∂Ä_Ï†ÑÏÑ∏ÏûÑÎåÄ(ÏúµÏûê)...


Processing PDFs:  25%|‚ñà‚ñà‚ñå       | 4/16 [00:22<01:00,  5.01s/it]

Processing Í≥†Ïö©ÎÖ∏ÎèôÎ∂Ä_Ï≤≠ÎÖÑÏùºÏûêÎ¶¨Ï∞ΩÏ∂úÏßÄÏõê...


Processing PDFs:  31%|‚ñà‚ñà‚ñà‚ñè      | 5/16 [00:26<00:51,  4.64s/it]

Processing Í≥†Ïö©ÎÖ∏ÎèôÎ∂Ä_ÎÇ¥ÏùºÎ∞∞ÏõÄÏπ¥Îìú(ÏùºÎ∞ò)...


Processing PDFs:  38%|‚ñà‚ñà‚ñà‚ñä      | 6/16 [00:30<00:44,  4.40s/it]

Processing Î≥¥Í±¥Î≥µÏßÄÎ∂Ä_ÎÖ∏Ïù∏ÏùºÏûêÎ¶¨ Î∞è ÏÇ¨ÌöåÌôúÎèôÏßÄÏõê...


Processing PDFs:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 7/16 [00:34<00:38,  4.31s/it]

Processing Ï§ëÏÜåÎ≤§Ï≤òÍ∏∞ÏóÖÎ∂Ä_Ï∞ΩÏóÖÏÇ¨ÏóÖÌôîÏßÄÏõê...


Processing PDFs:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 8/16 [00:38<00:35,  4.38s/it]

Processing Î≥¥Í±¥Î≥µÏßÄÎ∂Ä_ÏÉùÍ≥ÑÍ∏âÏó¨...


Processing PDFs:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã    | 9/16 [00:42<00:29,  4.17s/it]

Processing Íµ≠ÌÜ†ÍµêÌÜµÎ∂Ä_ÏÜåÍ∑úÎ™®Ï£ºÌÉùÏ†ïÎπÑÏÇ¨ÏóÖ...


Processing PDFs:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 10/16 [00:46<00:24,  4.11s/it]

Processing Íµ≠ÌÜ†ÍµêÌÜµÎ∂Ä_ÎØºÍ∞ÑÏûÑÎåÄ(ÏúµÏûê)...


Processing PDFs:  69%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ   | 11/16 [00:50<00:19,  3.95s/it]

Processing Í≥†Ïö©ÎÖ∏ÎèôÎ∂Ä_Ï°∞Í∏∞Ïû¨Ï∑®ÏóÖÏàòÎãπ...


Processing PDFs:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 12/16 [00:53<00:15,  3.84s/it]

Processing 2024ÎÖÑÎèÑ ÏÑ±Í≥ºÍ≥ÑÌöçÏÑú(Ï¥ùÍ¥ÑÌé∏)...


Processing PDFs:  81%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 13/16 [02:02<01:10, 23.60s/it]

Processing „ÄåFIS Ïù¥Ïäà & Ìè¨Ïª§Ïä§„Äç 23-3Ìò∏ „ÄäÏ°∞ÏÑ∏ÏßÄÏ∂ú Ïó∞Í≥ÑÍ¥ÄÎ¶¨„Äã...


Processing PDFs:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 14/16 [02:14<00:39, 19.89s/it]

Processing „ÄåFIS Ïù¥Ïäà & Ìè¨Ïª§Ïä§„Äç 22-3Ìò∏ „ÄäÏû¨Ï†ïÏúµÏûêÏÇ¨ÏóÖ„Äã...


Processing PDFs:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 15/16 [02:30<00:18, 18.86s/it]

Processing ÏõîÍ∞Ñ ÎÇòÎùºÏû¨Ï†ï 2023ÎÖÑ 12ÏõîÌò∏...


Processing PDFs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 16/16 [03:33<00:00, 13.33s/it]


### TESTÏóê Í¥ÄÌïú EMBEDDING DB DB ÏÉùÏÑ±

In [5]:
base_directory = CFG.BASE_DIRECTORY # Your Base Directory
test_df = pd.read_csv(CFG.DACON_TEST_CSV)
test_pdf_databases = process_pdfs_from_dataframe(test_df, base_directory)
pickle_file_path = os.path.join(base_directory, CFG.EMBEDDINGS_ABOUT_TEST_DATASET)

with open(pickle_file_path, 'wb') as f:
    pickle.dump(test_pdf_databases, f)

  warn_deprecated(


Processing Ï§ëÏÜåÎ≤§Ï≤òÍ∏∞ÏóÖÎ∂Ä_ÌòÅÏã†Ï∞ΩÏóÖÏÇ¨ÏóÖÌôîÏûêÍ∏à(ÏúµÏûê)...


Processing PDFs:  11%|‚ñà         | 1/9 [00:04<00:35,  4.40s/it]

Processing Î≥¥Í±¥Î≥µÏßÄÎ∂Ä_Î∂ÄÎ™®Í∏âÏó¨(ÏòÅÏïÑÏàòÎãπ) ÏßÄÏõê...


Processing PDFs:  22%|‚ñà‚ñà‚ñè       | 2/9 [00:08<00:29,  4.25s/it]

Processing Î≥¥Í±¥Î≥µÏßÄÎ∂Ä_ÎÖ∏Ïù∏Ïû•Í∏∞ÏöîÏñëÎ≥¥Ìóò ÏÇ¨ÏóÖÏö¥ÏòÅ...


Processing PDFs:  33%|‚ñà‚ñà‚ñà‚ñé      | 3/9 [00:12<00:23,  3.98s/it]

Processing ÏÇ∞ÏóÖÌÜµÏÉÅÏûêÏõêÎ∂Ä_ÏóêÎÑàÏßÄÎ∞îÏö∞Ï≤ò...


Processing PDFs:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 4/9 [00:16<00:20,  4.09s/it]

Processing Íµ≠ÌÜ†ÍµêÌÜµÎ∂Ä_ÌñâÎ≥µÏ£ºÌÉùÏ∂úÏûê...


Processing PDFs:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 5/9 [00:19<00:15,  3.80s/it]

Processing „ÄåFIS Ïù¥Ïäà & Ìè¨Ïª§Ïä§„Äç 22-4Ìò∏ „ÄäÏ§ëÏïô-ÏßÄÎ∞© Í∞Ñ Ïû¨Ï†ïÏ°∞Ï†ïÏ†úÎèÑ„Äã...


Processing PDFs:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 6/9 [00:23<00:11,  3.89s/it]

Processing „ÄåFIS Ïù¥Ïäà & Ìè¨Ïª§Ïä§„Äç 23-2Ìò∏ „ÄäÌïµÏã¨Ïû¨Ï†ïÏÇ¨ÏóÖ ÏÑ±Í≥ºÍ¥ÄÎ¶¨„Äã...


Processing PDFs:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 7/9 [00:28<00:08,  4.01s/it]

Processing „ÄåFIS Ïù¥Ïäà&Ìè¨Ïª§Ïä§„Äç 22-2Ìò∏ „ÄäÏû¨Ï†ïÏÑ±Í≥ºÍ¥ÄÎ¶¨Ï†úÎèÑ„Äã...


Processing PDFs:  89%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ | 8/9 [00:31<00:03,  3.79s/it]

Processing „ÄåFIS Ïù¥Ïäà & Ìè¨Ïª§Ïä§„Äç(Ïã†Í∑ú) ÌÜµÍ∂å Ï†ú1Ìò∏ „ÄäÏö∞Î∞úÎ∂ÄÏ±Ñ„Äã...


Processing PDFs: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9/9 [00:35<00:00,  3.93s/it]


### EMBEDDING Ï†ïÎ≥¥Î•º Ï†ÄÏû•Ìïú CSV ÌååÏùºÏùò ÏÉùÏÑ±ÏùÑ ÏúÑÌïú FUNCTIONS

In [5]:
def normalize_string(s):
    """Ïú†ÎãàÏΩîÎìú Ï†ïÍ∑úÌôî"""
    return unicodedata.normalize('NFC', s)

def format_docs(docs):
    """Í≤ÄÏÉâÎêú Î¨∏ÏÑúÎì§ÏùÑ ÌïòÎÇòÏùò Î¨∏ÏûêÏó¥Î°ú Ìè¨Îß∑ÌåÖ"""
    context = ""
    for doc in docs:
        context += doc.page_content
        context += '\n'
    return context

def extract_context(df, pdf_databases, data_type='train'):
    # Í≤∞Í≥ºÎ•º Ï†ÄÏû•Ìï† Î¶¨Ïä§Ìä∏ Ï¥àÍ∏∞Ìôî
    results = []

    # Î∞∞Ïπò ÏÇ¨Ïù¥Ï¶à ÏÑ§Ï†ï
    batch_size = 1  # ÏõêÌïòÎäî Î∞∞Ïπò ÌÅ¨Í∏∞Î°ú ÏÑ§Ï†ï

    # DataFrameÏùò Í∞Å ÌñâÏóê ÎåÄÌï¥ Ï≤òÎ¶¨
    for start in tqdm(range(0, len(df), batch_size), desc="Creating Q&A including RAG info"):
        # ÌòÑÏû¨ Î∞∞Ïπò ÏÑ†ÌÉù
        batch_rows = df.iloc[start:start + batch_size]

        # Î∞∞Ïπò ÎÇ¥Ïùò Í∞Å Ìñâ Ï≤òÎ¶¨
        for _, row in batch_rows.iterrows():
            # ÏÜåÏä§ Î¨∏ÏûêÏó¥ Ï†ïÍ∑úÌôî
            source = normalize_string(row['Source'])
            question = normalize_string(row['Question'])
            if data_type == 'train':
                answer = normalize_string(row['Answer'])

            # Ï†ïÍ∑úÌôîÎêú ÌÇ§Î°ú Îç∞Ïù¥ÌÑ∞Î≤†Ïù¥Ïä§ Í≤ÄÏÉâ
            normalized_keys = {normalize_string(k): v for k, v in pdf_databases.items()}
            retriever = normalized_keys[source]['retriever']

            context = format_docs(retriever.invoke(question))

            if data_type == 'train':
                results.append({
                    "Context" : context,
                    "Question": question,
                    "Answer"  : answer,
                })
            else:
                results.append({
                    "Context" : context,
                    "Question": question,
                })

    return results


### TRAININGÏóê Í¥ÄÌïú CSV ÌååÏùº ÏÉùÏÑ±

In [4]:
train_df = pd.read_csv(CFG.DACON_TRAIN_CSV)
with open(CFG.EMBEDDINGS_ABOUT_TRAINING_DATASET, 'rb') as f:
    train_pdf_databases = pickle.load(f)

  return torch.load(io.BytesIO(b))


In [5]:
train_context = extract_context(train_df, train_pdf_databases, data_type='train')

stf_full_train_df = pd.DataFrame(train_context)
stf_train_df = pd.DataFrame(train_context[:int(len(train_context)*0.8+0.5)])
stf_eval_df = pd.DataFrame(train_context[int(len(train_context)*0.2+0.5):])

stf_full_train_df.to_csv(CFG.EMBEDDING_CSV_FULL_TRAIN, index=False, encoding="UTF-8")
stf_train_df.to_csv(CFG.EMBEDDING_CSV_TRAIN, index=False, encoding="UTF-8")
stf_eval_df.to_csv(CFG.EMBEDDING_CSV_VAL, index=False, encoding="UTF-8")

Creating Q&A including RAG info: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 496/496 [00:17<00:00, 28.47it/s]


### TESTÏóê Í¥ÄÌïú CSV ÌååÏùº ÏÉùÏÑ±

In [3]:
test_df = pd.read_csv(CFG.DACON_TEST_CSV)
with open(CFG.EMBEDDINGS_ABOUT_TEST_DATASET, 'rb') as f:
    test_pdf_databases = pickle.load(f)

  return torch.load(io.BytesIO(b))


In [6]:
test_context = extract_context(test_df, test_pdf_databases, data_type='test')

stf_test_df = pd.DataFrame(test_context)

stf_test_df.to_csv(CFG.EMBEDDING_CSV_TEST, index=False, encoding="UTF-8")

Creating Q&A including RAG info: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 98/98 [00:03<00:00, 28.96it/s]


# MODELING

### Loading default Pre-trained Model

In [3]:
# 4ÎπÑÌä∏ ÏñëÏûêÌôî ÏÑ§Ï†ï
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Î™®Îç∏ ID 
model_id = CFG.PRETRAINING_MODEL

# ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î°úÎìú Î∞è ÏÑ§Ï†ï
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.use_default_system_prompt = False

# Î™®Îç∏ Î°úÎìú Î∞è ÏñëÏûêÌôî ÏÑ§Ï†ï Ï†ÅÏö©
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:13<00:00,  1.35s/it]


### Show how the model look like

In [4]:
for name, param in model.named_parameters():
    print(name, param.requires_grad)

model.embed_tokens.weight True
model.layers.0.self_attn.q_proj.weight False
model.layers.0.self_attn.k_proj.weight False
model.layers.0.self_attn.v_proj.weight False
model.layers.0.self_attn.o_proj.weight False
model.layers.0.mlp.gate_proj.weight False
model.layers.0.mlp.up_proj.weight False
model.layers.0.mlp.down_proj.weight False
model.layers.0.input_layernorm.weight True
model.layers.0.post_attention_layernorm.weight True
model.layers.0.pre_feedforward_layernorm.weight True
model.layers.0.post_feedforward_layernorm.weight True
model.layers.1.self_attn.q_proj.weight False
model.layers.1.self_attn.k_proj.weight False
model.layers.1.self_attn.v_proj.weight False
model.layers.1.self_attn.o_proj.weight False
model.layers.1.mlp.gate_proj.weight False
model.layers.1.mlp.up_proj.weight False
model.layers.1.mlp.down_proj.weight False
model.layers.1.input_layernorm.weight True
model.layers.1.post_attention_layernorm.weight True
model.layers.1.pre_feedforward_layernorm.weight True
model.layer

### Preparing Parammeters for Finetuning

In [7]:
# Load LoRA configuration
peft_config = LoraConfig(
    r=CFG.LoRA_RANK,
    lora_alpha=CFG.LoRA_ALPHA,
    lora_dropout=CFG.LoRA_DROPOUT,
    bias="none",
    target_modules=[
    "model.embed_tokens", # able
    #"model.layers.0.input_layernorm", # unable
    #"model.layers.0.post_attention_layernorm", # unable
    #"model.layers.0.pre_feedforward_layernorm", # unable
    #"model.layers.0.post_feedforward_layernorm", # unable
    #"model.norm" # unable
    ],
    task_type="CAUSAL_LM",
)        

training_args = TrainingArguments(
    output_dir=CFG.TRAINING_RESUTL_DIR,
    per_device_train_batch_size=CFG.PER_TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=CFG.PER_EVAL_BATCH_SIZE,
    num_train_epochs=CFG.NUM_TRAIN_EPOCHS,
    logging_dir=CFG.LOGGING_DIR,
    logging_steps=CFG.LOGGING_STEPS,
    save_steps=CFG.SAVE_STEPS,
    evaluation_strategy="steps",
)



### Finetuning and Saving Finetuned Model

In [8]:
def Finetune_llm_with_SFT_Trainer():

    def normalize_string(s):
        """Ïú†ÎãàÏΩîÎìú Ï†ïÍ∑úÌôî"""
        return unicodedata.normalize('NFC', s)
    
    # load dataset
    # train_dataset = load_dataset('csv', data_files=CFG.EMBEDDING_CSV_TRAIN)['train']
    # eval_dataset = load_dataset('csv', data_files=CFG.EMBEDDING_CSV_VAL)['train']  
    train_dataset = load_dataset('csv', data_files=CFG.EMBEDDING_CSV_FULL_TRAIN)['train']
    
    def formatting_prompts_func(example):
        output_texts = []
        for i in range(len(example)):
            text =  """Îã§Ïùå Ï†ïÎ≥¥Î•º Î∞îÌÉïÏúºÎ°ú ÏßàÎ¨∏Ïóê ÎãµÌïòÏÑ∏Ïöî. ÎãµÎ≥ÄÏùÄ Íº≠ Î¨∏Ïû•ÏúºÎ°ú ÌïòÏÑ∏Ïöî. Ï£ºÏñ¥Î•º Íº≠ Ï†ÅÏúºÏÑ∏Ïöî. :
# {example[Context]}
# 
# ÏßàÎ¨∏: {example[Question]}
# 
# ÎãµÎ≥Ä: {example[Answer]}
# """
            text = normalize_string(text)
            output_texts.append(text)
        return output_texts
    
    trainer = SFTTrainer(
        model=model,
        args=training_args,
        peft_config = peft_config,
        formatting_func=formatting_prompts_func,
        train_dataset = train_dataset,
        # eval_dataset = eval_dataset,   
    )

    # Train model
    trainer.train()
    
    # Save trained model
    trainer.model.save_pretrained(CFG.SAVING_FINETUNING_MODEL_DIR)
    tokenizer.save_pretrained(CFG.SAVING_FINETUNING_MODEL_DIR)

In [9]:
Finetune_llm_with_SFT_Trainer()

Generating train split: 496 examples [00:00, 9350.04 examples/s]
Map:   0%|          | 0/496 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 496/496 [00:00<00:00, 24777.28 examples/s]
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtjwjddn15584[0m ([33mtjwjddn980117[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/10 [00:00<?, ?it/s]It is strongly recommended to train Gemma2 models with the `eager` attention implementation instead of `sdpa`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.
  attn_output = torch.nn.functional.scaled_dot_product_attention(
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:07<00:00,  1.43it/s]


{'train_runtime': 10.0235, 'train_samples_per_second': 1.496, 'train_steps_per_second': 0.998, 'train_loss': 2.4764156341552734, 'epoch': 5.0}


### Loading Finetuned Model (for Quantization) and Make Pipeline

In [5]:
def setup_llm_SFTTrainer_with_finetuning():
    # 4ÎπÑÌä∏ ÏñëÏûêÌôî ÏÑ§Ï†ï
    bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
    )
    
    # loading model and tokenizer
    model = AutoModelForCausalLM.from_pretrained(CFG.SAVING_FINETUNING_MODEL_DIR, 
                                                 quantization_config=bnb_config,
                                                 trust_remote_code=True,
                                                 device_map="auto",)
    tokenizer = AutoTokenizer.from_pretrained(CFG.SAVING_FINETUNING_MODEL_DIR)

    text_generation_pipeline = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        temperature=CFG.PIPELINE_TEMPERATURE,
        return_full_text=False,
        max_new_tokens=CFG.SEQ_MAX_LENGHT, 
    )

    hf = HuggingFacePipeline(pipeline=text_generation_pipeline)
    return hf

In [6]:
llm = setup_llm_SFTTrainer_with_finetuning()

Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:07<00:00,  1.26it/s]
  warn_deprecated(


# EVALUATION

### Utils for Evaluation

In [7]:
def normalize_string(s):
    """Ïú†ÎãàÏΩîÎìú Ï†ïÍ∑úÌôî"""
    return unicodedata.normalize('NFC', s)

def format_docs(docs):
    """Í≤ÄÏÉâÎêú Î¨∏ÏÑúÎì§ÏùÑ ÌïòÎÇòÏùò Î¨∏ÏûêÏó¥Î°ú Ìè¨Îß∑ÌåÖ"""
    context = ""
    for doc in docs:
        context += doc.page_content
        context += '\n'
    return context

### evaluation

In [8]:
# Í≤∞Í≥ºÎ•º Ï†ÄÏû•Ìï† Î¶¨Ïä§Ìä∏ Ï¥àÍ∏∞Ìôî
results = []

# CSV ÌååÏùº ÏùΩÍ∏∞
df = pd.read_csv(CFG.EMBEDDING_CSV_TEST)

# DataFrameÏùò Í∞Å ÌñâÏóê ÎåÄÌï¥ Ï≤òÎ¶¨
for _, row in tqdm(df.iterrows(), total=len(df), desc="Answering Questions"):
    # ÏÜåÏä§ Î¨∏ÏûêÏó¥ Ï†ïÍ∑úÌôî
    context = normalize_string(row['Context'])
    question = normalize_string(row['Question'])

    # RAG Ï≤¥Ïù∏ Íµ¨ÏÑ±
    template = """Îã§Ïùå Ï†ïÎ≥¥Î•º Î∞îÌÉïÏúºÎ°ú ÏßàÎ¨∏Ïóê ÎãµÌïòÏÑ∏Ïöî. ÎãµÎ≥ÄÏùÄ Íº≠ Î¨∏Ïû•ÏúºÎ°ú ÌïòÏÑ∏Ïöî. Ï£ºÏñ¥Î•º Íº≠ Ï†ÅÏúºÏÑ∏Ïöî. :
    {context}

    ÏßàÎ¨∏: {question}

    ÎãµÎ≥Ä:
    """
    prompt = PromptTemplate.from_template(template)

    # RAG Ï≤¥Ïù∏ Ï†ïÏùò
    rag_chain = ( 
        prompt
        | llm
        | StrOutputParser()
    )

    # ÎãµÎ≥Ä Ï∂îÎ°†
    # print(f"Question: {question}")
    full_response = rag_chain.invoke({"context":context, "question":question})

    # print(f"Answer: {full_response}\n")
    
    # Í≤∞Í≥º Ï†ÄÏû•
    results.append({
        'Question': question,
        'Context': context,
        'Answer': full_response
    })

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Answering Questions:  10%|‚ñà         | 10/98 [01:14<12:38,  8.62s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Answering Questions: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 98/98 [13:51<00:00,  8.48s/it]


# SUBMISSIOIN

In [11]:
# Ï†úÏ∂úÏö© ÏÉòÌîå ÌååÏùº Î°úÎìú
submit_df = pd.read_csv(CFG.SAMPLE_SUBMISSION)

# ÏÉùÏÑ±Îêú ÎãµÎ≥ÄÏùÑ Ï†úÏ∂ú DataFrameÏóê Ï∂îÍ∞Ä
submit_df['Answer'] = [item['Answer'] for item in results]
submit_df['Answer'] = submit_df['Answer'].fillna("Îç∞Ïù¥ÏΩò")     # Î™®Îç∏ÏóêÏÑú Îπà Í∞í (NaN) ÏÉùÏÑ± Ïãú Ï±ÑÏ†êÏóê Ïò§Î•òÍ∞Ä ÎÇ† Ïàò ÏûàÏùå [ Ï£ºÏùò ]

# Í≤∞Í≥ºÎ•º CSV ÌååÏùºÎ°ú Ï†ÄÏû•
submit_df.to_csv(CFG.RESULT_SUBMISSION, encoding='UTF-8-sig', index=False)