In [None]:
!nvidia-smi

Thu Oct 31 04:07:15 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              48W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
path = '/content/drive/MyDrive/DACON/Finance/reprocessed/'
# path = '/content/drive/MyDrive/kdt-EST-AI/project/dacon_fis/src/'
base_directory = path # Your Base Directory

In [2]:
# 이름 정의 - sweep에선 사용되지 않음

model_name="gemma2"
embedding_model="large"
aug_name="NoAug"#"AugGPT" #NoAug, AugAEDA
chunk_size=256
table_process="tab_v1.7"
finetune_mode="dora" #lora, dora
MAX_LEN = 4096

dataset_name = f"kdt3/DACON-QA-{embedding_model}-ensemble-{table_process}-{aug_name}-{chunk_size}"
# train_name = f"kdt3/DACON-QA-{model_name}-{embedding_model}-ensemble-{table_process}-{aug_name}-{chunk_size}-dora"
train_name = "kdt3/1101_1_lr5e-5"
# fname = f"{model_name}_{embedding_model}_ensemble_{chunk_size}_{aug_name}_5epoch_dora.csv"
fname = "1101_1_lr5e-5.csv"

#wandb 관련 변수
import os


os.environ["WANDB_ENTITY"]='DACON-FinAI'
os.environ["WANDB_PROJECT"]="DACON_FinAI"
os.environ["WANDB_LOG_MODEL"] = "end"
wandb_run_name=f"{aug_name}-{model_name}-{finetune_mode}-{embedding_model}-seq2seq"

# 설명

## Question - Answering with Retrieval

본 대회의 과제는 중앙정부 재정 정보에 대한 **검색 기능**을 개선하고 활용도를 높이는 질의응답 알고리즘을 개발하는 것입니다. <br>이를 통해 방대한 재정 데이터를 일반 국민과 전문가 모두가 쉽게 접근하고 활용할 수 있도록 하는 것이 목표입니다. <br><br>
베이스라인에서는 평가 데이터셋만을 활용하여 source pdf 마다 Vector DB를 구축한 뒤 langchain 라이브러리와 llama-2-ko-7b 모델을 사용하여 RAG 프로세스를 통해 추론하는 과정을 담고 있습니다. <br>( train_set을 활용한 훈련 과정은 포함하지 않으며, test_set  에 대한 추론만 진행합니다. )

## Mount/Login

구글 드라이브를 마운트하고 허깅페이스에 로그인
- 이때 허깅페이스 토큰은 kdt3 그룹에 대해 읽기/쓰기 권한이 있는 토큰이어야 함

## Download Library
필요/사용 라이브러리 다운로드
이때 버전 문제로 설치를 한 뒤 세션을 한번 재시작해줘야 합니다
<br>(그리고 세션 완전히 끊기면 다운로드 후 재시작을 다시 해줘야...)

## Import Library
한번 재시작했으면 위 과정 없이 Import만 실행해주면 됩니다

## Vector DB
문서를 여러 조각(chunk)로 나누고, 임베딩 유사도를 통해 관련 조각을 찾을 수 있게 DB화하는 함수들이 정의되어 있습니다.

## DB 생성
Vector DB에서 정의된 함수들로 문서 DB를 만들어줍니다.<br><br>
이때 Train과 Test를 한번에 하려고 하면 코랩이 터질 확률이 높으므로 Train하고 Create Dataset까지 실행해 업로드 한 뒤 재시작해서 램을 비우고 Test를 하는 것이 좋습니다.<br> 또한 문서 임베딩을 어떤 모델로 할지 인자로 넘겨줄 수 있습니다

## Create Dataset
DB 생성에서 만든 db와 데이터 dataframe을 사용해 HuggingFace 데이터셋 생성 후 업로드

## Fine-Tuning
학습 데이터셋으로 모델에 대한 파인튜닝 진행 후 Huggingface에 업로드<br>
4비트 양자화 LoRA로 파인튜닝<br>
기반 모델 또는 넣어줄때 사용할 프롬프트, 학습 관련 하이퍼파라미터 수정 가능

## Langchain 을 이용한 추론
모델을 사용한 추론


## 실행
### 기본
Mount/Login -> Download Library -> 재시작 (처음 1번)
Mount/Login -> Import Library (이후)

### 데이터셋 만들기
기본 -> Vector DB -> DB 생성 -> Create Dataset에서 첫 셀 + Train/Valid/Test 중 해당하는 셀

### 모델 학습하기
기본 -> Fine-Tuning(업로드할 위치, 데이터셋 위치, 모델 링크 확인 필수)

### 학습된 모델로 추론하기
기본 -> Langchain을 이용한 추론(모델 링크, 데이터셋 위치 확인) -> Submission(저장할 파일명 확인)

# Mount/Login

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os

token_path = os.path.join(base_directory,'data','token')
with open(token_path,'r') as f:
    hf_token = f.readline().strip('\n')
    wandb_token = f.readline().strip('\n')

In [5]:
from huggingface_hub import login

login(token=hf_token, add_to_git_credential=True)

Token is valid (permission: fineGrained).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [6]:
!pip install wandb
import wandb

# wandb 개인 API 키 입력
wandb.login(key=wandb_token)



[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

# Download Library

In [1]:
#!pip install unstructured pdfminer.six
#!pip install pillow-heif
#!pip install unstructured_inference
#!pip install unstructured_pytesseract
#!pip install pikepdf pypdf
#!pip install PyMuPDF

In [2]:
!apt-get install tesseract-ocr
!apt-get install poppler-utils

!pip install orjson==3.10.6

!pip install accelerate -U
!pip install -i https://pypi.org/simple/ bitsandbytes
!pip install transformers[torch] -U

!pip install datasets
!pip install langchain
!pip install langchain_community
!pip install langchain-teddynote
!pip install sentence-transformers
!pip install faiss-gpu
!pip install peft
!pip install trl

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 4,816 kB in 1s (3,550 kB/s)
Selecting previously unselected package tesseract-ocr-eng.
(Reading database ... 123623 files and directories currently installed.)
Preparing to unpack .../tesseract-ocr-

# Import Library

In [7]:
import os
import gc
import time
import unicodedata

import torch
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

from langchain.document_loaders.parsers.pdf import PDFPlumberParser


from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    BitsAndBytesConfig
)
from accelerate import Accelerator

# peft
from peft import prepare_model_for_kbit_training
from peft import PeftModel
from peft import LoraConfig, get_peft_model


# Langchain 관련
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableParallel
from langchain.schema.output_parser import StrOutputParser

# PDF 로딩/청크화 관련
from langchain.document_loaders.parsers.pdf import PDFPlumberParser
from langchain.document_loaders.pdf import PDFPlumberLoader
from langchain.document_loaders import UnstructuredPDFLoader
from langchain_teddynote.retrievers import KiwiBM25Retriever
from langchain.retrievers import EnsembleRetriever, MultiQueryRetriever


# Fine-Tuning Setup

## Monkey Patching

In [8]:
import torch
from transformers.models.gemma2 import modeling_gemma2
from typing import List, Optional, Tuple, Union
from transformers.cache_utils import Cache, HybridCache
def gemma2_forward(
    self,
    hidden_states: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_value: Optional[Cache] = None,
    output_attentions: Optional[bool] = False,
    use_cache: Optional[bool] = False,
    cache_position: Optional[torch.LongTensor] = None,
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
    """
    Args:
        hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
        attention_mask (`torch.FloatTensor`, *optional*):
            attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
            query_sequence_length, key_sequence_length)` if default attention is used.
        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
            returned tensors for more detail.
        use_cache (`bool`, *optional*):
            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
            (see `past_key_values`).
        past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
            Indices depicting the position of the input sequence tokens in the sequence
        kwargs (`dict`, *optional*):
            Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
            into the model
    """
    if self.is_sliding and attention_mask is not None:  # efficient SDPA and no padding
        # Flash-attn is a 2D tensor
        if self.config._attn_implementation == "flash_attention_2":
            if past_key_value is not None:  # when decoding
                attention_mask = attention_mask[:, -self.sliding_window :]
        else:
            min_dtype = torch.finfo(torch.float16).min
            sliding_window_mask = torch.tril(
                torch.ones_like(attention_mask, dtype=torch.bool), diagonal=-self.sliding_window
            )
            attention_mask = torch.where(sliding_window_mask, min_dtype, attention_mask)
            if attention_mask.shape[-1] <= 1:  # when decoding
                attention_mask = attention_mask[:, :, :, -self.sliding_window :]

    residual = hidden_states

    hidden_states = self.input_layernorm(hidden_states)

    # Self Attention
    hidden_states, self_attn_weights, present_key_value = self.self_attn(
        hidden_states=hidden_states,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_value=past_key_value,
        output_attentions=output_attentions,
        use_cache=use_cache,
        cache_position=cache_position,
    )
    hidden_states = self.post_attention_layernorm(hidden_states)
    hidden_states = residual + hidden_states

    residual = hidden_states
    hidden_states = self.pre_feedforward_layernorm(hidden_states)
    hidden_states = self.mlp(hidden_states)
    hidden_states = self.post_feedforward_layernorm(hidden_states)
    hidden_states = residual + hidden_states

    outputs = (hidden_states,)

    if output_attentions:
        outputs += (self_attn_weights,)

    if use_cache:
        outputs += (present_key_value,)

    return outputs


In [9]:
modeling_gemma2.Gemma2DecoderLayer.forward = gemma2_forward

## Set-up

In [10]:
# 모델 ID
model_cands={
 'llama2' : "beomi/llama-2-ko-7b",
 'yi' : "beomi/Yi-Ko-6B",
 'solar-beom' : "beomi/Solar-Ko-Recovery-11B",
 'gemma2' : "rtzr/ko-gemma-2-9b-it",
 'solar-lee' : "chihoonlee10/T3Q-ko-solar-dpo-v8.0",
 'llama3' : "KISTI-KONI/KONI-Llama3-8B-Instruct-20240729",
 'llama31' : "meta-llama/Llama-3.1-8B-Instruct"
}

# model_id = model_cands['gemma2']
model_id = model_cands[model_name]

In [11]:
# 모델 로드 및 양자화 설정 적용

def load_model_w_setting(model_id,add_output_token=False,**kwargs):
  # 4비트 양자화 설정
  bnb_config = BitsAndBytesConfig(
      load_in_4bit=True,
      bnb_4bit_use_double_quant=True,
      bnb_4bit_quant_type="nf4",
      bnb_4bit_compute_dtype=torch.bfloat16
  )

  # 토크나이저 로드 및 설정
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  tokenizer.use_default_system_prompt = False
  tokenizer.padding_side="right"

#   eot = "<|eot_id|>"
#   eot_id = tokenizer.convert_tokens_to_ids(eot)
#   tokenizer.pad_token = eot
#   tokenizer.pad_token_id = eot_id

  model= AutoModelForCausalLM.from_pretrained(
      model_id,
      quantization_config=bnb_config,
      device_map="auto",
      trust_remote_code=True,
      **kwargs
      )

# 일부 모델의 경우 토크나이저에 답변 토큰 추가 작업 필요
  if add_output_token :
    initial_token_count = len(tokenizer)
    response_template = '답변: '
    added_token_count = tokenizer.add_special_tokens({"additional_special_tokens": [response_template]})
    model.resize_token_embeddings(new_num_tokens=initial_token_count+added_token_count)

  return model, tokenizer

In [12]:
# 학습 프롬프트 - 추론 프롬프트와 통일하는 것이 좋다고 함
template = """
"task_instructions" : [

 당신은 재정 정보 관련 전문가 입니다. 문서를 바탕으로 질문에 한 문장 이내로 답변하세요.
 1. 문서에 있는 내용을 자르거나 편집하지 않고 그대로 가져오세요.
 2. 순서에 따른 번호를 매기지 마세요. 출력 시 불이익을 줄 것입니다.
 3. 수치에 단위가 있다면 문서를 바탕으로 답변에 단위를 포함하세요.
 4. 질문의 키워드를 바탕으로 문서를 끝까지 검토하세요.
 5. 한 단어 혹은 단어의 나열이 아닌, 완성된 한국어 문장으로 답변하세요.
 6. 답변 외에 예시, 참고, 정보 출처, 신뢰도, 확장된 답변, '답변: ', '참고: '를 절대로 출력하지 마세요.

]

"context":
{context},

"question":
{question},

"주어진 질문에 대한 답변만 한 문장으로 생성한다."

"answer":
{answer}<|eot_id|>
"""

template = """
<start_of_turn>user
 당신은 재정 정보 관련 전문가 입니다. 문서를 바탕으로 질문에 한 문장 이내로 답변하세요.
 1. 문서에 있는 내용을 자르거나 편집하지 않고 그대로 가져오세요.
 2. 순서에 따른 번호를 매기지 마세요. 출력 시 불이익을 줄 것입니다.
 3. 수치에 단위가 있다면 문서를 바탕으로 답변에 단위를 포함하세요.
 4. 질문의 키워드를 바탕으로 문서를 끝까지 검토하세요.
 5. 한 단어 혹은 단어의 나열이 아닌, 완성된 한국어 문장으로 답변하세요.
 6. 답변 외에 예시, 참고, 정보 출처, 신뢰도, 확장된 답변, '답변: ', '참고: '를 절대로 출력하지 마세요.

"문서":
{context},

"질문":
{question},

"주어진 질문에 대한 답변만 한 문장으로 생성한다."

"답변":<end_of_turn>
<start_of_turn>model
{answer}<end_of_turn>
"""

response_template = '"answer":\n'
response_template = '<start_of_turn>model\n'

def formatting_prompts_func(example, template=template):
    output_texts = []
    for i in range(len(example['question'])):
        context = example['context'][i]
        question = example['question'][i]
        answer = example['answer'][i]
        output_texts.append(template.format(context=context,question=question,answer=answer))
    return output_texts



In [13]:
valid_dict = {'input': [], 'answer': [], 'pred': []}
valid_df = pd.DataFrame(valid_dict)

In [14]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
import collections
# from SQUAD, 메트릭 계산 함수

def compute_f1(y_true, y_pred):
    # y_pred[y_true==-100] = -100
    # y_pred = y_pred[y_true != -100]
    # y_true = y_true[y_true != -100]
    # y_true[y_true==-100] = global_tokenizer.pad_token_id
    # y_pred[y_pred==-100] = global_tokenizer.pad_token_id

    true_counter = collections.Counter(y_true)
    pred_counter = collections.Counter(y_pred)
    common = true_counter & pred_counter
    tp = sum(common.values())
    pred_positive = sum(pred_counter.values())
    actual_positive = sum(true_counter.values())

    precision = 1.0 * tp / pred_positive if pred_positive != 0 else 0
    recall = 1.0 * tp / actual_positive if actual_positive != 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1, precision, recall

def compute_metrics(pred,compute_result=False):
    # print(pred.label_ids)
    labels = pred.label_ids.detach().cpu().numpy().squeeze()
    preds = pred.predictions.detach().cpu().numpy().squeeze()
    inputs = pred.inputs['input_ids'].detach().cpu().numpy().squeeze()
    inputs[inputs==-100] = global_tokenizer.pad_token_id
    # print('input: ',global_tokenizer.decode(inputs))
    att = pred.inputs['attention_mask'].detach().cpu().numpy().squeeze()
    mask = labels[:min(len(preds),len(labels))]==-100
    if len(preds) > len(labels):
        mask = np.append(mask,[False] * (len(preds) - len(labels)))

    preds[mask] = -100
    labels = labels[labels!=-100]
    preds = preds[preds!=-100]
    preds = preds[preds!=global_tokenizer.pad_token_id]
    valid_df.loc[len(valid_df)] = [global_tokenizer.decode(inputs), global_tokenizer.decode(labels), global_tokenizer.decode(preds)]
    f1, precision, recall = compute_f1(labels,preds)
#    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    # acc = accuracy_score(labels, preds)
    return {
        # 'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

In [15]:
def custom_data_collator(features):
    # Your custom collator logic, make sure it does not assign input_ids to labels.
    collate = dict()
    # print(features[0].keys())
    if 'labels' in features[0].keys():
        collate['labels'] = torch.tensor([feature['labels'] for feature in features])
    for k in features[0].keys():
        collate[k] = torch.tensor([feature[k] for feature in features])

    return collate

# Train & Inference with Sweep(Wandb Sweep으로 학습 및 추론)

In [None]:
#create sweep
sweep_config = {
    'method': 'bayes',
    'metric': {'goal': 'maximize', 'name': 'train/loss'},
    'parameters': {
        'batch_size': {
            'values': [1]
        },
        'model': {
            'values': ['gemma2','llama31']
        },
        'learning_rate': {
            'distribution': 'uniform',
            'min':0.0001,
            'max':0.001
        },
        'epochs': {
            'values': [8]
        },
        'lora_target': {
            'values': ['all','part']#['all','linear','part']
        },
        'r': {
            'values': [4,8,16]
        },
        'lora_alpha': {
            'values': [16,32,64]
        },
        'lora_dropout': {
            'values': [0,0.05,0.2]
        },
        'use_dora': {
            'values': [True, False]
        },
        'chunk_size': {
            'values': [256]#,512]
        },
        'embedding': {
            'values': ['base','large']
        },
        'augmentation': {
            'values': ['NoAug','AugGPT','AugAEDA']
        },
        'table_process': {
            'values': ['tab_v1.0']
        }
    }
}

In [None]:
sweep_id = wandb.sweep(sweep=sweep_config, entity='DACON-FinAI', project="DACON_FinAI Sweep-2")

In [None]:
def empty_memory():
    time.sleep(1)
    torch.cuda.empty_cache()
    gc.collect()
    time.sleep(1)
    torch.cuda.empty_cache()
    gc.collect()

In [None]:
import transformers, os
from datetime import datetime
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from tqdm.auto import trange


MAX_LEN = 4096

def sweep_inference(model_id, instance_name, eval_dataset,fname):

    peft_model_id = instance_name
    trained_model,tokenizer = load_model_w_setting(model_id)

    #Fine-Tune 한 LoRA 어댑터 불러오기
    trained_model.load_adapter(peft_model_id)

    text_generation_pipeline = pipeline(
        model=trained_model,
        tokenizer=tokenizer,
        task="text-generation",
        return_full_text=False,
        max_new_tokens=200,
        # repetition_penalty=1.5,
        eos_token_id = tokenizer.eos_token_id,
        pad_token_id = tokenizer.pad_token_id,
        max_length=MAX_LEN
    )

    llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

    # 결과를 저장할 리스트 초기화
    results = []

    # Dataset 각 행에 대해 처리
    for idx in trange(len(eval_dataset['question']), desc="Answering Questions"):
        #질문, 컨텍스트(문서)
        question = eval_dataset['question'][idx]
        context = eval_dataset['context'][idx]

        # RAG 체인 구성
        prompt = PromptTemplate.from_template(template.split('{answer}')[0])

        # RAG 체인 정의
        if context != "":
            rag_chain = (
                RunnableParallel(context=lambda x: x["context"], question = lambda x: x["question"])
                | prompt
                | llm
                | StrOutputParser()
            )
        else:
            rag_chain = (
                {"question": RunnablePassthrough()}
                | prompt
                | llm
                | StrOutputParser()
            )

        # 답변 추론
        full_response = rag_chain.invoke({"question": question, "context": context})

        # 결과 저장

        if context != "":
            results.append({
                "Context": context,
                "Question": question,
                "Answer": full_response,
                "True_Answer": eval_dataset['answer'][idx]
            })
        else:
            results.append({
                "Question": question,
                "Answer": full_response,
                "True_Answer": eval_dataset['answer'][idx]
            })

    # 제출용 샘플 파일 로드
    submit_df = pd.read_csv(f"{path}sample_submission.csv")

    # 생성된 답변을 제출 DataFrame에 추가
    save_mode = 'submission'

    if save_mode != 'submission' :
        submit_df['Question'] = [item['Question'] for item in results]
        submit_df['Context'] = [item['Context'] for item in results]
        save_dir = os.path.join(path,'eval')
    else : save_dir = os.path.join(path,'sub')

    if not os.path.exists(save_dir) : os.makedirs(save_dir)
    save_path = os.path.join(save_dir,fname)

    submit_df['Answer'] = [item['Answer'] for item in results]
    submit_df['Answer'] = submit_df['Answer'].fillna("데이콘").apply(str.rstrip)     # 모델에서 빈 값 (NaN) 생성 시 채점에 오류가 날 수 있음 [ 주의 ]

    # 결과를 CSV 파일로 저장
    submit_df.to_csv(save_path, encoding='UTF-8-sig', index=False)
    answer_table = wandb.Table(dataframe=submit_df)
    wandb.log({"answer_table": answer_table})

    wandb.save(save_path)

def sweep_train(config=None):
  with wandb.init(config=config):
    # set sweep configuration
    config = wandb.config

    # 모델 ID
    model_cands={
        'llama2' : "beomi/llama-2-ko-7b",
        'yi' : "beomi/Yi-Ko-6B",
        'solar-beom' : "beomi/Solar-Ko-Recovery-11B",
        'gemma2' : "rtzr/ko-gemma-2-9b-it",
        'solar-lee' : "chihoonlee10/T3Q-ko-solar-dpo-v8.0",
        'llama3' : "KISTI-KONI/KONI-Llama3-8B-Instruct-20240729",
    }

    model_id = model_cands[config.model]

    model,tokenizer = load_model_w_setting(model_id,attn_implementation='eager')

    global global_tokenizer
    global_tokenizer = tokenizer

    modules_dict = {
        "all": [
            "q_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
            "lm_head"
            ],
        "linear": "all-linear",
        "part": ["q_proj", "v_proj"]
    }
    lora_modules = modules_dict[config.lora_target]

    lora_config = LoraConfig(
        r=config.r,
        lora_alpha=config.lora_alpha,
        target_modules=lora_modules,
        bias="none",
        lora_dropout=config.lora_dropout,
        use_dora=config.use_dora,
        task_type="CAUSAL_LM",
    )

    model.enable_input_require_grads()
    model = get_peft_model(model, lora_config)
    dataset_url = f"kdt3/DACON-QA-{config.embedding}-ensemble-{config.table_process}-{config.augmentation}-{config.chunk_size}"
    # dataset_url = f"kdt3/DACON-QA-{config.embedding}-ensemble-markdown-reprocessed-{config.chunk_size}"
    target_dataset = load_dataset(dataset_url)

    # train_args = transformers.TrainingArguments(
    #     do_eval=True,
    #     output_dir='./output',
    #     warmup_ratio=0.05,
    #     per_device_train_batch_size=config.batch_size,
    #     gradient_accumulation_steps=4,
    #     gradient_checkpointing=True,
    #     num_train_epochs = config.epochs,
    #     learning_rate=config.learning_rate,
    #     fp16=True,
    #     optim="paged_adamw_8bit",
    #     logging_strategy='epoch',
    #     report_to="wandb",
    #     run_name=wandb_run_name,
    # ),

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset= target_dataset['train'],
        eval_dataset = target_dataset['valid'],
        compute_metrics=compute_metrics,
        args= transformers.TrainingArguments(
            do_eval=True,
            output_dir='./output',
            warmup_ratio=0.05,
            eval_strategy="epoch",
            eval_accumulation_steps=1,
            batch_eval_metrics=True,
            per_device_train_batch_size=config.batch_size,
            per_device_eval_batch_size=config.batch_size,
            gradient_accumulation_steps=4,
            gradient_checkpointing=True,
            num_train_epochs = config.epochs,
            learning_rate=config.learning_rate,
            fp16=True,
            optim="paged_adamw_8bit",
            logging_strategy='epoch',
            load_best_model_at_end=True,
            metric_for_best_model='f1',
            greater_is_better=True,
            save_total_limit=2,    # save only best and the last
            save_strategy='epoch',
            report_to="wandb",
            run_name=wandb_run_name,
        ),
        max_seq_length=MAX_LEN,
        formatting_func=formatting_prompts_func, # 프롬프트 처리하기 위해 필요
        data_collator=DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer, mlm=False), #모델에게 답변생성에만 집중하도록 함
    )

    model.config.use_cache = False
    trainer.train()
    trained_model = (trainer.model.module if hasattr(trainer.model, "module") else trainer.model)

    # 모델 업로드
    method = 'dora' if config.use_dora else 'lora'
    instance_name = f"kdt3/DACON-QA-{config.augmentation}-{config.model}-{method}-{config.embedding}-{config.chunk_size}-sweep"
    fname=f"{config.augmentation}-{config.model}-{method}-{config.embedding}-{config.chunk_size}.csv"
    trained_model.push_to_hub(instance_name, private=True,save_embedding_layers=True)


    empty_memory()
    sweep_inference(model_id, instance_name, target_dataset['test'],fname)
    empty_memory()

#로컬에 저장할 경우
# trained_model.save_pretrained(f"{output_dir}/saved_model")

In [None]:
wandb.agent(sweep_id, sweep_train, count=8)

# Train without Sweep(Sweep 사용하지 않고 학습)

In [16]:
# !pip install -q -U flash-attn --no-build-isolation
model,tokenizer = load_model_w_setting(model_id,attn_implementation='eager')

global global_tokenizer
global_tokenizer = tokenizer
# model,tokenizer = load_model_w_setting(model_id,attn_implementation='flash_attention_2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/40.5k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/852 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/39.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/10 [00:00<?, ?it/s]

model-00001-of-00010.safetensors:   0%|          | 0.00/1.92G [00:00<?, ?B/s]

model-00002-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00003-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00005-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00007-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00009-of-00010.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00010-of-00010.safetensors:   0%|          | 0.00/705M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [21]:
#데이터셋 로드
from datasets import load_dataset

# dataset_url = "kdt3/DACON-QA-base-table-preprocessed-v3"
# dataset_url = "kdt3/DACON-QA-base-preprocessed-v2"
# dataset_url = "kdt3/DACON-QA-base-augselect"
# dataset_url = "kdt3/DACON-QA-base-markdown"
# dataset_url = "kdt3/DACON-QA-bge-markdown"
dataset_url = dataset_name


train_dataset = load_dataset(dataset_url)

In [22]:
# 문맥 잘못됐나 확인 - 문맥 길이 체크

amax = np.argmax([len(x) for x in train_dataset['train']['context']])
amax, len(train_dataset['train']['context'][amax])

(280, 1944)

In [23]:
# def print_dataset_ele(data,i):
#   print(data['question'][i])
#   print('--------')
#   print(data['context'][i])
#   print('--------')
#   print(data['answer'][i])

# #예시
# i= amax
# print_dataset_ele(train_dataset['train'],i)

In [24]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,
    lora_alpha=64,
    target_modules=[
        "q_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    use_dora=(finetune_mode=="dora"),
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

model.enable_input_require_grads()
model = get_peft_model(model, config)



In [25]:
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq, DefaultDataCollator
import torch.nn as nn
from typing import Any, Callable, Dict, List, NewType, Optional, Tuple, Union
from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
from transformers.integrations.fsdp import is_fsdp_managed_module
from torch.distributed.fsdp import FullyShardedDataParallel
import contextlib


class FineTuningTrainer(Seq2SeqTrainer):
    def __init__(self, *args, eval_data_collator=None, **kwargs):
        super().__init__(*args, **kwargs)
        if not eval_data_collator:
                eval_data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer)
        self.eval_data_collator = eval_data_collator

    def get_eval_dataloader(self, eval_dataset: Dataset | None = None) -> DataLoader:
        if eval_dataset is None and self.eval_dataset is None:
            raise ValueError("Trainer: evaluation requires an eval_dataset.")

        # If we have persistent workers, don't do a fork bomb especially as eval datasets
        # don't change during training
        if (
            hasattr(self, "_eval_dataloader")
            and self.args.dataloader_persistent_workers
        ):
            return self.accelerator.prepare(self._eval_dataloader)
        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
        data_collator = self.eval_data_collator #Here eval_data_collator is called instead of standard data_collator

        if isinstance(eval_dataset, Dataset):
            eval_dataset = self._remove_unused_columns(
                eval_dataset, description="evaluation"
            )
        else:
            data_collator = self._get_collator_with_removed_columns(
                data_collator, description="evaluation"
            )

        dataloader_params = {
            "batch_size": self.args.eval_batch_size,
            "collate_fn": data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
            "persistent_workers": self.args.dataloader_persistent_workers,
        }

        # if not isinstance(sample_dataset, torch.utils.data.IterableDataset):
        #     dataloader_params["sampler"] = self._get_eval_sampler(eval_dataset)
        #     dataloader_params["drop_last"] = self.args.dataloader_drop_last
        #     dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor

        # accelerator.free_memory() will destroy the references, so
        # we need to store the non-prepared version

        eval_dataloader = DataLoader(eval_dataset, **dataloader_params)
        if self.args.dataloader_persistent_workers:
            self._eval_dataloader = eval_dataloader

        return self.accelerator.prepare(eval_dataloader)


In [26]:
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer, mlm=False) #모델에게 답변생성에만 집중하도록 함

def formatting_prompts_func(example, template=template):
    output_texts = []
    context = example['context']
    question = example['question']
    answer = example['answer']
    output_texts.append(template.format(context=context,question=question,answer=answer))
    return output_texts

def formatting_prompts_func_answer(example, template=template):
    output_texts = []
    context = example['context']
    question = example['question']
    output_texts.append(template.format(context=context,question=question,answer=answer))
    return output_texts


def apply_chat_template(example):
    input = formatting_prompts_func(example)
    example["input_ids"] = tokenizer(input)["input_ids"]
    # example["input_ids"] = tokenizer.convert_tokens_to_ids(example["input_ids"])

    # print(example["input_ids"])
    collated_data = collator(example["input_ids"])
    example["input_ids"] = collated_data["input_ids"][0]
    example["labels"] = collated_data["labels"][0].clone()
    # example["label_ids"] = collated_data["labels"][0].clone()
    # example["label"] = collated_data["labels"][0].clone()
    # print(example)
    return example


def mask_attention_response(example):
    response_len = len(tokenizer(example["answer"]+"<end_of_turn>\n")["input_ids"])
    example["attention_mask"] = torch.tensor([1] * (len(example["input_ids"]) - response_len) + [0] * response_len)
    example["input_ids"][-response_len+1:] = [0] * (response_len-1)
    # example["input_ids"] = example["input_ids"][:-response_len+1]
    # print(example.keys())
    return example


train_dataset['train'] = train_dataset['train'].map(apply_chat_template)
train_dataset['valid'] = train_dataset['valid'].map(apply_chat_template)

train_dataset['valid'] = train_dataset['valid'].map(mask_attention_response, desc="Masking assistant response")

Map:   0%|          | 0/396 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Masking assistant response:   0%|          | 0/100 [00:00<?, ? examples/s]

In [27]:
from transformers.integrations.integration_utils import WandbCallback
from transformers import Trainer
import tempfile
from transformers.integrations.integration_utils import save_model_architecture_to_file
import numbers
from pathlib import Path

class CustomWandbCallback(WandbCallback):
    def __init__(self, trainer):
        super().__init__()
        self._trainer = trainer

    def on_train_end(self, args, state, control, model=None, tokenizer=None, **kwargs):
        if self._wandb is None:
            return
        valid_table = wandb.Table(dataframe=valid_df)
        wandb.log({"valid_table": valid_table})

        if self._log_model.is_enabled and self._initialized and state.is_world_process_zero:
            fake_trainer = Trainer(eval_dataset= self._trainer.eval_dataset, args=args, model=model, processing_class=tokenizer)
            with tempfile.TemporaryDirectory() as temp_dir:
                fake_trainer.save_model(temp_dir)
                metadata = (
                    {
                        k: v
                        for k, v in dict(self._wandb.summary).items()
                        if isinstance(v, numbers.Number) and not k.startswith("_")
                    }
                    if not args.load_best_model_at_end
                    else {
                        f"eval/{args.metric_for_best_model}": state.best_metric,
                        "train/total_floss": state.total_flos,
                        "model/num_parameters": self._wandb.config.get("model/num_parameters"),
                    }
                )
                metadata["final_model"] = True
                model_name = (
                    f"model-{self._wandb.run.id}"
                    if (args.run_name is None or args.run_name == args.output_dir)
                    else f"model-{self._wandb.run.name}"
                )
                # add the model architecture to a separate text file
                save_model_architecture_to_file(model, temp_dir)

                artifact = self._wandb.Artifact(name=model_name, type="model", metadata=metadata)
                for f in Path(temp_dir).glob("*"):
                    if f.is_file():
                        with artifact.new_file(f.name, mode="wb") as fa:
                            fa.write(f.read_bytes())
                self._wandb.run.log_artifact(artifact, aliases=["final_model"])

In [28]:
import transformers, os
from datetime import datetime
from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq, GenerationConfig

# 로컬에 모델 저장하고 싶은 경우 이름 지정
project = "financeQA-finetune"
base_model_name = "gemma2"
run_name = base_model_name + "_" + project
output_dir = os.path.join(path,run_name)
if not os.path.exists(output_dir) : os.makedirs(output_dir)

MAX_LEN = 4096

trainer = FineTuningTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset['train'],
    eval_dataset = train_dataset['valid'],
    compute_metrics=compute_metrics,
    args=transformers.Seq2SeqTrainingArguments(
        do_eval=True,
        output_dir='./',
        warmup_ratio=0.05,
        eval_strategy="no",
        eval_accumulation_steps=1,
        batch_eval_metrics=True,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=4,
        gradient_checkpointing=True,
        num_train_epochs = 2,
        learning_rate=5e-5,
        fp16=True,
        optim="paged_adamw_8bit",
        logging_strategy='epoch',
        report_to="none",
        run_name=wandb_run_name,
        remove_unused_columns=True,
        predict_with_generate=True,
        include_for_metrics = ["inputs"],
        label_names=["labels"],
        generation_config=GenerationConfig(max_new_tokens=200,eos_token_id = [tokenizer.eos_token_id], pad_token_id = tokenizer.pad_token_id),
        # save_strategy="epoch",
        # save_steps=25,
    ),
    data_collator=DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer, mlm=False), #모델에게 답변생성에만 집중하도록 함
    eval_data_collator=custom_data_collator,
)

model.config.use_cache = False
wandb_callback = CustomWandbCallback(trainer)
trainer.add_callback(wandb_callback)
# trainer.evaluate([train_dataset['valid'][0]])
# model.config.rms_norm_eps = 1e-6
trainer.train()

trained_model = (trainer.model.module if hasattr(trainer.model, "module") else trainer.model)

#로컬에 저장할 경우
# trained_model.save_pretrained(f"{output_dir}/saved_model")

  super().__init__(*args, **kwargs)
[34m[1mwandb[0m: Currently logged in as: [33mps4southwest[0m ([33mDACON-FinAI[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
99,0.5771
198,0.2862




In [29]:
# 모델 업로드
trained_model.push_to_hub(train_name, private=True,save_embedding_layers=True)

adapter_model.safetensors:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kdt3/1101_1_lr5e-5/commit/c48e1c45c44a93e9d13f435b054c71841d0cb640', commit_message='Upload model', commit_description='', oid='c48e1c45c44a93e9d13f435b054c71841d0cb640', pr_url=None, pr_revision=None, pr_num=None)

# Inference with Langchain (Langchain을 이용한 추론)

In [30]:
# gpu memory 할당 해제
import gc
torch.cuda.empty_cache()
gc.collect()

295

In [31]:
# HuggingFacePipeline 객체 생성

# 모델 ID
if model_id is None:
    model_id = model_cands[run_name.split('_')[0]]
peft_model_id = train_name
trained_model,tokenizer = load_model_w_setting(model_id)

#Fine-Tune 한 LoRA 어댑터 불러오기
trained_model.load_adapter(peft_model_id)

text_generation_pipeline = pipeline(
    model=trained_model,
    tokenizer=tokenizer,
    task="text-generation",
    return_full_text=False,
    max_new_tokens=200,
    # repetition_penalty=1.5,
    eos_token_id = tokenizer.eos_token_id,
    pad_token_id = tokenizer.pad_token_id,
    max_length=MAX_LEN
)

llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

adapter_config.json:   0%|          | 0.00/724 [00:00<?, ?B/s]



adapter_model.safetensors:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

  llm = HuggingFacePipeline(pipeline=text_generation_pipeline)


In [32]:
#데이터셋 로드
from datasets import load_dataset
dataset_url = dataset_name
dataset = load_dataset(dataset_url)

In [33]:
# 검증 데이터 쓸지, 테스트 데이터 쓸지
eval_mode = 'test' # or 'test'
eval_dataset = dataset[eval_mode]

In [34]:
# 그냥 GPU 메모리 확인용
!nvidia-smi

Mon Nov  4 06:45:25 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              66W / 400W |  25805MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [35]:
template.split('{answer}')[0]

'\n<start_of_turn>user\n 당신은 재정 정보 관련 전문가 입니다. 문서를 바탕으로 질문에 한 문장 이내로 답변하세요.\n 1. 문서에 있는 내용을 자르거나 편집하지 않고 그대로 가져오세요.\n 2. 순서에 따른 번호를 매기지 마세요. 출력 시 불이익을 줄 것입니다.\n 3. 수치에 단위가 있다면 문서를 바탕으로 답변에 단위를 포함하세요.\n 4. 질문의 키워드를 바탕으로 문서를 끝까지 검토하세요.\n 5. 한 단어 혹은 단어의 나열이 아닌, 완성된 한국어 문장으로 답변하세요.\n 6. 답변 외에 예시, 참고, 정보 출처, 신뢰도, 확장된 답변, \'답변: \', \'참고: \'를 절대로 출력하지 마세요.\n\n"문서":\n{context},\n\n"질문":\n{question},\n\n"주어진 질문에 대한 답변만 한 문장으로 생성한다."\n\n"답변":<end_of_turn>\n<start_of_turn>model\n'

In [None]:
from tqdm.auto import trange

# 결과를 저장할 리스트 초기화
results = []

# DATASET 구조를 dataset[i]={'question':,'context':,...}로 바꾸면 안됨?

# Dataset 각 행에 대해 처리
for idx in trange(len(eval_dataset['question']), desc="Answering Questions"):
    #질문, 컨텍스트(문서)
    question = eval_dataset['question'][idx]
    context = eval_dataset['context'][idx]

    # RAG 체인 구성
    prompt = PromptTemplate.from_template(template.split('{answer}')[0])

    # RAG 체인 정의
    if context != "":
        rag_chain = (
            RunnableParallel(context=lambda x: x["context"], question = lambda x: x["question"])
            | prompt
            | llm
            | StrOutputParser()
        )
    else:
        rag_chain = (
            {"question": RunnablePassthrough()}
            | prompt
            | llm
            | StrOutputParser()
        )

    # 답변 추론
    print(f"Question: {question}")
    full_response = rag_chain.invoke({"question": question, "context": context})

    print(f"Answer: {full_response}\n")

    # 결과 저장

    if context != "":
        results.append({
            "Context": context,
            "Question": question,
            "Answer": full_response,
            "True_Answer": eval_dataset['answer'][idx]
        })
    else:
        results.append({
            "Question": question,
            "Answer": full_response,
            "True_Answer": eval_dataset['answer'][idx]
        })

Answering Questions:   0%|          | 0/98 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=200) and `max_length`(=4096) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Question: 2022년 혁신창업사업화자금(융자)의 예산은 얼마인가요?


Both `max_new_tokens` (=200) and `max_length`(=4096) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer: 2,300백만원


Question: 중소벤처기업부의 혁신창업사업화자금(융자) 사업목적은 무엇인가요?


Both `max_new_tokens` (=200) and `max_length`(=4096) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer: 중소벤처기업부의 혁신창업사업화자금(융자) 사업목적은 중소기업이 보유한 우수 기술의 사장을 방지하고 개발기술의 제품화·사업화를 촉진하여 기술기반 중소기업을 육성하는 것입니다.


Question: 중소벤처기업부의 혁신창업사업화자금(융자) 사업근거는 어떤 법률에 근거하고 있나요?


Both `max_new_tokens` (=200) and `max_length`(=4096) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer: 중소기업진흥에 관한 법률 제66조, 제67조, 제74조, 중소기업창업지원법 제35조


Question: 2010년에 신규 지원된 혁신창업사업화자금은 무엇인가요?


Both `max_new_tokens` (=200) and `max_length`(=4096) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer: 혁신창업사업화자금은 2010년에 재창업자금(실패 경영인에 대한 재기지원)과 청년전용창업자금(만 39세 이하 청년창업자 대상)이 신규 지원되었습니다.


Question: 혁신창업사업화자금 중 2020년에 신규 지원된 자금은 무엇인가요?


Both `max_new_tokens` (=200) and `max_length`(=4096) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer: 미래기술육성자금, 고성장촉진자금


Question: 재창업자금이 재도약지원자금으로 이관된 연도는 언제인가요?


In [None]:
# i = 28
# print_dataset_ele(eval_dataset,i)

# Evaluation

In [None]:
#검증 데이터 쓸때만 사용 가능
if eval_mode == 'valid':
  for item in results:
      y_hat = item["Answer"]
      y = item["True_Answer"]
      f1, precision, recall = compute_f1(y, y_hat)
      item["F1"] = f1
      item["Precision"] = precision
      item["Recall"] = recall

In [None]:
if eval_mode == 'valid':
  # 제출용 샘플 파일 로드
  eval_df = pd.DataFrame([])

  # 생성된 답변을 제출 DataFrame에 추가
  eval_df['Question'] = [item['Question'] for item in results]
  eval_df['Answer'] = [item['Answer'] for item in results]
  eval_df["F1"] = [item["F1"] for item in results]
  eval_df["Precision"] = [item["Precision"] for item in results]
  eval_df["Recall"] = [item["Recall"] for item in results]
  # eval_df['Answer'] = eval_df['Answer'].fillna("데이콘")     # 모델에서 빈 값 (NaN) 생성 시 채점에 오류가 날 수 있음 [ 주의 ]

  save_dir = os.path.join(path,'eval')
  if not os.path.exists(save_dir) : os.makedirs(save_dir)
  save_name = f'eval_{fname}'
  save_path = os.path.join(save_dir,save_name)

  # 결과를 CSV 파일로 저장
  eval_df.to_csv(save_path, encoding='UTF-8-sig', index=False)

In [None]:
# 평균 F1 확인
# eval_df = pd.read_csv(f"{path}trained_eval.csv",index_col=0)
if eval_mode == 'valid' :
  display(eval_df["F1"].mean())

# Submission

In [None]:
submit_df = pd.read_csv(f"{path}sample_submission.csv")
submit_df.head()

Unnamed: 0,SAMPLE_ID,Answer
0,TEST_000,데이콘
1,TEST_001,데이콘
2,TEST_002,데이콘
3,TEST_003,데이콘
4,TEST_004,데이콘


In [None]:
# 제출용 샘플 파일 로드
submit_df = pd.read_csv(f"{path}sample_submission.csv")

# 생성된 답변을 제출 DataFrame에 추가
save_mode = 'submission'

if save_mode != 'submission' :
  submit_df['Question'] = [item['Question'] for item in results]
  submit_df['Context'] = [item['Context'] for item in results]
  save_dir = os.path.join(path,'eval')
else : save_dir = os.path.join(path,'sub')

if not os.path.exists(save_dir) : os.makedirs(save_dir)
save_path = os.path.join(save_dir,fname)

submit_df['Answer'] = [item['Answer'] for item in results]
submit_df['Answer'] = submit_df['Answer'].fillna("데이콘").apply(str.rstrip)     # 모델에서 빈 값 (NaN) 생성 시 채점에 오류가 날 수 있음 [ 주의 ]

# 결과를 CSV 파일로 저장
submit_df.to_csv(save_path, encoding='UTF-8-sig', index=False)

In [None]:
submit_df['Answer'] = submit_df['Answer'].apply(lambda x: x.split('<|eot_id|>')[0])
submit_df.to_csv(save_path.split('.csv')[0]+'_cleaned.csv', encoding='UTF-8-sig', index=False)