In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !pip install -U langchain-community
# !pip install -U transformers
# !pip install datasets
# !pip install pypdf
# !pip install -U bitsandbytes transformers accelerate
# !pip install peft
# # !pip install --upgrade --force-reinstall autoawq --extra-index-url https://download.pytorch.org/whl/cu121

In [None]:
# !pip uninstall -y autoawq
# !git clone https://github.com/casper-hansen/AutoAWQ.git
# %cd AutoAWQ
# !pip install .

In [None]:
import re
import json
import torch
import pandas as pd

from tqdm import tqdm
from datasets import Dataset, load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from peft import PeftModelForCausalLM
from peft import AutoPeftModelForCausalLM
from peft import prepare_model_for_kbit_training
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline, BitsAndBytesConfig
from transformers import EarlyStoppingCallback
from transformers import default_data_collator
# from autoawq import AWQConfig


from google.colab import files

import warnings
warnings.filterwarnings("ignore")

# Model Load Train Save

In [None]:
# 데이터 로드

dataset = load_dataset("json",
                       data_files = "/content/drive/MyDrive/1데이콘/2025금융AIChallenge금융AI모델경쟁/dataset/merged_qa.json",
                       )

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
model_name = 'LGAI-EXAONE/EXAONE-3.5-7.8B-Instruct'


# 양자화
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = torch.bfloat16
)

# awq_config = AWQConfig(
#     bits=8,
#     group_size=128,
#     module_skip_list=["lm_head"]
# )

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map={"":0},
    trust_remote_code=True,
    quantization_config = bnb_config
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

# 그래디언트 체크포인팅 활성화(메모리 절약)
model.gradient_checkpointing_enable()

# 모델을 훈련에 적합하게 조성
model = prepare_model_for_kbit_training(model)

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

model-00008-of-00013.safetensors:   0%|          | 0.00/952M [00:00<?, ?B/s]

model-00003-of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

model-00005-of-00013.safetensors:   0%|          | 0.00/952M [00:00<?, ?B/s]

model-00002-of-00013.safetensors:   0%|          | 0.00/952M [00:00<?, ?B/s]

model-00001-of-00013.safetensors:   0%|          | 0.00/926M [00:00<?, ?B/s]

model-00007-of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

model-00004-of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

model-00006-of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

model-00009-of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

model-00010-of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

model-00011-of-00013.safetensors:   0%|          | 0.00/952M [00:00<?, ?B/s]

model-00012-of-00013.safetensors:   0%|          | 0.00/948M [00:00<?, ?B/s]

model-00013-of-00013.safetensors:   0%|          | 0.00/515M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/13 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [None]:
# for name, module in model.named_modules():
#     if "attn" in name.lower() or "attention" in name.lower():
#         print(name)

In [None]:
# LoRA 설정
lora_config = LoraConfig(
    lora_alpha = 32,
    lora_dropout = 0.1,
    r = 16,
    task_type="CAUSAL_LM",
    bias = "none",
    target_modules = [
        'q_proj',
        'k_proj',
        'v_proj',
        'o_proj'
    ]
  )

# 모델에 LoRA 적용
model = get_peft_model(model, lora_config)

# 훈련 가능한 파라미터 확인
model.print_trainable_parameters()

trainable params: 7,340,032 || all params: 5,892,399,104 || trainable%: 0.1246


In [None]:
# 프롬프트 포맷 정의
def format_prompt(ex):
    if ex["input"]:
        return f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}"
    else:
        return f"### Instruction:\n{ex['instruction']}\n\n### Response:\n{ex['output']}"

# 토크나이징 함수
def preprocess_function(ex):
    prompt = format_prompt(ex)
    tokenized = tokenizer(prompt, truncation=True, padding="max_length", max_length=1024)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

In [None]:
dataset = dataset["train"]

# 전처리 적용
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=False,
    remove_columns=dataset.column_names
)

# train/test 분할
# tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2, shuffle=True, seed=810)

Map:   0%|          | 0/11759 [00:00<?, ? examples/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token

# training argument 설정
training_args = TrainingArguments(
    output_dir = "/content/drive/MyDrive/1데이콘/2025금융AIChallenge금융AI모델경쟁/dataset/finetunning_model8",
    save_strategy = 'epoch',
    save_total_limit = 2,

    # 메모리
    per_device_train_batch_size = 8,
    # per_device_evel_batch_size = 16,
    gradient_accumulation_steps = 4,
    # optim = "paged_adamw_8bit",
    bf16 = True,

    # 학습 제어
    learning_rate = 3e-5,
    num_train_epochs = 3,
    warmup_ratio = 0.05,
    weight_decay = 0.01,

    # 평가/체크포인트
    eval_strategy="no",
    # eval_strategy = 'epoch',
    # eval_steps = 5,
    logging_steps = 100,
    save_steps = 500,

    metric_for_best_model = 'eval_loss',
    greater_is_better = False,
    # load_best_model_at_end = True,
    remove_unused_columns=True,
)


# 학습
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset,
    # eval_dataset = tokenized_dataset['test'],
    tokenizer = tokenizer
    # callbacks= [EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()
trainer.save_model()