In [1]:
%cd /home/hjjang/SLM/SLM-llama/

/home/hjjang/SLM/SLM-llama


In [2]:
# %%
from transformers import AutoTokenizer, PreTrainedTokenizerBase,  TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import numpy as np
from tqdm.notebook import tqdm, trange
from models.config import LlamaCLConfig
from transformers.modeling_utils import PreTrainedModel, unwrap_model
from models.slm import ScalableLM
import torch
import torch.nn as nn
from dataclasses import dataclass
from typing import Optional, Union, List, Any, Dict
from dataset.dataset import get_dataset_by_name, IGNORE_INDEX
from datasets.utils.logging import disable_progress_bar
import os
import re
import argparse
import bitsandbytes as bnb
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_LAUNCH_BLOCKING"] = "0"

In [3]:
# %%
@dataclass
class DataCollatorWithPadding:

    eos_token_id: PreTrainedTokenizerBase
    task: str = "finance"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        batch = dict(task=self.task,
                     raw_text=[feature.pop('raw_text') for feature in features])
        label_key = 'labels' if 'labels' in features else 'label'
        input_ids, attention_mask, labels = tuple([torch.tensor(
            feature[key]) for feature in features] for key in ['input_ids', 'attention_mask', label_key])
        input_ids = nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.eos_token_id
        )
        attention_mask = nn.utils.rnn.pad_sequence(
            attention_mask, batch_first=True, padding_value=0
        )
        labels = nn.utils.rnn.pad_sequence(
            labels, batch_first=True, padding_value=IGNORE_INDEX
        )
        batch.update({
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        })
        return batch


In [4]:
task = "history"  # 직접 전달된 파라미터
lr = 3e-4  # 학습률 직접 지정
epoch = 5  # 학습 에폭수
batch_size = 1  # 배치 크기
tqdm = True  # tqdm 사용 여부
deepspeed = "./default_offload_opt_param.json"  # deepspeed 설정
output_dir = "./outputs/all"  # 출력 디렉터리

disable_progress_bar()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # GPU 또는 CPU 선택
model_path = None  # 모델 경로 직접 지정
output_dir = "./outputs/all"  # 출력 디렉터리 지정

In [5]:
from transformers import BitsAndBytesConfig

# 8bit 설정
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
)
if model_path is None:
    config = LlamaCLConfig()
    model = ScalableLM(config, quantization_config=quantization_config)
else:
    print("Load checkpoint from: {}".format(model_path))
    config = LlamaCLConfig()
    model = ScalableLM(config, quantization_config=quantization_config)
    model.load_state_dict(torch.load(model_path), strict=False)

model.to(device)  # 모델을 GPU로 이동


tokenizer = AutoTokenizer.from_pretrained(config.model_name, device_map="auto",quantization_config=quantization_config)

dataset = get_dataset_by_name(task, tokenizer=tokenizer)
dataset.set_format(
    columns=['input_ids', 'attention_mask', 'label', 'raw_text'])

dataset

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]





Dataset({
    features: ['question', 'choices', 'answer', 'input_ids', 'attention_mask', 'label', 'raw_text'],
    num_rows: 85907
})

In [6]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [7]:
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params //= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

In [8]:
def create_peft_config(modules):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=16,  # dimension of the updated matrices
        lora_alpha=64,  # parameter for scaling
        target_modules=modules,
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config

In [9]:
# model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)
modules = find_all_linear_names(model)
peft_config = create_peft_config(modules)
model = get_peft_model(model, peft_config)
# Print information about the percentage of trainable parameters
print_trainable_parameters(model, use_4bit=True)


all params: 3,690,210,560 || trainable params: 19,988,480 || trainable%: 0.5416623164180637


In [10]:
data_collator = DataCollatorWithPadding(
    eos_token_id=tokenizer.eos_token_id,
    task=task)

training_args = TrainingArguments(
    output_dir=output_dir,
    optim='adamw_torch',
    learning_rate=lr,
    report_to=None,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch,
    weight_decay=0.01,
    save_strategy="epoch",
    warmup_ratio=0.03,
    logging_steps=10,
    remove_unused_columns=True,
    ddp_find_unused_parameters=True,
    gradient_accumulation_steps=8,
    save_total_limit=1,
    disable_tqdm=not tqdm,
    # deepspeed=deepspeed
)
training_args = training_args.set_lr_scheduler(
    name="cosine", warmup_ratio=0.03)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


[2024-09-23 23:08:22,092] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/hjjang/miniconda3/envs/research/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `log2f@GLIBC_2.2.5'
/home/hjjang/miniconda3/envs/research/compiler_compat/ld: /usr/lib/x86_64-linux-gnu//libstdc++.so.6: undefined reference to `fesetround@GLIBC_2.2.5'
/home/hjjang/miniconda3/envs/research/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlopen'
/home/hjjang/miniconda3/envs/research/compiler_compat/ld: /usr/lib/x86_64-linux-gnu//libstdc++.so.6: undefined reference to `fegetround@GLIBC_2.2.5'
/home/hjjang/miniconda3/envs/research/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlclose'
/home/hjjang/miniconda3/envs/research/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlerror'
/home/hjjang/miniconda3/envs/research/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `dlsym'
collect2: error: ld returned 1 exit status
../aten/src/ATen/

RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 84, in _worker
    output = module(*input, **kwargs)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/peft/peft_model.py", line 1577, in forward
    return self.base_model(
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 188, in forward
    return self.model.forward(*args, **kwargs)
  File "/home/hjjang/SLM/SLM-llama/models/slm.py", line 87, in forward
    outputs = self.model(
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/accelerate/hooks.py", line 170, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/home/hjjang/SLM/SLM-llama/models/llama.py", line 906, in forward
    outputs = self.model(
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/accelerate/hooks.py", line 170, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/home/hjjang/SLM/SLM-llama/models/llama.py", line 789, in forward
    layer_outputs = decoder_layer(
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/accelerate/hooks.py", line 170, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/home/hjjang/SLM/SLM-llama/models/llama.py", line 495, in forward
    hidden_states, self_attn_weights, present_key_value = self.self_attn(
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1562, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/accelerate/hooks.py", line 170, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/home/hjjang/SLM/SLM-llama/models/llama.py", line 441, in forward
    print(attn_output)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/_tensor.py", line 463, in __repr__
    return torch._tensor_str._str(self, tensor_contents=tensor_contents)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/_tensor_str.py", line 698, in _str
    return _str_intern(self, tensor_contents=tensor_contents)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/_tensor_str.py", line 618, in _str_intern
    tensor_str = _tensor_str(self, indent)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/_tensor_str.py", line 350, in _tensor_str
    formatter = _Formatter(get_summarized_data(self) if summarize else self)
  File "/home/hjjang/miniconda3/envs/research/lib/python3.10/site-packages/torch/_tensor_str.py", line 138, in __init__
    nonzero_finite_vals = torch.masked_select(
RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

