# sLLM Fine-tuning
- (https://huggingface.co/HuggingFaceTB/SmolLM2-135M-Instruct)



### Installation

In [1]:
!pip install transformers trl peft accelerate datasets wandb bitsandbytes --upgrade -qqq

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.1/44.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m116.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.9/310.9 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m35.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
!pip install vllm triton



### Mount + libraries

- 세션 초기화 할때 마다 여기 셀들을 실행해주세요.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/MyDrive/hackertone/'

In [4]:
import torch
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
import pandas as pd
import numpy as np
import json
import re
from vllm import LLM, SamplingParams
import triton

## Dataset 전처리

In [5]:
def prompts(example):
    prompt_list = []
    for i in range(len(example['instruction'])):
        prompt_list.append(
f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>{example['instruction'][i]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>{example['output'][i]}<|eot_id|>"""
        )
    return prompt_list

In [6]:
with open(path + "craw.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 태그 파싱
pattern = r'<STYLE (style="([^"]+)")?(primary="([^"]+)")?(tone="([^"]+)")?>([^<]+)</STYLE>'

processed_data = []
result = {'instruction': [], 'output': []}
for data in data:
    match = re.match(pattern, data)
    if match:
        if match.group(2):
            result["instruction"].append(f"{match.group(2)} 스타일의 문장을 만들어줘.")
        if match.group(4):
            result["instruction"].append(f"{match.group(4)} 어체의 문장을 만들어줘.")
        if match.group(6):
            result["instruction"].append(f"{match.group(6)} 어체의 문장을 만들어줘.")
        result["output"].append(f"{match.group(7)}")

processed_data = result
df = pd.DataFrame(processed_data)
from datasets import Dataset
dataset = Dataset.from_pandas(df)

In [7]:
len(dataset)

6081

In [None]:
from datasets import Dataset

def prompts_json(json_file_path):
  with open(json_file_path, "r", encoding="utf-8") as f:
      data = json.load(f)

  # 태그 파싱
  pattern = r'<STYLE (style="([^"]+)")?(primary="([^"]+)")?(tone="([^"]+)")?>([^<]+)</STYLE>'

  result = {'instruction': []}
  for data in data:
      match = re.match(pattern, data)
      if match:
          result["instruction"].append(f"{match.group(7)}")
  return result

In [None]:
prompts = prompts_json(path + "craw.json")

In [None]:
instruction = '안녕, 성찰에 대해 글써줘.'

In [None]:
# 추론 수행
def generate_responses(instruction_msg, model, tokenizer):
  instruction = f'{instruction_msg}'

  messages = [
      {
        "role": "system",
        "content": '''당신은 훌륭한 글 작성 도우미입니다.
        사용자의 스타일(userStyle2)에 맞춰서 입력된 글을 다시 작성해주세요.
        You are great writing helper.
        Fit on user's wrting style(userStyle2) and write articles according to it.
        Write in Korean'''
      },
      {
        "role": "user",
        "content": instruction
      },
  ]

  prompt_message = tokenizer.apply_chat_template(
          messages,
          tokenize=False,
          add_generation_prompt=True,
  )

  # Tokenize the prompt_message and convert to a tensor
  input_ids = tokenizer(prompt_message, return_tensors="pt").input_ids.to(model.device)

  eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
  outputs = model.generate(
                input_ids,
                max_length=300,
                num_return_sequences=1,
                do_sample=True,
                top_k=50,
                top_p=0.95,
                temperature=0.7,
            )
  result = []
  generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  result.append(generated_text)
  return result

# 추론 결과 생성 및 출력
response = generate_responses(prompts['instruction'][0], model, tokenizer)
print(response)

["system\n당신은 훌륭한 글 작성 도우미입니다.\n        사용자의 스타일(userStyle2)에 맞춰서 입력된 글을 다시 작성해주세요.\n        You are great writing helper.\n        Fit on user's wrting style(userStyle2) and write articles according to it.\n        Write in Korean\nuser\n안녕하십니까 형님들 오랜만에 돌아왔습니다.  코딩테스트, 다들 어떻게 준비하고 계십니까?\nassistant\n계십니까 준비하세요. 코딩"]


## Modeling


### 모델 불러오기

In [None]:
model_id = 'HuggingFaceTB/SmolLM2-135M-Instruct'

tokenizer = AutoTokenizer.from_pretrained(model_id, **{"low_cpu_mem_usage": True})
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


###LoRA

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha = 16,
    lora_dropout = 0.1,
    target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
    bias="none",
    task_type="CAUSAL_LM",
    init_lora_weights="gaussian"
)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 2,442,240 || all params: 136,957,248 || trainable%: 1.7832


### Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

In [None]:
train_data = dataset

### Fine-Tuning

In [None]:
lora_trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    max_seq_length=256,
    tokenizer=tokenizer,
    args=TrainingArguments(
        output_dir="outputs",
        num_train_epochs = 2,
        max_steps=1000,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=2,
        optim="paged_adamw_8bit",
        warmup_steps=1,
        learning_rate=1e-5,
        fp16=True,
        logging_steps=50,
        push_to_hub=False,
        report_to='none',
    ),
    peft_config=lora_config,
    formatting_func=prompts,
)

lora_trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/6081 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
50,1.7479
100,1.5335
150,1.3585
200,1.191
250,1.0359
300,0.9576
350,0.9409
400,0.9174
450,0.9055
500,0.885


TrainOutput(global_step=1000, training_loss=1.0106663093566894, metrics={'train_runtime': 749.0924, 'train_samples_per_second': 10.68, 'train_steps_per_second': 1.335, 'total_flos': 1327659096775680.0, 'train_loss': 1.0106663093566894, 'epoch': 1.3149243918474687})

In [None]:
lora_model = "lora_model"
lora_trainer.model.save_pretrained(lora_model)
lora_trainer.tokenizer.save_pretrained(lora_model)

model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto')
model = PeftModel.from_pretrained(model, lora_model, device_map='auto')

model = model.merge_and_unload()
model.save_pretrained('SmolLM2-135M-ERAI')
tokenizer.save_pretrained('SmolLM2-135M-ERAI')

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


('SmolLM2-135M-ERAI-ver-test/tokenizer_config.json',
 'SmolLM2-135M-ERAI-ver-test/special_tokens_map.json',
 'SmolLM2-135M-ERAI-ver-test/vocab.json',
 'SmolLM2-135M-ERAI-ver-test/merges.txt',
 'SmolLM2-135M-ERAI-ver-test/added_tokens.json',
 'SmolLM2-135M-ERAI-ver-test/tokenizer.json')

## 추론
- 실행 전에 세션 초기화 해주세요.
- 이후 위에 Mount+Libraries 셀을 실행해주세요.

In [None]:
base_model = 'SmolLM2-135M-ERAI'
llm = LLM(model=base_model,
          # max_model_len=55000,
          gpu_memory_utilization=0.8)

tokenizer = AutoTokenizer.from_pretrained(base_model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'

INFO 12-01 01:09:13 config.py:1861] Downcasting torch.float32 to torch.float16.
INFO 12-01 01:09:21 config.py:350] This model supports multiple tasks: {'generate', 'embedding'}. Defaulting to 'generate'.
INFO 12-01 01:09:21 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='SmolLM2-135M-ERAI-ver-test', speculative_config=None, tokenizer='SmolLM2-135M-ERAI-ver-test', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 12-01 01:09:22 model_runner.py:1077] Loading model weights took 0.2551 GB
INFO 12-01 01:09:23 worker.py:232] Memory profiling results: total_gpu_memory=22.17GiB initial_memory_usage=2.10GiB peak_torch_memory=2.28GiB memory_usage_post_profile=2.10GiB non_torch_memory=0.27GiB kv_cache_size=15.18GiB gpu_memory_utilization=0.80
INFO 12-01 01:09:23 gpu_executor.py:113] # GPU blocks: 44224, # CPU blocks: 11650
INFO 12-01 01:09:23 gpu_executor.py:117] Maximum concurrency for 8192 tokens per request: 86.38x
INFO 12-01 01:09:27 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 12-01 01:09:27 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO

In [None]:
instruction = '''프론트엔드 개발에 대해서 설명해줘'''

messages = [
    {
      "role": "system",
      "content": '''당신은 훌륭한 글 작성 도우미입니다.
       사용자의 스타일(userStyle2)에 맞춰서 입력된 글을 다시 작성해주세요.
       You are great writing helper.
       Fit on user's wrting style(userStyle2) and write articles according to it.
       Write in Korean'''
    },
    {
      "role": "user",
      "content": instruction
    },
]

prompt_message = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
)

eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]

outputs = llm.generate(prompt_message,
                       SamplingParams(stop_token_ids=eos_token_id, temperature=0.8, top_p=0.95,max_tokens=1024))

for output in outputs:
    propt = output.prompt
    generated_text = output.outputs[0].text
    print(generated_text)

Processed prompts: 100%|██████████| 1/1 [00:03<00:00,  3.41s/it, est. speed input: 131.90 toks/s, output: 300.81 toks/s]

그러나 모든 디자인 자세에 대한 개발할 수 있는 과정을 실행하는 작업을 어드마케이션이 드러나지 않습니다.
디자인 이러한 분야입니다. 
웹 애플리케이션에 대해 사용자가 많은 스타일을 다시 작성해 작성해주세요. 사용자가 다시 내는 작업을 보고서 받습니다.
사이트 서버를 사용하여 애플리케이션이 진실마를 들었습니다. 오브려는 분야는 프론트엔드를 사용하여 인터랙션을 상들고 내는 작업을 구현해주세요.
스타일을 보고서 받습니다. 오브려는 분야입니다.
사용자가 다시 내는 작업을 보고서 받습니다. 위에서 애플리케이션을 보고서 받습니다.
내는 작업을 보고서 받습니다. 또 무력을 줄 수 있습니다. 방법설과 스타일을 만들 수 있습니다.
애플리케이션을 종료하는 상황이 단축되게 되는데, 인터랙션 인식에서 특정의 사용자가 직접 상호작용하기 위해 많은 작업을 다시 들었습니다.
직접 상호작용하는 분야입니다. 제자를 성공적으로 보고서 받습니다. 상호작용과 더분에서 내는 작업을 받습니다.






## ONNX 변환

In [None]:
!pip install onnx onnxruntime

Collecting onnx
  Downloading onnx-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnx-1.17.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m92.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m120.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# 모델과 토크나이저 로드
checkpoint = "SmolLM2-135M-ERAI-ver-test"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

# 예제 입력 생성 (dummy input)
messages = [{"role": "user", "content": "translate 'I have a hat' to French"}]
input_text = tokenizer.apply_chat_template(messages, tokenize=True)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
inputs = inputs.to(torch.int)

# ONNX 파일로 변환
onnx_path = "/content/drive/MyDrive/hackertone/smollm2_135m_erai.onnx"

# Export 모델
torch.onnx.export(
    model,
    args=(inputs,),  # 모델에 전달될 입력
    f=onnx_path,  # 출력될 ONNX 파일 경로
    export_params=True,  # 학습된 파라미터 저장
    opset_version=17,  # ONNX opset version (최신 버전 사용 권장)
    input_names=["input_ids"],  # 입력 이름
    output_names=["output"],  # 출력 이름
    dynamic_axes={  # 동적 크기 설정
        "input_ids": {0: "batch_size", 1: "sequence_length"},
        "output": {0: "batch_size", 1: "sequence_length"}
    },
    verbose=True
)

print(f"ONNX 모델이 '{onnx_path}'에 저장되었습니다.")


AttributeError: module 'torch' has no attribute 'q4f16'