In [1]:
import warnings
warnings.filterwarnings('ignore')

## make Datasets

In [2]:
%%capture
! pip install datasets

In [3]:
import torch
from datasets import load_dataset

def make_prompt(ddl, question, query=''):
  prompt = f"""당신은 SQL을 생성하는 SQL 봇입니다. DDL의 테이블을 활용한 Question을 해결할 수 있는 SQL 쿼리를 생성하세요.

### DDL:
{ddl}

### Question:
{question}

### SQL:
{query}"""
  return prompt

dataset = load_dataset("shangrilar/ko_text2sql", "origin")['test']
dataset = dataset.to_pandas()

for idx, row in dataset.iterrows():
  prompt = make_prompt(row['context'], row['question'])
  dataset.loc[idx, 'prompt'] = prompt

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   db_id     112 non-null    int64 
 1   context   112 non-null    object
 2   question  112 non-null    object
 3   answer    112 non-null    object
 4   prompt    112 non-null    object
dtypes: int64(1), object(4)
memory usage: 4.5+ KB


In [4]:
! nvidia-smi

Fri Jan 31 20:44:45 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             44W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

## 비교를 위한 HF Pipeline

In [5]:
%%capture
! pip install -U transformers bitsandbytes

# 세션 다시 시작

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

model_id = "shangrilar/yi-ko-6b-text2sql"

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_id)

hf_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:  39%|###8      | 1.93G/4.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/9.74k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.28M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

Device set to use cuda:0


In [7]:
! nvidia-smi

Fri Jan 31 20:49:09 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             50W /  400W |    4819MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

## 시간 측정 (HF Pipeline)

In [8]:
import time

for batch_size in [1, 2, 4, 8, 16, 32]:
  start_time = time.time()
  hf_pipeline(dataset['prompt'].tolist(), max_new_tokens=128, batch_size=batch_size)
  print(f'{batch_size}: {time.time() - start_time}')

1: 172.6132423877716
2: 167.4520239830017
4: 107.70461058616638
8: 67.56611514091492
16: 39.046319007873535
32: 26.62624740600586


In [9]:
! nvidia-smi

Fri Jan 31 20:58:50 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   45C    P0            122W /  400W |    9589MiB /  40960MiB |     66%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

## vLLM

In [10]:
%%capture
! pip install vllm

In [11]:
import gc
import torch

del model
del tokenizer
del hf_pipeline

gc.collect()
torch.cuda.empty_cache()

In [12]:
! nvidia-smi

Fri Jan 31 20:59:19 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   34C    P0             52W /  400W |    4837MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [13]:
import torch
from vllm import LLM, SamplingParams

model_id = "shangrilar/yi-ko-6b-text2sql"
llm = LLM(model=model_id, dtype=torch.float16, max_model_len=1024)

INFO 01-31 20:59:20 __init__.py:183] Automatically detected platform cuda.
INFO 01-31 20:59:34 config.py:520] This model supports multiple tasks: {'generate', 'reward', 'classify', 'embed', 'score'}. Defaulting to 'generate'.
INFO 01-31 20:59:34 llm_engine.py:232] Initializing an LLM engine (v0.7.0) with config: model='shangrilar/yi-ko-6b-text2sql', speculative_config=None, tokenizer='shangrilar/yi-ko-6b-text2sql', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_exe

Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


INFO 01-31 20:59:41 model_runner.py:1115] Loading model weights took 11.5128 GB
INFO 01-31 20:59:42 worker.py:266] Memory profiling takes 0.58 seconds
INFO 01-31 20:59:42 worker.py:266] the current vLLM instance can use total_gpu_memory (39.56GiB) x gpu_memory_utilization (0.90) = 35.60GiB
INFO 01-31 20:59:42 worker.py:266] model weights take 11.51GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 0.73GiB; the rest of the memory reserved for KV Cache is 23.36GiB.
INFO 01-31 20:59:42 executor_base.py:108] # CUDA blocks: 23919, # CPU blocks: 4096
INFO 01-31 20:59:42 executor_base.py:113] Maximum concurrency for 1024 tokens per request: 373.73x
INFO 01-31 20:59:44 model_runner.py:1430] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:27<00:00,  1.27it/s]

INFO 01-31 21:00:12 model_runner.py:1558] Graph capturing finished in 27 secs, took 0.24 GiB
INFO 01-31 21:00:12 llm_engine.py:429] init engine (profile, create kv cache, warmup model) took 30.93 seconds





In [14]:
! nvidia-smi

Fri Jan 31 21:17:08 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             50W /  400W |   36519MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [15]:
import time

for max_num_seqs in [1, 2, 4, 8, 16, 32]:
  start_time = time.time()
  llm.llm_engine.scheduler_config.max_num_seqs = max_num_seqs
  sampling_params = SamplingParams(temperature=1, top_p=1, max_tokens=128)
  outputs = llm.generate(dataset['prompt'].tolist(), sampling_params)
  print(f'{max_num_seqs}: {time.time() - start_time}')

Processed prompts: 100%|██████████| 112/112 [00:51<00:00,  2.16it/s, est. speed input: 425.62 toks/s, output: 83.54 toks/s]


1: 51.83793663978577


Processed prompts: 100%|██████████| 112/112 [00:27<00:00,  4.05it/s, est. speed input: 797.25 toks/s, output: 158.90 toks/s]


2: 27.70698094367981


Processed prompts: 100%|██████████| 112/112 [00:15<00:00,  7.43it/s, est. speed input: 1460.85 toks/s, output: 288.58 toks/s]


4: 15.15408730506897


Processed prompts: 100%|██████████| 112/112 [00:09<00:00, 12.35it/s, est. speed input: 2430.29 toks/s, output: 470.37 toks/s]


8: 9.139188289642334


Processed prompts: 100%|██████████| 112/112 [00:06<00:00, 17.40it/s, est. speed input: 3423.82 toks/s, output: 704.31 toks/s]


16: 6.507745027542114


Processed prompts: 100%|██████████| 112/112 [00:04<00:00, 23.40it/s, est. speed input: 4603.60 toks/s, output: 909.39 toks/s]

32: 4.8575615882873535



