In [1]:
!pip install vllm==0.8.5
!pip install datasets

Collecting vllm==0.8.5
  Downloading vllm-0.8.5-cp38-abi3-manylinux1_x86_64.whl.metadata (14 kB)
Collecting cachetools (from vllm==0.8.5)
  Downloading cachetools-6.0.0-py3-none-any.whl.metadata (5.4 kB)
Collecting sentencepiece (from vllm==0.8.5)
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting tqdm (from vllm==0.8.5)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting blake3 (from vllm==0.8.5)
  Downloading blake3-1.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting py-cpuinfo (from vllm==0.8.5)
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl.metadata (794 bytes)
Collecting transformers>=4.51.1 (from vllm==0.8.5)
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub>=0.30.0 (from huggingface-hub[hf_xet]>=0.30.0->vllm==0.8.5)
  Downloading huggingface_hub-0.32.4-py3-none-any.whl.metadata (14 kB)
Collecting tokeniz

In [1]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

import os

os.environ['VLLM_USE_V1'] = '0'

model_name = "Qwen/Qwen3-30B-A3B-FP8"
tokenizer = AutoTokenizer.from_pretrained(model_name)

batch_size = 384 # preemption이 가능한 발생하지 않도록 적절한 값을 설정 - VRAM 48GB라면 96정도?
max_prompt_tokens = 2048 # MMMLU 최대 프롬프트 토큰: 1500+
max_think_tokens = 1024
max_nonthink_tokens = 1024
temperature = 0.6
top_p = 0.95
num_gpus = 2

sampling_params = {"think": SamplingParams(temperature=temperature, top_p=top_p,
                                           max_tokens=max_think_tokens),
                   "nonthink": SamplingParams(temperature=temperature, top_p=top_p,
                                              max_tokens=max_nonthink_tokens)
                  }

llm = LLM(model=model_name,
          max_model_len = max_prompt_tokens + max_think_tokens,
          max_num_seqs = batch_size,
          tensor_parallel_size = num_gpus,
          gpu_memory_utilization = 0.97,
          enforce_eager=True)

INFO 06-05 21:29:51 [__init__.py:239] Automatically detected platform cuda.
INFO 06-05 21:30:01 [config.py:717] This model supports multiple tasks: {'reward', 'classify', 'generate', 'score', 'embed'}. Defaulting to 'generate'.
INFO 06-05 21:30:02 [config.py:1770] Defaulting to use mp for distributed inference
INFO 06-05 21:30:02 [llm_engine.py:240] Initializing a V0 LLM engine (v0.8.5) with config: model='Qwen/Qwen3-30B-A3B-FP8', speculative_config=None, tokenizer='Qwen/Qwen3-30B-A3B-FP8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=3072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=fp8, enforce_eager=True, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=Ob

model-00004-of-00007.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

[1;36m(VllmWorkerProcess pid=1957)[0;0m INFO 06-05 21:30:54 [weight_utils.py:281] Time spent downloading weights for Qwen/Qwen3-30B-A3B-FP8: 50.416948 seconds


model.safetensors.index.json:   0%|          | 0.00/3.57M [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/7 [00:00<?, ?it/s]


[1;36m(VllmWorkerProcess pid=1957)[0;0m INFO 06-05 21:31:05 [loader.py:458] Loading weights took 9.91 seconds
INFO 06-05 21:31:05 [loader.py:458] Loading weights took 9.88 seconds
[1;36m(VllmWorkerProcess pid=1957)[0;0m INFO 06-05 21:31:05 [model_runner.py:1140] Model loading took 14.5406 GiB and 61.398903 seconds
INFO 06-05 21:31:05 [model_runner.py:1140] Model loading took 14.5406 GiB and 61.258887 seconds
[1;36m(VllmWorkerProcess pid=1957)[0;0m INFO 06-05 21:31:11 [worker.py:287] Memory profiling takes 5.53 seconds
[1;36m(VllmWorkerProcess pid=1957)[0;0m INFO 06-05 21:31:11 [worker.py:287] the current vLLM instance can use total_gpu_memory (44.52GiB) x gpu_memory_utilization (0.97) = 43.18GiB
[1;36m(VllmWorkerProcess pid=1957)[0;0m INFO 06-05 21:31:11 [worker.py:287] model weights take 14.54GiB; non_torch_memory takes 0.39GiB; PyTorch activation peak memory takes 0.25GiB; the rest of the memory reserved for KV Cache is 28.00GiB.
INFO 06-05 21:31:11 [worker.py:287] Memory p

In [2]:
from datasets import load_dataset

dataset_ko = load_dataset("openai/MMMLU", "KO_KR", split="test")
dataset_en = load_dataset("cais/mmlu", 'all', split="test")

def make_query_ko(tokenizer, data, think):
    text = "{}\n(A) {} (B) {} (C) {} (D) {}".format(data['Question'], data['A'], data['B'], data['C'], data['D'])
    return tokenizer.apply_chat_template([{"role":"user", "content":text}], add_generation_prompt=True, tokenize=False, enable_thinking=think)
    
def make_query_en(tokenizer, data, think):
    text = "{}\n(A) {} (B) {} (C) {} (D) {}".format(data['question'], data['choices'][0], data['choices'][1], data['choices'][2], data['choices'][3])
    return tokenizer.apply_chat_template([{"role":"user", "content":text}], add_generation_prompt=True, tokenize=False, enable_thinking=think)
    

print("Preprocessing")
prompts = {"ko":{"think": [make_query_ko(tokenizer, data, think = True) for data in dataset_ko],
                 "nonthink": [make_query_ko(tokenizer, data, think = False) for data in dataset_ko]
                },
           "en":{"think": [make_query_en(tokenizer, data, think = True) for data in dataset_en],
                 "nonthink": [make_query_en(tokenizer, data, think = False) for data in dataset_en]
                }
          }
              
subjects = dataset_ko['Subject']

print(prompts['ko']['nonthink'][0])

README.md:   0%|          | 0.00/3.01k [00:00<?, ?B/s]

mmlu_KO-KR.csv:   0%|          | 0.00/7.67M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/14042 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/53.2k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/138k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/3.50M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/408k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/76.5k [00:00<?, ?B/s]

auxiliary_train-00000-of-00001.parquet:   0%|          | 0.00/47.5M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/14042 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1531 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/285 [00:00<?, ? examples/s]

Generating auxiliary_train split:   0%|          | 0/99842 [00:00<?, ? examples/s]

Preprocessing
<|im_start|>user
Q에 대해 주어진 확대체 Q(sqrt(2), sqrt(3), sqrt(18))의 차수를 구하십시오.
(A) 0 (B) 4 (C) 2 (D) 6<|im_end|>
<|im_start|>assistant
<think>

</think>




In [None]:
import torch
from vllm.model_executor.models.qwen3_moe import Qwen3MoeSparseMoeBlock
from tqdm.auto import tqdm

model_config = llm.llm_engine.model_config.hf_config

num_experts_per_tok = model_config.num_experts_per_tok
num_experts = model_config.num_experts
num_layers = model_config.num_hidden_layers

def make_hook(layer_id, topk = 8):
    def hook(module, input, output):
        with torch.no_grad():
            chosen_expert_ids = output[0].topk(k=topk, dim=-1).indices.detach().cpu()
            batch_topk_expert_ids[layer_id].append(chosen_expert_ids)
    
    return hook

def batch_topk_aligning(tokenizer, prompts, outputs, topk_experts):
    # topk_experts: [L, T, B, K], L: #Layer, C: #Token, B: Batch size, K: Top-K
    # Prompt expert: Flatten 된 후, 임의의 batch 단위로 잘려서 저장됨(순서는 고정)
    # Output expert: 토큰 단위로 저장됨(순서는 고정)
    # prompt 마다 [T, L, K] 단위로 expert id list를 추출하여 dictionary로 변환
    
    i, j = 0, 0

    # prompt 토큰 처리
    result = []
    for p in prompts:
        prompt_token_ids = tokenizer(p)['input_ids']
        num_tokens = len(prompt_token_ids)

        expert_ids = []
        for layer_expert in topk_experts:
            expert_ids.append(layer_expert[i][j:j+num_tokens])
        expert_ids = torch.stack(expert_ids).transpose(0, 1).tolist() # [L, T, K] -> [T, L, K]

        p_result = {"prompt": p, "prompt_token_ids": prompt_token_ids, "prompt_expert_ids": expert_ids}
        result.append(p_result)
        
        j += num_tokens

        if j > len(topk_experts[0][i]): # Preemption이 발생했거나, 다른 이유로 문제가 발생함
            raise Exception(f"Prompt token mismatch: ({i}, {j}) to ({i}, {j+num_tokens}), {p}")

        if j == len(topk_experts[0][i]):
            j = 0
            i += 1

    # 이 시점에서 i는 generation 토큰 시작 위치에 도착
    gen_base_idx = i

    prompt_size = len(prompts)

    # 배치 중간의 seq가 출력이 완료되었을 경우, index가 밀착함.
    # 예시) 8번째 seq가 130토큰에서 출력이 완료되면, 131번째 출력부터는 8+i번째 seq가 7+i번째로 옮겨짐
    running_index_counter = [prompt_size for _ in range(max_think_tokens + gen_base_idx + 10)] # 10: 그냥 여유분
    
    print(gen_base_idx)
    
    for i, o in tqdm(enumerate(outputs), total = len(outputs)):
        text = o.outputs[0].text
        token_ids = o.outputs[0].token_ids

        expert_ids = []
        for j in range(len(token_ids) - 1):
            expert_list = []
            for layer_expert in topk_experts:
                expert_list.append(layer_expert[gen_base_idx + j][prompt_size - running_index_counter[gen_base_idx + j]])
            running_index_counter[gen_base_idx + j] -= 1
            expert_ids.append(torch.stack(expert_list)) # [L, K]
        expert_ids = torch.stack(expert_ids).tolist() # [T, L, K]
        
        result[i]["output"] = text
        result[i]["output_token_ids"] = token_ids
        result[i]["output_expert_ids"] = expert_ids

    return result

def preemption_check(prompts, topk_experts):
    i, j = 0, 0

    for p in prompts:
        prompt_token_ids = tokenizer(p)['input_ids']
        num_tokens = len(prompt_token_ids)
        j += num_tokens

        if j > len(topk_experts[0][i]):
            raise Exception(f"Prompt token mismatch: ({i}, {j}) to ({i}, {j+num_tokens}), {p}")

        if j == len(topk_experts[0][i]):
            j = 0
            i += 1

    # 이 시점에서 i는 generation 토큰 시작 위치에 도착
    gen_base_idx = i

    for i in range(gen_base_idx + 1, len(topk_experts[0])): # 0번 레이어
        if len(topk_experts[0][i - 1]) < len(topk_experts[0][i]):
            # 배치 크기는 생성하는 동안 이전과 같거나, 줄어들어야(일부 seq의 생성이 끝남) 함
            # 그렇지 않다면 Preemption이 발생하여 prompt 재연산이 일어난 것
            #우연히 재연산 토큰 수가 현재 시퀀스 수와 일치하면 못찾아낼수도? - 보통 수백~수천 토큰쯤 생성된 후에 preemption이 발생하므로 쉽지 않음
            print(i, len(topk_experts[0][i - 1]), len(topk_experts[0][i]))
            return True
            
    return False

In [5]:
import json

for is_think in ['think', 'nonthink']:
    for lang in ['ko', 'en']:
        if is_think == 'think' and lang == 'ko': continue
        print("############# {} - {} ###########".format(lang, is_think))
        result = []
        
        for i in range(len(result), len(prompts[lang][is_think]), batch_size):
            print(f"{i} - {min(i+batch_size, len(prompts[lang][is_think]))}")
                
            batch_topk_expert_ids = [[] for _ in range(num_layers)]

            # hook 걸어주기
            hooks = []
                    
            for j, layer in enumerate(llm.llm_engine.model_executor.driver_worker.model_runner.model.model.layers):
                if hasattr(layer.mlp, 'gate'):
                    hooks.append(layer.mlp.gate.register_forward_hook(make_hook(j, num_experts_per_tok)))

            # 생성
            batch_prompts = prompts[lang][is_think][i:i+batch_size]
            batch_outputs = llm.generate(batch_prompts, sampling_params[is_think])

            # 혹시나 preemption 나오면 반으로 쪼개서 다시 실행
            if preemption_check(batch_prompts, batch_topk_expert_ids):
                # Step 1
                for h in hooks:
                    h.remove()
                batch_topk_expert_ids = [[] for _ in range(num_layers)]
                    
                hooks = []
                for j, layer in enumerate(llm.llm_engine.model_executor.driver_worker.model_runner.model.model.layers):
                    if hasattr(layer.mlp, 'gate'):
                        hooks.append(layer.mlp.gate.register_forward_hook(make_hook(j, num_experts_per_tok)))
                
                batch_prompts = prompts[lang][is_think][i:i+batch_size // 2]
                batch_outputs = llm.generate(batch_prompts, sampling_params[is_think])
                    
                result += batch_topk_aligning(tokenizer, batch_prompts, batch_outputs, batch_topk_expert_ids)
                
                # Step 2
                for h in hooks:
                    h.remove()
                batch_topk_expert_ids = [[] for _ in range(num_layers)]
                    
                hooks = []
                for j, layer in enumerate(llm.llm_engine.model_executor.driver_worker.model_runner.model.model.layers):
                    if hasattr(layer.mlp, 'gate'):
                        hooks.append(layer.mlp.gate.register_forward_hook(make_hook(j, num_experts_per_tok)))
                
                batch_prompts = prompts[lang][is_think][i+batch_size // 2:i+batch_size]
                batch_outputs = llm.generate(batch_prompts, sampling_params[is_think])
                    
                result += batch_topk_aligning(tokenizer, batch_prompts, batch_outputs, batch_topk_expert_ids)

            else:
                result += batch_topk_aligning(tokenizer, batch_prompts, batch_outputs, batch_topk_expert_ids)
                
            for h in hooks:
                h.remove()

        for i, r in enumerate(result):
            r['subject'] = subjects[i]
            r['language'] = lang
            r['think'] = is_think

        print("Save the MoE...")

        with open(f"Qwen3_30B_A3B_MoE_{lang}_{is_think}.jsonl", "w", encoding="utf-8") as fout:
            for item in tqdm(result):
                fout.write(json.dumps(item, ensure_ascii=False) + "\n")

        del result

############# en - think ###########
0 - 384


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

10


  0%|          | 0/384 [00:00<?, ?it/s]

384 - 768


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

9


  0%|          | 0/384 [00:00<?, ?it/s]

768 - 1152


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

12


  0%|          | 0/384 [00:00<?, ?it/s]

1152 - 1536


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

13


  0%|          | 0/384 [00:00<?, ?it/s]

1536 - 1920


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

9


  0%|          | 0/384 [00:00<?, ?it/s]

1920 - 2304


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

9


  0%|          | 0/384 [00:00<?, ?it/s]

2304 - 2688


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

11


  0%|          | 0/384 [00:00<?, ?it/s]

2688 - 3072


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

11


  0%|          | 0/384 [00:00<?, ?it/s]

3072 - 3456


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

28


  0%|          | 0/384 [00:00<?, ?it/s]

3456 - 3840


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

9


  0%|          | 0/384 [00:00<?, ?it/s]

3840 - 4224


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

10


  0%|          | 0/384 [00:00<?, ?it/s]

4224 - 4608


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

11


  0%|          | 0/384 [00:00<?, ?it/s]

4608 - 4992


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

11


  0%|          | 0/384 [00:00<?, ?it/s]

4992 - 5376


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

9


  0%|          | 0/384 [00:00<?, ?it/s]

5376 - 5760


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

23


  0%|          | 0/384 [00:00<?, ?it/s]

5760 - 6144


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

40


  0%|          | 0/384 [00:00<?, ?it/s]

6144 - 6528


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

9


  0%|          | 0/384 [00:00<?, ?it/s]

6528 - 6912


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

10


  0%|          | 0/384 [00:00<?, ?it/s]

6912 - 7296


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

8


  0%|          | 0/384 [00:00<?, ?it/s]

7296 - 7680


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

7


  0%|          | 0/384 [00:00<?, ?it/s]

7680 - 8064


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

7


  0%|          | 0/384 [00:00<?, ?it/s]

8064 - 8448


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

9


  0%|          | 0/384 [00:00<?, ?it/s]

8448 - 8832


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

14


  0%|          | 0/384 [00:00<?, ?it/s]

8832 - 9216


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

15


  0%|          | 0/384 [00:00<?, ?it/s]

9216 - 9600


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

12


  0%|          | 0/384 [00:00<?, ?it/s]

9600 - 9984


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

9


  0%|          | 0/384 [00:00<?, ?it/s]

9984 - 10368


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

10


  0%|          | 0/384 [00:00<?, ?it/s]

10368 - 10752


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

21


  0%|          | 0/384 [00:00<?, ?it/s]

10752 - 11136


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

35


  0%|          | 0/384 [00:00<?, ?it/s]

11136 - 11520


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

35


  0%|          | 0/384 [00:00<?, ?it/s]

11520 - 11904


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

34


  0%|          | 0/384 [00:00<?, ?it/s]

11904 - 12288


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

32


  0%|          | 0/384 [00:00<?, ?it/s]

12288 - 12672


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

17


  0%|          | 0/384 [00:00<?, ?it/s]

12672 - 13056


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

11


  0%|          | 0/384 [00:00<?, ?it/s]

13056 - 13440


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

17


  0%|          | 0/384 [00:00<?, ?it/s]

13440 - 13824


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

9


  0%|          | 0/384 [00:00<?, ?it/s]

13824 - 14042


Processed prompts:   0%|          | 0/218 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

4


  0%|          | 0/218 [00:00<?, ?it/s]

Save the MoE...
############# ko - nonthink ###########
0 - 384


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

13


  0%|          | 0/384 [00:00<?, ?it/s]

384 - 768


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

13


  0%|          | 0/384 [00:00<?, ?it/s]

768 - 1152


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

17


  0%|          | 0/384 [00:00<?, ?it/s]

1152 - 1536


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

19


  0%|          | 0/384 [00:00<?, ?it/s]

1536 - 1920


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

13


  0%|          | 0/384 [00:00<?, ?it/s]

1920 - 2304


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

12


  0%|          | 0/384 [00:00<?, ?it/s]

2304 - 2688


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

15


  0%|          | 0/384 [00:00<?, ?it/s]

2688 - 3072


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

16


  0%|          | 0/384 [00:00<?, ?it/s]

3072 - 3456


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

40


  0%|          | 0/384 [00:00<?, ?it/s]

3456 - 3840


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

13


  0%|          | 0/384 [00:00<?, ?it/s]

3840 - 4224


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

14


  0%|          | 0/384 [00:00<?, ?it/s]

4224 - 4608


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

14


  0%|          | 0/384 [00:00<?, ?it/s]

4608 - 4992


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

17


  0%|          | 0/384 [00:00<?, ?it/s]

4992 - 5376


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

14


  0%|          | 0/384 [00:00<?, ?it/s]

5376 - 5760


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

33


  0%|          | 0/384 [00:00<?, ?it/s]

5760 - 6144


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

61


  0%|          | 0/384 [00:00<?, ?it/s]

6144 - 6528


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

13


  0%|          | 0/384 [00:00<?, ?it/s]

6528 - 6912


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

15


  0%|          | 0/384 [00:00<?, ?it/s]

6912 - 7296


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

12


  0%|          | 0/384 [00:00<?, ?it/s]

7296 - 7680


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

11


  0%|          | 0/384 [00:00<?, ?it/s]

7680 - 8064


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

11


  0%|          | 0/384 [00:00<?, ?it/s]

8064 - 8448


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

14


  0%|          | 0/384 [00:00<?, ?it/s]

8448 - 8832


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

22


  0%|          | 0/384 [00:00<?, ?it/s]

8832 - 9216


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

24


  0%|          | 0/384 [00:00<?, ?it/s]

9216 - 9600


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

20


  0%|          | 0/384 [00:00<?, ?it/s]

9600 - 9984


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

14


  0%|          | 0/384 [00:00<?, ?it/s]

9984 - 10368


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

14


  0%|          | 0/384 [00:00<?, ?it/s]

10368 - 10752


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

32


  0%|          | 0/384 [00:00<?, ?it/s]

10752 - 11136


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

57


  0%|          | 0/384 [00:00<?, ?it/s]

11136 - 11520


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

58


  0%|          | 0/384 [00:00<?, ?it/s]

11520 - 11904


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

57


  0%|          | 0/384 [00:00<?, ?it/s]

11904 - 12288


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

52


  0%|          | 0/384 [00:00<?, ?it/s]

12288 - 12672


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

25


  0%|          | 0/384 [00:00<?, ?it/s]

12672 - 13056


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

16


  0%|          | 0/384 [00:00<?, ?it/s]

13056 - 13440


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

27


  0%|          | 0/384 [00:00<?, ?it/s]

13440 - 13824


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

13


  0%|          | 0/384 [00:00<?, ?it/s]

13824 - 14042


Processed prompts:   0%|          | 0/218 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

6


  0%|          | 0/218 [00:00<?, ?it/s]

Save the MoE...
############# en - nonthink ###########
0 - 384


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

10


  0%|          | 0/384 [00:00<?, ?it/s]

384 - 768


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

10


  0%|          | 0/384 [00:00<?, ?it/s]

768 - 1152


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

13


  0%|          | 0/384 [00:00<?, ?it/s]

1152 - 1536


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

13


  0%|          | 0/384 [00:00<?, ?it/s]

1536 - 1920


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

10


  0%|          | 0/384 [00:00<?, ?it/s]

1920 - 2304


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

10


  0%|          | 0/384 [00:00<?, ?it/s]

2304 - 2688


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

12


  0%|          | 0/384 [00:00<?, ?it/s]

2688 - 3072


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

12


  0%|          | 0/384 [00:00<?, ?it/s]

3072 - 3456


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

29


  0%|          | 0/384 [00:00<?, ?it/s]

3456 - 3840


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

9


  0%|          | 0/384 [00:00<?, ?it/s]

3840 - 4224


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

10


  0%|          | 0/384 [00:00<?, ?it/s]

4224 - 4608


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

11


  0%|          | 0/384 [00:00<?, ?it/s]

4608 - 4992


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

12


  0%|          | 0/384 [00:00<?, ?it/s]

4992 - 5376


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

9


  0%|          | 0/384 [00:00<?, ?it/s]

5376 - 5760


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

23


  0%|          | 0/384 [00:00<?, ?it/s]

5760 - 6144


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

41


  0%|          | 0/384 [00:00<?, ?it/s]

6144 - 6528


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

9


  0%|          | 0/384 [00:00<?, ?it/s]

6528 - 6912


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

11


  0%|          | 0/384 [00:00<?, ?it/s]

6912 - 7296


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

9


  0%|          | 0/384 [00:00<?, ?it/s]

7296 - 7680


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

8


  0%|          | 0/384 [00:00<?, ?it/s]

7680 - 8064


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

8


  0%|          | 0/384 [00:00<?, ?it/s]

8064 - 8448


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

10


  0%|          | 0/384 [00:00<?, ?it/s]

8448 - 8832


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

14


  0%|          | 0/384 [00:00<?, ?it/s]

8832 - 9216


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

15


  0%|          | 0/384 [00:00<?, ?it/s]

9216 - 9600


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

13


  0%|          | 0/384 [00:00<?, ?it/s]

9600 - 9984


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

10


  0%|          | 0/384 [00:00<?, ?it/s]

9984 - 10368


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

10


  0%|          | 0/384 [00:00<?, ?it/s]

10368 - 10752


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

21


  0%|          | 0/384 [00:00<?, ?it/s]

10752 - 11136


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

35


  0%|          | 0/384 [00:00<?, ?it/s]

11136 - 11520


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

36


  0%|          | 0/384 [00:00<?, ?it/s]

11520 - 11904


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

36


  0%|          | 0/384 [00:00<?, ?it/s]

11904 - 12288


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

33


  0%|          | 0/384 [00:00<?, ?it/s]

12288 - 12672


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

17


  0%|          | 0/384 [00:00<?, ?it/s]

12672 - 13056


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

11


  0%|          | 0/384 [00:00<?, ?it/s]

13056 - 13440


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

17


  0%|          | 0/384 [00:00<?, ?it/s]

13440 - 13824


Processed prompts:   0%|          | 0/384 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

9


  0%|          | 0/384 [00:00<?, ?it/s]

13824 - 14042


Processed prompts:   0%|          | 0/218 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

4


  0%|          | 0/218 [00:00<?, ?it/s]

Save the MoE...


In [None]:
# Hook 못지우고 hooks 지웠을 때

def remove_all_hooks(model):
    for module in model.modules():
        module._forward_hooks.clear()
        module._forward_pre_hooks.clear()
        module._backward_hooks.clear()

# 사용 예
remove_all_hooks(llm.llm_engine.model_executor.driver_worker.model_runner.model)


In [None]:
from datasets import load_dataset, concatenate_datasets
import json
from tqdm.auto import tqdm

for is_think in ['think', 'nonthink']:
    for lang in ['en', 'ko']:
        ds_list.append(load_dataset("json", data_files=f"Qwen3_30B_A3B_MoE_{lang}_{is_think}.jsonl", split="train"))

full_data = concatenate_datasets(ds_list)
full_data.push_to_hub("", private=True, token = '')