# Setup

In [None]:
# Copyright 2022 Adobe
# All Rights Reserved.

# NOTICE: Adobe permits you to use, modify, and distribute this file in
# accordance with the terms of the Adobe license agreement accompanying
# it.


import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
# os.environ["HF_HOME"] = "/mnt/localssd/.hfcache/"
os.environ["VLLM_ALLOW_INSECURE_SERIALIZATION"] = "1"
os.environ["VLLM_DISABLE_COMPILE_CACHE"] = "1"
os.environ["TORCHDYNAMO_VERBOSE"] = "1"
os.environ["TRUST_REMOTE_CODE"] = "true"
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "0"
os.environ["HF_TOKEN"] = "hf_xxxx"

import sys
import torch
import argparse
import numpy as np
import pandas as pd
from scipy import stats
from tqdm.auto import tqdm
from importlib import reload
from dotenv import load_dotenv
import huggingface_hub as hf_hub
from vllm import LLM, SamplingParams

from src.utils import register_vllm_models, steer_moe

try:
    load_dotenv()
    hf_hub.login(os.environ["HF_TOKEN"])
except Exception as e:
    print("HF_TOKEN not found in environment variables. Continuing without login.")
    pass

num_experts = pd.read_json("activations/num_experts.jsonl", lines=True)

INFO 02-11 09:05:39 [__init__.py:241] Automatically detected platform cuda.


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


# Config

In [2]:
# Supported Models: 
# "Qwen/Qwen3-30B-A3B", "openai/gpt-oss-120b", 
# "microsoft/Phi-3.5-MoE-instruct", "openai/gpt-oss-20b", 
# "mistralai/Mixtral-8x7B-Instruct-v0.1", "allenai/OLMoE-1B-7B-0125-Instruct"
MODEL = "Qwen/Qwen3-30B-A3B"
TASK = "faithfulness"  # "faithfulness" or "safety"
REVERSE_EFFECT = 0  # 0 to increase faithfulness/safety, 1 to decrease safety
config = {
    "model": MODEL,
    "task": TASK,
    "reverse_effect": REVERSE_EFFECT,
    "max_tokens": 512,

    "activations_path": f"activations/activations_[{MODEL.replace('/', '--')}]_[{TASK}].pkl",
    "num_pos_experts": num_experts[(num_experts["model"] == MODEL) & (num_experts["Task"] == TASK) & (num_experts["Reverse"] == REVERSE_EFFECT) & (num_experts["Activation"] == "Activated")]["Num Experts"].values[0],  # From Table A.1 in the paper
    "num_neg_experts": num_experts[(num_experts["model"] == MODEL) & (num_experts["Task"] == TASK) & (num_experts["Reverse"] == REVERSE_EFFECT) & (num_experts["Activation"] == "Deactivated")]["Num Experts"].values[0],  # From Table A.1 in the paper
}
config

{'model': 'Qwen/Qwen3-30B-A3B',
 'task': 'faithfulness',
 'reverse_effect': 0,
 'max_tokens': 512,
 'activations_path': 'activations/activations_[Qwen--Qwen3-30B-A3B]_[faithfulness].pkl',
 'num_pos_experts': np.int64(0),
 'num_neg_experts': np.int64(500)}

# Load Model

In [3]:
register_vllm_models()

llm = LLM(
    model=MODEL, 
    max_seq_len_to_capture=4096, max_model_len=4096, 
    tensor_parallel_size=torch.cuda.device_count(), gpu_memory_utilization=0.95, max_num_seqs=1,
    enforce_eager=True,
    enable_prefix_caching=False,
    trust_remote_code=True,
)

INFO 02-11 09:05:46 [utils.py:326] non-default args: {'model': 'Qwen/Qwen3-30B-A3B', 'trust_remote_code': True, 'max_model_len': 4096, 'tensor_parallel_size': 2, 'enable_prefix_caching': False, 'gpu_memory_utilization': 0.95, 'max_num_seqs': 1, 'disable_log_stats': True, 'enforce_eager': True, 'max_seq_len_to_capture': 4096}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 02-11 09:05:54 [__init__.py:711] Resolved architecture: Qwen3MoeForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 02-11 09:05:54 [__init__.py:1750] Using max model len 4096
INFO 02-11 09:05:58 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 02-11 09:05:58 [__init__.py:3565] Cudagraph is disabled under eager mode
[1;36m(EngineCore_0 pid=2562164)[0;0m INFO 02-11 09:06:00 [core.py:636] Waiting for init message from front-end.
[1;36m(EngineCore_0 pid=2562164)[0;0m INFO 02-11 09:06:00 [core.py:74] Initializing a V1 LLM engine (v0.10.1.1) with config: model='Qwen/Qwen3-30B-A3B', speculative_config=None, tokenizer='Qwen/Qwen3-30B-A3B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disab

[1;36m(EngineCore_0 pid=2562164)[0;0m [1;36m(VllmWorker TP1 pid=2562188)[0;0m update_moe_manual_args: UPDATED EXPERTS ROUTING WEIGHTS 48
[1;36m(EngineCore_0 pid=2562164)[0;0m [1;36m(VllmWorker TP0 pid=2562186)[0;0m update_moe_manual_args: UPDATED EXPERTS ROUTING WEIGHTS 48


[1;36m(EngineCore_0 pid=2562164)[0;0m [1;36m(VllmWorker TP1 pid=2562188)[0;0m INFO 02-11 09:06:06 [weight_utils.py:297] Using model weights format ['*.safetensors']
[1;36m(EngineCore_0 pid=2562164)[0;0m [1;36m(VllmWorker TP0 pid=2562186)[0;0m INFO 02-11 09:06:06 [weight_utils.py:297] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/16 [00:00<?, ?it/s]


[1;36m(EngineCore_0 pid=2562164)[0;0m [1;36m(VllmWorker TP1 pid=2562188)[0;0m INFO 02-11 09:06:15 [default_loader.py:262] Loading weights took 8.21 seconds
[1;36m(EngineCore_0 pid=2562164)[0;0m [1;36m(VllmWorker TP0 pid=2562186)[0;0m INFO 02-11 09:06:16 [default_loader.py:262] Loading weights took 8.37 seconds
[1;36m(EngineCore_0 pid=2562164)[0;0m [1;36m(VllmWorker TP1 pid=2562188)[0;0m INFO 02-11 09:06:16 [gpu_model_runner.py:2007] Model loading took 28.4577 GiB and 9.945390 seconds
[1;36m(EngineCore_0 pid=2562164)[0;0m [1;36m(VllmWorker TP0 pid=2562186)[0;0m INFO 02-11 09:06:17 [gpu_model_runner.py:2007] Model loading took 28.4577 GiB and 10.858993 seconds
[1;36m(EngineCore_0 pid=2562164)[0;0m [1;36m(VllmWorker TP0 pid=2562186)[0;0m INFO 02-11 09:06:21 [gpu_worker.py:276] Available KV cache memory: 16.02 GiB
[1;36m(EngineCore_0 pid=2562164)[0;0m [1;36m(VllmWorker TP1 pid=2562188)[0;0m INFO 02-11 09:06:21 [gpu_worker.py:276] Available KV cache memory: 16.02 GiB

# SteerMoE

In [4]:
batch_messages = [
    [
        {
            "role": "user", 
            "content": "Document: iPod was developed by Google\n Question: Who is the developer of iPod? \n Final Answer Only:"
        }
    ],
    [
        {
            "role": "user", 
            "content": "Document: The chief executive officer of Google is Lakshmi Mittal\n Question: Who is the chief executive officer of Google? \n Final Answer Only:"
        }
    ],
    [
        {
            "role": "user", 
            "content": "Document: Anderson Cooper is employed by National Review\n Question: Who is the employer of Anderson Cooper? \n Final Answer Only:"
        }
    ],
]

In [5]:
### Before Steering
paired_ttest_df = steer_moe(
    llm, config["activations_path"],
    num_pos_experts=0, num_neg_experts=0,
    steering_magnitude=1000, reverse_effect=config["reverse_effect"], strategy="risk_diff"
)
sampling_params = SamplingParams(temperature=0.0, top_p=1, top_k=1, min_p=0, max_tokens=config["max_tokens"], seed=0)
outputs = llm.chat(batch_messages, sampling_params, use_tqdm=True, chat_template_kwargs={"enable_thinking": False, "reasoning_effort": "low"},)
generations = [output.outputs[0].text for output in outputs]
generations

MAX EXPERTS: 2399 3505
##### Total Experts: 6144, Layers: 48, Experts: 128
##### Num Experts: 0, Steering Magnitude: 1000, Reverse Effect: 0, pos_num_experts: 0, neg_num_experts: 0, metric=risk_diff, strategy: risk_diff


INFO 02-10 15:21:21 [chat_utils.py:470] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this.


Adding requests:   0%|          | 0/3 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/3 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

['Apple Inc.',
 'The chief executive officer of Google is not Lakshmi Mittal. The correct answer is Sundar Pichai.',
 'The employer of Anderson Cooper is not National Review. The correct answer is: CNN.']

In [6]:
### After Steering
paired_ttest_df = steer_moe(
    llm, config["activations_path"],
    num_pos_experts=config["num_pos_experts"], num_neg_experts=config["num_neg_experts"],
    steering_magnitude=1000, reverse_effect=config["reverse_effect"], strategy="risk_diff"
)
sampling_params = SamplingParams(temperature=0.0, top_p=1, top_k=1, min_p=0, max_tokens=config["max_tokens"], seed=0)
outputs = llm.chat(batch_messages, sampling_params, use_tqdm=True, chat_template_kwargs={"enable_thinking": False, "reasoning_effort": "low"},)
generations = [output.outputs[0].text for output in outputs]
generations

MAX EXPERTS: 2399 3505
##### Total Experts: 6144, Layers: 48, Experts: 128
##### Num Experts: 500, Steering Magnitude: 1000, Reverse Effect: 0, pos_num_experts: 0, neg_num_experts: 500, metric=risk_diff, strategy: risk_diff




Adding requests:   0%|          | 0/3 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/3 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

['Apple', 'Lakshmi Mittal', 'National Review']