### Colabで実装する場合のみ以下を最初に実装

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
%cd /content/drive/MyDrive/grpo_tes_3

print("Current directory:", os.getcwd())

/content/drive/MyDrive/grpo_tes_3
Current directory: /content/drive/MyDrive/grpo_tes_3


### Installation

In [3]:
%%capture
import os, importlib.util
!pip install --upgrade -qqq uv
if importlib.util.find_spec("torch") is None or "COLAB_" in "".join(os.environ.keys()):
    try: import numpy, PIL; get_numpy = f"numpy=={numpy.__version__}"; get_pil = f"pillow=={PIL.__version__}"
    except: get_numpy = "numpy"; get_pil = "pillow"
    !uv pip install -qqq \
        "torch>=2.8.0" "triton>=3.4.0" {get_numpy} {get_pil} torchvision bitsandbytes "transformers==4.56.2" \
        "unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" \
        "unsloth[base] @ git+https://github.com/unslothai/unsloth" \
        git+https://github.com/triton-lang/triton.git@05b2c186c1b6c9a08375389d5efe9cb4c401c075#subdirectory=python/triton_kernels
elif importlib.util.find_spec("unsloth") is None:
    !uv pip install -qqq unsloth
!uv pip install --upgrade --no-deps transformers==4.56.2 tokenizers trl==0.22.2 unsloth unsloth_zoo

### Unsloth

Unslothの加速カーネルを使用してGPT-OSS 20Bをロードし、このプロジェクトのGRPOハイパーパラメータを再利用します。


In [None]:
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer

max_seq_length = 5016

MODEL_ID = "unsloth/gpt-oss-20b"


dtype = None  # None lets Unsloth pick the optimal torch dtype

fourbit_models = [
    "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
    "unsloth/gpt-oss-120b-unsloth-bnb-4bit",
    "unsloth/gpt-oss-20b",
    "unsloth/gpt-oss-120b",
]  # Reference list of compatible checkpoints

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_ID,
    dtype=dtype,
    max_seq_length=max_seq_length,
    load_in_4bit=True,  # 4-bit quantization keeps VRAM usage low
    full_finetuning=False,
    # token = "hf_...",  # Uncomment if using gated checkpoints
)

tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Loaded {MODEL_ID} with sequence length {max_seq_length}.")


In [None]:
"""Reusable GRPO training utilities for GPT-OSS 20B finetuning with Unsloth."""

from __future__ import annotations

import random
from dataclasses import dataclass
from typing import Optional, Tuple

import torch
from torch.utils.data import IterableDataset
from unsloth import FastLanguageModel
from trl import GRPOConfig, GRPOTrainer
from transformers import PreTrainedTokenizerBase

from data_reward import load_prompt_dataset, reward_fn

MODEL_ID = "unsloth/gpt-oss-20b"
OUT = "runs/grpo_gptoss20b_lora4_tes"

TOTAL_STEPS = 10
PROMPTS_PER_STEP = 12
NUM_GENERATIONS = 8
MAX_PROMPT_LEN = 1000
MAX_COMPLETION_LEN = 4000
SEED = 42

# Disable Accelerate's batch dispatching so IterableDataset samples containing strings
# (our raw prompts) are not concatenated across processes, preventing TypeError.
ACCELERATOR_CONFIG = {"dispatch_batches": False}

MAX_SEQ_LENGTH = MAX_PROMPT_LEN + MAX_COMPLETION_LEN + 16
LORA_RANK = 4
LORA_ALPHA = 8
LORA_DROPOUT = 0.0
LOAD_IN_4BIT = True


@dataclass
class TrainingArtifacts:
    """Bundle components needed to run and resume GRPO training."""

    trainer: GRPOTrainer
    tokenizer: PreTrainedTokenizerBase


def _set_seed(seed: int) -> None:
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


class StepStream(IterableDataset):
    """Yield exactly *k* prompts per trainer step with reward columns."""

    KEEP_KEYS = {
        "prompt",
        "reward_action_0",
        "reward_action_1",
        "reward_action_2",
        "reward_action_3",
    }

    def __init__(self, base_dataset, k: int) -> None:
        super().__init__()
        self.base = base_dataset
        self.k = k
        self.n = len(base_dataset)
        self.keys = [key for key in self.KEEP_KEYS if key in getattr(base_dataset, "features", {})]

    def __iter__(self):  # type: ignore[override]
        while True:
            for idx in random.sample(range(self.n), self.k):
                row = self.base[idx]
                sample = {}
                for key in self.keys:
                    value = row[key]
                    if key == "prompt":
                        sample[key] = value
                    else:
                        sample[key] = torch.atleast_1d(torch.tensor(value, dtype=torch.float32))
                yield sample


def create_step_stream(prompts_per_step: int = PROMPTS_PER_STEP, dataset=None) -> StepStream:
    base = dataset if dataset is not None else load_prompt_dataset()
    return StepStream(base, prompts_per_step)


def build_model_and_tokenizer(
    model_id: str = MODEL_ID,
    seed: int = SEED,
    max_seq_length: int = MAX_SEQ_LENGTH,
) -> Tuple[torch.nn.Module, PreTrainedTokenizerBase]:
    """Load GPT-OSS with Unsloth kernels and wrap it with LoRA adapters."""

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_id,
        max_seq_length=max_seq_length,
        dtype=None,
        load_in_4bit=LOAD_IN_4BIT,
        full_finetuning=False,
    )
    tokenizer.padding_side = "left"
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = FastLanguageModel.get_peft_model(
        model,
        r=LORA_RANK,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=seed,
        use_rslora=False,
        loftq_config=None,
    )
    model.config.use_cache = False
    return model, tokenizer


def build_trainer(
    output_dir: str = OUT,
    model_id: str = MODEL_ID,
    total_steps: int = TOTAL_STEPS,
    prompts_per_step: int = PROMPTS_PER_STEP,
    num_generations: int = NUM_GENERATIONS,
    max_prompt_len: int = MAX_PROMPT_LEN,
    max_completion_len: int = MAX_COMPLETION_LEN,
    seed: int = SEED,
    dataset=None,
    learning_rate: float = 5e-5,
    model: Optional[torch.nn.Module] = None,
    tokenizer: Optional[PreTrainedTokenizerBase] = None,
) -> TrainingArtifacts:
    _set_seed(seed)
    if model is None or tokenizer is None:
        model, tokenizer = build_model_and_tokenizer(model_id=model_id, seed=seed)
    else:
        tokenizer.padding_side = "left"
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model.config.use_cache = False
    stream = create_step_stream(prompts_per_step=prompts_per_step, dataset=dataset)

    args = GRPOConfig(
        output_dir=output_dir,
        max_steps=total_steps,
        learning_rate=learning_rate,
        bf16=True,  ### colab仕様
        gradient_checkpointing=True,
        seed=seed,
        num_generations=num_generations,
        generation_batch_size=prompts_per_step * num_generations,
        per_device_train_batch_size=prompts_per_step,
        gradient_accumulation_steps=1,
        max_prompt_length=max_prompt_len,
        max_completion_length=max_completion_len,
        temperature=1.0,
        report_to=[],
        logging_steps=1,
        accelerator_config=ACCELERATOR_CONFIG,
    )

    trainer = GRPOTrainer(
        model=model,
        processing_class=tokenizer,
        args=args,
        reward_funcs=reward_fn,
        train_dataset=stream,
    )
    return TrainingArtifacts(trainer=trainer, tokenizer=tokenizer)


def run_training(artifacts: Optional[TrainingArtifacts] = None) -> TrainingArtifacts:
    if artifacts is None:
        artifacts = build_trainer()
    artifacts.trainer.train()
    artifacts.trainer.save_model(OUT)
    artifacts.tokenizer.save_pretrained(OUT)
    return artifacts


### LoRA

Unslothの最適化カーネルを使用してパラメータ効率の良いアダプターをアタッチします。


In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_RANK,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=SEED,
    use_rslora=False,
    loftq_config=None,
)
model.config.use_cache = False
model.print_trainable_parameters()

Unsloth: Making `model.base_model.model.model` require gradients
trainable params: 1,990,656 || all params: 20,916,747,840 || trainable%: 0.0095


### Reasoning Effort

トレーニング前にモデルとトークナイザーのペアリングを検証するためのクイックサンプルを生成します。



In [6]:
messages = [
    {"role": "user", "content": "Provide a short checklist for evaluating energy-saving actions."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True,
    reasoning_effort="low",
).to(model.device)
_ = model.generate(**inputs, max_new_tokens=128, streamer=TextStreamer(tokenizer))


<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-10-28

Reasoning: low

# Valid channels: analysis, commentary, final. Channel must be included for every message.
Calls to these tools must go to the commentary channel: 'functions'.<|end|><|start|>user<|message|>Provide a short checklist for evaluating energy-saving actions.<|end|><|start|>assistant<|channel|>analysis<|message|>Need a short checklist. Provide bullet points.<|end|><|start|>assistant<|channel|>final<|message|>**Energy‑Saving Action Evaluation Checklist**

| # | Criterion | What to Check |
|---|-----------|---------------|
| 1 | **Energy Effectiveness** | Will the action reduce overall energy consumption (kWh)? |
| 2 | **Cost‑and‑Payback** | Cost of implementation vs. savings; calculate payback period. |
| 3 | **Install/Operation Complexity** | How easy is it to install, maintain, and operate? |
| 4 | **Lifecycle Impact** | Consider manufactur

### Dataset & Reward

Harmony形式のプロンプトデータセットをロードし、GRPO報酬関数で使用される報酬カラムをプレビューします。


In [None]:
dataset = load_prompt_dataset()
print(f"Total prompts: {len(dataset)}")
print(dataset[0])

Generating train split: 0 examples [00:00, ? examples/s]

Total prompts: 150
{'timestep': 8, 'timestamp': '2025-04-14T08:00:00', 'prompt': '<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\nCurrent date: 2025-10-26\n\nReasoning: medium\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions\n\nYou are an optimisation agent that supervises a thermal energy storage (TES) plant. Your job is to minimise cumulative CO₂ emissions over the full simulation horizon by planning TES charge and discharge decisions.\nDeveloper instructions: respond using ASCII characters only. Return a single line formatted exactly as `[action_index]`, where action_index is an integer in {0, 1, 2, 3}. Do not include additional text, explanations, markdown, or keys.\n\n<|end|><|start|>user<|message|>Objective:\n- Minimise total CO₂ emissions = electricity consumption × time-varying CO₂ intensity over the horizon.\n\nCurrent 

### Streaming Prompts

`StepStream`は最適化ステップごとに正確に`PROMPTS_PER_STEP`個のプロンプトを生成し、公式のGRPOノートブックを反映します。



In [None]:
stream = create_step_stream(dataset=dataset)
preview = next(iter(stream))
print("Keys:", preview.keys())
print("Prompt snippet:", preview['prompt'][:200])
print("Reward tensors:", {k: v.tolist() for k, v in preview.items() if k != 'prompt'})

Keys: dict_keys(['reward_action_0', 'reward_action_2', 'reward_action_3', 'reward_action_1', 'prompt'])
Prompt snippet: <|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-10-26

Reasoning: medium

# Valid channels: analysis, commentary, fina
Reward tensors: {'reward_action_0': [0.8734728693962097], 'reward_action_2': [0.014982611872255802], 'reward_action_3': [0.0005119182169437408], 'reward_action_1': [0.11103261262178421]}


### Build Trainer

プロジェクトのデフォルト（TOTAL_STEPS=100, PROMPTS_PER_STEP=12, NUM_GENERATIONS=8）を使用してTRL `GRPOTrainer`を作成します。



In [11]:
artifacts = build_trainer(
    dataset=dataset,
    model=model,
    tokenizer=tokenizer,
)
trainer = artifacts.trainer
trainer.processing_class
print(f"Max steps: {trainer.args.max_steps}")
print(f"Generation batch size: {trainer.args.generation_batch_size}")


Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 12 to the `num_generations` of 8
Unsloth: Switching to float32 training since model cannot work with float16
Max steps: 100
Generation batch size: 96


### Training

ノートブック内でGRPOループを実行（vLLM無効）。


In [None]:
trainer_stats = trainer.train()


In [None]:
trainer.save_model(OUT)
tokenizer.save_pretrained(OUT)


### Quick Inference Check

Switch the model to evaluation mode and sample a response using the trained adapters.


In [None]:
model.eval()
messages = [
    {"role": "system", "content": "You are an analyst focusing on industrial CO₂ mitigation."},
    {"role": "user", "content": "Summarize the recommended action from the latest dataset row."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True,
    reasoning_effort="medium",
).to(model.device)
with torch.inference_mode():
    outputs = model.generate(**inputs, max_new_tokens=160)
print(tokenizer.decode(outputs[0], skip_special_tokens=False))


### Reload Saved Adapters

Load the adapters later by reconstructing the base model via Unsloth and attaching the saved LoRA weights.


In [None]:
if False:
    from peft import PeftModel
    reloaded_model, reloaded_tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_ID,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        full_finetuning=False,
    )
    reloaded_model = FastLanguageModel.get_peft_model(
        reloaded_model,
        r=LORA_RANK,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=SEED,
        use_rslora=False,
        loftq_config=None,
    )
    reloaded_model = PeftModel.from_pretrained(reloaded_model, OUT)
    reloaded_model.eval()
