### Colabで実装する場合のみ以下を最初に実装

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
%cd /content/drive/MyDrive/grpo_tes_3

print("Current directory:", os.getcwd())

/content/drive/MyDrive/grpo_tes_3
Current directory: /content/drive/MyDrive/grpo_tes_3


### Installation

In [1]:
%%bash
set -euo pipefail

python -m pip install --quiet --upgrade uv

rm -rf "$HOME/.venvs/unsloth"
uv venv "$HOME/.venvs/unsloth"
source "$HOME/.venvs/unsloth/bin/activate"

uv pip install --quiet --upgrade numpy pillow ipykernel sentencepiece bitsandbytes triton
uv pip install --quiet --upgrade \
  torch torchvision torchaudio \
  --index-url https://download.pytorch.org/whl/cu121   # CPU-onlyならこの行を削る
uv pip install --quiet --upgrade \
  transformers==4.56.2 tokenizers==0.22.1 trl==0.22.2 unsloth unsloth_zoo

python -m ipykernel install --user --name unsloth-uv --display-name "Python (unsloth-uv)"

[0mUsing CPython 3.10.12 interpreter at: [36m/usr/bin/python[39m
Creating virtual environment at: [36m/home/ubuntu/.venvs/unsloth[39m
Activate with: [32msource /home/ubuntu/.venvs/unsloth/bin/activate[39m


Installed kernelspec unsloth-uv in /home/ubuntu/.local/share/jupyter/kernels/unsloth-uv


In [2]:
import sys; print(sys.executable)

/home/ubuntu/.venvs/unsloth/bin/python


### Unsloth

Unslothの加速カーネルを使用してGPT-OSS 20Bをロードし、このプロジェクトのGRPOハイパーパラメータを再利用します。


In [None]:
from unsloth import FastLanguageModel
import torch
from transformers import TextStreamer

max_seq_length = 4000

MODEL_ID = "unsloth/gpt-oss-20b"


dtype = None  # None lets Unsloth pick the optimal torch dtype

fourbit_models = [
    "unsloth/gpt-oss-20b-unsloth-bnb-4bit",
    "unsloth/gpt-oss-120b-unsloth-bnb-4bit",
    "unsloth/gpt-oss-20b",
    "unsloth/gpt-oss-120b",
]  # Reference list of compatible checkpoints

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=MODEL_ID,
    dtype=dtype,
    max_seq_length=max_seq_length,
    load_in_4bit=True,  # 4-bit quantization keeps VRAM usage low
    full_finetuning=False,
    # token = "hf_...",  # Uncomment if using gated checkpoints
)

tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Loaded {MODEL_ID} with sequence length {max_seq_length}.")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.10.11: Fast Gpt_Oss patching. Transformers: 4.56.2.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.495 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 8.0. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.37s/it]


Loaded unsloth/gpt-oss-20b with sequence length 5016.


In [None]:
"""Reusable GRPO training utilities for GPT-OSS 20B finetuning with Unsloth."""

from __future__ import annotations

import logging
import random
from dataclasses import dataclass
from typing import Optional, Tuple

import torch
from torch.utils.data import IterableDataset
from unsloth import FastLanguageModel
from trl import GRPOConfig, GRPOTrainer
from transformers import PreTrainedTokenizerBase

from data_reward import load_prompt_dataset, reward_fn

MODEL_ID = "unsloth/gpt-oss-20b"
OUT = "runs/grpo_gptoss20b_lora4_tes"

TOTAL_STEPS = 10
PROMPTS_PER_STEP = 1
NUM_GENERATIONS = 4
MAX_PROMPT_LEN = 1000
MAX_COMPLETION_LEN = 3000
GRADIENT_ACCUMULATION_STEPS = 4
SEED = 42

# Disable Accelerate's batch dispatching so IterableDataset samples containing strings
# (our raw prompts) are not concatenated across processes, preventing TypeError.
ACCELERATOR_CONFIG = {"dispatch_batches": False, "split_batches": True}

logger = logging.getLogger("train_grpo")
if not logger.handlers:
    _handler = logging.StreamHandler()
    _handler.setFormatter(logging.Formatter("%(asctime)s | %(levelname)s | %(message)s"))
    logger.addHandler(_handler)
logger.setLevel(logging.INFO)
logger.propagate = False

MAX_SEQ_LENGTH = MAX_PROMPT_LEN + MAX_COMPLETION_LEN + 16
LORA_RANK = 4
LORA_ALPHA = 8
LORA_DROPOUT = 0.0
LOAD_IN_4BIT = True


@dataclass
class TrainingArtifacts:
    """Bundle components needed to run and resume GRPO training."""

    trainer: GRPOTrainer
    tokenizer: PreTrainedTokenizerBase


def _set_seed(seed: int) -> None:
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


class StepStream(IterableDataset):
    """Yield prompts with reward tensors, sampled per trainer step."""

    KEEP_KEYS = {
        "prompt",
        "reward_action_0",
        "reward_action_1",
        "reward_action_2",
        "reward_action_3",
    }

    def __init__(self, base_dataset, prompts_per_step: int) -> None:
        super().__init__()
        self.base = base_dataset
        self.prompts_per_step = prompts_per_step
        self.n = len(base_dataset)
        self.keys = [key for key in self.KEEP_KEYS if key in getattr(base_dataset, "features", {})]

    def __iter__(self):  # type: ignore[override]
        while True:
            indices = random.sample(range(self.n), self.prompts_per_step)
            for idx in indices:
                row = self.base[idx]
                sample = {}
                for key in self.keys:
                    value = row[key]
                    if key == "prompt":
                        sample[key] = value
                    else:
                        sample[key] = torch.atleast_1d(torch.tensor(value, dtype=torch.float32))
                yield sample


def create_step_stream(
    prompts_per_step: int = PROMPTS_PER_STEP,
    dataset=None,
) -> StepStream:
    base = dataset if dataset is not None else load_prompt_dataset()
    return StepStream(base, prompts_per_step)


def build_model_and_tokenizer(
    model_id: str = MODEL_ID,
    seed: int = SEED,
    max_seq_length: int = MAX_SEQ_LENGTH,
) -> Tuple[torch.nn.Module, PreTrainedTokenizerBase]:
    """Load GPT-OSS with Unsloth kernels and wrap it with LoRA adapters."""

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_id,
        max_seq_length=max_seq_length,
        dtype=None,
        load_in_4bit=LOAD_IN_4BIT,
        full_finetuning=False,
    )
    tokenizer.padding_side = "left"
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model = FastLanguageModel.get_peft_model(
        model,
        r=LORA_RANK,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=seed,
        use_rslora=False,
        loftq_config=None,
    )
    model.config.use_cache = False
    return model, tokenizer


def build_trainer(
    output_dir: str = OUT,
    model_id: str = MODEL_ID,
    total_steps: int = TOTAL_STEPS,
    prompts_per_step: int = PROMPTS_PER_STEP,
    num_generations: int = NUM_GENERATIONS,
    max_prompt_len: int = MAX_PROMPT_LEN,
    max_completion_len: int = MAX_COMPLETION_LEN,
    seed: int = SEED,
    dataset=None,
    learning_rate: float = 5e-5,
    model: Optional[torch.nn.Module] = None,
    tokenizer: Optional[PreTrainedTokenizerBase] = None,
) -> TrainingArtifacts:
    _set_seed(seed)
    if model is None or tokenizer is None:
        model, tokenizer = build_model_and_tokenizer(model_id=model_id, seed=seed)
    else:
        tokenizer.padding_side = "left"
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
        model.config.use_cache = False
    stream = create_step_stream(
        prompts_per_step=prompts_per_step,
        dataset=dataset,
    )
    logger.info(
        "StepStream configured | prompts_per_micro_step=%d | num_generations=%d | dataset_rows=%d | keep_keys=%s",
        prompts_per_step,
        num_generations,
        stream.n,
        stream.keys,
    )

    train_batch_size = prompts_per_step
    generation_batch_size = num_generations
    completions_per_micro_step = prompts_per_step * num_generations
    total_completions_per_update = completions_per_micro_step * GRADIENT_ACCUMULATION_STEPS

    args = GRPOConfig(
        output_dir=output_dir,
        max_steps=total_steps,
        learning_rate=learning_rate,
        bf16=True,
        gradient_checkpointing=True,
        seed=seed,
        num_generations=num_generations,
        generation_batch_size=generation_batch_size,
        per_device_train_batch_size=train_batch_size,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        max_prompt_length=max_prompt_len,
        max_completion_length=max_completion_len,
        report_to=[],
        logging_steps=1,
        accelerator_config=ACCELERATOR_CONFIG,
    )
    logger.info(
        "Generation config | num_generations=%d | generation_batch_size=%d | per_device_train_batch_size=%d | "
        "grad_accum=%d | split_batches=%s | completions_per_micro_step=%d | completions_per_update=%d",
        num_generations,
        generation_batch_size,
        train_batch_size,
        GRADIENT_ACCUMULATION_STEPS,
        ACCELERATOR_CONFIG.get("split_batches"),
        completions_per_micro_step,
        total_completions_per_update,
    )

    trainer = GRPOTrainer(
        model=model,
        processing_class=tokenizer,
        args=args,
        reward_funcs=reward_fn,
        train_dataset=stream,
    )
    return TrainingArtifacts(trainer=trainer, tokenizer=tokenizer)


def run_training(artifacts: Optional[TrainingArtifacts] = None) -> TrainingArtifacts:
    if artifacts is None:
        artifacts = build_trainer()
    artifacts.trainer.train()
    artifacts.trainer.save_model(OUT)
    artifacts.tokenizer.save_pretrained(OUT)
    return artifacts

### LoRA

Unslothの最適化カーネルを使用してパラメータ効率の良いアダプターをアタッチします。


In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r=LORA_RANK,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=SEED,
    use_rslora=False,
    loftq_config=None,
)
model.config.use_cache = False
model.print_trainable_parameters()

Unsloth: Making `model.base_model.model.model` require gradients
trainable params: 1,990,656 || all params: 20,916,747,840 || trainable%: 0.0095


### Reasoning Effort

トレーニング前にモデルとトークナイザーのペアリングを検証するためのクイックサンプルを生成します。



In [6]:
messages = [
    {"role": "user", "content": "Provide a short checklist for evaluating energy-saving actions."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True,
    reasoning_effort="low",
).to(model.device)
_ = model.generate(**inputs, max_new_tokens=128, streamer=TextStreamer(tokenizer))


<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-10-28

Reasoning: low

# Valid channels: analysis, commentary, final. Channel must be included for every message.
Calls to these tools must go to the commentary channel: 'functions'.<|end|><|start|>user<|message|>Provide a short checklist for evaluating energy-saving actions.<|end|><|start|>assistant<|channel|>analysis<|message|>Need short checklist.<|end|><|start|>assistant<|channel|>final<|message|>**Energy‑Saving Action Evaluation Checklist**

| # | Criteria | What to Check |
|---|-----------|---------------|
| 1 | **Energy Impact** | • How much energy will the action reduce? (kWh or %). |
| 2 | **Cost of Implementation** | • Initial cost, ongoing maintenance, and potential hidden costs. |
| 3 | **Payback Period** | • How long will it take to recover the investment? |
| 4 | **Return on Investment (ROI)** | • Net savings vs. cost (including taxes


### Dataset & Reward

Harmony形式のプロンプトデータセットをロードし、GRPO報酬関数で使用される報酬カラムをプレビューします。


In [7]:
dataset = load_prompt_dataset()
print(f"Total prompts: {len(dataset)}")
print(dataset[0])

Generating train split: 150 examples [00:00, 6265.77 examples/s]

Total prompts: 150
{'timestep': 8, 'timestamp': '2025-04-14T08:00:00', 'prompt': '<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\nCurrent date: 2025-10-26\n\nReasoning: medium\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions\n\nYou are an optimisation agent that supervises a thermal energy storage (TES) plant. Your job is to minimise cumulative CO₂ emissions over the full simulation horizon by planning TES charge and discharge decisions.\nDeveloper instructions: respond using ASCII characters only. Return a single line formatted exactly as `[action_index]`, where action_index is an integer in {0, 1, 2, 3}. Do not include additional text, explanations, markdown, or keys.\n\n<|end|><|start|>user<|message|>Objective:\n- Minimise total CO₂ emissions = electricity consumption × time-varying CO₂ intensity over the horizon.\n\nCurrent 




### Streaming Prompts

`StepStream`は最適化ステップごとに正確に`PROMPTS_PER_STEP`個のプロンプトを生成し、公式のGRPOノートブックを反映します。



In [8]:
stream = create_step_stream(dataset=dataset)
preview = next(iter(stream))
print("Keys:", preview.keys())
print("Prompt snippet:", preview['prompt'][:200])
print("Reward tensors:", {k: v.tolist() for k, v in preview.items() if k != 'prompt'})

Keys: dict_keys(['reward_action_0', 'reward_action_2', 'prompt', 'reward_action_3', 'reward_action_1'])
Prompt snippet: <|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: 2025-10-26

Reasoning: medium

# Valid channels: analysis, commentary, fina
Reward tensors: {'reward_action_0': [0.8734728693962097], 'reward_action_2': [0.014982611872255802], 'reward_action_3': [0.0005119182169437408], 'reward_action_1': [0.11103261262178421]}


### Build Trainer

プロジェクトのデフォルト（TOTAL_STEPS=100, PROMPTS_PER_STEP=12, NUM_GENERATIONS=8）を使用してTRL `GRPOTrainer`を作成します。



In [9]:
artifacts = build_trainer(
    dataset=dataset,
    model=model,
    tokenizer=tokenizer,
)
trainer = artifacts.trainer
trainer.processing_class
print(f"Max steps: {trainer.args.max_steps}")
print(f"Generation batch size: {trainer.args.generation_batch_size}")


Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.
We will change the batch size of 12 to the `num_generations` of 8
Max steps: 10
Generation batch size: 96


### Training

ノートブック内でGRPOループを実行（vLLM無効）。


In [10]:
trainer_stats = trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 199998, 'pad_token_id': 200017}.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 80 | Num Epochs = 9,223,372,036,854,775,807 | Total steps = 10
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 1 x 1) = 8
 "-____-"     Trainable parameters = 1,990,656 of 20,916,747,840 (0.01% trained)
`generation_config` default values have been modified to match model-specific defaults: {'max_length': 131072}. If this is not desired, please set these values explicitly.


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.50 GiB. GPU 0 has a total capacity of 39.49 GiB of which 12.54 GiB is free. Including non-PyTorch memory, this process has 26.95 GiB memory in use. Of the allocated memory 26.26 GiB is allocated by PyTorch, with 62.00 MiB allocated in private pools (e.g., CUDA Graphs), and 99.19 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
trainer.save_model(OUT)
tokenizer.save_pretrained(OUT)


### Quick Inference Check

Switch the model to evaluation mode and sample a response using the trained adapters.


In [None]:
model.eval()
messages = [
    {"role": "system", "content": "You are an analyst focusing on industrial CO₂ mitigation."},
    {"role": "user", "content": "Summarize the recommended action from the latest dataset row."},
]
inputs = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
    return_dict=True,
    reasoning_effort="medium",
).to(model.device)
with torch.inference_mode():
    outputs = model.generate(**inputs, max_new_tokens=160)
print(tokenizer.decode(outputs[0], skip_special_tokens=False))


### Reload Saved Adapters

Load the adapters later by reconstructing the base model via Unsloth and attaching the saved LoRA weights.


In [None]:
if False:
    from peft import PeftModel
    reloaded_model, reloaded_tokenizer = FastLanguageModel.from_pretrained(
        model_name=MODEL_ID,
        max_seq_length=max_seq_length,
        dtype=dtype,
        load_in_4bit=load_in_4bit,
        full_finetuning=False,
    )
    reloaded_model = FastLanguageModel.get_peft_model(
        reloaded_model,
        r=LORA_RANK,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha=LORA_ALPHA,
        lora_dropout=LORA_DROPOUT,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=SEED,
        use_rslora=False,
        loftq_config=None,
    )
    reloaded_model = PeftModel.from_pretrained(reloaded_model, OUT)
    reloaded_model.eval()
