In [5]:
import os
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, Mxfp4Config, TrainingArguments, AutoTokenizer
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, PPOTrainer, PPOConfig

In [15]:
dir(TrainingArguments)

['_VALID_DICT_FIELDS',
 '__annotations__',
 '__class__',
 '__dataclass_fields__',
 '__dataclass_params__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__match_args__',
 '__module__',
 '__ne__',
 '__new__',
 '__post_init__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_dict_torch_dtype_to_str',
 '_n_gpu',
 '_no_sync_in_gradient_accumulation',
 '_setup_devices',
 'accelerator_config',
 'adafactor',
 'adam_beta1',
 'adam_beta2',
 'adam_epsilon',
 'auto_find_batch_size',
 'average_tokens_across_devices',
 'batch_eval_metrics',
 'bf16',
 'bf16_full_eval',
 'data_seed',
 'dataloader_drop_last',
 'dataloader_num_workers',
 'dataloader_persistent_workers',
 'dataloader_pin_memory',
 'dataloader_prefetch_factor',
 'ddp_backend',
 'ddp_broadcast_buf

In [2]:
dataset = load_dataset("json", data_files="/app/Files/training_set.jsonl", split="train")

In [15]:
dataset[0]

{'messages': [{'role': 'system',
   'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
  {'role': 'user', 'content': 'What is a vector space in linear algebra?'},
  {'role': 'assistant',
   'content': 'A vector space is a collection of vectors where you can add them together and multiply them by scalars, following specific rules.'}]}

In [16]:
dataset[0]['messages']

[{'role': 'system',
  'content': 'You are a mathematician who is specialized in linear algebra and also statistics.'},
 {'role': 'user', 'content': 'What is a vector space in linear algebra?'},
 {'role': 'assistant',
  'content': 'A vector space is a collection of vectors where you can add them together and multiply them by scalars, following specific rules.'}]

In [3]:
tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")

In [6]:
messages = dataset[0]['messages']
convo = tokenizer.apply_chat_template(messages, tokenize=False)

In [7]:
convo

'<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\nCurrent date: 2025-08-24\n\nReasoning: medium\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions\n\nYou are a mathematician who is specialized in linear algebra and also statistics.\n\n<|end|><|start|>user<|message|>What is a vector space in linear algebra?<|end|><|start|>assistant<|channel|>final<|message|>A vector space is a collection of vectors where you can add them together and multiply them by scalars, following specific rules.<|return|>'

In [4]:
quantization_config = Mxfp4Config(dequantize=True)

model_kwargs = dict(
    attn_implementation='eager', 
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,
    use_cache=False, 
    device_map='auto')

model = AutoModelForCausalLM.from_pretrained('openai/gpt-oss-20b',**model_kwargs)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
messages = [{'role':'user', 'content':'what is a vector space?'}]

prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True,return_tensors='pt').to(model.device)

out = model.generate(prompt, max_new_tokens=512)
response = tokenizer.batch_decode(out)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [10]:
response

['<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\nCurrent date: 2025-08-24\n\nReasoning: medium\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>user<|message|>what is a vector space?<|end|><|start|>assistant<|channel|>analysis<|message|>The user asks: "what is a vector space?" They want a definition, perhaps with explanation. The context: maybe some background: vector space over a field. Provide definition, examples, properties, axioms. Might want simple explanation with example of R^n. Provide formal definition and discussion of key axioms. Also mention linear transformations, subspaces, basis, dimension. The user didn\'t specify any level or application. Likely a basic explanation is suitable. But should we anticipate advanced interest? Provide accessible explanation, with formal definition and intuitive explanation. And mention that vector spaces can be over 

In [5]:
peft_config = LoraConfig(
    r=8,                 
    lora_alpha=16,
    target_modules='all-linear', 
    target_parameters=[
        "7.mlp.experts.gate_up_proj",
        "7.mlp.experts.down_proj",
        "15.mlp.experts.gate_up_proj",
        "15.mlp.experts.down_proj",
        "23.mlp.experts.gate_up_proj",
        "23.mlp.experts.down_proj"
    ]
)

peft_model = get_peft_model(model, peft_config)



In [18]:
filepath = '/app/Files/OSS_Training_Output/'

training_args = TrainingArguments(    
    output_dir=os.path.join(filepath,"gpt-oss-20b-fine-tuning"),
    save_strategy="steps",
    save_steps=50,
    fp16=True,  
    learning_rate=2e-4,
    gradient_checkpointing=True,
    num_train_epochs=3,
    logging_steps=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4, 
    warmup_ratio=0.03,
    lr_scheduler_type="cosine_with_min_lr",
    lr_scheduler_kwargs={"min_lr_rate":0.1},
    report_to="none",
)

In [22]:
import trl
for mods in dir(trl):
    if "Trainer" in mods:
        print(mods)

AlignPropTrainer
BCOTrainer
CPOTrainer
DPOTrainer
GKDTrainer
GRPOTrainer
IterativeSFTTrainer
KTOTrainer
NashMDTrainer
ORPOTrainer
OnlineDPOTrainer
PPOTrainer
PRMTrainer
RLOOTrainer
RewardTrainer
SFTTrainer
XPOTrainer


In [25]:
help(SFTTrainer)

Help on class SFTTrainer in module trl.trainer.sft_trainer:

class SFTTrainer(transformers.trainer.Trainer)
 |  SFTTrainer(model: Union[str, torch.nn.modules.module.Module, transformers.modeling_utils.PreTrainedModel], args: Union[trl.trainer.sft_config.SFTConfig, transformers.training_args.TrainingArguments, NoneType] = None, data_collator: Optional[transformers.data.data_collator.DataCollator] = None, train_dataset: Union[datasets.arrow_dataset.Dataset, datasets.iterable_dataset.IterableDataset, NoneType] = None, eval_dataset: Union[datasets.arrow_dataset.Dataset, dict[str, datasets.arrow_dataset.Dataset], NoneType] = None, processing_class: Union[transformers.tokenization_utils_base.PreTrainedTokenizerBase, transformers.image_processing_utils.BaseImageProcessor, transformers.feature_extraction_utils.FeatureExtractionMixin, transformers.processing_utils.ProcessorMixin, NoneType] = None, compute_loss_func: Optional[Callable] = None, compute_metrics: Optional[Callable[[transformers.tra

In [19]:
trainer = SFTTrainer(
    model=peft_model,
    args=training_args,
    train_dataset=dataset,
    processing_class=tokenizer
)

In [14]:
trainer.train()

Step,Training Loss
1,4.0187
2,3.9725
3,3.5964
4,3.0796
5,2.6557
6,2.267
7,1.8935
8,1.621
9,1.4731
10,1.2935


TrainOutput(global_step=26, training_loss=1.4067063652552092, metrics={'train_runtime': 438.338, 'train_samples_per_second': 0.933, 'train_steps_per_second': 0.059, 'total_flos': 8007579371450880.0, 'train_loss': 1.4067063652552092})

In [27]:
help(PPOTrainer)

Help on class PPOTrainer in module trl.trainer.ppo_trainer:

class PPOTrainer(transformers.trainer.Trainer)
 |  PPOTrainer(args: trl.trainer.ppo_config.PPOConfig, processing_class: Union[transformers.tokenization_utils_base.PreTrainedTokenizerBase, transformers.image_processing_utils.BaseImageProcessor, transformers.feature_extraction_utils.FeatureExtractionMixin, transformers.processing_utils.ProcessorMixin, NoneType], model: torch.nn.modules.module.Module, ref_model: Optional[torch.nn.modules.module.Module], reward_model: torch.nn.modules.module.Module, train_dataset: datasets.arrow_dataset.Dataset, value_model: torch.nn.modules.module.Module, data_collator: Optional[transformers.data.data_collator.DataCollatorWithPadding] = None, eval_dataset: Union[datasets.arrow_dataset.Dataset, dict[str, datasets.arrow_dataset.Dataset], NoneType] = None, optimizers: tuple[torch.optim.optimizer.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), callbacks: Optional[list[transformers.trai

In [10]:
help(PPOConfig)

Help on class PPOConfig in module trl.trainer.ppo_config:

class PPOConfig(trl.trainer.utils.OnPolicyConfig)
 |
 |  Configuration class for the [`PPOTrainer`].
 |
 |  This class includes only the parameters that are specific to PPO training. For a full list of training arguments,
 |  please refer to the [`~transformers.TrainingArguments`] and [`OnPolicyConfig`] documentation. Note that default
 |  values in this class may differ from those in [`~transformers.TrainingArguments`].
 |
 |  Using [`~transformers.HfArgumentParser`] we can turn this class into
 |  [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
 |  command line.
 |
 |  Parameters:
 |      exp_name (`str`, *optional*, defaults to `os.path.basename(__file__)[:-3]`):
 |          Name of this experiment.
 |      reward_model_path (`str`, *optional*, defaults to `"EleutherAI/pythia-160m"`):
 |          Path to the reward model.
 |      model_adapter_name (`str` or `None

In [5]:
!pip show trl

Name: trl
Version: 0.21.0
Summary: Train transformer language models with reinforcement learning.
Home-page: https://github.com/huggingface/trl
Author: Leandro von Werra
Author-email: leandro.vonwerra@gmail.com
License: 
Location: /usr/local/lib/python3.12/site-packages
Requires: accelerate, datasets, transformers
Required-by: 


In [10]:
from trl import PPOTrainer, AutoModelForCausalLMWithValueHead, PPOConfig
from transformers import AutoTokenizer, GenerationConfig

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")
tokenizer.pad_token = tokenizer.eos_token

# Define a simple processing class wrapper
class TokenizerProcessor:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, prompts, **kwargs):
        return self.tokenizer(prompts, return_tensors="pt", padding=True, truncation=True)

processing_class = TokenizerProcessor(tokenizer)


In [11]:
from trl import AutoModelForCausalLMWithValueHead # critic
from transformers import AutoModelForCausalLM, GenerationConfig

tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")

ppo_config = PPOConfig(
    batch_size=1,
    learning_rate=1e-5,
    cliprange=0.2,
    # ppo_epochs=2
)

def reward_fn(samples, **kwargs):
    return [len(s) for s in samples]

# nb_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype="float16",
# )
# Policy (actor + critic in one)
policy_model = AutoModelForCausalLMWithValueHead.from_pretrained('openai/gpt-oss-20b', device_map='auto', offload_folder="offload",trust_remote_code=True)
# policy_model = AutoModelForCausalLM.from_pretrained('openai/gpt-oss-20b', device_map='auto')

# Frozen reference model for ratio calc
# ref_model = AutoModelForCausalLM.from_pretrained('openai/gpt-oss-20b', device_map='auto')
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained('openai/gpt-oss-20b', device_map='auto', offload_folder="offload",trust_remote_code=True)


# reward model (in practice: classifier trained on preference) scoring or placeholder
# reward_model = AutoModelForCausalLM.from_pretrained('openai/gpt-oss-20b', device_map='auto')
reward_model = AutoModelForCausalLMWithValueHead.from_pretrained('openai/gpt-oss-20b', device_map='auto', offload_folder="offload",trust_remote_code=True)
value_model = policy_model
# value_model = AutoModelForCausalLMWithValueHead.from_pretrained('openai/gpt-oss-20b', device_map='auto')
# value_model = AutoModelForCausalLM.from_pretrained('openai/gpt-oss-20b', device_map='auto')

for m in [ref_model, reward_model]:
    for param in m.parameters():
        param.requires_grad = False
        
if not hasattr(policy_model, "generation_config"):
    policy_model.generation_config = GenerationConfig(
        max_new_tokens=64,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )

    
trainer = PPOTrainer(
    args=ppo_config,
    model=policy_model,
    value_model=value_model,
    reward_model=reward_model,
    ref_model=ref_model,
    # eval_dataset=,
    # optimizers=,       
    train_dataset=dataset,
    processing_class=processing_class
    # processing_class=tokenizer
    # tokenizer=tokenizer
)

Fetching 40 files:   0%|          | 0/40 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Fetching 40 files:   0%|          | 0/40 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Fetching 40 files:   0%|          | 0/40 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



AttributeError: 'AutoModelForCausalLMWithValueHead' object has no attribute 'base_model_prefix'

In [None]:
trainer.train()