In [1]:
%%capture
# Install latest transformers for Gemma 3N
!pip install --no-deps git+https://github.com/huggingface/transformers.git # Only for Gemma 3N
!pip install --no-deps --upgrade timm # Only for Gemma 3N

In [2]:
%%capture
# ==============================================================================
# CELL 1: Install all necessary libraries (Same as Colab)
# ==============================================================================

!pip install wandb -qU
!pip install weave
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3  trl triton cut_cross_entropy 
!pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
!pip install -U peft
!pip install --force-reinstall --no-deps git+https://github.com/unslothai/unsloth-zoo.git
!pip install --force-reinstall --no-deps git+https://github.com/unslothai/unsloth.git

In [3]:
# ==============================================================================
# CELL 2: Login to Weights & Biases
# ==============================================================================
import wandb
from kaggle_secrets import UserSecretsClient

# --- PRE-REQUISITE ---
# 1. Add your W&B API key as a secret in Kaggle with the label "wandb_api_key".
# 2. This keeps your key secure and private.
# ---------------------

try:
    user_secrets = UserSecretsClient()
    wandb_api_key = user_secrets.get_secret("wandb_api_key")
    wandb.login(key=wandb_api_key)
    print("✅ Successfully logged into Weights & Biases.")
except Exception as e:
    print("Could not log into W&B. Please ensure the 'wandb_api_key' secret is set in your Kaggle notebook.")
    print(f"Error: {e}")

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjdmasciano2[0m ([33mjdmasciano2-university-of-lagos[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


✅ Successfully logged into Weights & Biases.


In [4]:

# ==============================================================================
# CELL 3: Copy Datasets to Working Directory (No Changes)
# ==============================================================================
from datasets import Dataset, Image as HFImage
from pathlib import Path
import os

# --- NEW STEP: Copy data to the faster working directory ---
source_path = "/kaggle/input/maize-dataset/"
local_path = "/kaggle/working/local_datasets/"

if not os.path.exists(local_path):
    print(f"Copying data from {source_path} to {local_path} for faster access...")
    !cp -r {source_path} {local_path}
    print("✅ Data copy complete.")
else:
    print(f"✅ Data already copied to {local_path}")
# ---------------------------------------------------------


# --- NEW STEP: Copy data to the faster working directory ---
source_path2 = "/kaggle/input/aura-mind-maize-validation/"
local_path2 = "/kaggle/working/validation_datasets/"

if not os.path.exists(local_path2):
    print(f"Copying data from {source_path2} to {local_path2} for faster access...")
    !cp -r {source_path2} {local_path2}
    print("✅ Data copy complete.")
else:
    print(f"✅ Data already copied to {local_path2}")
# ---------------------------------------------------------

Copying data from /kaggle/input/maize-dataset/ to /kaggle/working/local_datasets/ for faster access...
✅ Data copy complete.
Copying data from /kaggle/input/aura-mind-maize-validation/ to /kaggle/working/validation_datasets/ for faster access...
✅ Data copy complete.


In [5]:
# ==============================================================================
# CELL 4: Prepare the Dataset as a Python List (FINAL CORRECTED VERSION)
# ==============================================================================
from pathlib import Path
from PIL import Image
from tqdm import tqdm

# --- DEFINE THE FUNCTION FIRST ---
def create_conversation_dict(image_path, class_name):
    """Creates the final dictionary structure for a single sample."""
    display_name = CLASS_NAME_MAPPING.get(class_name, "Unknown Maize Condition")
    
    # Load the actual image object here
    pil_image = Image.open(image_path).convert("RGB")
    
    return {
        "messages": [
            { "role": "user",
              "content": [
                {"type": "text", "text": "Classify the condition of this maize plant. Choose from: Healthy Maize Plant, Maize Phosphorus Deficiency."},
                # The PIL Image object goes directly here
                {"type": "image", "image": pil_image}
              ]
            },
            { "role": "assistant",
              "content": [
                {"type": "text", "text": f"This is a {display_name}."}
              ]
            },
        ]
    }

# --- THEN, DEFINE YOUR MAPPING ---
CLASS_NAME_MAPPING = {
    "maize_healthy": "Healthy Maize Plant",
    "phosphorus_deficiency": "Maize Phosphorus Deficiency",
}

# --- FINALLY, RUN THE WORKFLOW ---

# 1. Point to the directory and get the list of STRING paths
dataset_path = Path("/kaggle/working/local_datasets/")
image_paths = list(dataset_path.glob("**/*.jpg")) + list(dataset_path.glob("**/*.jpeg"))
print(f"Found {len(image_paths)} images.")

# 2. Loop through the paths and create the final Python list directly
print("Creating the final dataset list...")
final_dataset_list = []
# Use tqdm for a progress bar
for path in tqdm(image_paths, desc="Processing images"):
    class_folder_name = path.parent.name
    final_dataset_list.append(create_conversation_dict(path, class_folder_name))

print("\n✅ Dataset preparation complete!")
print("\nExample of the final data format:")
# We print the structure to confirm the PIL Image object is now inside
print(final_dataset_list[0])

Found 176 images.
Creating the final dataset list...


Processing images: 100%|██████████| 176/176 [00:10<00:00, 17.34it/s]


✅ Dataset preparation complete!

Example of the final data format:
{'messages': [{'role': 'user', 'content': [{'type': 'text', 'text': 'Classify the condition of this maize plant. Choose from: Healthy Maize Plant, Maize Phosphorus Deficiency.'}, {'type': 'image', 'image': <PIL.Image.Image image mode=RGB size=1080x1920 at 0x78E9B8572E10>}]}, {'role': 'assistant', 'content': [{'type': 'text', 'text': 'This is a Healthy Maize Plant.'}]}]}





In [6]:
# ==============================================================================
# CELL 5: Define the W&B Sweep Configuration
# ==============================================================================
import yaml

# Here, we define the hyperparameters we want to search over.
# W&B will automatically try different combinations based on the 'method'.
# Method can be 'random', 'grid', or 'bayes'. 'bayes' is often a great choice.
sweep_config = {
    'method': 'bayes',
    'metric': {
        'name': 'train/loss',  # The TRL trainer logs loss to this key
        'goal': 'minimize'
    },
    'parameters': {
        'learning_rate': {
            'distribution': 'log_uniform_values',
            'min': 5e-6,
            'max': 5e-4
        },
        'num_train_epochs': {
            'values': [10, 15, 18, 20]
        },
        'lora_r': {
            'values': [8, 16, 32]
        },
        'lora_alpha_multiplier': {
             'values': [1, 2]
        }
    }
}

print("Sweep Configuration:")
print(yaml.dump(sweep_config))

# Initialize the sweep on the W&B server
sweep_id = wandb.sweep(sweep_config, project="mc-s-and-e")

Sweep Configuration:
method: bayes
metric:
  goal: minimize
  name: train/loss
parameters:
  learning_rate:
    distribution: log_uniform_values
    max: 0.0005
    min: 5.0e-06
  lora_alpha_multiplier:
    values:
    - 1
    - 2
  lora_r:
    values:
    - 8
    - 16
    - 32
  num_train_epochs:
    values:
    - 10
    - 15
    - 18
    - 20

Create sweep with ID: amrl62cp
Sweep URL: https://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e/sweeps/amrl62cp


In [7]:
# ==============================================================================
# CELL 6: EVALUATION FRAMEWORK (ULTIMATE SIMPLICITY VERSION)
# ==============================================================================
import weave
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import torch
import transformers

# --- All setup code remains the same ---
CLASS_NAME_MAPPING = {
    "maize_healthy": "This is a Healthy Maize Plant.",
    "phosphorus_deficiency": "This is a Maize Phosphorus Deficiency.",
}
print("Building evaluation dataset from validation files...")
validation_data_path = Path("/kaggle/working/validation_datasets/")
eval_image_paths = list(validation_data_path.glob("**/*.jpg")) + list(validation_data_path.glob("**/*.jpeg"))
eval_dataset = []
for path in tqdm(eval_image_paths, desc="Processing validation images"):
    class_folder_name = path.parent.name
    target_label = CLASS_NAME_MAPPING.get(class_folder_name)
    if target_label:
        eval_dataset.append({
            "image_path": str(path),
            "question": "Classify the condition of this maize plant. Choose from: Healthy Maize Plant, Maize Phosphorus Deficiency.",
            "target": target_label,
        })
print(f"✅ Created an evaluation dataset with {len(eval_dataset)} examples.")

# --- Define the Weave Model (WITHOUT TYPE HINTS) ---
class MaizeExpertModel(weave.Model):
    # THE FINAL FIX: By removing the strict type hints, we avoid the Pydantic
    # validation error. The code will rely on "duck typing" - as long as the
    # objects have the right methods, it will work.
    model: any
    processor: any

    @weave.op()
    @torch.inference_mode()
    def predict(self, image_path: str, question: str) -> dict:
        image = Image.open(image_path).convert("RGB")
        messages = [{"role": "user", "content": [{"type": "text", "text": question}, {"type": "image", "image": image}]}]
        text_prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = self.processor(text=text_prompt, images=image, return_tensors="pt").to(self.model.device)
        outputs = self.model.generate(**inputs, max_new_tokens=20, use_cache=True)
        response = self.processor.batch_decode(outputs, skip_special_tokens=True)[0]
        prompt_marker = "model\n"
        answer_start_index = response.rfind(prompt_marker)
        final_answer = response[answer_start_index + len(prompt_marker):].strip() if answer_start_index != -1 else "Could not parse."
        return {"generated_text": final_answer}

# --- 4. Define the Intelligent Scorer ---
@weave.op()
def calculate_accuracy(target: str, output: dict) -> dict:
    """
    Calculates accuracy by checking for keywords ("Healthy", "Phosphorus", "Maize")
    in the model's prediction, making it robust to phrasing changes.
    """
    prediction = output.get('generated_text', '').lower() # Convert to lowercase for case-insensitivity
    
    # Extract the key diagnostic word from the target
    # Ex: "This is a Healthy Maize Plant." -> "healthy"
    # Ex: "This is a Maize Phosphorus Deficiency." -> "phosphorus"
    target_keyword = ""
    if "healthy" in target.lower():
        target_keyword = "healthy"
    elif "phosphorus" in target.lower():
        target_keyword = "phosphorus"
    
    # A prediction is correct if it contains BOTH "maize" and the target keyword
    is_correct = 1 if "maize" in prediction and target_keyword in prediction else 0
    return {"accuracy": is_correct}

# --- 5. Define the Async Evaluation Wrapper ---
async def evaluate_and_log(model, processor, eval_dataset):
    print("\n🔬 Starting evaluation...")
    eval_model = MaizeExpertModel(model=model, processor=processor)
    evaluation = weave.Evaluation(dataset=eval_dataset, scorers=[calculate_accuracy])
    results = await evaluation.evaluate(eval_model)
    print(f"✅ Evaluation complete. Full results object: {results}")

print("✅ W&B Weave evaluation components are updated and ready.")

error: XDG_RUNTIME_DIR not set in the environment.
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1334:(snd_func_refer) error evaluating name
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5701:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM default
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evalu

Building evaluation dataset from validation files...


Processing validation images: 100%|██████████| 21/21 [00:00<00:00, 118387.61it/s]

✅ Created an evaluation dataset with 21 examples.
✅ W&B Weave evaluation components are updated and ready.





In [8]:
# ==============================================================================
# CELL 7: Create the Main Training Function for the W&B Agent (CORRECTED)
# ==============================================================================
from unsloth import FastVisionModel, FastModel
from transformers import AutoProcessor
from trl import SFTTrainer, SFTConfig
from unsloth.trainer import UnslothVisionDataCollator
import torch
import gc # Import the garbage collector module
import asyncio 


def train():
    """
    This function is called by the W&B agent. It contains the entire
    model setup, training, and saving logic.
    A `try...finally` block has been added to ensure robust memory cleanup
    after each run completes or fails.
    """
    # Initialize variables to None to ensure they exist for the 'finally' block
    model, processor, trainer, run = None, None, None, None
    
    try:
        run = wandb.init()
        
        WANDB_CONFIG = wandb.config
        lora_r_value = WANDB_CONFIG.lora_r
        learning_rate_value = WANDB_CONFIG.learning_rate
        epochs_value = WANDB_CONFIG.num_train_epochs
        lora_alpha_value = lora_r_value * WANDB_CONFIG.lora_alpha_multiplier

        MODEL_NAME = "unsloth/gemma-3n-E2B-it-unsloth-bnb-4bit"

        print("--- New W&B Run ---")
        print(f"Parameters: LR={learning_rate_value}, Epochs={epochs_value}, LoRA r={lora_r_value}, LoRA alpha={lora_alpha_value}")

        model, tokenizer = FastVisionModel.from_pretrained(
            model_name=MODEL_NAME,
            max_seq_length=2048, # max_seq_length is correctly set here
            dtype=None,
            load_in_4bit=True,
            device_map = {"": torch.cuda.current_device()}
            
        )
        processor = AutoProcessor.from_pretrained(MODEL_NAME)
        print("✅ Base model, tokenizer, and processor loaded.")

        model = FastVisionModel.get_peft_model(
            model,
            r=lora_r_value,
            lora_alpha=lora_alpha_value,
            finetune_vision_layers=True,
            finetune_language_layers=True,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        )
        print("✅ PEFT adapters added.")

        FastModel.for_training(model)
        trainer = SFTTrainer(
            model=model,
            train_dataset=final_dataset_list,
            # The 'processing_class' argument is deprecated; data_collator handles this.
            # Removed for clarity.
            data_collator=UnslothVisionDataCollator(model, processor=processor),
            args=SFTConfig(
                output_dir=f"./outputs_{run.name}",
                report_to="wandb",
                num_train_epochs=epochs_value,
                learning_rate=learning_rate_value,
                per_device_train_batch_size=2,
                gradient_accumulation_steps=4,
                gradient_checkpointing=False,
                remove_unused_columns=False,
                dataset_text_field="",
                dataset_kwargs={"skip_prepare_dataset": True},
                # THIS LINE IS THE FIX: 'max_seq_length' has been removed.
                warmup_ratio=0.1,
                optim="adamw_torch_fused",
                save_strategy="no",
                seed=3407,
            ),
        )

        print(f"\n🔥 Starting training run: {run.name}...")
        trainer.train()
        print("✅ Training complete!")

        output_save_dir = f"/kaggle/working/maize_expert_adapters_{run.name}"
        model.save_pretrained(output_save_dir)
        tokenizer.save_pretrained(output_save_dir)
        print(f"✅ Model adapters saved to {output_save_dir}")

        artifact = wandb.Artifact(f'maize-adapters-{run.name}', type='model')
        artifact.add_dir(output_save_dir)
        run.log_artifact(artifact)
        print("✅ Adapters logged as a W&B Artifact.")
        
        # 2. THE FIX: Use asyncio.run() to execute the async function
        # Call the isolated async evaluation function
        asyncio.run(evaluate_and_log(model, processor, eval_dataset))
        
    finally:
        print("\n🧹 Starting cleanup for next run...")
        if run:
            run.finish()

        # THE FINAL FIX: Safely delete each variable only if it was created.
        if 'eval_model' in locals() and eval_model is not None: del eval_model
        if 'trainer' in locals() and trainer is not None: del trainer
        if 'model' in locals() and model is not None: del model
        if 'processor' in locals() and processor is not None: del processor
        if 'tokenizer' in locals() and tokenizer is not None: del tokenizer        
        
        gc.collect()
        torch.cuda.empty_cache()
        print("✅ Memory cleared. Ready for the next agent run.")

# Execute the sweep agent
# This remains the same
wandb.agent(sweep_id, function=train, count=5)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-07-30 14:47:07.070438: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753886827.433836      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753886827.539937      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!


[34m[1mwandb[0m: Agent Starting Run: scxaublf with config:
[34m[1mwandb[0m: 	learning_rate: 8.340839324354132e-06
[34m[1mwandb[0m: 	lora_alpha_multiplier: 2
[34m[1mwandb[0m: 	lora_r: 16
[34m[1mwandb[0m: 	num_train_epochs: 15
[34m[1mwandb[0m: Tracking run with wandb version 0.21.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250730_144735-scxaublf[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mcomic-sweep-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e[0m
[34m[1mwandb[0m: 🧹 View sweep at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e/sweeps/amrl62cp[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e/runs/scxaublf[0m


--- New W&B Run ---
Parameters: LR=8.340839324354132e-06, Epochs=15, LoRA r=16, LoRA alpha=32
==((====))==  Unsloth 2025.7.11: Fast Gemma3N patching. Transformers: 4.55.0.dev0.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3N does not support SDPA - switching to eager!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/469M [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/210 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/98.0 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

preprocessor_config.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.70M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

✅ Base model, tokenizer, and processor loaded.
Unsloth: Making `model.base_model.model.model.language_model` require gradients
✅ PEFT adapters added.
Unsloth: Model does not have a default image size - using 512

🔥 Starting training run: comic-sweep-1...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 176 | Num Epochs = 15 | Total steps = 165
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 22,904,832 of 5,462,343,104 (0.42% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,12.7373
2,12.8063
3,12.792
4,12.7929
5,12.8121
6,12.7731
7,12.8355
8,12.8667
9,12.8016
10,12.8448


✅ Training complete!


[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/maize_expert_adapters_comic-sweep-1)... 

✅ Model adapters saved to /kaggle/working/maize_expert_adapters_comic-sweep-1


Done. 0.4s


✅ Adapters logged as a W&B Artifact.

🔬 Starting evaluation...
✅ Evaluation complete. Full results object: {'calculate_accuracy': {'accuracy': {'mean': 1.0}}, 'model_latency': {'mean': 92.96465700013297}}

🧹 Starting cleanup for next run...


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         train/epoch ▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇████
[34m[1mwandb[0m:   train/global_step ▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇█████
[34m[1mwandb[0m:     train/grad_norm      █▅▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: train/learning_rate ▂▂▇███▇▇▇▆▆▆▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▁▁▁
[34m[1mwandb[0m:          train/loss █████▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:               total_flos 1.374190524647424e+16
[34m[1mwandb[0m:              train/epoch 15
[34m[1mwandb[0m:        train/global_step 165
[34m[1mwandb[0m:          train/grad_norm 3.27019
[34m[1mwandb[0m:      train/learning_rate 0.0
[34m[1mwandb[0m:               train/loss 0.0003
[34m[1mwandb[0m:               train_loss 1.40899
[34m[1mwandb[

✅ Memory cleared. Ready for the next agent run.


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: dr770umn with config:
[34m[1mwandb[0m: 	learning_rate: 1.5707354395781175e-05
[34m[1mwandb[0m: 	lora_alpha_multiplier: 2
[34m[1mwandb[0m: 	lora_r: 32
[34m[1mwandb[0m: 	num_train_epochs: 20
[34m[1mwandb[0m: Tracking run with wandb version 0.21.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250730_154051-dr770umn[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mstilted-sweep-2[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e[0m
[34m[1mwandb[0m: 🧹 View sweep at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e/sweeps/amrl62cp[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e/runs/dr770umn[0m


--- New W&B Run ---
Parameters: LR=1.5707354395781175e-05, Epochs=20, LoRA r=32, LoRA alpha=64
==((====))==  Unsloth 2025.7.11: Fast Gemma3N patching. Transformers: 4.55.0.dev0.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3N does not support SDPA - switching to eager!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Base model, tokenizer, and processor loaded.
Unsloth: Making `model.base_model.model.model.language_model` require gradients
✅ PEFT adapters added.
Unsloth: Model does not have a default image size - using 512

🔥 Starting training run: stilted-sweep-2...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 176 | Num Epochs = 20 | Total steps = 220
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 45,809,664 of 5,485,247,936 (0.84% trained)


Step,Training Loss
1,12.7356
2,12.8063
3,12.792
4,12.7929
5,12.8121
6,12.7731
7,12.8355
8,12.8667
9,12.8016
10,12.8448


✅ Training complete!


[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/maize_expert_adapters_stilted-sweep-2)... 

✅ Model adapters saved to /kaggle/working/maize_expert_adapters_stilted-sweep-2


Done. 0.8s


✅ Adapters logged as a W&B Artifact.

🔬 Starting evaluation...
✅ Evaluation complete. Full results object: {'calculate_accuracy': {'accuracy': {'mean': 1.0}}, 'model_latency': {'mean': 67.47047442481632}}

🧹 Starting cleanup for next run...


[34m[1mwandb[0m: uploading data
[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         train/epoch ▁▁▁▁▁▂▂▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇██
[34m[1mwandb[0m:   train/global_step ▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇█████
[34m[1mwandb[0m:     train/grad_norm      █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: train/learning_rate ▁▂▅▅▆████▇▇▆▆▆▆▅▅▅▅▅▅▅▅▅▅▄▄▄▄▃▂▂▂▂▂▁▁▁▁▁
[34m[1mwandb[0m:          train/loss ██████▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:               total_flos 1.846669784408064e+16
[34m[1mwandb[0m:              train/epoch 20
[34m[1mwandb[0m:        train/global_step 220
[34m[1mwandb[0m:          train/grad_norm 0.00028
[34m[1mwandb[0m:      train/learning_rate 0.0
[34m[1mwandb[0m:               train/loss 0.0001
[34m[1mwandb[0m:               

✅ Memory cleared. Ready for the next agent run.


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: y9u8bqtm with config:
[34m[1mwandb[0m: 	learning_rate: 1.1586228218033573e-05
[34m[1mwandb[0m: 	lora_alpha_multiplier: 2
[34m[1mwandb[0m: 	lora_r: 32
[34m[1mwandb[0m: 	num_train_epochs: 20
[34m[1mwandb[0m: Tracking run with wandb version 0.21.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250730_164712-y9u8bqtm[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33munique-sweep-3[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e[0m
[34m[1mwandb[0m: 🧹 View sweep at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e/sweeps/amrl62cp[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e/runs/y9u8bqtm[0m


--- New W&B Run ---
Parameters: LR=1.1586228218033573e-05, Epochs=20, LoRA r=32, LoRA alpha=64
==((====))==  Unsloth 2025.7.11: Fast Gemma3N patching. Transformers: 4.55.0.dev0.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3N does not support SDPA - switching to eager!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Base model, tokenizer, and processor loaded.
Unsloth: Making `model.base_model.model.model.language_model` require gradients
✅ PEFT adapters added.
Unsloth: Model does not have a default image size - using 512

🔥 Starting training run: unique-sweep-3...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 176 | Num Epochs = 20 | Total steps = 220
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 45,809,664 of 5,485,247,936 (0.84% trained)


Step,Training Loss
1,12.735
2,12.8068
3,12.7921
4,12.7928
5,12.8118
6,12.7733
7,12.8337
8,12.8659
9,12.7994
10,12.8437


✅ Training complete!


[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/maize_expert_adapters_unique-sweep-3)... 

✅ Model adapters saved to /kaggle/working/maize_expert_adapters_unique-sweep-3


Done. 0.7s


✅ Adapters logged as a W&B Artifact.

🔬 Starting evaluation...
✅ Evaluation complete. Full results object: {'calculate_accuracy': {'accuracy': {'mean': 1.0}}, 'model_latency': {'mean': 7.0068366300492055}}

🧹 Starting cleanup for next run...


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         train/epoch ▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
[34m[1mwandb[0m:   train/global_step ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▃▄▄▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇██
[34m[1mwandb[0m:     train/grad_norm     █▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: train/learning_rate ▃▅▆▇█▇▇▇▇▇▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▂▂▂▂▁▁▁▁▁
[34m[1mwandb[0m:          train/loss ███▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:               total_flos 1.846669784408064e+16
[34m[1mwandb[0m:              train/epoch 20
[34m[1mwandb[0m:        train/global_step 220
[34m[1mwandb[0m:          train/grad_norm 0.12241
[34m[1mwandb[0m:      train/learning_rate 0.0
[34m[1mwandb[0m:               train/loss 0.0001
[34m[1mwandb[0m:               train_loss 1.04808
[34m[1mwandb[

✅ Memory cleared. Ready for the next agent run.


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: m9qc9xf2 with config:
[34m[1mwandb[0m: 	learning_rate: 0.00010701553734896272
[34m[1mwandb[0m: 	lora_alpha_multiplier: 2
[34m[1mwandb[0m: 	lora_r: 32
[34m[1mwandb[0m: 	num_train_epochs: 20
[34m[1mwandb[0m: Tracking run with wandb version 0.21.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250730_175453-m9qc9xf2[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mvague-sweep-4[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e[0m
[34m[1mwandb[0m: 🧹 View sweep at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e/sweeps/amrl62cp[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e/runs/m9qc9xf2[0m


--- New W&B Run ---
Parameters: LR=0.00010701553734896272, Epochs=20, LoRA r=32, LoRA alpha=64
==((====))==  Unsloth 2025.7.11: Fast Gemma3N patching. Transformers: 4.55.0.dev0.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3N does not support SDPA - switching to eager!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Base model, tokenizer, and processor loaded.
Unsloth: Making `model.base_model.model.model.language_model` require gradients
✅ PEFT adapters added.
Unsloth: Model does not have a default image size - using 512

🔥 Starting training run: vague-sweep-4...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 176 | Num Epochs = 20 | Total steps = 220
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 45,809,664 of 5,485,247,936 (0.84% trained)


Step,Training Loss
1,12.735
2,12.8068
3,12.7921
4,12.7928
5,12.8118
6,12.7733
7,12.8337
8,12.8659
9,12.7994
10,12.8437


✅ Training complete!


[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/maize_expert_adapters_vague-sweep-4)... 

✅ Model adapters saved to /kaggle/working/maize_expert_adapters_vague-sweep-4


Done. 0.7s


✅ Adapters logged as a W&B Artifact.

🔬 Starting evaluation...


Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `use_cache=False`, `past_key_value=None`.
Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `use_cache=False`, `past_key_value=None`.
Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `use_cache=False`, `past_key_value=None`.
Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `use_cache=False`, `past_key_value=None`.
Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `past_key_value=None`.
Caching is incompatible with gradien

✅ Evaluation complete. Full results object: {'calculate_accuracy': {'accuracy': {'mean': 1.0}}, 'model_latency': {'mean': 9.932059639976138}}

🧹 Starting cleanup for next run...


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         train/epoch ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▇▇▇▇▇▇▇▇█████
[34m[1mwandb[0m:   train/global_step ▁▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇█████
[34m[1mwandb[0m:     train/grad_norm     █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: train/learning_rate ▂▃▃▄▅███▇▇▆▆▆▆▆▆▅▅▅▅▅▅▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▁▁
[34m[1mwandb[0m:          train/loss ██████▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:               total_flos 1.846669784408064e+16
[34m[1mwandb[0m:              train/epoch 20
[34m[1mwandb[0m:        train/global_step 220
[34m[1mwandb[0m:          train/grad_norm 0.00032
[34m[1mwandb[0m:      train/learning_rate 0.0
[34m[1mwandb[0m:               train/loss 0.0001
[34m[1mwandb[0m:               train_loss 1.03961
[34m[1mwandb[

✅ Memory cleared. Ready for the next agent run.


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 18w5p7zm with config:
[34m[1mwandb[0m: 	learning_rate: 0.00044524843020146536
[34m[1mwandb[0m: 	lora_alpha_multiplier: 2
[34m[1mwandb[0m: 	lora_r: 32
[34m[1mwandb[0m: 	num_train_epochs: 20
[34m[1mwandb[0m: Tracking run with wandb version 0.21.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20250730_190127-18w5p7zm[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mamber-sweep-5[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e[0m
[34m[1mwandb[0m: 🧹 View sweep at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e/sweeps/amrl62cp[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/jdmasciano2-university-of-lagos/mc-s-and-e/runs/18w5p7zm[0m


--- New W&B Run ---
Parameters: LR=0.00044524843020146536, Epochs=20, LoRA r=32, LoRA alpha=64
==((====))==  Unsloth 2025.7.11: Fast Gemma3N patching. Transformers: 4.55.0.dev0.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Gemma3N does not support SDPA - switching to eager!


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

✅ Base model, tokenizer, and processor loaded.
Unsloth: Making `model.base_model.model.model.language_model` require gradients
✅ PEFT adapters added.
Unsloth: Model does not have a default image size - using 512

🔥 Starting training run: amber-sweep-5...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 176 | Num Epochs = 20 | Total steps = 220
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 45,809,664 of 5,485,247,936 (0.84% trained)


Step,Training Loss
1,12.735
2,12.8068
3,12.7921
4,12.7928
5,12.8118
6,12.7733
7,12.8337
8,12.8659
9,12.7994
10,12.8437


✅ Training complete!


[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/maize_expert_adapters_amber-sweep-5)... 

✅ Model adapters saved to /kaggle/working/maize_expert_adapters_amber-sweep-5


Done. 0.7s


✅ Adapters logged as a W&B Artifact.

🔬 Starting evaluation...


Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `use_cache=False`, `past_key_value=None`.
Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `use_cache=False`, `past_key_value=None`.
Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `use_cache=False`, `past_key_value=None`.
Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `use_cache=False`, `past_key_value=None`.
Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `use_cache=False`, `past_key_value=None`.
Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `use_cache=False`, `past_key_value=None`.
Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `use_cache=False`, `past_key_value=None`.
Caching is incompatible with gradient checkpointing in Gemma3nTextDecoderLayer. Setting `u

✅ Evaluation complete. Full results object: {'calculate_accuracy': {'accuracy': {'mean': 1.0}}, 'model_latency': {'mean': 13.788020304271154}}

🧹 Starting cleanup for next run...


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         train/epoch ▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇██
[34m[1mwandb[0m:   train/global_step ▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇███
[34m[1mwandb[0m:     train/grad_norm    █▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: train/learning_rate ███▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▁▁▁
[34m[1mwandb[0m:          train/loss ███▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:               total_flos 1.846669784408064e+16
[34m[1mwandb[0m:              train/epoch 20
[34m[1mwandb[0m:        train/global_step 220
[34m[1mwandb[0m:          train/grad_norm 0.0002
[34m[1mwandb[0m:      train/learning_rate 0.0
[34m[1mwandb[0m:               train/loss 0.0001
[34m[1mwandb[0m:               train_loss 1.042
[34m[1mwandb[0m:

✅ Memory cleared. Ready for the next agent run.
