<a href="https://colab.research.google.com/github/vadhri/ai-notebook/blob/main/mlops/pistachio-dataset/Experiment-Parameter-Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
%%writefile train.py

import argparse
import wandb
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from fastai.vision.all import *

# Defaults for sweep overrides or direct usage
default_config = SimpleNamespace(
    loss_function="log_loss",
    learning_rate="optimal",
    num_iterations=1000,
    log_interval=100,
    wandb_project="pistachio-classification",
    wandb_run_name="sgd_classifier_run",
    use_wandb=True
)

def parse_args():
    """Override default arguments"""
    parser = argparse.ArgumentParser(description="SGD classifier training script")
    parser.add_argument("--loss_function", type=str, default=default_config.loss_function)
    parser.add_argument("--learning_rate", type=str, default=default_config.learning_rate)
    parser.add_argument("--num_iterations", type=int, default=default_config.num_iterations)
    parser.add_argument("--log_interval", type=int, default=default_config.log_interval)
    parser.add_argument("--wandb_project", type=str, default=default_config.wandb_project)
    parser.add_argument("--wandb_run_name", type=str, default=default_config.wandb_run_name)
    parser.add_argument("--use_wandb", type=bool, default=default_config.use_wandb)
    args = parser.parse_args()
    vars(default_config).update(vars(args))
    return

def load_data():
  # python funcntion to check if direectory exists
    import os
    if not os.path.exists('/content/artifacts/pistachio-image-classification-data-split:v0/data_split.csv'):
      processed_data_at = wandb.use_artifact(f'pistachio-image-classification-data-split:latest')
      processed_dataset_dir = Path(processed_data_at.download())

    data_path = '/content/artifacts/pistachio-image-classification-data-split:v0/data_split.csv'
    df = pd.read_csv(data_path)

    X = df.drop(['Class', 'Stage'], axis=1)
    y = df['Class']

    X_train = X[df['Stage'] == 'train']
    y_train = y[df['Stage'] == 'train']
    X_val = X[df['Stage'] == 'val']
    y_val = y[df['Stage'] == 'val']
    X_test = X[df['Stage'] == 'test']
    y_test = y[df['Stage'] == 'test']

    return X_train, y_train, X_val, y_val, X_test, y_test

def run_experiment(config):
    run = None
    if config.use_wandb:
        run = wandb.init(project=config.wandb_project, name=config.wandb_run_name)

    X_train, y_train, X_val, y_val, X_test, y_test = load_data()

    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_val_enc = le.transform(y_val)
    y_test_enc = le.transform(y_test)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    classes = np.unique(y_train_enc)
    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train_enc)
    class_weight_dict = dict(zip(classes, class_weights))

    model = SGDClassifier(loss=config.loss_function, learning_rate=config.learning_rate, random_state=42, eta0=0.0001)

    results = []
    for i in range(config.num_iterations):
        model.partial_fit(X_train_scaled, y_train_enc, classes=classes)
        train_acc = accuracy_score(y_train_enc, model.predict(X_train_scaled))
        test_acc = accuracy_score(y_test_enc, model.predict(X_test_scaled))

        if i % config.log_interval == 0:
            print(f"Iteration {i+1}: Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")

        if config.use_wandb:
            wandb.log({"iteration": i + 1, "train_accuracy": train_acc, "test_accuracy": test_acc})

        results.append({"iteration": i + 1, "train_accuracy": train_acc, "test_accuracy": test_acc})

    val_acc = accuracy_score(y_val_enc, model.predict(X_val_scaled))
    print(f"Final Validation Accuracy: {val_acc:.4f}")

    if config.use_wandb:
        wandb.log({"final_val_accuracy": val_acc})
        result_table = wandb.Table(columns=["iteration", "train_accuracy", "test_accuracy"])
        for res in results:
            result_table.add_data(res["iteration"], res["train_accuracy"], res["test_accuracy"])
        wandb.log({"training_results": result_table})
        # run.finish() # already commented out intentionally

    return {
        "model": model,
        "scaler": scaler,
        "label_encoder": le,
        "val_accuracy": val_acc,
        "training_log": results
    }

if __name__ == '__main__':
    parse_args()
    run_experiment(default_config)


Overwriting train.py


In [7]:
!python train.py --learning_rate=adaptive --loss_function=modified_huber --num_iterations=500 --wandb_project "pistachio-classification" --wandb_run_name "sgd_classifier_run"


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvadhri-venkat[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20250413_123307-hi71ff2v[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33msgd_classifier_run[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/vadhri-venkat/pistachio-classification[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/vadhri-venkat/pistachio-classification/runs/hi71ff2v[0m
[34m[1mwandb[0m: Downloading large artifact pistachio-image-classification-data-split:latest, 233.35MB. 4190 files... 
[34m[1mwandb[0m:   4190 of 4190 files downloaded.  
Done. 0:0:37.9
Iteration 1

In [None]:
%%writefile sweep.yaml
program: train.py

# Method can be grid, random or bayes
method: random

# Project this sweep is part of
project: pistachio-classification

# Metric to optimize
metric:
  name: final_val_accuracy
  goal: maximize

parameters:
  num_iterations:
    values: [1000, 1500, 2500]

  loss:
    values: ["log_loss", "hinge", "modified_huber"]

  learning_rate:
    values: ["constant", "optimal", "invscaling", "adaptive"]


In [9]:
!wandb login
!wandb sweep ./sweep.yaml

[34m[1mwandb[0m: Currently logged in as: [33mvadhri-venkat[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Creating sweep from: ./sweep.yaml
[34m[1mwandb[0m: Creating sweep with ID: [33mnieh006f[0m
[34m[1mwandb[0m: View sweep at: [34m[4mhttps://wandb.ai/vadhri-venkat/pistachio-classification/sweeps/nieh006f[0m
[34m[1mwandb[0m: Run sweep agent with: [33mwandb agent vadhri-venkat/pistachio-classification/nieh006f[0m


In [None]:
!wandb agent vadhri-venkat/pistachio-classification/arii9zn3

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[34m[1mwandb[0m:     train_accuracy ▃▁▃▅▆▆▆▇▆▇█▇▇▇▇▇▇███▇▇▇█████████████████
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m: final_val_accuracy 0.84186
[34m[1mwandb[0m:          iteration 1500
[34m[1mwandb[0m:      test_accuracy 0.88372
[34m[1mwandb[0m:     train_accuracy 0.87485
[34m[1mwandb[0m: 
[34m[1mwandb[0m: 🚀 View run [33msgd_classifier_run[0m at: [34m[4mhttps://wandb.ai/vadhri-venkat/pistachio-classification/runs/bfg87oog[0m
[34m[1mwandb[0m: ⭐️ View project at: [34m[4mhttps://wandb.ai/vadhri-venkat/pistachio-classification[0m
[34m[1mwandb[0m: Synced 5 W&B file(s), 1 media file(s), 1 artifact file(s) and 0 other file(s)
[34m[1mwandb[0m: Find logs at: [35m[1m./wandb/run-20250413_134629-bfg87oog/logs[0m
2025-04-13 13:46:44,800 - wandb.wandb_agent - INFO - Cleaning up finished run: bfg87oog
2025-04-13 13:46:45,004 - wandb.wandb_agent - INFO - Agent receiv