<a href="https://colab.research.google.com/github/vadhri/ai-notebook/blob/main/mlops/pistachio-dataset/Extended_Taining_options.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The explanation fo the results for the tail end case using random forest is as below.

[Wandb report](https://wandb.ai/vadhri-venkat/pistachio-classification/reports/Parameter-optimization--VmlldzoxMjI2NjY3Ng?accessToken=qdxrp7pn1hf3t5bb027c70hp6owkwqn3s4a8h1e346766q13nm7uwvawsczv1qo8)

In [1]:
%%writefile train.py

import argparse
import wandb
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from fastai.vision.all import *

from sklearn.model_selection import StratifiedKFold
import pandas as pd
import wandb
from pathlib import Path

# Defaults for sweep overrides or direct usage
default_config = SimpleNamespace(
    loss_function="log_loss",
    learning_rate="optimal",
    num_iterations=1000,
    log_interval=100,
    wandb_project="pistachio-classification",
    wandb_run_name="data-partioning-model-eval",
    use_wandb=True
)

def parse_args():
    """Override default arguments"""
    parser = argparse.ArgumentParser(description="SGD classifier training script")
    parser.add_argument("--loss_function", type=str, default=default_config.loss_function)
    parser.add_argument("--learning_rate", type=str, default=default_config.learning_rate)
    parser.add_argument("--num_iterations", type=int, default=default_config.num_iterations)
    parser.add_argument("--log_interval", type=int, default=default_config.log_interval)
    parser.add_argument("--wandb_project", type=str, default=default_config.wandb_project)
    parser.add_argument("--wandb_run_name", type=str, default=default_config.wandb_run_name)
    parser.add_argument("--use_wandb", type=bool, default=default_config.use_wandb)
    args = parser.parse_args()
    vars(default_config).update(vars(args))
    return

def load_data(n_splits=2):
# Check if the directory exists and download the processed dataset if not
    import os
    if not os.path.exists('./artifacts/pistachio-image-classification-data-split:v0/data_split.csv'):
        processed_data_at = wandb.use_artifact('pistachio-image-classification-data-split:latest')
        processed_dataset_dir = Path(processed_data_at.download())

    # Load the CSV with the data
    data_path = './artifacts/pistachio-image-classification-data-split:v0/data_split.csv'
    df = pd.read_csv(data_path)

    # Separate features and target variable
    X = df.drop(['Class', 'Stage'], axis=1)
    y = df['Class']

    # Initialize Stratified K-Fold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Create lists to store each fold's splits
    X_train_list = []
    y_train_list = []
    X_val_list = []
    y_val_list = []
    X_test_list = []
    y_test_list = []

    # Loop through each fold and store the splits
    for train_idx, val_idx in skf.split(X, y):
        # Get the train and validation splits for this fold
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Convert y_train, y_val to NumPy arrays (flatten them if needed)
        y_train_flat = y_train.to_numpy()  # Ensure it's a 1D array
        y_val_flat = y_val.to_numpy()      # Ensure it's a 1D array

        # Test data remains constant, separate from the training/validation data
        X_test = X[df['Stage'] == 'test']
        y_test = y[df['Stage'] == 'test']
        y_test_flat = y_test.to_numpy()    # Ensure it's a 1D array

        # Append the current fold's splits to the corresponding lists as 1D arrays
        X_train_list.extend(X_train.to_numpy())  # Add each fold's X_train directly
        y_train_list.extend(y_train_flat)       # Add each fold's y_train directly
        X_val_list.extend(X_val.to_numpy())      # Add each fold's X_val directly
        y_val_list.extend(y_val_flat)           # Add each fold's y_val directly
        X_test_list.extend(X_test.to_numpy())    # Add each fold's X_test directly
        y_test_list.extend(y_test_flat)         # Add each fold's y_test directly

    # Return flattened lists (1D arrays) for all splits
    return X_train_list, y_train_list, X_val_list, y_val_list, X_test_list, y_test_list

def run_experiment(config):
    run = None
    if config.use_wandb:
        run = wandb.init(project=config.wandb_project, name=config.wandb_run_name)

    X_train, y_train, X_val, y_val, X_test, y_test = load_data()

    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_val_enc = le.transform(y_val)
    y_test_enc = le.transform(y_test)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    classes = np.unique(y_train_enc)
    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train_enc)
    class_weight_dict = dict(zip(classes, class_weights))

    model = SGDClassifier(loss=config.loss_function, learning_rate=config.learning_rate, random_state=42, eta0=0.0001)

    results = []
    for i in range(config.num_iterations):
        model.partial_fit(X_train_scaled, y_train_enc, classes=classes)
        train_acc = accuracy_score(y_train_enc, model.predict(X_train_scaled))
        test_acc = accuracy_score(y_test_enc, model.predict(X_test_scaled))

        if i % config.log_interval == 0:
            print(f"Iteration {i+1}: Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")

        if config.use_wandb:
            wandb.log({"iteration": i + 1, "train_accuracy": train_acc, "test_accuracy": test_acc})

        results.append({"iteration": i + 1, "train_accuracy": train_acc, "test_accuracy": test_acc})

    val_acc = accuracy_score(y_val_enc, model.predict(X_val_scaled))
    print(f"Final Validation Accuracy: {val_acc:.4f}")

    if config.use_wandb:
        wandb.log({"final_val_accuracy": val_acc})
        result_table = wandb.Table(columns=["iteration", "train_accuracy", "test_accuracy"])
        for res in results:
            result_table.add_data(res["iteration"], res["train_accuracy"], res["test_accuracy"])
        wandb.log({"training_results": result_table})
        # run.finish() # already commented out intentionally

    return {
        "model": model,
        "scaler": scaler,
        "label_encoder": le,
        "val_accuracy": val_acc,
        "training_log": results
    }

if __name__ == '__main__':
    parse_args()
    run_experiment(default_config)


Writing train.py


In [2]:
!python train.py --learning_rate=adaptive --loss_function=modified_huber --num_iterations=500 --wandb_project "pistachio-classification" --wandb_run_name "sgd_classifier_run"


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice: 2
[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mvadhri-venkat[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracki

In [3]:
%%writefile sweep.yaml
program: train.py

# Method can be grid, random or bayes
method: random

# Project this sweep is part of
project: pistachio-classification

# Metric to optimize
metric:
  name: final_val_accuracy
  goal: maximize

parameters:
  num_iterations:
    values: [1000, 1500, 2500]

  loss:
    values: ["log_loss", "hinge", "modified_huber"]

  learning_rate:
    values: ["constant", "optimal", "invscaling", "adaptive"]


Writing sweep.yaml


In [4]:
!wandb login
!wandb sweep ./sweep.yaml

[34m[1mwandb[0m: Currently logged in as: [33mvadhri-venkat[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Creating sweep from: ./sweep.yaml
[34m[1mwandb[0m: Creating sweep with ID: [33mlsapjzjg[0m
[34m[1mwandb[0m: View sweep at: [34m[4mhttps://wandb.ai/vadhri-venkat/pistachio-classification/sweeps/lsapjzjg[0m
[34m[1mwandb[0m: Run sweep agent with: [33mwandb agent vadhri-venkat/pistachio-classification/lsapjzjg[0m


In [6]:
!wandb agent vadhri-venkat/pistachio-classification/lsapjzjg --count=50

[34m[1mwandb[0m: Starting wandb agent 🕵️
2025-04-13 16:00:20,765 - wandb.wandb_agent - INFO - Running runs: []
2025-04-13 16:00:21,104 - wandb.wandb_agent - INFO - Agent received command: run
2025-04-13 16:00:21,104 - wandb.wandb_agent - INFO - Agent starting run with config:
	learning_rate: invscaling
	loss: hinge
	num_iterations: 1500
2025-04-13 16:00:21,105 - wandb.wandb_agent - INFO - About to run command: /usr/bin/env python train.py --learning_rate=invscaling --loss=hinge --num_iterations=1500
2025-04-13 16:00:26,117 - wandb.wandb_agent - INFO - Running runs: ['bwkeew6e']
[34m[1mwandb[0m: Currently logged in as: [33mvadhri-venkat[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20250413_160030-bwkeew6e[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Sync

In [9]:
%%writefile train.py

import argparse
import wandb
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from fastai.vision.all import *

from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import PCA

import pandas as pd
import wandb
from pathlib import Path

# Defaults for sweep overrides or direct usage
default_config = SimpleNamespace(
    loss_function="log_loss",
    learning_rate="optimal",
    num_iterations=1000,
    log_interval=100,
    wandb_project="pistachio-classification",
    wandb_run_name="data-partioning-model-eval",
    use_wandb=True
)

def parse_args():
    """Override default arguments"""
    parser = argparse.ArgumentParser(description="SGD classifier training script")
    parser.add_argument("--loss_function", type=str, default=default_config.loss_function)
    parser.add_argument("--learning_rate", type=str, default=default_config.learning_rate)
    parser.add_argument("--num_iterations", type=int, default=default_config.num_iterations)
    parser.add_argument("--log_interval", type=int, default=default_config.log_interval)
    parser.add_argument("--wandb_project", type=str, default=default_config.wandb_project)
    parser.add_argument("--wandb_run_name", type=str, default=default_config.wandb_run_name)
    parser.add_argument("--use_wandb", type=bool, default=default_config.use_wandb)
    args = parser.parse_args()
    vars(default_config).update(vars(args))
    return

def load_data(n_splits=2):
# Check if the directory exists and download the processed dataset if not
    import os
    if not os.path.exists('./artifacts/pistachio-image-classification-data-split:v0/data_split.csv'):
        processed_data_at = wandb.use_artifact('pistachio-image-classification-data-split:latest')
        processed_dataset_dir = Path(processed_data_at.download())

    # Load the CSV with the data
    data_path = './artifacts/pistachio-image-classification-data-split:v0/data_split.csv'
    df = pd.read_csv(data_path)

    # Separate features and target variable
    X = df.drop(['Class', 'Stage'], axis=1)
    y = df['Class']

    # Initialize Stratified K-Fold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Create lists to store each fold's splits
    X_train_list = []
    y_train_list = []
    X_val_list = []
    y_val_list = []
    X_test_list = []
    y_test_list = []

    # Loop through each fold and store the splits
    for train_idx, val_idx in skf.split(X, y):
        # Get the train and validation splits for this fold
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Convert y_train, y_val to NumPy arrays (flatten them if needed)
        y_train_flat = y_train.to_numpy()  # Ensure it's a 1D array
        y_val_flat = y_val.to_numpy()      # Ensure it's a 1D array

        # Test data remains constant, separate from the training/validation data
        X_test = X[df['Stage'] == 'test']
        y_test = y[df['Stage'] == 'test']
        y_test_flat = y_test.to_numpy()    # Ensure it's a 1D array

        # Append the current fold's splits to the corresponding lists as 1D arrays
        X_train_list.extend(X_train.to_numpy())  # Add each fold's X_train directly
        y_train_list.extend(y_train_flat)       # Add each fold's y_train directly
        X_val_list.extend(X_val.to_numpy())      # Add each fold's X_val directly
        y_val_list.extend(y_val_flat)           # Add each fold's y_val directly
        X_test_list.extend(X_test.to_numpy())    # Add each fold's X_test directly
        y_test_list.extend(y_test_flat)         # Add each fold's y_test directly

    # Return flattened lists (1D arrays) for all splits
    return X_train_list, y_train_list, X_val_list, y_val_list, X_test_list, y_test_list

def run_experiment(config):
    run = None
    if config.use_wandb:
        run = wandb.init(project=config.wandb_project, name=config.wandb_run_name)

    X_train, y_train, X_val, y_val, X_test, y_test = load_data()

    # add pca
    pca = PCA(n_components=16)
    X_train = pca.fit_transform(X_train)
    X_val = pca.transform(X_val)

    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_val_enc = le.transform(y_val)
    y_test_enc = le.transform(y_test)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    classes = np.unique(y_train_enc)
    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train_enc)
    class_weight_dict = dict(zip(classes, class_weights))

    model = SGDClassifier(loss=config.loss_function, learning_rate=config.learning_rate, random_state=42, eta0=0.0001)

    results = []
    for i in range(config.num_iterations):
        model.partial_fit(X_train_scaled, y_train_enc, classes=classes)
        train_acc = accuracy_score(y_train_enc, model.predict(X_train_scaled))
        test_acc = accuracy_score(y_test_enc, model.predict(X_test_scaled))

        if i % config.log_interval == 0:
            print(f"Iteration {i+1}: Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}")

        if config.use_wandb:
            wandb.log({"iteration": i + 1, "train_accuracy": train_acc, "test_accuracy": test_acc})

        results.append({"iteration": i + 1, "train_accuracy": train_acc, "test_accuracy": test_acc})

    val_acc = accuracy_score(y_val_enc, model.predict(X_val_scaled))
    print(f"Final Validation Accuracy: {val_acc:.4f}")

    if config.use_wandb:
        wandb.log({"final_val_accuracy": val_acc})
        result_table = wandb.Table(columns=["iteration", "train_accuracy", "test_accuracy"])
        for res in results:
            result_table.add_data(res["iteration"], res["train_accuracy"], res["test_accuracy"])
        wandb.log({"training_results": result_table})
        # run.finish() # already commented out intentionally

    return {
        "model": model,
        "scaler": scaler,
        "label_encoder": le,
        "val_accuracy": val_acc,
        "training_log": results
    }

if __name__ == '__main__':
    parse_args()
    run_experiment(default_config)


Overwriting train.py


In [12]:
!python train.py --learning_rate=optimal --loss_function=hinge --num_iterations=1000 --wandb_project "pistachio-classification" --wandb_run_name "sgd_classifier_run"


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvadhri-venkat[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20250413_162838-p5vv2y0m[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33msgd_classifier_run[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/vadhri-venkat/pistachio-classification[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/vadhri-venkat/pistachio-classification/runs/p5vv2y0m[0m
Iteration 1: Train Acc: 0.8422, Test Acc: 0.4977
Iteration 101: Train Acc: 0.8650, Test Acc: 0.4977
Iteration 201: Train Acc: 0.8748, Test Acc: 0.5023
Iteration 301: Train Acc: 0.8748, Test Acc: 0.5023
I

In [19]:
%%writefile train_logistic_regression.py
import argparse
import wandb
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from pathlib import Path
import os
from sklearn.decomposition import PCA
from types import SimpleNamespace


# Defaults for sweep overrides or direct usage
default_config = SimpleNamespace(
    loss_function="lbfgs",  # Use a solver appropriate for Logistic Regression
    learning_rate=None,     # Learning rate is not relevant for Logistic Regression
    num_iterations=1000,
    log_interval=100,
    wandb_project="pistachio-classification",
    wandb_run_name="data-partioning-model-eval",
    use_wandb=True
)

def parse_args():
    """Override default arguments"""
    parser = argparse.ArgumentParser(description="Logistic Regression training script")
    parser.add_argument("--loss_function", type=str, default=default_config.loss_function)
    parser.add_argument("--learning_rate", type=str, default=default_config.learning_rate) # Keep for compatibility, but not used
    parser.add_argument("--num_iterations", type=int, default=default_config.num_iterations)
    parser.add_argument("--log_interval", type=int, default=default_config.log_interval)
    parser.add_argument("--wandb_project", type=str, default=default_config.wandb_project)
    parser.add_argument("--wandb_run_name", type=str, default=default_config.wandb_run_name)
    parser.add_argument("--use_wandb", type=bool, default=default_config.use_wandb)
    args = parser.parse_args()
    vars(default_config).update(vars(args))
    return

def load_data(n_splits=2):
# Check if the directory exists and download the processed dataset if not
    import os
    if not os.path.exists('./artifacts/pistachio-image-classification-data-split:v0/data_split.csv'):
        processed_data_at = wandb.use_artifact('pistachio-image-classification-data-split:latest')
        processed_dataset_dir = Path(processed_data_at.download())

    # Load the CSV with the data
    data_path = './artifacts/pistachio-image-classification-data-split:v0/data_split.csv'
    df = pd.read_csv(data_path)

    # Separate features and target variable
    X = df.drop(['Class', 'Stage'], axis=1)
    y = df['Class']

    # Initialize Stratified K-Fold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Create lists to store each fold's splits
    X_train_list = []
    y_train_list = []
    X_val_list = []
    y_val_list = []
    X_test_list = []
    y_test_list = []

    # Loop through each fold and store the splits
    for train_idx, val_idx in skf.split(X, y):
        # Get the train and validation splits for this fold
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Convert y_train, y_val to NumPy arrays (flatten them if needed)
        y_train_flat = y_train.to_numpy()  # Ensure it's a 1D array
        y_val_flat = y_val.to_numpy()      # Ensure it's a 1D array

        # Test data remains constant, separate from the training/validation data
        X_test = X[df['Stage'] == 'test']
        y_test = y[df['Stage'] == 'test']
        y_test_flat = y_test.to_numpy()    # Ensure it's a 1D array

        # Append the current fold's splits to the corresponding lists as 1D arrays
        X_train_list.extend(X_train.to_numpy())  # Add each fold's X_train directly
        y_train_list.extend(y_train_flat)       # Add each fold's y_train directly
        X_val_list.extend(X_val.to_numpy())      # Add each fold's X_val directly
        y_val_list.extend(y_val_flat)           # Add each fold's y_val directly
        X_test_list.extend(X_test.to_numpy())    # Add each fold's X_test directly
        y_test_list.extend(y_test_flat)         # Add each fold's y_test directly

    # Return flattened lists (1D arrays) for all splits
    return X_train_list, y_train_list, X_val_list, y_val_list, X_test_list, y_test_list

def run_experiment(config):
    run = None
    if config.use_wandb:
        run = wandb.init(project=config.wandb_project, name=config.wandb_run_name)

    X_train, y_train, X_val, y_val, X_test, y_test = load_data()

    # add pca
    pca = PCA(n_components=28)
    X_train = pca.fit_transform(X_train)
    X_val = pca.transform(X_val)
    X_test = pca.transform(X_test) # Apply PCA to X_test


    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_val_enc = le.transform(y_val)
    y_test_enc = le.transform(y_test)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    classes = np.unique(y_train_enc)
    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train_enc)
    class_weight_dict = dict(zip(classes, class_weights))

    # Use LogisticRegression instead of SGDClassifier
    model = LogisticRegression(solver=config.loss_function, random_state=42, class_weight=class_weight_dict, max_iter=config.num_iterations) # Use the specified solver
    model.fit(X_train_scaled, y_train_enc)

    # ... (rest of the code remains largely the same, no need for partial_fit)
    train_acc = accuracy_score(y_train_enc, model.predict(X_train_scaled))
    test_acc = accuracy_score(y_test_enc, model.predict(X_test_scaled))
    val_acc = accuracy_score(y_val_enc, model.predict(X_val_scaled))


    print(f"Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}, val Acc: {val_acc:.4f}")


    if config.use_wandb:
      wandb.log({"train_accuracy": train_acc, "test_accuracy": test_acc, "final_val_accuracy": val_acc})

# ... (rest of the code, including if __name__ == '__main__' remains the same)

if __name__ == '__main__':
    parse_args()
    run_experiment(default_config)


Overwriting train_logistic_regression.py


In [23]:
!python train_logistic_regression.py --learning_rate=optimal --loss_function=lbfgs --num_iterations=1000 --wandb_project "pistachio-classification" --wandb_run_name "logistic_regression_run"


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvadhri-venkat[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20250413_164738-tyy42da1[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mlogistic_regression_run[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/vadhri-venkat/pistachio-classification[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/vadhri-venkat/pistachio-classification/runs/tyy42da1[0m
Train Acc: 0.8776, Test Acc: 0.8837, val Acc: 0.8776
[1;34mwandb[0m: 
[1;34mwandb[0m: 🚀 View run [33mlogistic_regression_run[0m at: [34mhttps://wandb.ai/vadhri-venkat/pistachio-classification

In [30]:
# prompt: Change the classifier to randomforest in the code above; give me full code
%%writefile train_random_forest.py

import argparse
import wandb
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier # Changed classifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from pathlib import Path
import os
from sklearn.decomposition import PCA
from types import SimpleNamespace

# Defaults for sweep overrides or direct usage
default_config = SimpleNamespace(
    n_estimators=100, # Add hyperparameter for RandomForest
    max_depth=None, # Add hyperparameter for RandomForest
    loss_function="log_loss", # Placeholder, not used for RandomForest
    learning_rate="optimal", # Placeholder, not used for RandomForest
    num_iterations=1000, # Placeholder, not used for RandomForest
    log_interval=100,
    wandb_project="pistachio-classification",
    wandb_run_name="data-partioning-model-eval",
    use_wandb=True
)

def parse_args():
    """Override default arguments"""
    parser = argparse.ArgumentParser(description="RandomForest classifier training script")
    parser.add_argument("--n_estimators", type=int, default=default_config.n_estimators)
    parser.add_argument("--max_depth", type=int, default=default_config.max_depth)
    parser.add_argument("--loss_function", type=str, default=default_config.loss_function)
    parser.add_argument("--learning_rate", type=str, default=default_config.learning_rate)
    parser.add_argument("--num_iterations", type=int, default=default_config.num_iterations)
    parser.add_argument("--log_interval", type=int, default=default_config.log_interval)
    parser.add_argument("--wandb_project", type=str, default=default_config.wandb_project)
    parser.add_argument("--wandb_run_name", type=str, default=default_config.wandb_run_name)
    parser.add_argument("--use_wandb", type=bool, default=default_config.use_wandb)
    args = parser.parse_args()
    vars(default_config).update(vars(args))
    return

def load_data(n_splits=2):
# Check if the directory exists and download the processed dataset if not
    import os
    if not os.path.exists('./artifacts/pistachio-image-classification-data-split:v0/data_split.csv'):
        processed_data_at = wandb.use_artifact('pistachio-image-classification-data-split:latest')
        processed_dataset_dir = Path(processed_data_at.download())

    # Load the CSV with the data
    data_path = './artifacts/pistachio-image-classification-data-split:v0/data_split.csv'
    df = pd.read_csv(data_path)

    # Separate features and target variable
    X = df.drop(['Class', 'Stage'], axis=1)
    y = df['Class']

    # Initialize Stratified K-Fold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Create lists to store each fold's splits
    X_train_list = []
    y_train_list = []
    X_val_list = []
    y_val_list = []
    X_test_list = []
    y_test_list = []

    # Loop through each fold and store the splits
    for train_idx, val_idx in skf.split(X, y):
        # Get the train and validation splits for this fold
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Convert y_train, y_val to NumPy arrays (flatten them if needed)
        y_train_flat = y_train.to_numpy()  # Ensure it's a 1D array
        y_val_flat = y_val.to_numpy()      # Ensure it's a 1D array

        # Test data remains constant, separate from the training/validation data
        X_test = X[df['Stage'] == 'test']
        y_test = y[df['Stage'] == 'test']
        y_test_flat = y_test.to_numpy()    # Ensure it's a 1D array

        # Append the current fold's splits to the corresponding lists as 1D arrays
        X_train_list.extend(X_train.to_numpy())  # Add each fold's X_train directly
        y_train_list.extend(y_train_flat)       # Add each fold's y_train directly
        X_val_list.extend(X_val.to_numpy())      # Add each fold's X_val directly
        y_val_list.extend(y_val_flat)           # Add each fold's y_val directly
        X_test_list.extend(X_test.to_numpy())    # Add each fold's X_test directly
        y_test_list.extend(y_test_flat)         # Add each fold's y_test directly

    # Return flattened lists (1D arrays) for all splits
    return X_train_list, y_train_list, X_val_list, y_val_list, X_test_list, y_test_list

def run_experiment(config):
    run = None
    if config.use_wandb:
        run = wandb.init(project=config.wandb_project, name=config.wandb_run_name, config=config)

    X_train, y_train, X_val, y_val, X_test, y_test = load_data()

    # add pca
    pca = PCA(n_components=16)
    X_train = pca.fit_transform(X_train)
    X_val = pca.transform(X_val)
    X_test = pca.transform(X_test) # Apply PCA to X_test

    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_val_enc = le.transform(y_val)
    y_test_enc = le.transform(y_test)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    classes = np.unique(y_train_enc)
    class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train_enc)
    class_weight_dict = dict(zip(classes, class_weights))

    # Initialize RandomForestClassifier with hyperparameters
    model = RandomForestClassifier(n_estimators=config.n_estimators, max_depth=config.max_depth, random_state=42, class_weight=class_weight_dict)
    model.fit(X_train_scaled, y_train_enc)

    train_acc = accuracy_score(y_train_enc, model.predict(X_train_scaled))
    test_acc = accuracy_score(y_test_enc, model.predict(X_test_scaled))
    val_acc = accuracy_score(y_val_enc, model.predict(X_val_scaled))

    print(f"Train Acc: {train_acc:.4f}, Test Acc: {test_acc:.4f}, val Acc: {val_acc:.4f}")

    if config.use_wandb:
        wandb.log({"train_accuracy": train_acc, "test_accuracy": test_acc, "final_val_accuracy": val_acc})

    return {
        "model": model,
        "scaler": scaler,
        "label_encoder": le,
        "val_accuracy": val_acc
    }

if __name__ == '__main__':
    parse_args()
    run_experiment(default_config)


Overwriting train_random_forest.py


In [31]:
!python train_random_forest.py --learning_rate=optimal --loss_function=lbfgs --num_iterations=1000 --wandb_project "pistachio-classification" --wandb_run_name "logistic_regression_run"


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvadhri-venkat[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20250413_165021-e6r4z5fv[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mlogistic_regression_run[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/vadhri-venkat/pistachio-classification[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/vadhri-venkat/pistachio-classification/runs/e6r4z5fv[0m
Train Acc: 1.0000, Test Acc: 1.0000, val Acc: 1.0000
[1;34mwandb[0m: 
[1;34mwandb[0m: 🚀 View run [33mlogistic_regression_run[0m at: [34mhttps://wandb.ai/vadhri-venkat/pistachio-classification

In [29]:
%%writefile sweep_random_forest.yaml

program: train_random_forest.py
method: random
metric:
  name: final_val_accuracy
  goal: maximize
parameters:
  n_estimators:
    values: [50, 100, 200]
  max_depth:
    values: [10, 20, None]
  pca_components:
    values: [16, 28, 64]


Writing sweep_random_forest.yaml


In [39]:
!wandb sweep ./sweep_random_forest.yaml

[34m[1mwandb[0m: Creating sweep from: ./sweep_random_forest.yaml
[34m[1mwandb[0m: Creating sweep with ID: [33moc1a3evy[0m
[34m[1mwandb[0m: View sweep at: [34m[4mhttps://wandb.ai/vadhri-venkat/pistachio-classification/sweeps/oc1a3evy[0m
[34m[1mwandb[0m: Run sweep agent with: [33mwandb agent vadhri-venkat/pistachio-classification/oc1a3evy[0m


In [40]:
!wandb agent vadhri-venkat/pistachio-classification/oc1a3evy --count=50

[34m[1mwandb[0m: Starting wandb agent 🕵️
2025-04-13 16:57:47,890 - wandb.wandb_agent - INFO - Running runs: []
2025-04-13 16:57:48,068 - wandb.wandb_agent - INFO - Agent received command: run
2025-04-13 16:57:48,068 - wandb.wandb_agent - INFO - Agent starting run with config:
	max_depth: 20
	n_estimators: 50
2025-04-13 16:57:48,069 - wandb.wandb_agent - INFO - About to run command: /usr/bin/env python train_random_forest.py --max_depth=20 --n_estimators=50
[34m[1mwandb[0m: Currently logged in as: [33mvadhri-venkat[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.19.9
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20250413_165752-esfmqr1j[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mdata-partioning-model-eval[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/vadhri-venkat/p