In [1]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.15.3-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.31-py3-none-any.whl (184 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.3/184.3 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.25.0-py2.py3-none-any.whl (206 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m206.5/206.5 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools (from wandb)
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting

In [4]:
!wandb --version

wandb, version 0.15.3


In [5]:
%%capture
!wget -nc https://github.com/DataTalksClub/mlops-zoomcamp/raw/main/cohorts/2023/02-experiment-tracking/homework-wandb/preprocess_data.py -P scripts_wb

In [7]:
!python scripts_wb/preprocess_data.py \
  --wandb_project 'mlops-zoomcamp' \
  --raw_data_path ./data \
  --dest_path ./output

[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice: 1
[34m[1mwandb[0m: You chose 'Create a W&B account'
[34m[1mwandb[0m: Create an account here: https://wandb.ai/authorize?signup=true
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Tracking run with wandb version 0.15.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/content/wandb/run-20230606_173857-e6akbceo[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mquiet-waterfall-1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/vidya-ratan96/mlops-zoomcamp[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/vidya-ratan96/mlops-zoomcamp/runs/e6akbceo[

In [8]:
from types import SimpleNamespace

config = SimpleNamespace(
    WANDB_PROJECT='mlops-zoomcamp',
    WANDB_ENTITY=None,
    ARTIFACT_NAME='vidya-ratan96/mlops-zoomcamp/NYC-Taxi:v0',
    
)

In [9]:
import os
import pickle

import wandb

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def run_train(
    wandb_project: str,
    wandb_entity: str,
    data_artifact: str,
    max_depth: int,
    random_state: int,
):
    # Initialize a Weights & Biases run
    wandb.init(
        project=wandb_project,
        entity=wandb_entity,
        job_type="train",
        config={"max_depth": max_depth, "random_state": random_state},
    )

    # Fetch the preprocessed dataset from artifacts
    artifact = wandb.use_artifact(data_artifact, type="preprocessed_dataset")
    data_path = artifact.download()

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    # Define the XGBoost Regressor Mode, train the model and perform prediction
    rf = RandomForestRegressor(max_depth=max_depth, random_state=random_state)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)

    mse = mean_squared_error(y_val, y_pred, squared=False)
    # TODO: Log `mse` to Weights & Biases under the key `"MSE"`
    wandb.log({'MSE': mse})

    with open("regressor.pkl", "wb") as f:
        pickle.dump(rf, f)

    # TODO: Log `regressor.pkl` as an artifact of type `model`
    regressor_model = wandb.Artifact('regressor', type='model')
    regressor_model.add_file("regressor.pkl")
    wandb.log_artifact(regressor_model)

In [10]:
run_train(
    wandb_project=config.WANDB_PROJECT,
    wandb_entity=config.WANDB_ENTITY,
    data_artifact=config.ARTIFACT_NAME,
    max_depth=10,
    random_state=0,)

[34m[1mwandb[0m: Currently logged in as: [33mvidya-ratan96[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   4 of 4 files downloaded.  


In [11]:
wandb.finish()

VBox(children=(Label(value='1.429 MB of 1.437 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.994475…

0,1
MSE,▁

0,1
MSE,2.45398


In [12]:
import os
import pickle
from functools import partial

import wandb

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def run_train(data_artifact: str):
    wandb.init()
    config = wandb.config

    # Fetch the preprocessed dataset from artifacts
    artifact = wandb.use_artifact(data_artifact, type="preprocessed_dataset")
    data_path = artifact.download()

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    # Define the XGBoost Regressor Mode, train the model and perform prediction
    # TODO: Pass the parameters n_estimators, min_samples_split, min_samples_leaf from `config` to `RandomForestRegressor`
    rf = RandomForestRegressor(max_depth=config.max_depth, random_state=0,
                               n_estimators=config.n_estimators,
                               min_samples_split=config.min_samples_split,
                               min_samples_leaf=config.min_samples_leaf)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)

    mse = mean_squared_error(y_val, y_pred, squared=False)
    wandb.log({"MSE": mse})

    with open("regressor.pkl", "wb") as f:
        pickle.dump(rf, f)

    artifact = wandb.Artifact(f"{wandb.run.id}-model", type="model")
    artifact.add_file("regressor.pkl")
    wandb.log_artifact(artifact)
    wandb.finish()


SWEEP_CONFIG = {
    "method": "bayes",
    "metric": {"name": "MSE", "goal": "minimize"},
    "parameters": {
        "max_depth": {
            "distribution": "int_uniform",
            "min": 1,
            "max": 20,
        },
        "n_estimators": {
            "distribution": "int_uniform",
            "min": 10,
            "max": 50,
        },
        "min_samples_split": {
            "distribution": "int_uniform",
            "min": 2,
            "max": 10,
        },
        "min_samples_leaf": {
            "distribution": "int_uniform",
            "min": 1,
            "max": 4,
        },
    },
}


def run_sweep(wandb_project: str, wandb_entity: str, data_artifact: str, count: int):
    sweep_id = wandb.sweep(SWEEP_CONFIG, project=wandb_project, entity=wandb_entity)
    wandb.agent(sweep_id, partial(run_train, data_artifact=data_artifact), count=count)

In [13]:
run_sweep(wandb_project=config.WANDB_PROJECT,
           wandb_entity=config.WANDB_ENTITY, 
           data_artifact=config.ARTIFACT_NAME, 
           count=5)

Create sweep with ID: jklyf9zq
Sweep URL: https://wandb.ai/vidya-ratan96/mlops-zoomcamp/sweeps/jklyf9zq


[34m[1mwandb[0m: Agent Starting Run: bczr8z7i with config:
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	min_samples_leaf: 1
[34m[1mwandb[0m: 	min_samples_split: 2
[34m[1mwandb[0m: 	n_estimators: 10


[34m[1mwandb[0m:   4 of 4 files downloaded.  


VBox(children=(Label(value='0.159 MB of 0.159 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
MSE,▁

0,1
MSE,2.45912


[34m[1mwandb[0m: Agent Starting Run: 9f8uun9h with config:
[34m[1mwandb[0m: 	max_depth: 16
[34m[1mwandb[0m: 	min_samples_leaf: 3
[34m[1mwandb[0m: 	min_samples_split: 3
[34m[1mwandb[0m: 	n_estimators: 24


[34m[1mwandb[0m:   4 of 4 files downloaded.  


VBox(children=(Label(value='2.221 MB of 2.221 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
MSE,▁

0,1
MSE,2.44731


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: nbtucaeo with config:
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	min_samples_leaf: 1
[34m[1mwandb[0m: 	min_samples_split: 7
[34m[1mwandb[0m: 	n_estimators: 33


[34m[1mwandb[0m:   4 of 4 files downloaded.  


VBox(children=(Label(value='0.057 MB of 0.057 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
MSE,▁

0,1
MSE,2.47379


[34m[1mwandb[0m: Agent Starting Run: p5qb3zx7 with config:
[34m[1mwandb[0m: 	max_depth: 17
[34m[1mwandb[0m: 	min_samples_leaf: 3
[34m[1mwandb[0m: 	min_samples_split: 4
[34m[1mwandb[0m: 	n_estimators: 26


[34m[1mwandb[0m:   4 of 4 files downloaded.  


VBox(children=(Label(value='2.692 MB of 2.692 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
MSE,▁

0,1
MSE,2.44794


[34m[1mwandb[0m: Agent Starting Run: 51xabrgt with config:
[34m[1mwandb[0m: 	max_depth: 20
[34m[1mwandb[0m: 	min_samples_leaf: 4
[34m[1mwandb[0m: 	min_samples_split: 2
[34m[1mwandb[0m: 	n_estimators: 29


[34m[1mwandb[0m:   4 of 4 files downloaded.  


VBox(children=(Label(value='3.602 MB of 3.602 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
MSE,▁

0,1
MSE,2.45172
