# Training model and deploy it into WANDB

In [1]:
import os
from pathlib import Path

p = Path.cwd()
os.chdir(p.parent)
os.getcwd()

'/root/repos/lightning/HousePricing'

In [2]:
from dotenv import dotenv_values
import os

envs = ["secret.env", "fit.env"]

for fenv in envs:
    file = os.path.join("env", fenv)
    config = dotenv_values(file)  # load sensitive variables
    print(config.keys())
    for c, v in config.items():
        os.environ[c] = v

odict_keys(['WANDB_API_KEY', 'WANDB_NAME'])
odict_keys(['WANDB_NOTEBOOK_NAME', 'ITERATIONS', 'MAX_EPOCHS', 'PATIENCE', 'BATCH_SIZE', 'LEARNING_RATE', 'VALIDATION_SIZE'])


In [3]:
import wandb
import os

wandb_key = os.environ["WANDB_API_KEY"]
wandb.login(key=wandb_key)

[34m[1mwandb[0m: Currently logged in as: [33mwilber-quito[0m ([33mdeepsat[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
import torch

torch.cuda.is_available()

True

In [5]:
import torch

torch.cuda.device_count()

1

In [6]:
import torch

torch.cuda.get_device_name()

'NVIDIA GeForce GTX 1650'

In [7]:
import os

max(1, os.cpu_count() - 1)

11

In [8]:
import os
import wandb
import torch
from lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks.early_stopping import EarlyStopping
from src.model import NeuralNetwork
from src.data import HousePricingDataModule
from src.utils.utility import fake_name

project_name = os.environ["WANDB_NAME"]
iters = int(os.environ["ITERATIONS"])

patience = int(os.environ["PATIENCE"])
max_epochs = int(os.environ["MAX_EPOCHS"])
batch_size = int(os.environ["BATCH_SIZE"])
learning_rate = float(os.environ["LEARNING_RATE"])
validation_size = float(os.environ["VALIDATION_SIZE"])
accelerator = "gpu" if torch.cuda.is_available() else "cpu"

# Preparing data to be used
data_module = HousePricingDataModule(
    batch_size=batch_size,
    validation_size=validation_size,
)
data_module.prepare_data()

in_features = data_module.data_features()

# Setting up the training configuration
config = {
    "accelerator": accelerator,
    "max_epochs": max_epochs,
    "patience": patience,
    "lr": learning_rate,
    "batch_size": batch_size,
    "in_features": in_features,
    "validation_size": validation_size,
}

artifacts = list()

for i in range(iters):

    run_name = fake_name()

    wandb.init(
        job_type="train",
        name=run_name,
        project=project_name,
        config=config,
    )

    print(f"[INFO]: Fit config: {config}")

    # Defining the model to be training
    model = NeuralNetwork(input_size=wandb.config["in_features"], lr=wandb.config["lr"])

    # Defining the logger instance the lighning will use as default logging
    logger = WandbLogger()

    # Define how the model registry work
    checkpoint_callback = ModelCheckpoint(
        every_n_epochs=1,
        monitor="val_loss",
        mode="min",
        save_top_k=2,
        filename="house_pricing-{epoch:02d}-{val_loss:.2f}",
    )

    # Defining early stop configuration
    early_stopping_callback = EarlyStopping(
        monitor="val_loss",
        mode="min",
        verbose=False,
        patience=wandb.config["patience"],
    )

    # Defines the training instance
    trainer = Trainer(
        accelerator=wandb.config["accelerator"],
        max_epochs=wandb.config["max_epochs"],
        logger=logger,
        callbacks=[checkpoint_callback, early_stopping_callback],
    )

    trainer.fit(model, datamodule=data_module)

    artifacts_item = {
        "run_name": run_name,
        "best_model": checkpoint_callback.best_model_path,
        "fit_config": config,
    }
    artifacts.append(artifacts_item)

    wandb.finish()

[INFO]: Skipping downloading data. Data is already downloaded
[INFO]: Reloading set up data...


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


[INFO]: Fit config: {'accelerator': 'gpu', 'max_epochs': 1000, 'patience': 100, 'lr': 0.001, 'batch_size': 256, 'in_features': 244, 'validation_size': 0.1}
[INFO]: Input size: 244


/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type       | Params
------------------------------------
0 | net  | Sequential | 215 K 
------------------------------------
215 K     Trainable params
0         Non-trainable params
215 K     Total params
0.864     Total estimated model params size (MB)


[INFO]: Skipping downloading data. Data is already downloaded
[INFO]: Reloading set up data...
[INFO]: Setting up fit dataset/s
[INFO]: Train dataset size: 1166
[INFO]: Validation dataset size: 129
Sanity Checking: |          | 0/? [00:00<?, ?it/s][INFO]: Validation dataloader size: 2
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


                                                                           

/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/lightning/pytorch/loops/fit_loop.py:298: The number of training batches (5) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


[INFO]: Train dataloader size: 5
Training: |          | 0/? [00:00<?, ?it/s][INFO]: Logger save model dir: ./lightning_logs/mrsat7bm
Epoch 999: 100%|██████████| 5/5 [00:00<00:00, 37.91it/s, v_num=t7bm]

`Trainer.fit` stopped: `max_epochs=1000` reached.


Epoch 999: 100%|██████████| 5/5 [00:00<00:00, 36.89it/s, v_num=t7bm]


0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,999.0
train_loss,177689472.0
trainer/global_step,4999.0
val_loss,387328832.0


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type       | Params
------------------------------------
0 | net  | Sequential | 215 K 
------------------------------------
215 K     Trainable params
0         Non-trainable params
215 K     Total params
0.864     Total estimated model params size (MB)


[INFO]: Fit config: {'accelerator': 'gpu', 'max_epochs': 1000, 'patience': 100, 'lr': 0.001, 'batch_size': 256, 'in_features': 244, 'validation_size': 0.1}
[INFO]: Input size: 244
[INFO]: Skipping downloading data. Data is already downloaded
[INFO]: Reloading set up data...
[INFO]: Setting up fit dataset/s
[INFO]: Train dataset size: 1166
[INFO]: Validation dataset size: 129
Sanity Checking: |          | 0/? [00:00<?, ?it/s][INFO]: Validation dataloader size: 2
                                                                            

/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/lightning/pytorch/loops/fit_lo

[INFO]: Train dataloader size: 5
Training: |          | 0/? [00:00<?, ?it/s][INFO]: Logger save model dir: ./lightning_logs/3478itnz
Epoch 931: 100%|██████████| 5/5 [00:00<00:00, 33.51it/s, v_num=itnz]


0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,931.0
train_loss,105884808.0
trainer/global_step,4659.0
val_loss,393353344.0


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type       | Params
------------------------------------
0 | net  | Sequential | 215 K 
------------------------------------
215 K     Trainable params
0         Non-trainable params
215 K     Total params
0.864     Total estimated model params size (MB)


[INFO]: Fit config: {'accelerator': 'gpu', 'max_epochs': 1000, 'patience': 100, 'lr': 0.001, 'batch_size': 256, 'in_features': 244, 'validation_size': 0.1}
[INFO]: Input size: 244
[INFO]: Skipping downloading data. Data is already downloaded
[INFO]: Reloading set up data...
[INFO]: Setting up fit dataset/s
[INFO]: Train dataset size: 1166
[INFO]: Validation dataset size: 129
Sanity Checking: |          | 0/? [00:00<?, ?it/s][INFO]: Validation dataloader size: 2
                                                                            

/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:492: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/root/miniconda3/envs/pricehousing/lib/python3.9/site-packages/lightning/pytorch/loops/fit_lo

[INFO]: Train dataloader size: 5
Training: |          | 0/? [00:00<?, ?it/s][INFO]: Logger save model dir: ./lightning_logs/27yrkxwa
Epoch 918: 100%|██████████| 5/5 [00:00<00:00, 34.52it/s, v_num=kxwa]


0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,918.0
train_loss,109455904.0
trainer/global_step,4594.0
val_loss,409431008.0


In [9]:
tmp = list()
for data in artifacts:
    run_name = data["run_name"]
    best_model = data["best_model"]
    fit_config = data["fit_config"]
    fit_config = {f"fit_{key}": value for key, value in fit_config.items()}
    tmp.append({"run_name": run_name, "best_model": best_model, **fit_config})

In [10]:
import pandas as pd

pd.DataFrame(tmp)

Unnamed: 0,run_name,best_model,fit_accelerator,fit_max_epochs,fit_patience,fit_lr,fit_batch_size,fit_in_features,fit_validation_size
0,Mr._ZI1EUZUC,./lightning_logs/mrsat7bm/checkpoints/house_pr...,gpu,1000,100,0.001,256,244,0.1
1,Gerald_4LIP1BLK,./lightning_logs/3478itnz/checkpoints/house_pr...,gpu,1000,100,0.001,256,244,0.1
2,Stacey_Y41HGQZW,./lightning_logs/27yrkxwa/checkpoints/house_pr...,gpu,1000,100,0.001,256,244,0.1


In [11]:
import pandas as pd

pd.DataFrame(tmp).to_csv("artifacts.csv", index=None)