## **Training Pipeline**
This notebook will be used to train the diffusion model using the defined train script. 

In [1]:
!nvidia-smi


Sun Dec 10 18:32:15 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  Off  | 00000000:00:06.0 Off |                    0 |
| N/A   36C    P0    26W / 250W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  Tesla V100-PCIE...  Off  | 00000000:00:07.0 Off |                    0 |
| N/A   31C    P0    24W / 250W |      0MiB / 16384MiB |      0%      Default |
|       

In [2]:
!kill 22119

/bin/bash: line 0: kill: (22119) - No such process


In [3]:
import sys
sys.path.append("/home/jupyter-group3/reconstruction/reconstruction-deep-network")

In [4]:
import numpy as np
import os
import torch
from torch.utils.data import Subset
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from argparse import ArgumentParser
import yaml

import reconstruction_deep_network
from reconstruction_deep_network.data_loader.custom_loader import CustomDataLoader
from reconstruction_deep_network.trainer.trainer import ModelTrainer

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
torch.set_float32_matmul_precision('medium')

In [5]:
torch.backends.cudnn.benchmark =  True
torch.backends.cudnn.enabled =  True

In [6]:
module_path = reconstruction_deep_network.__path__[0]
root_dir = os.path.dirname(module_path)
data_path = os.path.join(root_dir, "data", "v1")
text_embeddings = os.path.join(data_path, "text_embeddings")
null_embeddings = os.path.join(text_embeddings, "null")
if not os.path.isdir(null_embeddings):
    os.makedirs(null_embeddings)
trainer_config_path = os.path.join(module_path, "trainer", "trainer_config.yaml")

In [7]:
def parse_args(args=None):

    parser = ArgumentParser()
    parser = pl.Trainer.add_argparse_args(parser)
    parser.add_argument("--main_config_path", type = str, dest = "main_config_path")
    parser.add_argument("--train_metadata", type = str, dest = "train_metadata")
    parser.add_argument("--val_metadata", type = str, dest = "val_metadata")
    parser.add_argument("--num_workers", type = int, dest = "num_workers")
    parser.add_argument("--exp_name", dest = "exp_name", type = str)
    parser.add_argument("--batch_size", dest = "batch_size", type = int)
    parser.add_argument("--n_epochs", dest = "n_epochs", type = int)
    parser.add_argument("--learning_rate", dest = "learning_rate", type = float)
    parser.add_argument("--ckpt_path", dest = "ckpt_path", type = str)

    args = pl.Trainer.parse_argparser(parser.parse_args())
    return args

In [8]:
def main(args):

    config_file_path = args.main_config_path
    with open(config_file_path, 'r') as f:
        config = yaml.load(f, Loader = yaml.FullLoader)
    
    config["train"]["learning_rate"] = args.learning_rate
    config["train"]["max_epochs"] = args.n_epochs
    config["train"]["batch_size"] = args.batch_size

    train_dataset = CustomDataLoader(mode = "train", debug = False, metadata_filename = args.train_metadata, num_views = args.num_views)
    train_indices = list(range(0, 100))
    train_dataset = Subset(train_dataset, train_indices)
    print(f"Size of train dataset: {len(train_dataset)}")
#     val_dataset = CustomDataLoader(mode = "val", debug = False, metadata_filename = args.val_metadata, num_views = args.num_views)    
    
    train_loader = torch.utils.data.DataLoader(
                    train_dataset,
                    batch_size = config["train"]["batch_size"],
                    shuffle = True,
                    num_workers = args.num_workers,
                    drop_last = True)
    
#     val_loader = torch.utils.data.DataLoader(
#                     val_dataset,
#                     batch_size = 1,
#                     shuffle = False,
#                     num_workers = args.num_workers,
#                     drop_last = False)
    
#     torch.cuda.empty_cache()
    model_trainer = ModelTrainer()

        
    print(f"Training for {model_trainer.max_epochs} epochs...")
    print(f"Diffusion Training timesteps: {model_trainer.scheduler.num_train_timesteps}")
    
    
    ckpt_path = None if args.ckpt_path == "None" else args.ckpt_path
    if ckpt_path is not None:
        model_trainer.load_state_dict(torch.load(args.ckpt_path, map_location='cpu')[
            'state_dict'], strict=False)

    checkpoint_callback = ModelCheckpoint(save_top_k=1, monitor="train_loss",
                                          mode="min", save_last=1,
                                          filename='epoch={epoch}-loss={train_loss:.4f}')
    

#     logger = TensorBoardLogger(
#         save_dir='logs/tb_logs', name=args.exp_name, default_hp_metric=False)
    
    training_pipeline = pl.Trainer.from_argparse_args(
        args,
        callbacks=[checkpoint_callback],
#         limit_train_batches=1,
#         strategy = "ddp_notebook",
        amp_backend="apex",
        amp_level="O2"
        )
    
    training_pipeline.fit(model_trainer, train_loader)

In [9]:
parser = ArgumentParser()
parser = pl.Trainer.add_argparse_args(parser)
parser.add_argument("--main_config_path", type = str, dest = "main_config_path")
parser.add_argument("--train_metadata", type = str, dest = "train_metadata")
parser.add_argument("--val_metadata", type = str, dest = "val_metadata")
parser.add_argument("--num_views", type = int, dest = "num_views")
parser.add_argument("--num_workers", type = int, dest = "num_workers")
parser.add_argument("--exp_name", dest = "exp_name", type = str)
parser.add_argument("--batch_size", dest = "batch_size", type = int)
parser.add_argument("--n_epochs", dest = "n_epochs", type = int)
parser.add_argument("--learning_rate", dest = "learning_rate", type = float)
parser.add_argument("--ckpt_path", dest = "ckpt_path", type = str)

args = pl.Trainer.parse_argparser(parser.parse_args([
    "--main_config_path", trainer_config_path,
    "--train_metadata", "ir-20231129-train-split",
    "--val_metadata", "ir-20231129-val-split",
    "--num_views", "1",
    "--num_workers", "12",
    "--exp_name", "ir-training-pipeline-test",
    "--batch_size", "1",
    "--n_epochs", "10",
    "--learning_rate", "0.0002",
    "--ckpt_path", "None"
]))

In [10]:
## set devices and epochs
args.accelerator = "gpu"
args.devices = 1
args.max_epochs = 30
args.num_sanity_val_steps=0 
# args.ckpt_path = "/home/jupyter-group3/reconstruction/reconstruction-deep-network/reconstruction_deep_network/notebooks/pipelines/lightning_logs/version_17/checkpoints/epoch=epoch=0-loss=train_loss=0.1843.ckpt"

In [11]:
args.num_views

1

In [12]:
main(args)

Size of train dataset: 100
Loading diffusion models...


  deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False)
  "The NVIDIA/apex AMP implementation has been deprecated upstream. Consequently, its integration inside"
  "The NVIDIA/apex AMP implementation has been deprecated upstream. Consequently, its integration inside"
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Training for 30 epochs...
Diffusion Training timesteps: 1000


  "Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning`"
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name          | Type               | Params
-----------------------------------------------------
0 | mv_base_model | MultiViewBaseModel | 865 M 
-----------------------------------------------------
865 M     Trainable params
0         Non-trainable params
865 M     Total params
3,463.643 Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 320.00 MiB (GPU 0; 15.78 GiB total capacity; 15.10 GiB already allocated; 43.44 MiB free; 15.13 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
!nvidia-smi

In [None]:
!kill 2810


In [None]:
torch.cuda.empty_cache()

In [None]:
args.gpus

In [None]:
args.devices