In [1]:
import gc
import shutil
from pathlib import Path

import torch
import pandas as pd

# import losses
# import train
# import inference
# from utils.log_utils import get_exp_name
# from optimizers.lion import Lion
# from models.transformer_encoder import Network
# from models.convlstm import Seq2Seq

In [2]:
data_dir = "/kaggle/input/yandex-cup-ml-23-nowcasting"
class Config:
    # logging
    logs_dir: Path = Path("/kaggle/working/")

    # data
    data_dir = Path(f"{data_dir}/")
    train_files = [
        f"{data_dir}/ML Cup 2023 Weather/train/2021-01-train.hdf5",
        f"{data_dir}/ML Cup 2023 Weather/train/2021-02-train.hdf5",
        f"{data_dir}/ML Cup 2023 Weather/train/2021-03-train.hdf5",
        f"{data_dir}/ML Cup 2023 Weather/train/2021-04-train.hdf5",
        f"{data_dir}/ML Cup 2023 Weather/train/2021-05-train.hdf5",
        f"{data_dir}/ML Cup 2023 Weather/train/2021-06-train.hdf5",
        f"{data_dir}/ML Cup 2023 Weather/train/2021-07-train.hdf5",
        f"{data_dir}/ML Cup 2023 Weather/train/2021-08-train.hdf5",
        f"{data_dir}/ML Cup 2023 Weather/train/2021-09-train.hdf5",
        f"{data_dir}/ML Cup 2023 Weather/train/2021-10-train.hdf5",
        f"{data_dir}/ML Cup 2023 Weather/train/2021-11-train.hdf5",
        f"{data_dir}/ML Cup 2023 Weather/train/2021-12-train.hdf5",
    ]
    valid_files = [
        # f"{data_dir}/ML Cup 2023 Weather/train/2021-02-train.hdf5",
        # f"{data_dir}/ML Cup 2023 Weather/train/2021-05-train.hdf5",
        # f"{data_dir}/ML Cup 2023 Weather/train/2021-08-train.hdf5",
        f"{data_dir}/ML Cup 2023 Weather/train/2021-11-train.hdf5",
    ]
    test_files = [f"{data_dir}/ML Cup 2023 Weather/2022-test-public.hdf5"]
    mode = "overlap"

    batch_size = 8  # 104 128
    eval_batch_size = 1
    num_workers = 12

    # aug
    mix_proba = 1.0
    mixup_alpha = 1.0
    cutmix_alpha = 1.0

    device = "cuda:0"
    use_amp = True
    clip_value = 1
    lr = 3e-5  # 3e-5  # lion: 3e-5  adamw: 1e-4
    min_lr = 1e-8

    n_epochs = 3

    label_smoothing = 0.0

# Data

In [3]:
import math
import random
from typing import Any

import torch
import numpy as np
import torchvision
from torch import Tensor


class RandomMixup(torch.nn.Module):
    """Randomly apply Mixup to the provided batch and targets.
    The class implements the data augmentations as described in the paper
    `"mixup: Beyond Empirical Risk Minimization" <https://arxiv.org/abs/1710.09412>`_.

    Args:
        num_classes (int): number of classes used for one-hot encoding.
        p (float): probability of the batch being transformed. Default value is 0.5.
        alpha (float): hyperparameter of the Beta distribution used for mixup.
            Default value is 1.0.
        inplace (bool): boolean to make this transform inplace. Default set to False.
    """

    def __init__(
        self,
        p: float = 1.0,
        alpha: float = 1.0,
        inplace: bool = False,
    ) -> None:
        super().__init__()

        if alpha <= 0:
            raise ValueError("Alpha param can't be zero.")

        self.p = p
        self.alpha = alpha
        self.inplace = inplace

    def forward(self, batch: Tensor, target: Tensor):
        """
        Args:
            batch (Tensor): Float tensor of size (B, C, H, W)
            target (Tensor): Integer tensor of size (B, )

        Returns:
            Tensor: Randomly transformed batch.
        """
        if not self.inplace:
            batch = batch.clone()
            target = target.clone()

        if torch.rand(1).item() >= self.p:
            return batch, target

        # It's faster to roll the batch by one instead of shuffling it to create image pairs
        batch_rolled = batch.roll(1, 0)
        target_rolled = target.roll(1, 0)

        # Implemented as on mixup paper, page 3.
        lambda_param = float(
            torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0]
        )
        batch_rolled.mul_(1.0 - lambda_param)
        batch.mul_(lambda_param).add_(batch_rolled)

        target_rolled.mul_(1.0 - lambda_param)
        target.mul_(lambda_param).add_(target_rolled)

        return batch, target

    def __repr__(self) -> str:
        s = (
            f"{self.__class__.__name__}("
            f", p={self.p}"
            f", alpha={self.alpha}"
            f", inplace={self.inplace}"
            f")"
        )
        return s


class RandomCutmix(torch.nn.Module):
    """Randomly apply Cutmix to the provided batch and targets.
    The class implements the data augmentations as described in the paper
    `"CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features"
    <https://arxiv.org/abs/1905.04899>`_.

    Args:
        num_classes (int): number of classes used for one-hot encoding.
        p (float): probability of the batch being transformed. Default value is 0.5.
        alpha (float): hyperparameter of the Beta distribution used for cutmix.
            Default value is 1.0.
        inplace (bool): boolean to make this transform inplace. Default set to False.
    """

    def __init__(
        self,
        p: float = 1.0,
        alpha: float = 1.0,
        inplace: bool = False,
    ) -> None:
        super().__init__()
        self.p = p
        self.alpha = alpha
        self.inplace = inplace

    def forward(self, batch: Tensor, target: Tensor):
        """
        Args:
            batch (Tensor): Float tensor of size (B, C, H, W)
            target (Tensor): Integer tensor of size (B, )

        Returns:
            Tensor: Randomly transformed batch.
        """
        if not self.inplace:
            batch = batch.clone()
            target = target.clone()

        if torch.rand(1).item() >= self.p:
            return batch, target

        # It's faster to roll the batch by one instead of shuffling it to create image pairs
        batch_rolled = batch.roll(1, 0)
        target_rolled = target.roll(1, 0)

        # Implemented as on cutmix paper, page 12 (with minor corrections on typos).
        lambda_param = float(
            torch._sample_dirichlet(torch.tensor([self.alpha, self.alpha]))[0]
        )
        *_, H, W = torchvision.transforms.functional.get_dimensions(batch)

        r_x = torch.randint(W, (1,))
        r_y = torch.randint(H, (1,))

        r = 0.5 * math.sqrt(1.0 - lambda_param)
        r_w_half = int(r * W)
        r_h_half = int(r * H)

        x1 = int(torch.clamp(r_x - r_w_half, min=0))
        y1 = int(torch.clamp(r_y - r_h_half, min=0))
        x2 = int(torch.clamp(r_x + r_w_half, max=W))
        y2 = int(torch.clamp(r_y + r_h_half, max=H))

        batch[..., y1:y2, x1:x2] = batch_rolled[..., y1:y2, x1:x2]
        target[..., y1:y2, x1:x2] = target_rolled[..., y1:y2, x1:x2]

        # lambda_param = float(1.0 - (x2 - x1) * (y2 - y1) / (W * H))

        # target_rolled.mul_(1.0 - lambda_param)
        # target.mul_(lambda_param).add_(target_rolled)

        return batch, target

    def __repr__(self) -> str:
        s = (
            f"{self.__class__.__name__}("
            f", p={self.p}"
            f", alpha={self.alpha}"
            f", inplace={self.inplace}"
            f")"
        )
        return s


class RandomAugment:
    def __call__(self, emb: Tensor) -> Tensor:
        # concat samples

        if random.random() < 0.5:
            emb = emb.flip(0)

        if random.random() < 0.5:
            cuts = random.randint(1, 8)
            # cuts = random.randint(1, int(len(emb) * 0.2))
            idxs_to_cut = np.random.choice(len(emb), cuts)
            m = torch.ones(len(emb), dtype=torch.bool)
            m[idxs_to_cut] = False
            emb = emb[m]

        # shift
        n_shifts = random.randint(0, len(emb) // 2)
        emb = emb.roll(n_shifts, 0)

        return emb

In [4]:
import random
import json
from pathlib import Path

import h5py
import torch
import torchvision
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from sklearn import preprocessing


class RadarDataset(Dataset):
    def __init__(
        self,
        list_of_files,
        in_seq_len=4,
        out_seq_len=12,
        mode="overlap",
        with_time=False,
    ):
        self.in_seq_len = in_seq_len
        self.out_seq_len = out_seq_len
        self.seq_len = in_seq_len + out_seq_len
        self.with_time = with_time
        self.__prepare_timestamps_mapping(list_of_files)
        self.__prepare_sequences(mode)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        data = []
        for timestamp in self.sequences[index]:
            with h5py.File(self.timestamp_to_file[timestamp]) as d:
                data.append(np.array(d[timestamp]["intensity"]))
        data = np.expand_dims(data, axis=1)
        data[data == -1e6] = 0
        data[data == -2e6] = -1
        inputs = data[: self.in_seq_len]
        targets = data[self.in_seq_len :]
        out = {"features": torch.from_numpy(inputs), "label": torch.from_numpy(targets)}
        if self.with_time:
            out["timestamp"] = torch.tensor(int(self.sequences[index][-1]))
        return out

    def __prepare_timestamps_mapping(self, list_of_files):
        self.timestamp_to_file = {}
        for filename in list_of_files:
            with h5py.File(filename) as d:
                self.timestamp_to_file = {
                    **self.timestamp_to_file,
                    **dict(map(lambda x: (x, filename), d.keys())),
                }

    def __prepare_sequences(self, mode):
        timestamps = np.unique(sorted(self.timestamp_to_file.keys()))
        if mode == "sequentially":
            self.sequences = [
                timestamps[index * self.seq_len : (index + 1) * self.seq_len]
                for index in range(len(timestamps) // self.seq_len)
            ]
        elif mode == "overlap":
            self.sequences = [
                timestamps[index : index + self.seq_len]
                for index in range(len(timestamps) - self.seq_len + 1)
            ]
        else:
            raise Exception(f"Unknown mode {mode}")
        self.sequences = list(
            filter(
                lambda x: int(x[-1]) - int(x[0]) == (self.seq_len - 1) * 600,
                self.sequences,
            )
        )


class Collator:
    def __init__(
        self,
        stage: str = "train",
        mix_proba: float = 0.5,
        mixup_alpha: float = 0.0,
        cutmix_alpha: float = 0.0,
    ) -> None:
        assert stage in ("train", "val", "test")
        self.stage = stage

        # mixup & cutmix augmentations
        self.mix_transform = None
        if (stage == "train") and (random.random() < mix_proba):
            mix_transforms = []
            if mixup_alpha > 0:
                mix_transforms.append(RandomMixup(alpha=mixup_alpha))
            if cutmix_alpha > 0:
                mix_transforms.append(RandomCutmix(alpha=cutmix_alpha))

            self.mix_transform = torchvision.transforms.RandomChoice(mix_transforms)

    def __call__(self, batch):
        seq_len, n_channels, height, width = batch[0]["features"].shape
        features = torch.zeros(len(batch), *batch[0]["features"].shape)

        has_label = "label" in batch[0]
        if has_label:
            labels = torch.zeros((len(batch), *batch[0]["label"].shape))
            # one_label = torch.zeros(len(batch))

        for idx, item in enumerate(batch):
            features[idx] = item["features"]
            if has_label:
                labels[idx] = item["label"]

        if has_label and self.mix_transform and (self.stage == "train"):
            features, labels = self.mix_transform(features, labels)

        out = {
            "features": features,
        }
        if has_label:
            out["label"] = labels

        return out




# Model

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class ConvLSTMCell(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, padding, activation):
        super().__init__()

        if activation == "tanh":
            self.activation = torch.tanh
        elif activation == "relu":
            self.activation = torch.relu

        self.conv = nn.Conv2d(
            in_channels=in_channels + out_channels,
            out_channels=4 * out_channels,
            kernel_size=kernel_size,
            padding=padding,
        )

    def forward(self, X, H_prev, C_prev):
        conv_output = self.conv(torch.cat([X, H_prev], dim=1))
        i_conv, f_conv, C_conv, o_conv = torch.chunk(conv_output, chunks=4, dim=1)
        input_gate = torch.sigmoid(i_conv)
        forget_gate = torch.sigmoid(f_conv)
        output_gate = torch.sigmoid(o_conv)
        C = forget_gate * C_prev + input_gate * self.activation(C_conv)
        H = output_gate * self.activation(C)
        return H, C


class ConvLSTM(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, padding, activation):
        super().__init__()
        self.out_channels = out_channels
        self.convLSTMCell = ConvLSTMCell(
            in_channels, out_channels, kernel_size, padding, activation
        )

    def forward(self, X):
        batch_size, seq_len, _, height, width = X.size()
        output = torch.zeros(
            batch_size,
            seq_len,
            self.out_channels,
            height,
            width,
            device=self.convLSTMCell.conv.weight.device,
        )
        H = torch.zeros(
            batch_size,
            self.out_channels,
            height,
            width,
            device=self.convLSTMCell.conv.weight.device,
        )
        C = torch.zeros(
            batch_size,
            self.out_channels,
            height,
            width,
            device=self.convLSTMCell.conv.weight.device,
        )
        for time_step in range(seq_len):
            H, C = self.convLSTMCell(X[:, time_step], H, C)
            output[:, time_step] = H
        return output


class Seq2Seq(nn.Module):
    def __init__(
        self,
        num_channels,
        num_kernels,
        kernel_size,
        padding,
        activation,
        num_layers,
        out_seq_len,
    ):
        super().__init__()
        self.out_seq_len = out_seq_len

        self.sequential = nn.Sequential()
        self.sequential.add_module(
            "convlstm1",
            ConvLSTM(
                in_channels=num_channels,
                out_channels=num_kernels,
                kernel_size=kernel_size,
                padding=padding,
                activation=activation,
            ),
        )
        for layer_index in range(2, num_layers + 1):
            self.sequential.add_module(
                f"convlstm{layer_index}",
                ConvLSTM(
                    in_channels=num_kernels,
                    out_channels=num_kernels,
                    kernel_size=kernel_size,
                    padding=padding,
                    activation=activation,
                ),
            )
        self.conv = nn.Conv2d(
            in_channels=num_kernels,
            out_channels=num_channels,
            kernel_size=kernel_size,
            padding=padding,
        )

    def forward(self, X, mask=None):
        batch_size, seq_len, num_channels, height, width = X.size()
        inputs = torch.zeros(
            batch_size,
            seq_len + self.out_seq_len - 1,
            num_channels,
            height,
            width,
            device=self.conv.weight.device,
        )
        inputs[:, :seq_len] = X
        output = self.sequential(inputs)
        output = torch.stack(
            [
                self.conv(output[:, index + seq_len - 1])
                for index in range(self.out_seq_len)
            ],
            dim=1,
        )
        return output

In [6]:
from typing import Any, Callable, Optional, Tuple

import torch
from torch import Tensor
from torch.optim.optimizer import Optimizer


def exists(val: Any):
    return val is not None


def update_fn(
    p, grad: Tensor, exp_avg: Tensor, lr: float, wd: float, beta1: float, beta2: float
):
    # stepweight decay
    p.data.mul_(1 - lr * wd)

    # weight update
    update = exp_avg.clone().mul_(beta1).add(grad, alpha=1 - beta1).sign_()
    p.add_(update, alpha=-lr)

    # decay the momentum running average coefficient
    exp_avg.mul_(beta2).add_(grad, alpha=1 - beta2)


class Lion(Optimizer):
    def __init__(
        self,
        params: Any,
        lr: float = 1e-4,
        betas: Tuple[float, float] = (0.9, 0.95),
        weight_decay: float = 1e-2,
    ):
        assert lr > 0.0
        assert all([0.0 <= beta <= 1.0 for beta in betas])

        defaults = dict(lr=lr, betas=betas, weight_decay=weight_decay)

        super().__init__(params, defaults)

        self.update_fn = update_fn

    @torch.no_grad()
    def step(self, closure: Optional[Callable] = None):
        loss = None
        if exists(closure):
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            for p in filter(lambda p: exists(p.grad), group["params"]):
                grad, lr, wd, beta1, beta2, state = (
                    p.grad,
                    group["lr"],
                    group["weight_decay"],
                    *group["betas"],
                    self.state[p],
                )

                # init state - exponential moving average of gradient values

                if len(state) == 0:
                    state["exp_avg"] = torch.zeros_like(p)

                exp_avg = state["exp_avg"]

                self.update_fn(p, grad, exp_avg, lr, wd, beta1, beta2)

        return loss

# Training

In [7]:
import torch
import numpy as np
from tqdm.auto import tqdm
from sklearn.metrics import average_precision_score

from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter


def train(
    cfg,
    net,
    train_dataloader,
    val_dataloader,
    optimizer,
    criterion,
    scheduler=None,
    label_smoothing=0.0,
    tb=None,  # tensorboard logger
):
    scaler = torch.cuda.amp.GradScaler(enabled=cfg.use_amp)
    alpha = 0.8
    best_val_score = 0

    for epoch in range(cfg.n_epochs):
        # Training
        net.train()
        train_loss = None
        train_score = np.zeros((12,), dtype=float)
        for batch_idx, data in enumerate(pbar := tqdm(train_dataloader)):
            # if batch_idx > 50:
            #     break
            optimizer.zero_grad()
            for key in data:
                data[key] = data[key].to(cfg.device)
            batch, targets = data["features"], data["label"]
            mask = data["mask"] if "mask" in data else None
            targets = targets * (1 - label_smoothing) + (label_smoothing / 2)

            with torch.cuda.amp.autocast(enabled=cfg.use_amp):
                logits = net(batch, mask)
                logits[targets == -1] = -1
                loss = criterion(logits, targets)

            scaler.scale(loss).backward()
            if cfg.clip_value is not None:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(net.parameters(), cfg.clip_value)
            scaler.step(optimizer)
            scaler.update()

            train_loss = (
                loss.item()
                if not train_loss
                else alpha * train_loss + (1 - alpha) * loss.item()
            )
            train_score += np.sum(
                (
                    np.square(
                        targets.detach().cpu().numpy() - logits.detach().cpu().numpy()
                    )
                )
                * (targets.detach().cpu().numpy() != -1),
                axis=(0, 2, 3, 4),
            )

            pbar.set_description(
                f"Epoch: {epoch} Loss: {train_loss:.6f} Score: {np.mean(np.sqrt(train_score / (batch_idx + 1))):.2f}"
            )

            del logits, batch, mask, targets

        if scheduler:
            scheduler.step()

        train_score /= batch_idx + 1
        train_score = np.mean(np.sqrt(train_score))
        print("Train Loss:", train_loss)
        print("Train score:", train_score)

        # Evaluation
        net.eval()
        val_loss = None
        val_score = np.zeros((12,), dtype=float)
        for batch_idx, data in enumerate(pbar := tqdm(val_dataloader)):
            # if batch_idx > 50:
            #     break
            with torch.no_grad():
                with torch.cuda.amp.autocast(enabled=cfg.use_amp):
                    for key in data:
                        data[key] = data[key].to(cfg.device)
                    batch, targets = data["features"], data["label"]
                    mask = data["mask"] if "mask" in data else None

                    logits = net(batch, mask)
                    logits[targets == -1] = -1
                    loss = criterion(logits, targets)

                val_loss = (
                    loss.item()
                    if not val_loss
                    else alpha * val_loss + (1 - alpha) * loss.item()
                )
                val_score += np.sum(
                    (
                        np.square(
                            targets.detach().cpu().numpy()
                            - logits.detach().cpu().numpy()
                        )
                    )
                    * (targets.detach().cpu().numpy() != -1),
                    axis=(0, 2, 3, 4),
                )

                pbar.set_description(
                    f"Epoch: {epoch} Loss: {val_loss:.6f} Score: {np.mean(np.sqrt(val_score / (batch_idx + 1))):.2f}"
                )

        val_score /= batch_idx + 1
        val_score = np.mean(np.sqrt(val_score))
        print("LR:", optimizer.param_groups[0]["lr"])
        print("Val Loss:", val_loss)
        print("Val score:", val_score)

        (cfg.logs_dir / "weights").mkdir(exist_ok=True)
        torch.save(net.state_dict(), cfg.logs_dir / "weights" / "last.pt")
        if val_score > best_val_score:
            print(f"Score improved from {best_val_score:.4f} to {val_score:.4f}")
            best_val_score = val_score
            torch.save(net.state_dict(), cfg.logs_dir / "weights" / "best.pt")
        with open(cfg.logs_dir / "weights" / "best_score.txt", "w") as f:
            f.write(
                f"Epoch: {epoch} \nTrain AP: {train_score} \nVal AP: {val_score} \nBest val score: {best_val_score}"
            )
        print()

        if tb is not None:
            tb.add_scalar("Loss/train", train_loss, epoch)
            tb.add_scalar("Loss/val", val_loss, epoch)
            tb.add_scalar("Score/train", train_score, epoch)
            tb.add_scalar("Score/val", val_score, epoch)


def run_training(cfg, net, optimizer, criterion, scheduler):
    tb = SummaryWriter(log_dir=cfg.logs_dir)

    train_dataset = RadarDataset(cfg.train_files, mode=cfg.mode)
    val_dataset = RadarDataset(cfg.valid_files, mode=cfg.mode)
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=cfg.batch_size,
        shuffle=True,
        num_workers=cfg.num_workers,
        collate_fn=Collator(
            "train",
            cfg.mix_proba,
            cfg.mixup_alpha,
            cfg.cutmix_alpha,
        ),
        drop_last=False,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=cfg.eval_batch_size,
        shuffle=False,
        num_workers=cfg.num_workers,
        collate_fn=Collator(
            "val",
            cfg.mix_proba,
            cfg.mixup_alpha,
            cfg.cutmix_alpha,
        ),
        drop_last=False,
    )

    net.to(cfg.device)

    train(
        cfg,
        net,
        train_dataloader,
        val_dataloader,
        optimizer,
        criterion,
        scheduler,
        cfg.label_smoothing,
        tb,
    )

    del train_dataset, train_dataloader, val_dataset, val_dataloader


# Inference

In [8]:
from pathlib import Path

import torch
import h5py
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import average_precision_score


def process_test(model, device, test_loader, output_file="prediction.hdf5"):
    model.eval()
    with torch.no_grad():
        for index, item in enumerate(tqdm(test_loader)):
            inputs, last_input_timestamp = item["features"], item["timestamp"]
            output = model(inputs.to(device))
            output = output.cpu().numpy()
            with h5py.File(output_file, mode="a") as f_out:
                for index in range(output.shape[1]):
                    timestamp_out = str(
                        int(last_input_timestamp[-1]) + 600 * (index + 1)
                    )
                    f_out.create_group(timestamp_out)
                    f_out[timestamp_out].create_dataset(
                        "intensity", data=output[0, index, 0]
                    )


def run_inference(cfg, net):
    test_dataset = RadarDataset(cfg.test_files, out_seq_len=0, with_time=True)
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=cfg.eval_batch_size,
        shuffle=False,
        num_workers=cfg.num_workers,
        # collate_fn=Collator(
        #     "test",
        #     cfg.num_labels,
        #     mix_proba=0,
        #     max_crop_size=max_crop_size,
        #     is_fixed_crop=is_fixed_crop,
        # ),
    )
    net.to(cfg.device)
    if (cfg.logs_dir / "prediction.hdf5").exists():
        (cfg.logs_dir / "prediction.hdf5").unlink()
    process_test(net, cfg.device, test_dataloader, cfg.logs_dir / "prediction.hdf5")


# Run

In [9]:
cfg = Config()

net = Seq2Seq(
    num_channels=1,
    num_kernels=32,
    kernel_size=(3, 3),
    padding=(1, 1),
    activation="relu",
    num_layers=1,
    out_seq_len=12,
)

In [10]:
try:
    print("Logging dir:", cfg.logs_dir)

    criterion = torch.nn.MSELoss()

    # optimizer = torch.optim.AdamW(net.parameters(), lr=cfg.lr)
    optimizer = Lion(net.parameters(), lr=cfg.lr, weight_decay=0.01)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=cfg.n_epochs, eta_min=cfg.min_lr
    )
    run_training(
        cfg,
        net,
        optimizer,
        criterion,
        scheduler,
    )

    del criterion, optimizer, scheduler
    torch.cuda.empty_cache()
    gc.collect()
except KeyboardInterrupt:
    print("Stop training")

Logging dir: /kaggle/working




  0%|          | 0/6401 [00:00<?, ?it/s]

Train Loss: 0.18821787908222454
Train score: 343.6676802412995


  0%|          | 0/4155 [00:00<?, ?it/s]

LR: 2.25025e-05
Val Loss: 0.0056676393865231775
Val score: 57.36043998065622
Score improved from 0.0000 to 57.3604



  0%|          | 0/6401 [00:00<?, ?it/s]

Train Loss: 0.18482099303318247
Train score: 339.9560927494965


  0%|          | 0/4155 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdfb8534550>Traceback (most recent call last):

  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/opt/conda/lib/python3.10/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fdfb8534550>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1479, in __del__
    self._shutdown_workers()
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1462, in _shutdown_workers
    if w.is_alive():
  File "/op

LR: 7.507500000000003e-06
Val Loss: 0.004868088762148794
Val score: 56.357297260379966



  0%|          | 0/6401 [00:00<?, ?it/s]

Train Loss: 0.252333953468237
Train score: 337.4972546585689


  0%|          | 0/4155 [00:00<?, ?it/s]

LR: 1e-08
Val Loss: 0.004707685397304068
Val score: 56.20484753395967



In [11]:
# INFER
print("INFERENCE")
net.load_state_dict(
    torch.load(cfg.logs_dir / "weights/last.pt", map_location="cpu")
)

run_inference(cfg, net)

INFERENCE


  0%|          | 0/96 [00:00<?, ?it/s]