- base version: exp002
- add: 重複画像のラベルデータを後処理で埋める

In [1]:
import os
from typing import *

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from src.utils import Jbl


%matplotlib inline
sns.set_style("whitegrid")

In [20]:
from dataclasses import dataclass, field
from glob import glob

import torch


@dataclass
class InputPath:
    _prefix: str = "../input"
    train_images: str = f"{_prefix}/christ-train-imgs.npz"
    train_labels: str = f"{_prefix}/christ-train-labels.npz"
    test_images: str = f"{_prefix}/christ-test-imgs.npz"
    images_prefix: str = f"{_prefix}/data/train/0"
        
        
@dataclass
class OutputPath:
    prefix: str = "../output"
    model: str = f"{prefix}/model"
    submission: str = f"{prefix}/submission"

        
@dataclass
class Basic:
    run_name: str = "exp004"
    is_debug: bool = False
    seed: int = 42
    device: str = "cuda" if torch.cuda.is_available() else "cpu"
        
        
@dataclass
class Kfold:
    number: int = 5
    method: str = "skf"
    shuffle: bool = True
    columns: List[str] = field(default_factory=lambda: ["target"])
        
        
@dataclass
class Adam:
    name: str = "Adam"
    lr: float = 1e-5
    weight_decay: float = 0
    amsgrad: bool = False
        
        
@dataclass
class ReduceLROnPlateau:
    name: str = "ReduceLROnPlateau"
    mode: str = "min"
    factor: float = 0.1
    patience: int = 5
    verbose: bool = True
    eps: float = 1e-8

        
@dataclass
class Params:
    model_name: str = "resnet34"
    batch_size: int = 64
    test_batch_size: int = 256
    epochs: int = 3 if Basic.is_debug else 100
    image_size: int = 224
    num_workers: int = 0
    target_size: int = 13
    # Union[Adam]
    optimizer: Adam = Adam()
    # Union[CosineAnnealingLR, CosineAnnealingWarmRestarts, ReduceLROnPlateau]
    scheduler: ReduceLROnPlateau = ReduceLROnPlateau()
    pretrained: bool = True
    num_aug: int = 5
    num_tta: int = 2
    early_stopping_rounds: int = 10


# @dataclass
# class DinoParams:
#     arch: str = "vit_small"
#     embed_dim: int = 384
#     patch_size: int = 16
#     checkpoint: str = f"../output/model/dino/{Basic.run_name}/checkpoint1540.pth"
        

@dataclass
class ModelConfig:
    basic: Basic = Basic()
    kfold: Kfold = Kfold()
    params: Params = Params()
#     dino_params: DinoParams = DinoParams()

In [4]:
# for x in os.listdir(OutputPath.model):
#     if x.startswith(f"{Basic.run_name}_"):
#         os.remove(f"{OutputPath.model}/{x}")

In [5]:
past_sessions = [x.split("_")[0] for x in os.listdir(OutputPath.model) if x.endswith("_0.pth")]
assert Basic.run_name not in past_sessions

In [7]:
import numpy as np


def load_npz(path: str) -> np.array:
    x = np.load(path)["arr_0"]
    # print(f"{os.path.basename(path)}: {x.shape}")
    return x

In [13]:
fix_seed()
input_path = InputPath()
model_config = ModelConfig()

In [40]:
train_length = len(load_npz(input_path.train_images))
sub = pd.read_csv(f"{OutputPath.submission}/submission_exp002.csv")
duplicate_image_target_dict: Dict[int, int] = Jbl.load(f"{OutputPath.prefix}/duplicate_image_target_dict.jbl")
duplicate_image_target_dict = {k-train_length: v for k, v in duplicate_image_target_dict.items()}
y_ = [duplicate_image_target_dict.get(id, y) for id, y in zip(sub["id"], sub["y"])]
sub.assign(y_=y_).iloc[list(duplicate_image_target_dict.keys())]

Unnamed: 0,id,y,y_
455,455,3,3
118,118,5,5
162,162,7,7
186,186,2,2
413,413,2,2
259,259,7,7
139,139,2,2
36,36,9,9
341,341,9,9
299,299,5,5


In [42]:
sub.assign(y=y_).to_csv(f"{OutputPath.submission}/submission_exp004.csv", index=False)