In [1]:
import sys
is_kaggle_notebook = "kaggle_web_client" in sys.modules
if is_kaggle_notebook:
    %pip uninstall timm -y
    
sys.path.append('/kaggle/input/pretrainedmodels/pretrainedmodels-0.7.4')
sys.path.append('/kaggle/input/efficientnet-pytorch/EfficientNet-PyTorch-master')
sys.path.append('/kaggle/input/timm-pytorch-image-models/pytorch-image-models-master')
sys.path.append('/kaggle/input/segmentation-models-pytorch/segmentation_models.pytorch-master')


In [2]:
import os
import warnings
from glob import glob
from pathlib import Path
import shutil

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm import tqdm
import matplotlib.pyplot as plt
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader, Dataset
import segmentation_models_pytorch as smp

warnings.simplefilter("ignore")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [43]:
class CFG:
    # ============== comp exp name =============
    comp_name = 'contrail'
    comp_dir_path = '/kaggle/input/'
    comp_folder_name = 'google-research-identify-contrails-reduce-global-warming'

    dataset_path = "/kaggle/working/dataset_test/ash_color/"

    pth_paths=[
                "/kaggle/working/notebook/experiment/v3/model1_iter1/model1_iter1/model1_iter1_fold0.pth",
                "/kaggle/working/notebook/experiment/v3/model1_iter1/model1_iter1/model1_iter1_fold1.pth",
                "/kaggle/working/notebook/experiment/v3/model1_iter2/model1_iter2/model1_iter2_fold0.pth",
                "/kaggle/working/notebook/experiment/v3/model1_iter2/model1_iter2/model1_iter2_fold1.pth",
                "/kaggle/working/notebook/experiment/v3/model1_iter2/model1_iter2/model1_iter2_fold2.pth",
                "/kaggle/working/notebook/experiment/v3/model1_iter3/model1_iter3/model1_iter3_fold0.pth",
                "/kaggle/working/notebook/experiment/v3/model1_iter3/model1_iter3/model1_iter3_fold1.pth",
                "/kaggle/working/notebook/experiment/v3/model1_iter3/model1_iter3/model1_iter3_fold2.pth",
                "/kaggle/working/notebook/experiment/v3/model2_iter1/model2_iter1/model2_iter1_fold0.pth",
                "/kaggle/working/notebook/experiment/v3/model2_iter1/model2_iter1/model2_iter1_fold1.pth",
                "/kaggle/working/notebook/experiment/v3/model2_iter1/model2_iter1/model2_iter1_fold2.pth",
                "/kaggle/working/notebook/experiment/v3/model3_iter1/model3_iter1/model3_iter1_fold0.pth",
                "/kaggle/working/notebook/experiment/v3/model3_iter1/model3_iter1/model3_iter1_fold1.pth",
                "/kaggle/working/notebook/experiment/v3/model3_iter1/model3_iter1/model3_iter1_fold2.pth",
               ]
    if is_kaggle_notebook:
        pth_paths=glob("/kaggle/input/modelsv1/**/*.pth",recursive=True)
    
    # ============== model =============
    TTA = is_kaggle_notebook
    thresh = 0.00000005
    # ============== training cfg =============
    valid_batch_size = 32

    # ============== fixed =============
    num_workers = 4
    seed = 42

    # ============== augmentation =============

    valid_aug_list = [
        ToTensorV2(transpose_mask=True),
    ]

# Dataset Preprocess

In [44]:
def read_record(record_id, directory, mode):
    record_data = {}
    if mode in ["train", "validation"]:
        bands_mask = ["band_11", "band_14", "band_15", "human_pixel_masks"]
    if mode == "test":
        bands_mask = ["band_11", "band_14", "band_15"]

    for x in bands_mask:
        record_data[x] = np.load(os.path.join(directory, record_id, x + ".npy"))
    return record_data


def normalize_range(data, bounds):
    return (data - bounds[0]) / (bounds[1] - bounds[0])


def get_false_color(record_data):
    _T11_BOUNDS = (243, 303)
    _CLOUD_TOP_TDIFF_BOUNDS = (-4, 5)
    _TDIFF_BOUNDS = (-4, 2)

    N_TIMES_BEFORE = 4

    r = normalize_range(record_data["band_15"] - record_data["band_14"], _TDIFF_BOUNDS)
    g = normalize_range(record_data["band_14"] - record_data["band_11"], _CLOUD_TOP_TDIFF_BOUNDS)
    b = normalize_range(record_data["band_14"], _T11_BOUNDS)
    false_color = np.clip(np.stack([r, g, b], axis=2), 0, 1)
    img = false_color[..., N_TIMES_BEFORE]

    return img

In [45]:
def create_dataset(data_dir, save_dir, mode):
    input_dir=f"{data_dir}/{mode}"
    ids = os.listdir(input_dir)
    df = pd.DataFrame(ids, columns=['record_id'])
    os.makedirs(save_dir, exist_ok=True)
    df['path'] = save_dir + df['record_id'].astype(str) + '.npy'
    df.to_csv(f"{save_dir}/{mode}_df.csv", index=False)

    for record_id in tqdm(ids):
        data = read_record(str(record_id), input_dir, mode)
        images = get_false_color(data)
        if mode in ["train", "validation"]:
            array = np.dstack([images, data['human_pixel_masks']])
        if mode == "test":
            array = np.dstack([images])
        array = array.astype(np.float16)

        npy_path = f"{save_dir}/{record_id}.npy"
        np.save(str(npy_path), array)


data_dir = f'{CFG.comp_dir_path}/{CFG.comp_folder_name}'
create_dataset(data_dir, CFG.dataset_path, "test")

100%|██████████| 2/2 [00:00<00:00, 27.16it/s]


# Dataset

In [46]:
class fastnumpyio:
    def load(file):
        file = open(file, "rb")
        header = file.read(128)
        descr = str(header[19:25], 'utf-8').replace("'", "").replace(" ", "")
        shape = tuple(int(num) for num in str(header[60:120], 'utf-8').replace(', }', '').replace('(', '').replace(')', '').split(','))
        datasize = np.lib.format.descr_to_dtype(descr).itemsize
        for dimension in shape:
            datasize *= dimension
        return np.ndarray(shape, dtype=descr, buffer=file.read(datasize))


class ContrailsDataset(Dataset):
    def __init__(self, df, transform, mode='train'):
        self.df = df
        self.transform = A.Compose(transform)
        self.mode = mode

    def __getitem__(self, index):

        if self.mode == 'valid':
            row = self.df.iloc[index]
            record_id = row["record_id"]
            image_path = row["image_path"]
            label_path = row["label_path"]
            image = fastnumpyio.load(str(image_path)).astype("float32")
            label = fastnumpyio.load(str(label_path)).astype("float32")
            data = self.transform(image=image, mask=label)
            image = data['image']
            label = data['mask']
            image = torch.tensor(image)
            return image.float(), label.float()

        if self.mode == 'test':
            row = self.df.iloc[index]
            path = row.path
            record_id = row.record_id
            npy = fastnumpyio.load(str(path))
            image = npy
            data = self.transform(image=image)
            image = data['image']
            image = torch.tensor(image)
            return image.float(), record_id

    def __len__(self):
        return len(self.df)

In [47]:
test_df = pd.read_csv(f"{CFG.dataset_path}/test_df.csv")
test_df


Unnamed: 0,record_id,path
0,1000834164244036115,/kaggle/working/dataset_test/ash_color/1000834...
1,1002653297254493116,/kaggle/working/dataset_test/ash_color/1002653...


In [48]:
dataset_test = ContrailsDataset(test_df, CFG.valid_aug_list, mode='test')
dataloader_test = DataLoader(dataset_test, batch_size=CFG.valid_batch_size, num_workers=CFG.num_workers)

print(f"""
{len(dataset_test) = }
test_image_shape : {dataset_test[0][0].shape}
test_image_dtype : {dataset_test[0][0].dtype}
""")




len(dataset_test) = 2
test_image_shape : torch.Size([3, 256, 256])
test_image_dtype : torch.float32



# Model

In [49]:
class CustomModel(nn.Module):
    def __init__(self, model_arch, backbone, in_chans, target_size, weight):
        super().__init__()

        self.model = smp.create_model(
            model_arch,
            encoder_name=backbone,
            encoder_weights=weight,
            in_channels=in_chans,
            classes=4,
            activation=None,
        )

    def forward(self, image):
        output = self.model(image)
        return output


def build_model(model_arch, backbone, in_chans, target_size, weight="imagenet", dataparallel=True):
    model = CustomModel(model_arch, backbone, in_chans, target_size, weight)
    num_gpus = torch.cuda.device_count()
    device_ids = list(range(num_gpus))
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if dataparallel:
        model = nn.DataParallel(model, device_ids=device_ids)
    return model

# Decide thresh

In [50]:
class Dice(nn.Module):
    def __init__(self, use_sigmoid=True):
        super(Dice, self).__init__()
        self.sigmoid = nn.Sigmoid()
        self.use_sigmoid = use_sigmoid

    def forward(self, inputs, targets, smooth=1):
        if self.use_sigmoid:
            inputs = self.sigmoid(inputs)

        inputs = inputs.view(-1)
        targets = targets.view(-1)

        intersection = (inputs * targets).sum()
        dice = (2.0 * intersection + smooth)/(inputs.sum() + targets.sum() + smooth)

        return dice


def calc_dice_score(pred, true, thresh: float) -> float:
    dice = Dice(use_sigmoid=False)
    pred_thresh = np.where(pred > thresh, 1, 0)
    pred_thresh = torch.flatten(torch.from_numpy(pred_thresh))
    return dice(true, pred_thresh).item()


def calc_optim_thresh(pred, true, threshs_to_test):
    best_dice = -1
    for thresh in threshs_to_test:
        dice = calc_dice_score(pred, true, thresh)
        if dice > best_dice:
            best_dice = dice
            best_thresh = thresh
    return best_dice, best_thresh

In [51]:
if not is_kaggle_notebook:
    valid_df = pd.read_csv(f"/kaggle/working/dataset_train/pseud_ashcolor_4label/validation_df.csv")
    valid_df = valid_df.dropna()
    dataset_valid = ContrailsDataset(valid_df, CFG.valid_aug_list, "valid")
    dataloader_valid = DataLoader(dataset_valid, batch_size=CFG.valid_batch_size, num_workers = CFG.num_workers)
    
    print(f"""
    {len(dataset_valid) = }
    valid_image_shape : {dataset_valid[0][0].shape}
    valid_mask_shape  : {dataset_valid[0][1].shape}
    valid_image_dtype : {dataset_valid[0][0].dtype}
    valid_mask_dtype : {dataset_valid[0][1].dtype}
    """)


    len(dataset_valid) = 1856
    valid_image_shape : torch.Size([3, 256, 256])
    valid_mask_shape  : torch.Size([1, 256, 256])
    valid_image_dtype : torch.float32
    valid_mask_dtype : torch.float32
    


In [52]:
if not is_kaggle_notebook:
    cum_cum_pred = []

    thresholds_to_test = [round(x * 0.01, 2) for x in range(1, 101, 2)]

    for i_pth, pth_path in enumerate(CFG.pth_paths):
        cum_true = []
        cum_pred = []
        pth = torch.load(pth_path)
        try:
            model = build_model(pth["model_arch"], pth["backbone"], pth["in_chans"], pth["target_size"], weight=None)
            model.load_state_dict(pth['model'])
        except RuntimeError:
            model = build_model(pth["model_arch"], pth["backbone"], pth["in_chans"], pth["target_size"], weight=None, dataparallel=False)
            model.load_state_dict(pth['model'])
        model.to(device)
        model.eval()

        for i, (images, masks) in enumerate(tqdm(dataloader_valid)):
            images, masks = images.cuda(), masks.cuda()
            with torch.no_grad():
                preds = model(images)[:, 2]
                preds = torch.sigmoid(preds)
                cum_pred.append(preds.cpu().detach().numpy())
                cum_true.append(masks.cpu().detach().numpy())

        cum_pred = (np.concatenate(cum_pred, axis=0)).flatten()
        cum_cum_pred.append(cum_pred)
    cum_cum_pred = np.sum(cum_cum_pred, axis=0)/len(CFG.pth_paths)
    cum_true = torch.flatten(torch.from_numpy(np.concatenate(cum_true, axis=0)))
    dice_score_, thresh = calc_optim_thresh(cum_cum_pred, cum_true, thresholds_to_test)
    print(f"score : {dice_score_:.4f}\tthresh : {thresh}\n")

100%|██████████| 58/58 [00:06<00:00,  8.48it/s]
100%|██████████| 58/58 [00:06<00:00,  8.47it/s]
100%|██████████| 58/58 [00:07<00:00,  8.16it/s]
100%|██████████| 58/58 [00:07<00:00,  8.12it/s]
100%|██████████| 58/58 [00:07<00:00,  8.02it/s]
100%|██████████| 58/58 [00:07<00:00,  8.16it/s]
100%|██████████| 58/58 [00:07<00:00,  8.15it/s]
100%|██████████| 58/58 [00:07<00:00,  8.21it/s]
100%|██████████| 58/58 [00:10<00:00,  5.31it/s]
100%|██████████| 58/58 [00:11<00:00,  5.27it/s]
100%|██████████| 58/58 [00:10<00:00,  5.30it/s]
100%|██████████| 58/58 [00:09<00:00,  6.14it/s]
100%|██████████| 58/58 [00:09<00:00,  6.18it/s]
100%|██████████| 58/58 [00:09<00:00,  6.17it/s]


score : 0.6880	thresh : 0.43



# Inference

In [33]:
def list_to_string(x):
    if x:
        s = str(x).replace("[", "").replace("]", "").replace(",", "")
    else:
        s = '-'
    return s


def rle_encode(x, fg_val=1):
    dots = np.where(
        x.T.flatten() == fg_val)[0]  # .T sets Fortran order down-then-right
    run_lengths = []
    prev = -2
    for b in dots:
        if b > prev + 1:
            run_lengths.extend((b + 1, 0))
        run_lengths[-1] += 1
        prev = b
    return run_lengths

In [35]:
submission = pd.read_csv(f'{CFG.comp_dir_path}/{CFG.comp_folder_name}/sample_submission.csv', index_col='record_id')

preds_multimodels = np.zeros((len(CFG.pth_paths), len(dataset_test), 1, 256, 256))
for i_pth, pth_path in enumerate(CFG.pth_paths):
    pth = torch.load(pth_path)
    try:
        model = build_model(pth["model_arch"], pth["backbone"], pth["in_chans"], pth["target_size"], weight=None)
        model.load_state_dict(pth['model'])
    except RuntimeError:
        model = build_model(pth["model_arch"], pth["backbone"], pth["in_chans"], pth["target_size"], weight=None, dataparallel=False)
        model.load_state_dict(pth['model'])
    model.to(device)
    model.eval()

    for i, (images, record_ids) in enumerate(tqdm(dataloader_test)):
        images = images.cuda()
        n_batch = images.shape[0]

        with torch.no_grad():
            preds = model(images)[:, 2]
            preds = torch.unsqueeze(preds, 1)
        preds = torch.sigmoid(preds).cpu().detach().numpy()

        for num in range(n_batch):
            pred = preds[num, 0, :, :]
            record_id = int(record_ids[num])
            submission.loc[int(record_id), f'pred_{str(i_pth)}']= "tmp"
            submission.loc[int(record_id), f'pred_{str(i_pth)}'] = [pred]
            

100%|██████████| 1/1 [00:00<00:00,  1.83it/s]
100%|██████████| 1/1 [00:00<00:00,  2.19it/s]
100%|██████████| 1/1 [00:00<00:00,  2.12it/s]
100%|██████████| 1/1 [00:00<00:00,  2.16it/s]
100%|██████████| 1/1 [00:00<00:00,  2.06it/s]
100%|██████████| 1/1 [00:00<00:00,  2.07it/s]
100%|██████████| 1/1 [00:00<00:00,  2.22it/s]
100%|██████████| 1/1 [00:00<00:00,  2.18it/s]


In [36]:
for i, row in enumerate(submission.itertuples()):
    pred = np.zeros((256, 256))
    for i_pth in range(len(CFG.pth_paths)):
        column_name = f'pred_{str(i_pth)}'
        i_pred = row[i_pth+2][0]
        pred += i_pred
    pred = pred/len(CFG.pth_paths)
    pred_thresh = np.where(pred > CFG.thresh, 1, 0)

    submission.iloc[i, 0] = list_to_string(rle_encode(pred_thresh))
cols_to_drop = submission.columns[1:]
submission = submission.drop(columns=cols_to_drop)
if is_kaggle_notebook:
    shutil.rmtree(CFG.dataset_path)
    submission.to_csv('submission.csv')