In [1]:
import sys
is_kaggle_notebook = "kaggle_web_client" in sys.modules
if is_kaggle_notebook:
    %pip uninstall timm -y
    
sys.path.append('/kaggle/input/pretrainedmodels/pretrainedmodels-0.7.4')
sys.path.append('/kaggle/input/efficientnet-pytorch/EfficientNet-PyTorch-master')
sys.path.append('/kaggle/input/timm-pytorch-image-models/pytorch-image-models-master')
sys.path.append('/kaggle/input/segmentation-models-pytorch/segmentation_models.pytorch-master')


In [6]:
import os
import warnings
from glob import glob
from pathlib import Path
import shutil

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm import tqdm
import matplotlib.pyplot as plt
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torch.utils.data import DataLoader, Dataset
import segmentation_models_pytorch as smp

warnings.simplefilter("ignore")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
class CFG:
    debug = False
    # ============== comp exp name =============
    comp_name = 'contrail'
    comp_dir_path = '/kaggle/input/'
    comp_folder_name = 'google-research-identify-contrails-reduce-global-warming'

    dataset_path = "/kaggle/working/dataset_test/ash_color/"

    exp_name = "model01"

    # ============== pred target =============
    TTA = is_kaggle_notebook

    # ============== training cfg =============
    valid_batch_size = 32

    # ============== fixed =============
    num_workers = 4
    seed = 42

    # ============== augmentation =============

    valid_aug_list = [
        ToTensorV2(transpose_mask=True),
    ]

# Dataset Preprocess

In [4]:
def read_record(record_id, directory, mode):
    record_data = {}
    if mode in ["train", "validation"]:
        bands_mask = ["band_11", "band_14", "band_15", "human_pixel_masks"]
    if mode == "test":
        bands_mask = ["band_11", "band_14", "band_15"]

    for x in bands_mask:
        record_data[x] = np.load(os.path.join(directory, record_id, x + ".npy"))
    return record_data


def normalize_range(data, bounds):
    return (data - bounds[0]) / (bounds[1] - bounds[0])


def get_false_color(record_data):
    _T11_BOUNDS = (243, 303)
    _CLOUD_TOP_TDIFF_BOUNDS = (-4, 5)
    _TDIFF_BOUNDS = (-4, 2)

    N_TIMES_BEFORE = 4

    r = normalize_range(record_data["band_15"] - record_data["band_14"], _TDIFF_BOUNDS)
    g = normalize_range(record_data["band_14"] - record_data["band_11"], _CLOUD_TOP_TDIFF_BOUNDS)
    b = normalize_range(record_data["band_14"], _T11_BOUNDS)
    false_color = np.clip(np.stack([r, g, b], axis=2), 0, 1)
    img = false_color[..., N_TIMES_BEFORE]

    return img

In [5]:
def create_dataset(data_dir, save_dir, mode):
    input_dir=f"{data_dir}/{mode}"
    ids = os.listdir(input_dir)
    df = pd.DataFrame(ids, columns=['record_id'])
    os.makedirs(save_dir, exist_ok=True)
    df['path'] = save_dir + df['record_id'].astype(str) + '.npy'
    df.to_csv(f"{save_dir}/{mode}_df.csv", index=False)

    for record_id in tqdm(ids):
        data = read_record(str(record_id), input_dir, mode)
        images = get_false_color(data)
        if mode in ["train", "validation"]:
            array = np.dstack([images, data['human_pixel_masks']])
        if mode == "test":
            array = np.dstack([images])
        array = array.astype(np.float16)

        npy_path = f"{save_dir}/{record_id}.npy"
        np.save(str(npy_path), array)

        
data_dir = '/kaggle/input/google-research-identify-contrails-reduce-global-warming/'
dataset_test = "/kaggle/working/dataset_test/ash_color/"
create_dataset(data_dir, dataset_test, "test")

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/google-research-identify-contrails-reduce-global-warming//test'

# Dataset

In [6]:
class fastnumpyio:
    def load(file):
        file = open(file, "rb")
        header = file.read(128)
        descr = str(header[19:25], 'utf-8').replace("'", "").replace(" ", "")
        shape = tuple(int(num) for num in str(header[60:120], 'utf-8').replace(', }', '').replace('(', '').replace(')', '').split(','))
        datasize = np.lib.format.descr_to_dtype(descr).itemsize
        for dimension in shape:
            datasize *= dimension
        return np.ndarray(shape, dtype=descr, buffer=file.read(datasize))


class ContrailsDataset(Dataset):
    def __init__(self, df, transform, mode='train'):
        self.df = df
        self.transform = A.Compose(transform)
        self.mode = mode

    def __getitem__(self, index):
        row = self.df.iloc[index]
        path = row.path
        record_id = row.record_id
        npy = fastnumpyio.load(str(path))

        if self.mode == 'train':
            image = npy[..., :-1]
            label = npy[..., -1]
            data = self.transform(image=image, mask=label)
            image = data['image']
            label = data['mask']
            label = np.expand_dims(label, 0)
            image = torch.tensor(image)
            label = torch.tensor(label)
            return image.float(), label.float()

        if self.mode == 'test':
            image = npy
            data = self.transform(image=image)
            image = data['image']
            image = torch.tensor(image)
            return image.float(), record_id

    def __len__(self):
        return len(self.df)

In [7]:
test_df = pd.read_csv(f"{CFG.dataset_path}/test_df.csv")
test_df

Unnamed: 0,record_id,path
0,1000834164244036115,/kaggle/working/dataset_test/ash_color/1000834...
1,1002653297254493116,/kaggle/working/dataset_test/ash_color/1002653...


In [8]:
dataset_test = ContrailsDataset(test_df, CFG.valid_aug_list, mode='test')
dataloader_test = DataLoader(dataset_test, batch_size=CFG.valid_batch_size, num_workers=CFG.num_workers)

print(f"""
{len(dataset_test) = }
test_image_shape : {dataset_test[0][0].shape}
test_image_dtype : {dataset_test[0][0].dtype}
""")




len(dataset_test) = 2
test_image_shape : torch.Size([3, 256, 256])
test_image_dtype : torch.float32



# Model

In [9]:
class CustomModel(nn.Module):
    def __init__(self, model_arch, backbone, in_chans, target_size, weight):
        super().__init__()

        self.model = smp.create_model(
            model_arch,
            encoder_name=backbone,
            encoder_weights=weight,
            in_channels=in_chans,
            classes=target_size,
            activation=None,
        )

    def forward(self, image):
        output = self.model(image)
        return output


def build_model(model_arch, backbone, in_chans, target_size, weight="imagenet"):
    print('model_arch: ', model_arch)
    print('backbone: ', backbone)
    model = CustomModel(model_arch, backbone, in_chans, target_size, weight)
    return model

In [10]:
if os.path.exists(f'./{CFG.exp_name}/{CFG.exp_name}.pth'):
    pth_path = f'./{CFG.exp_name}/{CFG.exp_name}.pth'
else:
    pth_path = f"/kaggle/input/{CFG.exp_name}/{CFG.exp_name}.pth"
pth = torch.load(pth_path)


model = build_model(pth["model_arch"], pth["backbone"], pth["in_chans"], pth["target_size"], weight=None)
model.load_state_dict(pth['model'])
thresh = pth['thresh']

model = nn.DataParallel(model, device_ids=list(range(torch.cuda.device_count())))
model.to(device)
model.eval();

model_arch:  Unet
backbone:  efficientnet-b0


# Inference

In [11]:
def list_to_string(x):
    if x:
        s = str(x).replace("[", "").replace("]", "").replace(",", "")
    else:
        s = '-'
    return s


def rle_encode(x, fg_val=1):
    dots = np.where(
        x.T.flatten() == fg_val)[0]  # .T sets Fortran order down-then-right
    run_lengths = []
    prev = -2
    for b in dots:
        if b > prev + 1:
            run_lengths.extend((b + 1, 0))
        run_lengths[-1] += 1
        prev = b
    return run_lengths

In [12]:
submission = pd.read_csv('/kaggle/input/google-research-identify-contrails-reduce-global-warming/sample_submission.csv', index_col='record_id')

for i, (images, record_ids) in tqdm(enumerate(dataloader_test), total=len(dataloader_test)):
    images = images.cuda()
    with torch.no_grad():
        preds = model(images)
    preds = torch.sigmoid(preds).cpu().detach().numpy()
    preds_thresh = np.where(preds > thresh, 1, 0)

    for num in range(images.shape[0]):
        pred = preds_thresh[num, 0, :, :]
        record_id = int(record_ids[num])
        submission.loc[int(record_id), 'encoded_pixels'] = list_to_string(rle_encode(pred))
submission

Unnamed: 0_level_0,encoded_pixels
record_id,Unnamed: 1_level_1
1000834164244036115,40193 3 40449 4 40707 4 40964 4 41221 5 41479 ...
1002653297254493116,-


In [13]:
if is_kaggle_notebook:
    shutil.rmtree(CFG.dataset_path)
    submission.to_csv('submission.csv')