In [1]:
import os
import sys
import glob

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# loader.py
from typing import List, Tuple

import h5py
import numpy as np


def loader(
    hdf5_file: str, folds: List[int]
) -> Tuple[np.ndarray, np.ndarray, np.ndarray, List[str]]:
    post = []
    pre = []
    masks = []
    names = []

    # Read hdf5 file and filter by fold
    with h5py.File(hdf5_file, "r") as f:
        for uuid, values in f.items():
            if values.attrs["fold"] not in folds:
                continue
            if "pre_fire" not in values:
                continue

            post.append(values["post_fire"][...])
            pre.append(values["pre_fire"][...])
            masks.append(values["mask"][...])
            names.append(uuid)

    # Convert to numpy arrays
    post = np.stack(post, axis=0, dtype=np.int32)
    pre = np.stack(pre, axis=0, dtype=np.int32)
    masks = np.stack(masks, axis=0, dtype=np.int32)

    return post, pre, masks, names

In [3]:
PATH_ROOT = os.path.join("..", "data")
PATH_DATASET = f"{PATH_ROOT}/train_eval.hdf5"

In [4]:
names = []
pres = []
posts = []
masks = []
folds = []
comments = []
defects = []

with h5py.File(PATH_DATASET, 'r') as fp:
    for uuid, values in fp.items():
        names.append(uuid)
        post_img = values["post_fire"][...]
        
        if 'mask' in values: masks.append(values['mask'][...])
        if 'post_fire' in values: posts.append(post_img)
        if 'pre_fire' in values: 
            pres.append(values['pre_fire'][...])
            defect = False
        else:
            pres.append(np.zeros(post_img.shape, dtype=post_img.dtype))
            defect = True
        folds.append(values.attrs['fold'])
        comments.append(values.attrs['comments'])
        defects.append(defect)
        # if fold!=0: continue
        # if "pre_fire" not in values: continue
        # print(uuid, *[ f'{k}: {v}' for k,v in values.attrs.items() ])
        # mask = values['mask'][...]
        # plt.imshow(mask)
        # plt.show()

pres = np.stack(pres)
posts = np.stack(posts)
masks = np.stack(masks)

In [5]:
print(len(names), len(folds), pres.shape, posts.shape, masks.shape, len(folds))

534 534 (534, 512, 512, 12) (534, 512, 512, 12) (534, 512, 512, 1) 534


In [6]:
train_eval_df = pd.DataFrame({
    'name':names,
    'fold': folds,
    'comment': comments,
    'defect': defects,
})
train_eval_df.shape

(534, 4)

In [7]:
train_eval_df.sort_values(by=['fold','defect','name'], ascending=[True, True, True], )

Unnamed: 0,name,fold,comment,defect
29,0ff249de-784d-4760-a039-ed9f92eca60c_0,0,"[5, 11]",False
30,0ff249de-784d-4760-a039-ed9f92eca60c_1,0,"[5, 11]",False
31,0ff249de-784d-4760-a039-ed9f92eca60c_2,0,"[5, 11]",False
34,1087ccdb-44af-4043-845c-d79e00fef8bc_0,0,"[2, 11]",False
75,270908a1-a110-4167-896e-bc0a847b3d31_0,0,"[2, 11]",False
...,...,...,...,...
347,a3e72d63-ec0c-4c85-a611-74a2567d551f_0,4,[-1],True
413,bfdb5b07-95e9-480f-bab8-491778cc7430_0,4,[11],True
474,e37635d0-82e8-4003-a594-8010c9cfe3b8_0,4,"[2, 11]",True
478,e4d4dbcb-dd92-40cf-a7fe-fda8dd35f367_0,4,[11],True


# sample_submission.csv

In [8]:
sample_submission_df = pd.read_csv('../data/sample_submission.csv')
print(sample_submission_df.shape)

(449, 3)


In [9]:
display(sample_submission_df.head())

Unnamed: 0,id,rle_mask,index
0,0ff249de-784d-4760-a039-ed9f92eca60c_0,912,0
1,0ff249de-784d-4760-a039-ed9f92eca60c_0,676,1
2,0ff249de-784d-4760-a039-ed9f92eca60c_0,363,2
3,0ff249de-784d-4760-a039-ed9f92eca60c_0,941,3
4,0ff249de-784d-4760-a039-ed9f92eca60c_0,259252,4


In [10]:
sample_submission_unique_id = set(sample_submission_df.id)
print(len(sample_submission_unique_id))

78


In [11]:
fold_0_has_pre_unique_id = set(train_eval_df[(train_eval_df.fold==0)&(train_eval_df.defect==False)]['name'].values)

In [12]:
sample_submission_unique_id == fold_0_has_pre_unique_id

True

# prediction.csv

In [13]:
# run create_sample_submission.py
!cd ../data; python ./create_sample_submission.py
!ls -l ../data/predictions.csv

-rw-r--r--  1 yokoya  staff  1828187  5 13 11:16 ../data/predictions.csv


In [14]:
prediction_df = pd.read_csv('../data/predictions.csv')
print(prediction_df.shape)

(39130, 3)


In [15]:
display(prediction_df.head())

Unnamed: 0,id,rle_mask,index
0,06181a53-1181-427c-9f60-55040bde0a9a_0,51300,0
1,06181a53-1181-427c-9f60-55040bde0a9a_0,150,1
2,06181a53-1181-427c-9f60-55040bde0a9a_0,362,2
3,06181a53-1181-427c-9f60-55040bde0a9a_0,150,3
4,06181a53-1181-427c-9f60-55040bde0a9a_0,362,4


In [16]:
prediction_unique_id = set(prediction_df.id)
print(len(prediction_unique_id))

130


In [17]:
fold_0_unique_id = set(train_eval_df[(train_eval_df.fold==0)]['name'].values)

In [18]:
prediction_unique_id == fold_0_unique_id

True