In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm.notebook import tqdm
from datetime import datetime
import json,itertools
from typing import Optional
from sklearn.model_selection import StratifiedKFold

train_df = pd.read_csv('/home/yiw/kg/sart/input/train.csv')
print(train_df.shape)

def rle_decode(mask_rle, shape):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0]*shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape(shape)  # Needed to align to RLE direction

# From https://newbedev.com/encode-numpy-array-using-uncompressed-rle-for-coco-dataset
def binary_mask_to_rle(binary_mask):
    rle = {'counts': [], 'size': list(binary_mask.shape)}
    counts = rle.get('counts')
    for i, (value, elements) in enumerate(itertools.groupby(binary_mask.ravel(order='F'))):
        if i == 0 and value == 1:
            counts.append(0)
        counts.append(len(list(elements)))
    return rle

def coco_structure(train_df):
    cat_ids = {name:id+1 for id, name in enumerate(train_df.cell_type.unique())}    
    cats =[{'name':name, 'id':id} for name,id in cat_ids.items()]
    images = [{'id':id, 'width':row.width, 'height':row.height, 'file_name':f'train/{id}.png'} for id,row in train_df.groupby('id').agg('first').iterrows()]
    annotations=[]
    for idx, row in tqdm(train_df.iterrows()):
        mk = rle_decode(row.annotation, (row.height, row.width))
        ys, xs = np.where(mk)
        x1, x2 = min(xs), max(xs)
        y1, y2 = min(ys), max(ys)
        enc =binary_mask_to_rle(mk)
        seg = {
            'segmentation':enc, 
            'bbox': [int(x1), int(y1), int(x2-x1+1), int(y2-y1+1)],
            'area': int(np.sum(mk)),
            'image_id':row.id, 
            'category_id':cat_ids[row.cell_type], 
            'iscrowd':0, 
            'id':idx
        }
        annotations.append(seg)
    return {'categories':cats, 'images':images,'annotations':annotations}

train_meta = train_df.groupby('id').first().reset_index()
train_meta

(73585, 9)


Unnamed: 0,id,annotation,width,height,cell_type,plate_time,sample_date,sample_id,elapsed_timedelta
0,0030fd0e6378,118145 6 118849 7 119553 8 120257 8 120961 9 1...,704,520,shsy5y,11h30m00s,2019-06-16,shsy5y[diff]_E10-4_Vessel-714_Ph_3,0 days 11:30:00
1,0140b3c8f445,32499 3 33201 7 33902 9 34604 10 35306 11 3600...,704,520,astro,09h00m00s,2020-09-13,astros[cereb]_F8-3_Vessel-361_Ph_4,0 days 09:00:00
2,01ae5a43a2ab,241026 3 241726 9 242427 13 243130 14 243834 1...,704,520,cort,13h30m00s,2020-11-04,cort[oka-high]_B5-1_Vessel-377_Ph_1,0 days 13:30:00
3,026b3c2c4b32,170753 5 171454 12 172158 13 172862 13 173565 ...,704,520,cort,19h30m00s,2020-11-04,cort[oka-low]_H6-2_Vessel-377_Ph_2,0 days 19:30:00
4,029e5b3b89c7,139142 7 139845 10 140548 13 141251 15 141955 ...,704,520,cort,13h30m00s,2020-10-27,cort[pre-treat]_B8-2_Vessel-377_Ph_2,0 days 13:30:00
...,...,...,...,...,...,...,...,...,...
601,fd98d82784a1,335979 1 336681 3 337385 3 338088 3 338792 3 3...,704,520,astro,13h00m00s,2020-09-14,astro[hippo]_G1-4_Vessel-361_Ph_3,0 days 13:00:00
602,fe33dfcf4ebd,165826 7 166526 14 167199 45 167901 46 168603 ...,704,520,astro,13h00m00s,2020-09-15,astro[hippo]_F2-4_Vessel-361_Ph_1,0 days 13:00:00
603,fe3e30f849f4,343210 3 343912 4 344610 9 345312 10 346014 11...,704,520,shsy5y,11h30m00s,2019-06-15,shsy5y[diff]_D10-4_Vessel-714_Ph_3,0 days 11:30:00
604,ffc2ead3e8cc,252827 1 253531 3 254235 5 254940 5 255644 7 2...,704,520,astro,09h00m00s,2020-09-13,astros[cereb]_F11-4_Vessel-361_Ph_1,0 days 09:00:00


In [2]:
n_splits=5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
for fold, (_, val_idx) in enumerate(skf.split(X=train_meta, y=train_meta['cell_type']), 1):
    train_meta.loc[val_idx, 'fold'] = fold
    
train_meta['fold'] = train_meta['fold'].astype(np.uint8)
train_meta.groupby('fold').size()
train_meta.head(10)

Unnamed: 0,id,annotation,width,height,cell_type,plate_time,sample_date,sample_id,elapsed_timedelta,fold
0,0030fd0e6378,118145 6 118849 7 119553 8 120257 8 120961 9 1...,704,520,shsy5y,11h30m00s,2019-06-16,shsy5y[diff]_E10-4_Vessel-714_Ph_3,0 days 11:30:00,3
1,0140b3c8f445,32499 3 33201 7 33902 9 34604 10 35306 11 3600...,704,520,astro,09h00m00s,2020-09-13,astros[cereb]_F8-3_Vessel-361_Ph_4,0 days 09:00:00,2
2,01ae5a43a2ab,241026 3 241726 9 242427 13 243130 14 243834 1...,704,520,cort,13h30m00s,2020-11-04,cort[oka-high]_B5-1_Vessel-377_Ph_1,0 days 13:30:00,1
3,026b3c2c4b32,170753 5 171454 12 172158 13 172862 13 173565 ...,704,520,cort,19h30m00s,2020-11-04,cort[oka-low]_H6-2_Vessel-377_Ph_2,0 days 19:30:00,4
4,029e5b3b89c7,139142 7 139845 10 140548 13 141251 15 141955 ...,704,520,cort,13h30m00s,2020-10-27,cort[pre-treat]_B8-2_Vessel-377_Ph_2,0 days 13:30:00,5
5,0323e81d23d9,244004 3 244705 8 245407 12 246109 15 246811 1...,704,520,cort,13h30m00s,2020-11-07,cort[density]_B11-4_Vessel-376_Ph_4,0 days 13:30:00,2
6,03b27b381a5f,219219 3 219922 6 220625 8 221328 10 222031 11...,704,520,cort,19h30m00s,2020-11-04,cort[6-OHDA]_B1-2_Vessel-377_Ph_4,0 days 19:30:00,4
7,042c17cd9143,34611 1 35314 3 36016 5 36719 7 37422 8 38124 ...,704,520,shsy5y,11h30m00s,2019-06-14,shsy5y[diff]_E8-4_Vessel-714_Ph_1,0 days 11:30:00,5
8,042dc0e561a4,191692 8 192395 11 193097 14 193800 14 194503 ...,704,520,shsy5y,11h30m00s,2019-06-15,shsy5y[diff]_D5-2_Vessel-714_Ph_1,0 days 11:30:00,2
9,04928f0866b0,128782 4 129476 14 130179 15 130883 14 131586 ...,704,520,cort,13h30m00s,2020-11-03,cort[pre-treat]_A4-2_Vessel-377_Ph_3,0 days 13:30:00,3


In [3]:
train_meta.to_csv("train_meta.csv", index=False)

In [5]:
fold_selected=1

train_ids = train_meta[train_meta["fold"]!=fold_selected].id
test_ids = train_meta[train_meta["fold"]==fold_selected].id

df_train = train_df[train_df.id.isin(train_ids)]
df_valid = train_df[train_df.id.isin(test_ids)]

df_train.to_csv("train_fold1.csv", index=False)
# train_json = coco_structure(df_train)
valid_json = coco_structure(df_valid)

FileNotFoundError: [Errno 2] No such file or directory: 'folds/train_fold1.csv'

In [None]:
# with open(f'coco_cell_train_fold{fold_selected}.json', 'w', encoding='utf-8') as f:
#     json.dump(train_json, f, ensure_ascii=True, indent=4)
with open(f'coco_cell_valid_fold{fold_selected}.json', 'w', encoding='utf-8') as f:
    json.dump(valid_json, f, ensure_ascii=True, indent=4)

In [14]:
train_meta=train_df

train_meta=train_meta.drop_duplicates(subset=['id'])
train_meta=train_meta.reset_index(drop=True)
train_meta=train_meta[["id","width","height"]]
row = train_meta[train_meta.id == "fe3e30f849f4"]
row

Unnamed: 0,id,width,height
603,fe3e30f849f4,704,520
