In [68]:
import shutil
from pathlib import Path

import pandas as pd

In [69]:
data_dir = Path('../data')
vin_big_data_dir = data_dir / 'vinbigdata_512x512'

In [70]:
df = pd.read_csv(vin_big_data_dir / 'train.csv')
df = df[df.class_id != 14].reset_index(drop=True)


In [71]:
df['x_center_norm'] = (df['x_min'] + df['x_max']) / (2 * df['width'])
df['y_center_norm'] = (df['y_min'] + df['y_max']) / (2 * df['height'])
df['bbox_width_norm'] = (df['x_max'] - df['x_min']) / df['width']
df['bbox_height_norm'] = (df['y_max'] - df['y_min']) / df['height']
df['yolo_format'] = df.apply(
    lambda row:
    f"{row['class_id']} "
    f"{row['x_center_norm']:.6f} "
    f"{row['y_center_norm']:.6f} "
    f"{row['bbox_width_norm']:.6f} "
    f"{row['bbox_height_norm']:.6f}",
    axis=1)


In [72]:
df

Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,width,height,x_center_norm,y_center_norm,bbox_width_norm,bbox_height_norm,yolo_format
0,9a5094b2563a1ef3ff50dc5c7ff71345,Cardiomegaly,3,R10,691.0,1375.0,1653.0,1831.0,2080,2336,0.563462,0.686216,0.462500,0.195205,3 0.563462 0.686216 0.462500 0.195205
1,051132a778e61a86eb147c7c6f564dfe,Aortic enlargement,0,R10,1264.0,743.0,1611.0,1019.0,2304,2880,0.623915,0.305903,0.150608,0.095833,0 0.623915 0.305903 0.150608 0.095833
2,1c32170b4af4ce1a3030eb8167753b06,Pleural thickening,11,R9,627.0,357.0,947.0,433.0,2540,3072,0.309843,0.128581,0.125984,0.024740,11 0.309843 0.128581 0.125984 0.024740
3,0c7a38f293d5f5e4846aa4ca6db4daf1,ILD,5,R17,1347.0,245.0,2188.0,2169.0,2285,2555,0.773523,0.472407,0.368053,0.753033,5 0.773523 0.472407 0.368053 0.753033
4,47ed17dcb2cbeec15182ed335a8b5a9e,Nodule/Mass,8,R9,557.0,2352.0,675.0,2484.0,2568,3353,0.239875,0.721145,0.045950,0.039368,8 0.239875 0.721145 0.045950 0.039368
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36091,b53d1dd80e99ca6bcef9d592f65d3321,Pleural effusion,10,R9,240.0,1550.0,562.0,2001.0,2304,2880,0.174045,0.616493,0.139757,0.156597,10 0.174045 0.616493 0.139757 0.156597
36092,26d1d5a0ef2e692c6340e74859ffdc53,Pulmonary fibrosis,13,R10,1163.0,787.0,1338.0,941.0,3072,3072,0.407064,0.281250,0.056966,0.050130,13 0.407064 0.281250 0.056966 0.050130
36093,22672ab82c290c20b86863291e25ef6c,ILD,5,R9,299.0,664.0,794.0,1508.0,2048,2500,0.266846,0.434400,0.241699,0.337600,5 0.266846 0.434400 0.241699 0.337600
36094,db169d0be36123bd55b866d6aa73983b,Other lesion,9,R8,6.0,670.0,272.0,1736.0,2304,2880,0.060330,0.417708,0.115451,0.370139,9 0.060330 0.417708 0.115451 0.370139


In [73]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=101)
df_folds = df[['image_id']].copy()

df_folds.loc[:, 'bbox_count'] = 1
df_folds = df_folds.groupby('image_id').count()
df_folds.loc[:, 'object_count'] = df.groupby('image_id')['class_id'].nunique()

df_folds.loc[:, 'stratify_group'] = np.char.add(
    df_folds['object_count'].values.astype(str),
    df_folds['bbox_count'].apply(lambda x: f'_{x // 15}').values.astype(str)
)

df_folds.loc[:, 'fold'] = 0
for fold_number, (train_index, val_index) in enumerate(skf.split(X=df_folds.index, y=df_folds['stratify_group'])):
    df_folds.loc[df_folds.iloc[val_index].index, 'fold'] = fold_number

# example with fold 0
df_folds.reset_index(inplace=True)

df_valid = pd.merge(df, df_folds[df_folds['fold'] == 0], on='image_id')

df_train = pd.merge(df, df_folds[~df_folds['fold'].isin([0, 1])], on='image_id')

df_test = pd.merge(df, df_folds[df_folds['fold'] == 1], on='image_id')


The least populated class in y has only 1 members, which is less than n_splits=5.



In [74]:
import plotly.express as px

x = 'class_name'
train_fig = px.histogram(df_train.sort_values(by=x), x=x, title='train  distribution')
val_fig = px.histogram(df_valid.sort_values(by=x), x=x, title='valid distribution')
test_fig = px.histogram(df_test.sort_values(by=x), x=x, title='test distribution')

# Show the histograms
train_fig.show()
val_fig.show()
test_fig.show()

In [75]:
df_test.sort_values(by='class_id').class_id.value_counts()

class_id
0     1473
3     1145
11     981
13     919
8      485
7      463
10     451
9      443
2      235
6      231
5      204
4       87
1       39
12      37
Name: count, dtype: int64

In [76]:
df_valid.sort_values(by='class_id').class_id.value_counts()


class_id
0     1438
3     1006
11     990
13     920
10     488
7      474
9      459
8      433
6      270
2      236
5      199
4      131
1       60
12      52
Name: count, dtype: int64

In [77]:
len(df_valid) + len(df_train) + len(df_test)

36096

In [78]:
len(df)

36096

In [79]:


dataset_dir = data_dir / 'vin_big_data_512x512_yolo'


In [80]:

from typing import Literal
from tqdm import tqdm


def make_sample_set(set_df: pd.DataFrame, yolo_dataset_dir: Path, orig_dataset_dir: Path,
                    set_type: Literal['train', 'test', 'val'] = 'train'):
    labels_dir = yolo_dataset_dir / set_type / 'labels'
    images_dir = yolo_dataset_dir / set_type / 'images'

    labels_dir.mkdir(exist_ok=True, parents=True)
    images_dir.mkdir(exist_ok=True, parents=True)

    for img_path in tqdm(list(orig_dataset_dir.iterdir()), desc=f'make yolo set for {set_type}'):
        img_id = str(img_path.name).split('.png')[0]
        objects_df = set_df[set_df['image_id'] == img_id]

        if objects_df.empty:
            continue

        label_path = labels_dir / f'{img_id}.txt'
        with open(label_path, 'w') as f:
            for entry in objects_df['yolo_format']:
                f.write(entry + '\n')

        shutil.copyfile(img_path, images_dir / img_path.name)



In [82]:
make_sample_set(df_train, dataset_dir, orig_dataset_dir=vin_big_data_dir / 'train', set_type='train')
make_sample_set(df_valid, dataset_dir, orig_dataset_dir=vin_big_data_dir / 'train', set_type='val')
make_sample_set(df_test, dataset_dir, orig_dataset_dir=vin_big_data_dir / 'train', set_type='test')


make yolo set for train: 100%|██████████| 15000/15000 [00:16<00:00, 920.16it/s]
make yolo set for val: 100%|██████████| 15000/15000 [00:10<00:00, 1407.41it/s]
make yolo set for test: 100%|██████████| 15000/15000 [00:10<00:00, 1397.37it/s]
