In [1]:
# Approach

# 1- Separte out class 0 and create folds (df0)
# 2- Separate out class 1 and create 7 folds that are stratified by domain (df1)
# 3- Concat df0 and df1

In [2]:
import pandas as pd
import numpy as np
import os
import cv2

from tqdm import tqdm

import ast
import matplotlib.pyplot as plt

In [3]:
base_path = '../input/global-wheat-head-dataset-2021/gwhd_2021/'

In [4]:
os.listdir('../input/global-wheat-head-dataset-2021/gwhd_2021/')

['competition_train.csv',
 'metadata_dataset.csv',
 'images',
 'competition_test.csv',
 'competition_val.csv']

In [5]:
NUM_FOLDS = 7

CHOSEN_FOLD = 0

## Load the data

In [6]:
# Load the train data

path = base_path + 'competition_train.csv'
df_train = pd.read_csv(path)

# Create a new column
df_train['source'] = 'train'

print(df_train.shape)

df_train.head()

(3657, 4)


Unnamed: 0,image_name,BoxesString,domain,source
0,4563856cc6d75c670eafd86d5eb7245fbe8f273c28f9e3...,99 692 160 764;641 27 697 115;935 978 1012 102...,Arvalis_1,train
1,a2a15938845d9812de03bd44799c4b1bf856a8ad11752e...,230 143 321 222;928 929 1015 1004;485 557 604 ...,Arvalis_1,train
2,401f89a2bb6ab63e3f406bd59b9cadccfe953230feb6cd...,440 239 544 288;333 538 429 594;913 171 963 20...,Arvalis_1,train
3,0a3937653483c36dfb4d957b6f82ae96dbdc7ba36cc3d8...,112 274 188 303;892 812 958 847;0 889 67 928;1...,Arvalis_1,train
4,be1652110a44acd24b42784356e965ce84a04893c3f1bb...,810 204 863 314;360 231 425 408;452 348 510 47...,Arvalis_1,train


In [7]:
# Load the val data

path = base_path + 'competition_val.csv'
df_val = pd.read_csv(path)

# Create a new column
df_val['source'] = 'val'

print(df_val.shape)

df_val.head()

(1476, 4)


Unnamed: 0,image_name,BoxesString,domain,source
0,e6b6a900e5c54cd5d8b0649768c361512cff1813409319...,0 30 109 119;453 617 561 661;450 441 652 493;5...,Usask_1,val
1,53889799be1319296f102fa09a512463c27316a428bb9f...,38 370 88 443;924 824 964 874;0 191 39 265;828...,Usask_1,val
2,ec8f9365ca93eb9dab075c64e5c8b32edd4f8993e17b89...,884 932 1024 990;828 53 1024 206;607 633 681 7...,Usask_1,val
3,48bab0505514c876207b4ede1c60cc906947ca02bb4277...,284 114 352 181;421 0 533 90;910 326 954 409;8...,Usask_1,val
4,44bf4657132a886bea1b74e105c3aadfa41ba7a2ae9d7a...,94 0 241 183;173 217 318 338;401 371 606 472;3...,Usask_1,val


In [8]:
# Load the test data

path = base_path + 'competition_test.csv'
df_test = pd.read_csv(path)

# Create a new column
df_test['source'] = 'test'

print(df_test.shape)

df_test.head()

(1382, 4)


Unnamed: 0,image_name,BoxesString,domain,source
0,255b6ca9fea63f44125e5174bc932470b604c760430715...,481 820 604 922;655 957 732 1024;930 926 1013 ...,UQ_7,test
1,7f5eb37cab658de6fd0d688bf27f16e423794fed6184d8...,896 911 977 955;800 898 821 941;770 867 804 90...,UQ_7,test
2,7bcfff43b356f4a94948367782aa704a37ff4579baf45d...,892 993 922 1024;844 943 871 965;758 926 801 9...,UQ_7,test
3,e535384eda9d0f9c6ac57dd9397d5d614e4cad48c144d8...,648 886 722 1024;392 950 507 1024;876 677 981 ...,UQ_7,test
4,66e9fa7379fd7b7fd64024ac1b03b8e56f9ad020c10635...,559 939 623 1009;775 875 829 919;853 883 888 9...,UQ_7,test


## Concat the data

In [9]:
# Concat the data

df_data = pd.concat([df_train, df_val, df_test], axis=0)

print(df_data.shape)

df_data.head()

(6515, 4)


Unnamed: 0,image_name,BoxesString,domain,source
0,4563856cc6d75c670eafd86d5eb7245fbe8f273c28f9e3...,99 692 160 764;641 27 697 115;935 978 1012 102...,Arvalis_1,train
1,a2a15938845d9812de03bd44799c4b1bf856a8ad11752e...,230 143 321 222;928 929 1015 1004;485 557 604 ...,Arvalis_1,train
2,401f89a2bb6ab63e3f406bd59b9cadccfe953230feb6cd...,440 239 544 288;333 538 429 594;913 171 963 20...,Arvalis_1,train
3,0a3937653483c36dfb4d957b6f82ae96dbdc7ba36cc3d8...,112 274 188 303;892 812 958 847;0 889 67 928;1...,Arvalis_1,train
4,be1652110a44acd24b42784356e965ce84a04893c3f1bb...,810 204 863 314;360 231 425 408;452 348 510 47...,Arvalis_1,train


## Remove the duplicates

EDA shoed that the first duplicate image, for each image_name, had wrong bboxes. Therefore, we will keep the last image and delete the first one.

In [10]:
# Keep the second of two duplicates for each duplicate image_name
df_data = df_data.drop_duplicates(subset='image_name', keep='last')

df_data = df_data.reset_index(drop=True)

df_data.shape

(6512, 4)

## Get the path to each image

In [11]:
# Get the path to each image

def get_path(x):
    
    image_name = x
    path = base_path + 'images/' + image_name
    
    return path


df_data['path'] = df_data['image_name'].apply(get_path)

#df_data.head()

## Create the image height and width column

EDA showed that all images are 1024x1024.

In [12]:
df_data['height'] = 1024
df_data['width'] = 1024

#df_data.head()

## Add a target column

In [13]:
def set_target(x):
    
    if x == 'no_box':
        return 0
    else:
        return 1
    
df_data['target'] = df_data['BoxesString'].apply(set_target)

#df_data.head()

In [14]:
df_data['target'].value_counts()

1    6387
0     125
Name: target, dtype: int64

## Add a column called image_id

In [15]:
def get_image_id(x):
    
    image_id = x.split('.')[0]
    
    return image_id

df_data['image_id'] = df_data['image_name'].apply(get_image_id)
    
#df_data.head()

## Create the folds

In [16]:
# Filter out target 0
df0 = df_data[df_data['target'] == 0]
df0 = df0.reset_index(drop=True)

# Filter out target 1
df1 = df_data[df_data['target'] == 1]
df1 = df1.reset_index(drop=True)

In [17]:
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold

skf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=101)

for fold, ( _, val_) in enumerate(skf.split(X=df0)):
      df0.loc[val_ , "fold"] = fold
        
df0['fold'].value_counts()

5.0    18
0.0    18
2.0    18
1.0    18
4.0    18
3.0    18
6.0    17
Name: fold, dtype: int64

In [18]:
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold

skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=101)

for fold, ( _, val_) in enumerate(skf.split(X=df1, y=df1.domain)):
      df1.loc[val_ , "fold"] = fold
        
df1['fold'].value_counts()



1.0    913
2.0    913
0.0    913
3.0    912
5.0    912
6.0    912
4.0    912
Name: fold, dtype: int64

In [19]:
# Concat df0 and df1

df_data = pd.concat([df0, df1], axis=0)
df_data = df_data.reset_index(drop=True)

df_data.shape

(6512, 10)

In [20]:
# How to choose the fold to train on.

fold_index = CHOSEN_FOLD

df_train = df_data[df_data['fold'] != fold_index]
df_val = df_data[df_data['fold'] == fold_index]

print('Train')
print(len(df_train))
print(df_train['target'].value_counts())
print('')
print('Val')
print(len(df_val))
print(df_val['target'].value_counts())

Train
5581
1    5474
0     107
Name: target, dtype: int64

Val
931
1    913
0     18
Name: target, dtype: int64


## Review df_train and df_val

In [21]:
df_train.head()

Unnamed: 0,image_name,BoxesString,domain,source,path,height,width,target,image_id,fold
0,6b06b7b2a64a2bf87daa760f754b44efaaeb69da6454d6...,no_box,Arvalis_5,train,../input/global-wheat-head-dataset-2021/gwhd_2...,1024,1024,0,6b06b7b2a64a2bf87daa760f754b44efaaeb69da6454d6...,5.0
2,e966dfbaa9bec815940faae79159e88be160cc5985e54d...,no_box,Arvalis_5,train,../input/global-wheat-head-dataset-2021/gwhd_2...,1024,1024,0,e966dfbaa9bec815940faae79159e88be160cc5985e54d...,2.0
3,032d6a431f3609706829fcaa4e98d4e5b0430d182e9cf1...,no_box,Arvalis_5,train,../input/global-wheat-head-dataset-2021/gwhd_2...,1024,1024,0,032d6a431f3609706829fcaa4e98d4e5b0430d182e9cf1...,2.0
4,041e2003e4f0afef241d3e29c73ad00daed660cf167646...,no_box,Arvalis_5,train,../input/global-wheat-head-dataset-2021/gwhd_2...,1024,1024,0,041e2003e4f0afef241d3e29c73ad00daed660cf167646...,6.0
5,6513f833f9aecc3d3bc4d79aa679ce6feb186abee40847...,no_box,Arvalis_5,train,../input/global-wheat-head-dataset-2021/gwhd_2...,1024,1024,0,6513f833f9aecc3d3bc4d79aa679ce6feb186abee40847...,5.0


In [22]:
df_val.head()

Unnamed: 0,image_name,BoxesString,domain,source,path,height,width,target,image_id,fold
1,899d237a8b101212498178338d81eb53b3e612fd940b40...,no_box,Arvalis_5,train,../input/global-wheat-head-dataset-2021/gwhd_2...,1024,1024,0,899d237a8b101212498178338d81eb53b3e612fd940b40...,0.0
13,17528cba85011bb2c82d5e223407c0a2c599bc18f35b6d...,no_box,Arvalis_5,train,../input/global-wheat-head-dataset-2021/gwhd_2...,1024,1024,0,17528cba85011bb2c82d5e223407c0a2c599bc18f35b6d...,0.0
16,43338b7802c69c618c4a80089a23f339314da3e0d5b88b...,no_box,Arvalis_5,train,../input/global-wheat-head-dataset-2021/gwhd_2...,1024,1024,0,43338b7802c69c618c4a80089a23f339314da3e0d5b88b...,0.0
21,5aba771ffdafdc2460ecb00524239776068b77318e5704...,no_box,Arvalis_5,train,../input/global-wheat-head-dataset-2021/gwhd_2...,1024,1024,0,5aba771ffdafdc2460ecb00524239776068b77318e5704...,0.0
25,059a41e6b8db4d29360d9dd224690f788712bb3a979b5c...,no_box,Arvalis_5,train,../input/global-wheat-head-dataset-2021/gwhd_2...,1024,1024,0,059a41e6b8db4d29360d9dd224690f788712bb3a979b5c...,0.0


In [23]:
df_train['domain'].value_counts()

ETHZ_1            640
Arvalis_3         504
Utokyo_1          461
Utokyo_2          391
Arvalis_5         381
Rres_1            370
Arvalis_2         344
Arvalis_4         175
Usask_1           171
Inrae_1           151
Arvalis_6         137
Terraref_1        125
Utokyo_3          101
Terraref_2         91
UQ_10              91
NAU_2              86
KSU_2              86
KSU_1              86
NAU_3              85
NMBU_2             84
KSU_3              81
UQ_11              72
NMBU_1             70
CIMMYT_2           67
CIMMYT_1           58
Arvalis_1          56
Ukyoto_1           52
KSU_4              52
Arvalis_11         51
Arvalis_10         50
CIMMYT_3           50
UQ_8               35
UQ_9               29
Arvalis_9          28
ULiège-GxABT_1     26
UQ_6               26
UQ_5               26
UQ_4               26
ARC_1              26
Arvalis_12         25
Arvalis_7          21
UQ_1               19
NAU_1              17
Arvalis_8          17
UQ_7               14
UQ_2      

In [24]:
df_val['domain'].value_counts()

ETHZ_1            107
Arvalis_3          84
Utokyo_1           77
Arvalis_5          67
Utokyo_2           65
Rres_1             62
Arvalis_2          57
Arvalis_4          29
Usask_1            29
Inrae_1            25
Arvalis_6          23
Utokyo_3           19
Terraref_1         19
UQ_10              15
Terraref_2         15
NAU_3              15
NAU_2              14
KSU_2              14
KSU_1              14
NMBU_2             14
KSU_3              14
UQ_11              12
NMBU_1             12
Arvalis_1          10
CIMMYT_2           10
CIMMYT_1           10
CIMMYT_3           10
Arvalis_11          9
Ukyoto_1            8
KSU_4               8
Arvalis_10          8
UQ_8                6
UQ_9                4
ARC_1               4
UQ_6                4
UQ_5                4
UQ_4                4
ULiège-GxABT_1      4
Arvalis_12          4
Arvalis_9           4
UQ_7                3
UQ_1                3
Arvalis_8           3
Arvalis_7           3
NAU_1               3
UQ_2      

## Put each bbox on a seperate row

In [25]:
# Filter out the rows that don't have bboxes

df_none = df_data[df_data['BoxesString'] == 'no_box']
df_box = df_data[df_data['BoxesString'] != 'no_box']

print(df_none.shape)
print(df_box.shape)

(125, 10)
(6387, 10)


In [26]:
df_box = df_box.reset_index(drop=True)

for i in tqdm(range(0, len(df_box)), total=len(df_box)):

    # Get the bbox string
    image_id = df_box.loc[i, 'image_id']
    box_str = df_box.loc[i, 'BoxesString']
    
    domain = df_box.loc[i, 'domain']
    target = df_box.loc[i, 'target']
    height = df_box.loc[i, 'height']
    width = df_box.loc[i, 'width']
    
    source = df_box.loc[i, 'source']
    path = df_box.loc[i, 'path']
    
    fold = df_box.loc[i, 'fold']


    # Create a list of bbox coords.
    # Each list item is a string
    box_list = box_str.split(';')

    num_rows = len(box_list)

    # Create lists that are the same length as num_rows.
    # All items in the list are the same.
    image_id_list = [image_id] * num_rows
    domain_list = [domain] * num_rows
    target_list = [target] * num_rows
    
    height_list = [height] * num_rows
    width_list = [width] * num_rows
    
    source_list = [source] * num_rows
    path_list = [path] * num_rows
    
    fold_list = [fold] * num_rows

    # Create a dict that we will later use to create a dataframe
    info_dict = {
        'image_id': image_id_list,
        'BoxesString': box_list,
        'domain': domain_list,
        'target': target_list,
        'height': height_list,
        'width': width_list,
        'source': source_list,
        'path': path_list,
        'fold': fold_list
    }

    df = pd.DataFrame(info_dict)

    if i == 0:
        df_fin = df
    else:
        df_fin = pd.concat([df_fin, df], axis=0)
        
df_fin.shape

100%|██████████| 6387/6387 [01:15<00:00, 84.41it/s]


(275371, 9)

In [27]:
# Concat df_none to df_fin

# Choose only the columns we want
df_none1 = df_none[df_fin.columns]

df_rows = pd.concat([df_fin, df_none1], axis=0)

print(df_rows.shape)

df_rows.head()

(275496, 9)


Unnamed: 0,image_id,BoxesString,domain,target,height,width,source,path,fold
0,4563856cc6d75c670eafd86d5eb7245fbe8f273c28f9e3...,99 692 160 764,Arvalis_1,1,1024,1024,train,../input/global-wheat-head-dataset-2021/gwhd_2...,3.0
1,4563856cc6d75c670eafd86d5eb7245fbe8f273c28f9e3...,641 27 697 115,Arvalis_1,1,1024,1024,train,../input/global-wheat-head-dataset-2021/gwhd_2...,3.0
2,4563856cc6d75c670eafd86d5eb7245fbe8f273c28f9e3...,935 978 1012 1020,Arvalis_1,1,1024,1024,train,../input/global-wheat-head-dataset-2021/gwhd_2...,3.0
3,4563856cc6d75c670eafd86d5eb7245fbe8f273c28f9e3...,377 834 463 890,Arvalis_1,1,1024,1024,train,../input/global-wheat-head-dataset-2021/gwhd_2...,3.0
4,4563856cc6d75c670eafd86d5eb7245fbe8f273c28f9e3...,654 797 696 837,Arvalis_1,1,1024,1024,train,../input/global-wheat-head-dataset-2021/gwhd_2...,3.0


In [28]:
df_rows['target'].value_counts()

1    275371
0       125
Name: target, dtype: int64

## Save the dataframe

In [29]:
path = 'df_rows_w_folds.csv'
df_rows.to_csv(path, index=False)


In [30]:
!ls

__notebook__.ipynb  df_rows_w_folds.csv
