# Step-to-Step to preprocessing the datasets

### Download description files

In [1]:
!mkdir -p /tmp/dataset
!wget https://storage.googleapis.com/openimages/2018_04/class-descriptions-boxable.csv -O /tmp/dataset/class-descriptions-boxable.csv
#!wget https://storage.googleapis.com/openimages/2018_04/train/train-annotations-bbox.csv -O /tmp/dataset/train-annotations-bbox.csv
!wget https://storage.googleapis.com/openimages/2018_04/validation/validation-annotations-bbox.csv -O /tmp/dataset/validation-annotations-bbox.csv
#!wget https://storage.googleapis.com/openimages/2018_04/test/test-annotations-bbox.csv -O /tmp/dataset/test-annotations-bbox.csv

--2019-05-10 11:18:14--  https://storage.googleapis.com/openimages/2018_04/class-descriptions-boxable.csv
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.30.80, 2800:3f0:4001:80a::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.30.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11255 (11K) [text/csv]
Saving to: ‘/tmp/dataset/class-descriptions-boxable.csv’


2019-05-10 11:18:14 (1,11 MB/s) - ‘/tmp/dataset/class-descriptions-boxable.csv’ saved [11255/11255]

--2019-05-10 11:18:15--  https://storage.googleapis.com/openimages/2018_04/validation/validation-annotations-bbox.csv
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.30.80, 2800:3f0:4001:80a::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.30.80|:443... connected.
HTTP request sent, awaiting re

### Check files

In [2]:
!ls /tmp/dataset/

class-descriptions-boxable.csv


### Download script to download the dataset

In [3]:
!mkdir -p ../scripts
!wget https://raw.githubusercontent.com/spmallick/learnopencv/master/downloadOpenImages/downloadOI.py -O ../scripts/downloadOI.py

--2019-05-08 14:11:30--  https://raw.githubusercontent.com/spmallick/learnopencv/master/downloadOpenImages/downloadOI.py
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.128.133, 151.101.192.133, 151.101.0.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.128.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3555 (3,5K) [text/plain]
Saving to: ‘../scripts/downloadOI.py’


2019-05-08 14:11:30 (20,7 MB/s) - ‘../scripts/downloadOI.py’ saved [3555/3555]



### Filtering the data

In [8]:
#!cd /tmp/dataset && python3 downloadOI.py --classes 'Human_face,Human_arm,Human_head,Human_body,Human_hand,Human_leg,Human_foot' --mode train --depiction=0
# Importing libraries
import pandas as pd
import os

In [3]:
class_labels = pd.read_csv('/tmp/dataset/class-descriptions-boxable.csv', header=None, names=['label', 'name'])

In [4]:
class_labels.head()

Unnamed: 0,label,name
0,/m/011k07,Tortoise
1,/m/011q46kg,Container
2,/m/012074,Magpie
3,/m/0120dh,Sea turtle
4,/m/01226z,Football


In [7]:
validation = pd.read_csv('/tmp/dataset/validation-annotations-bbox.csv')
validation.columns

Index(['ImageID', 'Source', 'LabelName', 'Confidence', 'XMin', 'XMax', 'YMin',
       'YMax', 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction',
       'IsInside'],
      dtype='object')

In [5]:
class_labels = class_labels[ class_labels["name"].isin(['Human arm','Human head','Human body','Human hand','Human leg','Human foot']) ]
class_labels.to_csv('../data/class-descriptions-boxable.csv', index = False)
class_labels

Unnamed: 0,label,name
176,/m/02p0tk3,Human body
213,/m/031n1,Human foot
220,/m/035r7c,Human leg
291,/m/04hgtk,Human head
502,/m/0dzf4,Human arm
572,/m/0k65p,Human hand


### Creating new annotation boxes

In [8]:
chunksize = 1000

with open('/tmp/dataset/train-annotations-bbox.csv') as file:
    for chunk in pd.read_csv(file, chunksize=chunksize):
        new_dataset = chunk[ chunk['LabelName'].isin(list(class_labels.label)) ]
        if not os.path.isfile('/tmp/dataset/new-train-annotations-bbox.csv'): # if file does not exist write header 
            new_dataset.to_csv('/tmp/dataset/new-train-annotations-bbox.csv', header=chunk.columns, index = False)
        else: # else it exists so append without writing the header
            new_dataset.to_csv('/tmp/dataset/new-train-annotations-bbox.csv', mode='a', header=False, index = False)

In [9]:
with open('/tmp/dataset/test-annotations-bbox.csv') as file:
    for chunk in pd.read_csv(file, chunksize=chunksize):
        new_dataset = chunk[ chunk['LabelName'].isin(list(class_labels.label)) ]
        if not os.path.isfile('/tmp/dataset/new-test-annotations-bbox.csv'): # if file does not exist write header 
            new_dataset.to_csv('/tmp/dataset/new-test-annotations-bbox.csv', header=chunk.columns, index = False)
        else: # else it exists so append without writing the header
            new_dataset.to_csv('/tmp/dataset/new-test-annotations-bbox.csv', mode='a', header=False, index = False)

In [10]:
with open('/tmp/dataset/validation-annotations-bbox.csv') as file:
    for chunk in pd.read_csv(file, chunksize=chunksize):
        new_dataset = chunk[ chunk['LabelName'].isin(list(class_labels.label)) ]
        if not os.path.isfile('/tmp/dataset/new-validation-annotations-bbox.csv'): # if file does not exist write header 
            new_dataset.to_csv('/tmp/dataset/new-validation-annotations-bbox.csv', header=chunk.columns, index = False)
        else: # else it exists so append without writing the header
            new_dataset.to_csv('/tmp/dataset/new-validation-annotations-bbox.csv', mode='a', header=False, index = False)

In [11]:
class_labels.to_csv('/tmp/dataset/new-class-descriptions-boxable.csv', header=False, index = False)

### Remove unnecessary files

In [12]:
!rm /tmp/dataset/train-annotations-bbox.csv
!rm /tmp/dataset/test-annotations-bbox.csv
!rm /tmp/dataset/validation-annotations-bbox.csv
!rm /tmp/dataset/class-descriptions-boxable.csv

!ls /tmp/dataset/

new-class-descriptions-boxable.csv  new-train-annotations-bbox.csv
new-test-annotations-bbox.csv	    new-validation-annotations-bbox.csv


In [13]:
#train_annotations = pd.read_csv('/tmp/dataset/new-train-annotations-bbox.csv')
train_annotations = pd.read_csv('../data/train-annotations-bbox.csv')
train_annotations[train_annotations.ImageID == 'd0913b76984c8223']

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
2713,d0913b76984c8223,xclick,/m/02p0tk3,1,0.00375,0.999375,0.371667,0.571667,1,1,1,0,0
3826,d0913b76984c8223,xclick,/m/02p0tk3,1,0.25125,0.846875,0.7575,0.999167,1,1,1,0,0
8853,d0913b76984c8223,xclick,/m/04hgtk,1,0.43375,0.486875,0.814167,0.904167,1,0,0,0,0
9562,d0913b76984c8223,xclick,/m/04hgtk,1,0.39375,0.44125,0.760833,0.83,1,0,0,0,0


In [83]:
print("Current size:", len(train_annotations))

train_annotations = train_annotations[ train_annotations.IsDepiction == 0 ]
train_annotations = train_annotations[ train_annotations.IsInside == 0 ]

print("New size:", len(train_annotations))

Current size: 734882
New size: 678042


In [84]:
train_annotations.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
0,000004f4400f6ec5,xclick,/m/04hgtk,1,0.300625,0.508125,0.22409,0.608777,1,0,0,0,0
1,000004f4400f6ec5,xclick,/m/04hgtk,1,0.544375,0.6925,0.179272,0.469655,0,0,0,0,0
2,000004f4400f6ec5,xclick,/m/04hgtk,1,0.768125,0.999375,0.648926,0.999066,0,1,0,0,0
3,000004f4400f6ec5,xclick,/m/0dzf4,1,0.143125,0.615625,0.597572,0.999066,1,1,0,0,0
4,000004f4400f6ec5,xclick,/m/0dzf4,1,0.40375,0.71625,0.478992,0.892624,1,0,0,0,0


### Occluded

In [16]:
len(train_annotations[ train_annotations.IsOccluded == 1 ])

472020

### Truncated

In [17]:
len(train_annotations[ train_annotations.IsTruncated == 1 ])

126968

### GroupOf

In [18]:
len(train_annotations[ train_annotations.IsGroupOf == 1 ])

8325

### Nothing

In [19]:
len(train_annotations) - len(train_annotations[ train_annotations.IsOccluded == 1 ]) - len(train_annotations[ train_annotations.IsTruncated == 1 ]) - len(train_annotations[ train_annotations.IsGroupOf == 1 ])

70729

In [66]:
class_labels_dict = { row['label']:row['name'] for index, row in class_labels.iterrows()}
class_labels_dict
labels_class = { row['name']:row['label'] for index, row in class_labels.iterrows()}
labels_class

{'Human body': '/m/02p0tk3',
 'Human foot': '/m/031n1',
 'Human leg': '/m/035r7c',
 'Human head': '/m/04hgtk',
 'Human arm': '/m/0dzf4',
 'Human hand': '/m/0k65p'}

In [67]:
train_annotations.replace({'LabelName' : class_labels_dict}).groupby('LabelName').sum()

Unnamed: 0_level_0,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
LabelName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Human arm,194495,85229.359767,108833.839525,87665.511768,127390.685102,181885,31816,425,0,0
Human body,166472,64403.111384,102168.013673,58483.159714,126433.771994,86955,47508,5549,0,0
Human foot,2189,903.532892,1317.681044,1099.623627,1620.064162,1345,317,5,0,0
Human hand,71356,31955.878986,38965.590223,37422.364734,45600.325481,49172,8137,14,0,0
Human head,173875,76105.369504,96991.36093,58662.986931,87589.093671,101886,23044,1949,0,0
Human leg,69655,30647.222386,39081.318953,38303.03202,54633.141161,50777,16146,383,0,0


In [68]:
df_foot = train_annotations[train_annotations.LabelName == labels_class['Human foot']]
df_body = train_annotations[train_annotations.LabelName == labels_class['Human body']]
df_arm = train_annotations[train_annotations.LabelName == labels_class['Human arm']]
df_hand = train_annotations[train_annotations.LabelName == labels_class['Human hand']]
df_head = train_annotations[train_annotations.LabelName == labels_class['Human head']]
df_leg = train_annotations[train_annotations.LabelName == labels_class['Human leg']]
df_foot.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
42,00007902a7b96e8c,xclick,/m/031n1,1,0.243542,0.434502,0.805204,0.90225,0,0,0,0,0
83,0000e2205e460318,xclick,/m/031n1,1,0.0575,0.089375,0.621951,0.651032,1,0,0,0,0
84,0000e2205e460318,xclick,/m/031n1,1,0.07,0.11125,0.63227,0.654784,1,0,0,0,0
85,0000e2205e460318,xclick,/m/031n1,1,0.144375,0.188125,0.645403,0.678236,1,0,0,0,0
86,0000e2205e460318,xclick,/m/031n1,1,0.16625,0.208125,0.652908,0.681989,1,0,0,0,0


In [71]:
# Filtering datasets
df_body = df_body.sample(len(df_foot))
df_arm = df_arm.sample(len(df_foot))
df_hand = df_hand.sample(len(df_foot))
df_head = df_head.sample(len(df_foot))
df_leg = df_leg.sample(len(df_foot))

In [75]:
print('df_body:', df_body.sum() )
print('df_arm:', df_arm.sum() )
print('df_hand:', df_hand.sum() )
print('df_head:', df_head.sum() )
print('df_leg:', df_leg.sum() )

df_body: ImageID        1951b374029fec0417b4123731480ad3b236b0d066f176...
Source         activemilxclickxclickxclickxclickxclickxclickx...
LabelName      /m/02p0tk3/m/02p0tk3/m/02p0tk3/m/02p0tk3/m/02p...
Confidence                                                  2189
XMin                                                     849.776
XMax                                                     1356.46
YMin                                                     764.221
YMax                                                     1671.94
IsOccluded                                                  1122
IsTruncated                                                  618
IsGroupOf                                                     90
IsDepiction                                                    0
IsInside                                                       0
dtype: object
df_arm: ImageID        dcf1b01245d05fdc2492e48cc08ca4d8c3335db02abd2d...
Source         xclickxclickxclickxclickxclickxclickxclickxc

In [76]:
final_dataset = df_foot
final_dataset = final_dataset.append([df_body,df_arm,df_hand,df_head,df_leg])
final_dataset.sum()

ImageID        00007902a7b96e8c0000e2205e4603180000e2205e4603...
Source         xclickxclickxclickxclickxclickxclickxclickxcli...
LabelName      /m/031n1/m/031n1/m/031n1/m/031n1/m/031n1/m/031...
Confidence                                                 13134
XMin                                                     5629.88
XMax                                                     7558.22
YMin                                                     5955.13
YMax                                                     8938.35
IsOccluded                                                  8890
IsTruncated                                                 2355
IsGroupOf                                                    145
IsDepiction                                                    0
IsInside                                                       0
dtype: object

In [78]:
final_dataset.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
42,00007902a7b96e8c,xclick,/m/031n1,1,0.243542,0.434502,0.805204,0.90225,0,0,0,0,0
83,0000e2205e460318,xclick,/m/031n1,1,0.0575,0.089375,0.621951,0.651032,1,0,0,0,0
84,0000e2205e460318,xclick,/m/031n1,1,0.07,0.11125,0.63227,0.654784,1,0,0,0,0
85,0000e2205e460318,xclick,/m/031n1,1,0.144375,0.188125,0.645403,0.678236,1,0,0,0,0
86,0000e2205e460318,xclick,/m/031n1,1,0.16625,0.208125,0.652908,0.681989,1,0,0,0,0


In [85]:
final_dataset.to_csv('../data/train-annotations-bbox.csv', index = False)

## Test

In [86]:
test_annotations = pd.read_csv('/tmp/dataset/new-test-annotations-bbox.csv')
test_annotations.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
0,000132c20b84269b,freeform,/m/04hgtk,1,0.135425,0.229449,0.270051,0.416875,0,0,0,0,0
1,000132c20b84269b,freeform,/m/04hgtk,1,0.440367,0.518296,0.163887,0.301675,0,0,0,0,0
2,000132c20b84269b,freeform,/m/04hgtk,1,0.581826,0.654673,0.174051,0.319746,0,0,0,0,0
3,000132c20b84269b,freeform,/m/04hgtk,1,0.694485,0.780885,0.263275,0.420263,0,0,0,0,0
4,000132c20b84269b,freeform,/m/0dzf4,1,0.039826,0.092433,0.535888,0.807865,0,0,0,0,0
5,000132c20b84269b,freeform,/m/0dzf4,1,0.193322,0.273312,0.542616,0.692539,0,0,0,0,0
6,000132c20b84269b,freeform,/m/0dzf4,1,0.323036,0.380687,0.568564,0.620461,1,0,0,0,0
7,000132c20b84269b,freeform,/m/0dzf4,1,0.562287,0.571656,0.629110,0.682929,1,0,0,0,0
8,000132c20b84269b,freeform,/m/0dzf4,1,0.787126,0.867837,0.601240,0.852073,0,0,0,0,0
9,0002ab0af02e4a77,freeform,/m/04hgtk,1,0.534108,0.745821,0.127420,0.462402,0,0,0,1,0


In [87]:
print("Current size:", len(test_annotations))

test_annotations = test_annotations[ test_annotations.IsDepiction == 0 ]
test_annotations = test_annotations[ test_annotations.IsInside == 0 ]

print("New size:", len(test_annotations))

Current size: 96794
New size: 91015


In [88]:
test_annotations.replace({'LabelName' : class_labels_dict}).groupby('LabelName').sum()

Unnamed: 0_level_0,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
LabelName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Human arm,23982,9830.037214,14316.439348,8724.63029,14972.187053,12012,4000,742,0,0
Human body,18759,4747.111303,14004.813223,3400.410022,15459.62257,4883,8581,2428,0,0
Human foot,464,187.512993,285.942129,211.005108,336.04347,258,72,8,0,0
Human hand,12185,5105.770612,6944.86938,5590.154191,7595.553158,4237,1933,7,0,0
Human head,22612,9441.85069,13156.408365,5967.694717,10547.389951,6065,1839,294,0,0
Human leg,13013,5263.005334,7696.760201,6118.012918,9950.94788,5129,3183,781,0,0


In [94]:
df_foot = test_annotations[test_annotations.LabelName == labels_class['Human foot']]
df_body = test_annotations[test_annotations.LabelName == labels_class['Human body']]
df_arm = test_annotations[test_annotations.LabelName == labels_class['Human arm']]
df_hand = test_annotations[test_annotations.LabelName == labels_class['Human hand']]
df_head = test_annotations[test_annotations.LabelName == labels_class['Human head']]
df_leg = test_annotations[test_annotations.LabelName == labels_class['Human leg']]
df_foot.head()

Unnamed: 0,ImageID,Source,LabelName,Confidence,XMin,XMax,YMin,YMax,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
315,008af1d34142a5bb,freeform,/m/031n1,1,0.008083,0.251214,0.608219,0.999058,0,1,0,0,0
316,008af1d34142a5bb,freeform,/m/031n1,1,0.234549,0.424278,0.467278,0.837,0,0,0,0,0
549,00f8135662b9e6e6,freeform,/m/031n1,1,0.039265,0.487601,0.589024,0.973561,0,0,0,0,0
550,00f8135662b9e6e6,freeform,/m/031n1,1,0.190668,0.598542,0.581275,0.859265,1,0,0,0,0
551,00f8135662b9e6e6,freeform,/m/031n1,1,0.389058,0.655971,0.523159,0.679104,0,0,0,0,0


In [95]:
# Filtering datasets
df_body = df_body.sample(len(df_foot))
df_arm = df_arm.sample(len(df_foot))
df_hand = df_hand.sample(len(df_foot))
df_head = df_head.sample(len(df_foot))
df_leg = df_leg.sample(len(df_foot))

In [96]:
final_dataset_test = df_foot
final_dataset_test = final_dataset_test.append([df_body,df_arm,df_hand,df_head,df_leg])
final_dataset_test.to_csv('../data/test-annotations-bbox.csv', index = False)
final_dataset_test.sum()

ImageID        008af1d34142a5bb008af1d34142a5bb00f8135662b9e6...
Source         freeformfreeformfreeformfreeformfreeformfreefo...
LabelName      /m/031n1/m/031n1/m/031n1/m/031n1/m/031n1/m/031...
Confidence                                                  2784
XMin                                                     1076.28
XMax                                                     1734.13
YMin                                                     1016.96
YMax                                                     1874.37
IsOccluded                                                  1046
IsTruncated                                                  582
IsGroupOf                                                    104
IsDepiction                                                    0
IsInside                                                       0
dtype: object