# Creating Object Detection Dataset 


> Creating object detection dataset from lableme annotated JSON files  


__The dataset creates output files in the format :__ 

```
    <output folder>
        /img
        valid.csv
        train.csv
```
__Format of the train/valid dataframes__

|image_id|width|height|x|y|w|h|
|-|-|-|-|-|-|-|
|stem of image in /img folder|width|height|x|y|w|h|

In [1]:
import sys  
import os
from os import listdir
from os.path import isfile, join
from pathlib import Path
import json
import io
import base64
import numpy as np
import PIL
from PIL import Image
from shapely.geometry import Polygon
import rasterio.features
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
from tqdm import tqdm
import pandas as pd

In [2]:
def img_b64_to_arr(img_b64):
    f = io.BytesIO()
    f.write(base64.b64decode(img_b64))
    img_arr = np.array(Image.open(f))
    return img_arr

In [3]:
# Set a path for the dataset
datasetLocation = (Path('../dataset/df-bl-annotated-v2/'))
datasetLocation = datasetLocation.resolve()

outputLocation = (Path('../dataset/df-bl-annotated-v2-obj/'))
outputLocation = outputLocation.resolve()

In [4]:
if os.path.exists(outputLocation/'img'):
    print('Output Location exists, continuing')
else:
    print('Output location does not exist, creating')
    path = os.path.join(outputLocation, 'img') 
    os.mkdir(path)

Output Location exists, continuing


In [5]:
onlyFiles = [f for f in listdir(datasetLocation) if isfile(join(datasetLocation, f))]
jsonFiles = [f for f in onlyFiles if '.json' in f]
print(f'Number of json files found: {len(jsonFiles)}')

Number of json files found: 26


In [6]:
FINAL = []

for json_file in tqdm(jsonFiles):
    
    x = -1
    y=-1
    h = -1
    w = -1 
    
    image_id = json_file.split('.')[0] # name of image
    
    jsonLoc = datasetLocation/json_file
    jsonFile = open (jsonLoc, "r")
    jsonFile = json.load(jsonFile)
    bboxes = jsonFile['shapes']
    




    
    
    im_b64 = jsonFile['imageData']
    im_np = img_b64_to_arr(im_b64)
    im = Image.fromarray(im_np)
    width, height = im.size
    im = im.save(str(outputLocation/'img'/jsonLoc.stem) + '.jpg')

    for bbox in bboxes:
        if bbox['shape_type']=='rectangle':
            label = bbox['label']
            points = bbox['points']
            small_index = 0 if points[0][0]<points[1][0] else 1
            large_index = int(not(small_index))

            p0_0 = points[0][0]
            p0_1 = points[0][1]

            p1_0 = points[1][0]
            p1_1 = points[1][1]

            x = int(min(p0_0,p1_0))
            y = int(min(p0_1,p1_1))
            w = int(max(p0_0,p1_0) - min(p0_0,p1_0) )
            h = int(max(p0_1,p1_1) - min(p0_1,p1_1) )
            
            
            SINGLE = {
                'image_id':image_id,
                'width':int(width),
                'height':int(height),
                'x':x,
                'y':y,
                'w':w,
                'h':h,
                'label':label,
                'area':w*h

            }
            FINAL.append(SINGLE)
    
    

100%|███████████████████████████████████████████████████████████████████████████████████| 26/26 [00:03<00:00,  8.43it/s]


In [7]:
df = pd.DataFrame(FINAL)

In [8]:
df = df.sample(frac=1)

In [9]:
df.reset_index()

Unnamed: 0,index,image_id,width,height,x,y,w,h,label,area
0,473,bl_ecf251d4_0_189,1920,1080,895,365,21,41,2,861
1,398,bl_ecf251d4_0_175,1920,1080,1330,403,33,48,2,1584
2,453,bl_ecf251d4_0_185,1920,1080,638,416,10,7,4,70
3,419,bl_ecf251d4_0_177,1920,1080,522,311,21,52,2,1092
4,380,bl_ecf251d4_0_165,1920,1080,130,399,37,55,2,2035
...,...,...,...,...,...,...,...,...,...,...
526,364,bl_ecf251d4_0_165,1920,1080,729,340,19,46,2,874
527,527,bl_ecf251d4_0_196,1920,1080,394,589,33,61,2,2013
528,212,bl_3c993bd2_0_88,1920,1080,559,406,29,60,2,1740
529,337,bl_ecf251d4_0_160,1920,1080,753,279,35,46,2,1610


In [10]:
df_train = df[:int(len(df)*0.9)]
df_valid = df[int(len(df)*0.9):]

In [11]:
df_train = df_train.reset_index()
df_valid= df_valid.reset_index()

In [12]:
df_train.to_csv(outputLocation/'train.csv')
df_train.to_csv(outputLocation/'valid.csv')