# Convert JSON to CSV

- Train: imageId, labelId, imagePath
- Validation: imageId, labelId, imagePath
- Test: imageId, imagePath

In [1]:
import json, glob
import numpy as np
import pandas as pd

## Train

In [2]:
train_json = json.load(open('data/train.json'))
missing_json = json.load(open('data/missing.json'))

In [3]:
num_images = len(train_json['images'])
num_missing = len(missing_json['images'])
print(num_images, ' ', num_missing)

1014544   12753


In [4]:
# Get image ids and labels
image_id = []
label_id = []
for i in range(num_images):
    image_id.append(int(train_json['annotations'][i]['imageId']))
    label_list = train_json['annotations'][i]['labelId']
    label_str = ''
    for j in range(len(label_list)):
        label_str += ' ' + label_list[j]
    label_str = label_str[1:]
    label_id.append(label_str)

In [5]:
# Get train image paths
train_image_folders = ['train_images_0','train_images_1','train_images_2','train_images_3','train_images_4',
                     'train_images_5','train_images_6','train_images_7','train_images_8','train_images_9','train_images_10']
image_path = []

def get_index(path):
    return int(path.split('\\')[-1].split('.')[0])

for folder_nm in train_image_folders:
    path_format = 'data_224/' + folder_nm + '/*.jpeg'
    pathSorted = sorted(glob.glob(path_format), key=get_index)
    image_path.append(pathSorted)

image_path = [item for sublist in image_path for item in sublist]

In [6]:
train_csv = pd.DataFrame({'imageId': image_id, 'labelId': label_id, 'imagePath': image_path})

In [7]:
train_csv.to_csv('data\\train_with_missing.csv', index=False)

In [8]:
# Get missing imageIds
missing_id = [int(missing_json['images'][i]['imageId']) for i in range(num_missing)]

In [9]:
train_without_missing = train_csv[~train_csv['imageId'].isin(missing_id)]

In [10]:
train_without_missing.shape

(1001791, 3)

In [11]:
train_without_missing.to_csv('data\\train_without_missing.csv', index=False)

# Validation

In [12]:
validation_json = json.load(open('data/validation.json'))

In [13]:
num_images = len(validation_json['images'])
print(num_images)

9897


In [14]:
# Get image ids and labels
image_id = []
label_id = []
for i in range(num_images):
    image_id.append(int(validation_json['annotations'][i]['imageId']))
    label_list = validation_json['annotations'][i]['labelId']
    label_str = ''
    for j in range(len(label_list)):
        label_str += ' ' + label_list[j]
    label_str = label_str[1:]
    label_id.append(label_str)

In [15]:
# Get validation image paths
validation_path_format = 'data_224\\validation_images\\*.jpeg'
image_path = sorted(glob.glob(validation_path_format), key=get_index)

In [16]:
validation_csv = pd.DataFrame({'imageId': image_id, 'labelId': label_id, 'imagePath': image_path})

In [17]:
validation_csv.head()

Unnamed: 0,imageId,imagePath,labelId
0,1,data_224\validation_images\1.jpeg,62 17 66 214 105 137 85
1,2,data_224\validation_images\2.jpeg,95 17 66 214 164 137 20 204 184
2,3,data_224\validation_images\3.jpeg,122 19 66 186 180 44 154 20
3,4,data_224\validation_images\4.jpeg,190 222 66 153 164 226 53 184
4,5,data_224\validation_images\5.jpeg,62 66 153 171 111 137 70 204 184


In [18]:
validation_csv.to_csv('data\\validation.csv', index=False)

# Test

In [19]:
test_json = json.load(open('data/test.json'))

In [20]:
num_images = len(test_json['images'])
print(num_images)

39706


In [21]:
# Get image ids
image_id = [int(test_json['images'][i]['imageId']) for i in range(num_images)]

In [22]:
# Get test image paths
test_path_format = 'data_224\\test_images\\*.jpeg'
image_path = sorted(glob.glob(test_path_format), key=get_index)

In [23]:
test_csv = pd.DataFrame({'imageId': image_id, 'imagePath': image_path})

In [24]:
test_csv.head()

Unnamed: 0,imageId,imagePath
0,1,data_224\test_images\1.jpeg
1,2,data_224\test_images\2.jpeg
2,3,data_224\test_images\3.jpeg
3,4,data_224\test_images\4.jpeg
4,5,data_224\test_images\5.jpeg


In [25]:
test_csv.to_csv('data\\test.csv', index=False)