In [1]:
import os, gc, sys
import h5py, json, math
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
from os import listdir
from matplotlib import pyplot as plt
from collections import defaultdict

  from ._conv import register_converters as _register_converters


In [6]:
# Helper functions
def memo_obj(obj):
    print(sys.getsizeof(obj)/ 1024**2," MB")

def get_metadata(path):  
    raw_json = json.loads(open(path).read())
    num_items = raw_json['EXPECTED_QUANTITY'] 
    return num_items

def get_image_and_labels(img_path, label_path):
    image_names = filter( lambda x: x.endswith(".jpg"), os.listdir(img_path))
    image_names = [os.path.splitext(x)[0] for x in image_names]
    label_names = filter( lambda x: x.endswith(".json"), os.listdir(label_path))
    label_names = [os.path.splitext(x)[0] for x in label_names]
    assert(len(label_names) == len(image_names))
    return image_names, label_names

def intersection(lst1, lst2):
    return list(set(lst1) & set(lst2))

# Create a dataframe for train and val set


In [16]:
base_path = "..\\..\\..\\train_data"
csv_path = os.path.join(base_path,"file_labels")
label_path = os.path.join(base_path,"metadata")
img_path = os.path.join(base_path,"bin-images")

In [12]:
# Load in the image and labels
image_names, label_names = get_image_and_labels(img_path, label_path)

536434

In [13]:
# Get the intersection of the two sets
with_label_image_names = intersection(image_names, label_names)
print("Total number of with_label_image_names:", len(with_label_image_names))

Total number of with_label_image_names: 536432


In [17]:
all_files = defaultdict(list)
for name in with_label_image_names:
    all_files['file_name'].append(str(name))
    all_files['label'].append(get_metadata(label_path + '/' + name + '.json'))
all_files = pd.DataFrame(all_files)
all_files.head()

KeyboardInterrupt: 

In [None]:
# Calculating the sharpness
sharpness_list = []
for i in range(len(df)):
    filename = img_path + '\\'+ all_files['file_name'][i] +'.jpg'
    im = Image.open(filename).convert('L') # to grayscale
    array = np.asarray(im, dtype=np.int32)
    gy, gx = np.gradient(array)
    gnorm = np.sqrt(gx**2 + gy**2)
    sharpness = np.average(gnorm)
    sharpness_list.append(sharpness)
    
print(len(sharpness_list))

In [None]:
# Save to folder
all_files['sharpness'] = pd.Series(sharpness_list, dtype = np.float32)
print(all_files.dtypes)
all_files.to_csv(csv_path + '\\all_file_label_sharp.csv', index = None)

In [19]:
all_files = pd.read_csv(csv_path + '\\all_file_label_sharp.csv', dtype = {'file_name':str, 'lable':np.int8, 'sharpness':np.float32})
all_files.dtypes

file_name     object
label          int64
sharpness    float32
dtype: object

In [23]:
# Select 10 percent samples for val and test each
test_size = len(all_files) // 10
df_test = all_files.loc[:test_size, :]
df_val = all_files.loc[test_size+1:2*test_size+1, :]
df_train = all_files.loc[2*test_size+2:, :]
print("all_files.shape:", all_files.shape)
print("df_test.shape:", df_test.shape)
print("df_val.shape:", df_val.shape)
print("df_train.shape:", df_train.shape)

df_train.to_csv(csv_path + '\\hard_train.csv', index = None)
df_val.to_csv(csv_path + '\\hard_val.csv', index = None)
df_test.to_csv(csv_path + '\\hard_test.csv', index = None)

all_files.shape: (536432, 3)
df_test.shape: (53644, 3)
df_val.shape: (53644, 3)
df_train.shape: (429144, 3)


In [25]:
moderate = all_files[all_files['label'] <= 5]
moderate.index = range(len(moderate))
moderate.shape

(361967, 3)

In [28]:
# Only select 15000 samples each for val and test
test_size = len(moderate) // 10
df_test = moderate.loc[:test_size, :]
df_val = moderate.loc[test_size+1:2*test_size+1, :]
df_train = moderate.loc[2*test_size+2:, :]
print("df_test.shape:", df_test.shape)
print("df_val.shape:", df_val.shape)
print("df_train.shape:", df_train.shape)

df_train.to_csv(csv_path + '\\moderate_train.csv', index = None)
df_val.to_csv(csv_path + '\\moderate_val.csv', index = None)
df_test.to_csv(csv_path + '\\moderate_test.csv', index = None)

df_test.shape: (36197, 3)
df_val.shape: (36197, 3)
df_train.shape: (289573, 3)
