# Preparing the Data to Train Bounding Box

## Initial Setup

Auto update from code base

In [1]:
%load_ext autoreload
%autoreload 2

Import libraries

In [11]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import os
import shutil
import zipfile
import glob
from tqdm import tqdm
import skimage
from PIL import Image
from PIL.ImageDraw import Draw
from torchvision import transforms

from src.data import pil_loader
from src.image_preprocessing import draw_dots, draw_rectangle

## Sample data to be used for training and validating bounding box model

In [3]:
df_beauty_train = pd.read_csv('data/raw/beauty_data_info_train_competition.csv')
df_beauty_val = pd.read_csv('data/raw/beauty_data_info_val_competition.csv')

df_fashion_train = pd.read_csv('data/raw/fashion_data_info_train_competition.csv')
df_fashion_val = pd.read_csv('data/raw/fashion_data_info_val_competition.csv')

df_mobile_train = pd.read_csv('data/raw/mobile_data_info_train_competition.csv')
df_mobile_val = pd.read_csv('data/raw/mobile_data_info_val_competition.csv')

In [4]:
df_beauty_sample = df_beauty_train.sample(n=500, random_state=2019).reset_index(drop=True)

df_fashion_sample = df_fashion_train.sample(n=500, random_state=2019).reset_index(drop=True)

df_mobile_sample = df_mobile_train.sample(n=500, random_state=2019).reset_index(drop=True)

In [5]:
# df_beauty_bb_train = pd.DataFrame({'image_path': df_beauty_train_sample['image_path']}).reset_index(drop=True)
# df_beauty_bb_val = pd.DataFrame({'image_path': df_beauty_val_sample['image_path']}).reset_index(drop=True)

# df_fashion_bb_train = pd.DataFrame({'image_path': df_fashion_train_sample['image_path']}).reset_index(drop=True)
# df_fashion_bb_val = pd.DataFrame({'image_path': df_fashion_val_sample['image_path']}).reset_index(drop=True)

# df_mobile_bb_train = pd.DataFrame({'image_path': df_mobile_train_sample['image_path']}).reset_index(drop=True)
# df_mobile_bb_val = pd.DataFrame({'image_path': df_mobile_val_sample['image_path']}).reset_index(drop=True)

# df_beauty_bb_train.to_csv('data/derived/beauty_image_path_train.csv', index=False)
# df_beauty_bb_val.to_csv('data/derived/beauty_image_path_val.csv', index=False)

# df_fashion_bb_train.to_csv('data/derived/fashion_image_path_train.csv', index=False)
# df_fashion_bb_val.to_csv('data/derived/fashion_image_path_val.csv', index=False)

# df_mobile_bb_train.to_csv('data/derived/mobile_image_path_train.csv', index=False)
# df_mobile_bb_val.to_csv('data/derived/mobile_image_path_val.csv', index=False)

## Output sampled images

Beauty

In [6]:
with tqdm(total=len(df_beauty_sample)) as pbar:
    for img_path_short in df_beauty_sample['image_path']:
        if img_path_short[-4:] != '.jpg': img_path_short += '.jpg'
        img_path_full = os.path.join('/home/yxlee245/ndsc-2019/data/raw', img_path_short)
        dest_path_full = os.path.join('/home/yxlee245/ndsc-2019/data/derived/bb_images/', img_path_short)
        shutil.copy(img_path_full,dest_path_full)
        pbar.update()

100%|██████████| 500/500 [00:16<00:00, 29.89it/s]


Fashion

In [7]:
with tqdm(total=len(df_fashion_sample)) as pbar:
    for img_path_short in df_fashion_sample['image_path']:
        if img_path_short[-4:] != '.jpg': img_path_short += '.jpg'
        img_path_full = os.path.join('/home/yxlee245/ndsc-2019/data/raw', img_path_short)
        dest_path_full = os.path.join('/home/yxlee245/ndsc-2019/data/derived/bb_images/', img_path_short)
        shutil.copy(img_path_full,dest_path_full)
        pbar.update()

100%|██████████| 500/500 [00:25<00:00, 15.51it/s]


Mobile

In [8]:
with tqdm(total=len(df_mobile_sample)) as pbar:
    for img_path_short in df_mobile_sample['image_path']:
        if img_path_short[-4:] != '.jpg': img_path_short += '.jpg'
        img_path_full = os.path.join('/home/yxlee245/ndsc-2019/data/raw', img_path_short)
        dest_path_full = os.path.join('/home/yxlee245/ndsc-2019/data/derived/bb_images/', img_path_short)
        shutil.copy(img_path_full,dest_path_full)
        pbar.update()

100%|██████████| 500/500 [00:19<00:00, 26.18it/s]


In [13]:
glob.glob('data/derived/bb_images/beauty_image/*.jpg')

['data/derived/bb_images/beauty_image/be190be3146d9cd88501be5c44ce67b9.jpg',
 'data/derived/bb_images/beauty_image/bbac977e8eb187479c5e88b68e340bb4.jpg',
 'data/derived/bb_images/beauty_image/aef8adf75eb343c217a5881016f860d4.jpg',
 'data/derived/bb_images/beauty_image/1d922cbcd8ac15ba535e10d302b0f11e.jpg',
 'data/derived/bb_images/beauty_image/3af85f5a138884604a7f6246d4f5e185.jpg',
 'data/derived/bb_images/beauty_image/0456864c4b19ee9ed5a37dbc9e833d1c.jpg',
 'data/derived/bb_images/beauty_image/113a5d19b28705729673e8325b6b2342.jpg',
 'data/derived/bb_images/beauty_image/95983185e4c84532092ab0ba55ea8c74.jpg',
 'data/derived/bb_images/beauty_image/374c2f8735f1ea7d5c522a4cc4745b01.jpg',
 'data/derived/bb_images/beauty_image/d54f7c51a9ee2639678cb43ceb6ef7ea.jpg',
 'data/derived/bb_images/beauty_image/5782138ebcc1d0e6d026a5d7da4daf2f.jpg',
 'data/derived/bb_images/beauty_image/e6d32e05b610da9409fb870ce75a317e.jpg',
 'data/derived/bb_images/beauty_image/954dc53c3b07c49c1235e4a20cada6ed.jpg',

In [15]:
with zipfile.ZipFile('bb_images.zip', 'w') as zipped_folder:
    for file in glob.glob('data/derived/bb_images/beauty_image/*.jpg'):
        zipped_folder.write(file)
    for file in glob.glob('data/derived/bb_images/fashion_image/*.jpg'):
        zipped_folder.write(file)
    for file in glob.glob('data/derived/bb_images/mobile_image/*.jpg'):
        zipped_folder.write(file)

Image paths of samples

In [20]:
df_beauty_sample[['image_path']].to_csv('data/derived/beauty_image_path_bb.csv', index=False)
df_fashion_sample[['image_path']].to_csv('data/derived/fashion_image_path_bb.csv', index=False)
df_mobile_sample[['image_path']].to_csv('data/derived/mobile_image_path_bb.csv', index=False)