In [11]:
import pandas as pd
import os
import csv
import glob

## Download data from here: https://www.kaggle.com/c/dog-breed-identification

BASE_PATH = "/Users/wctjerry/Documents/dataset/dog-breed-identification"
train_path = "train"
map_csv_path = "labels.csv"

## Collect all files' path for images in train folder
train_all_files_path = glob.glob(os.path.join(BASE_PATH, train_path + "/*.jpg"))

In [12]:
## Create id - breed pairs as well as breed - ids pairs 
## The labels.csv file contains two columns: id and corresponding breed

id_breed_dict = {}

reader = csv.DictReader(os.path.join(BASE_PATH, map_csv_path))
with open(os.path.join(BASE_PATH, map_csv_path)) as f:
    reader = csv.DictReader(f)
    for r in reader:
        id_breed_dict[r['id']] = r['breed']

In [13]:
def move_file(file_path):
    ## Move the image to corresponding breed folder. Create the breed folder if not exists
    ##
    ## INPUT: A single image file's path. The file name is its id. 
    ## OUTPUT: None
    
    image_id = os.path.basename(file_path).split('.')[0]
    breed = id_breed_dict[image_id]
    target_path = os.path.join(os.path.dirname(file_path), breed)
    
    if not os.path.exists(target_path) :
        os.mkdir(target_path)
        
    os.rename(str(file_path), os.path.join(target_path, os.path.basename(file_path)))

In [14]:
## Iterate to move images so as to meet the Input structure requirement of Create ML
##
## Final structure is like:
##   affenpinscher
##   -- 00ca18751837cd6a22813f8e221f7819.jpg
##   -- 1a012faf98b32039adc17ee708bcb360.jpg
##   -- 1aa98488ce65f9b57b468ccde4f1cad8.jpg
##   -- ...
##   afghan_hound
##   -- 0a4f1e17d720cdff35814651402b7cf4.jpg
##   -- 0d5a88f0ab2db8d34b533c69768135e8.jpg
##   -- ...
##   ...

for file_path in train_all_files_path:
    move_file(file_path)

In [15]:
## Extract 5% images from each breed folder and move to test path
## Although there is test data in the original dataset, there is no labels there.
## So create this 5% test data for testing use and AI VS. Human use.

import random

validation = "test_generated"
validation_path = os.path.join(BASE_PATH, validation)
breeds = set(id_breed_dict.values())
id_breed_validation_dict = {}

if not os.path.exists(validation_path) :
        os.mkdir(validation_path)

for breed in breeds:
    all_file_path_by_breed = glob.glob(os.path.join(BASE_PATH, train_path, breed + "/*.jpg"))
    validation_breed = random.sample(all_file_path_by_breed, int(len(all_file_path_by_breed) * 0.05))

    if not os.path.exists(os.path.join(validation_path, breed)) :
        os.mkdir(os.path.join(validation_path, breed))
    
    for v in validation_breed:
        i = os.path.basename(v).split('.')[0]
        id_breed_validation_dict[i] = id_breed_dict[i]
        os.rename(str(v), os.path.join(BASE_PATH, validation, breed, os.path.basename(v)))
        

f = open(os.path.join(validation_path, "test_generated_dict.txt"), "w")
f.write(str(id_breed_validation_dict))
f.close()