# Sample the Image Dataset

*This notebook explores the images generated from the first notebook and samples down the image dataset, using a train/test split, in order to optimize the data to run on a locally-built model.*

In [1]:
# import libraries
import pandas as pd
import numpy as np
import cv2
import glob
from sklearn.model_selection import train_test_split as tts
import os
import shutil
import csv

### Explore the Dataset
#### Check to see if both color and grayscale images are formatted correctly
*Note that Inceptionv3 only takes RGB images, not grayscale, due to the dataset on which it was trained.*

In [2]:
# read in a color image
color = cv2.imread("../data/images-cropped/kithara-full-resized/IMAG0009-kithara-full.jpg")
# read in a grayscale image
gray = cv2.imread("../data/images-cropped/kithara-full-resized/IMAG0029-kithara-full.jpg")
# check if both have three channels or not
print(color.shape)
print(gray.shape)

(558, 558, 3)
(558, 558, 3)


### Create train / test split of `kithara-full` and `no-kithara` images
#### 1. Use the image list `csv` to create a sample of `no-kithara` images and train / test split it

In [3]:
# read in the list of image IDs without a kithara
no_kithara = pd.read_csv('../data/images-cropped/no-kithara-resized-filenames.csv')

In [4]:
# # create a new column that labels the images without a kithara as such
# no_kithara['kithara'] = 0

In [5]:
# take a look at the first few rows
no_kithara.head()

Unnamed: 0,filename
0,IMAG0009-1248-1248.jpg
1,IMAG0009-1248-1664.jpg
2,IMAG0009-1248-2080.jpg
3,IMAG0009-1248-2496.jpg
4,IMAG0009-1248-2912.jpg


In [6]:
# Randomly select X images and assign to a new df named 'nk'
trash, nk = tts(no_kithara, test_size = 2000, random_state = 42)
# train/test split the nk dataframe
train_nk, test_nk = tts(nk, test_size = .25, random_state = 42)

In [7]:
# write the train / test data to csv
train_nk.to_csv('../data/images-cropped/no-kithara-train-filenames.csv', index = False)
test_nk.to_csv('../data/images-cropped/no-kithara-test-filenames.csv', index = False)

#### 2. Use the image list `csv`s to train / test split the `kithara-full` images

In [8]:
# read in the list of image IDs with a kithara
kithara = pd.read_csv('../data/images-cropped/kithara-full-resized-filenames.csv')

In [9]:
# # create a new column that labels the images with a kithara as such
# kithara['kithara'] = 1

In [10]:
kithara

Unnamed: 0,filename
0,IMAG0009-kithara-full.jpg
1,IMAG0029-kithara-full.jpg
2,IMAG0030-kithara-full.jpg
3,IMAG0034-kithara-full.jpg
4,IMAG0043-kithara-full.jpg
...,...
405,IMAG9949-kithara-full.jpg
406,IMAG9950a-kithara-full.jpg
407,IMAG9950b-kithara-full.jpg
408,IMAG9970-kithara-full.jpg


In [11]:
# train/test split the kithara dataframe
train_k, test_k = tts(kithara, test_size = .25, random_state = 42)

In [12]:
# write the train / test data to csv
train_k.to_csv('../data/images-cropped/kithara-full-train-filenames.csv', index = False)
test_k.to_csv('../data/images-cropped/kithara-full-test-filenames.csv', index = False)

#### Copy the selected, split, `resized` images to `kithara-train`, `kithara-test`, `no-kithara-train`, and `no-kithara-test` folders

In [13]:
# copy the training kithara images to a new folder
with open('../data/images-cropped/kithara-full-train-filenames.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\kithara-full-resized\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\kithara-full-train\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            print('end of kithara train file')

# copy the testing kithara images to a new folder
with open('../data/images-cropped/kithara-full-test-filenames.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\kithara-full-resized\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\kithara-full-test\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            print('end of kithara test file')
            
# copy the training no-kithara images to a new folder
with open('../data/images-cropped/no-kithara-train-filenames.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\no-kithara-resized\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\no-kithara-train\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            print('end of no-kithara train file')

# copy the testing no-kithara images to a new folder
with open('../data/images-cropped/no-kithara-test-filenames.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\no-kithara-resized\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\no-kithara-test\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            print('end of no-kithara test file')

end of kithara train file
end of kithara test file
end of no-kithara train file
end of no-kithara test file


#### Images are now standardized and structured for `keras` CNN modeling.