# Sample the Image Dataset

*This notebook explores the images generated from the first notebook and samples down the image dataset, using a train/test split, in order to optimize the data to run on a locally-built model.*

In [41]:
# import libraries
import pandas as pd
import numpy as np
import cv2
import glob
from sklearn.model_selection import train_test_split as tts
import os
import shutil

### Explore the Dataset
#### Check to see if both color and grayscale images are formatted correctly
*Note that Inceptionv3 only takes RGB images, not grayscale, due to the dataset on which it was trained.*

In [9]:
# read in a color image
color = cv2.imread("../data/images-cropped/kithara-full-resized/IMAG0009-kithara-full.jpg")
# read in a grayscale image
gray = cv2.imread("../data/images-cropped/kithara-full-resized/IMAG0029-kithara-full.jpg")
# check if both have three channels or not
print(color.shape)
print(gray.shape)

(558, 558, 3)
(558, 558, 3)


### Use the image list `csv`s to create a sample of `no-kithara` images

In [28]:
# read in the list of image IDs without a kithara
no_kithara = pd.read_csv('../data/images-cropped/no-kithara-resized-filenames.csv')

In [29]:
# create a new column that labels the images without a kithara as such
no_kithara['kithara'] = 0

In [30]:
# take a look at the first few rows
no_kithara.head()

Unnamed: 0,filename,kithara
0,IMAG0009-1248-1248.jpg,0
1,IMAG0009-1248-1664.jpg,0
2,IMAG0009-1248-2080.jpg,0
3,IMAG0009-1248-2496.jpg,0
4,IMAG0009-1248-2912.jpg,0


In [31]:
# Randomly select X images and assign to a new df named 'nk'
trash, nk = tts(no_kithara, test_size = 2000, random_state = 42)
# train/test split the nk dataframe
train_nk, test_nk = tts(no_kithara, test_size = .25, random_state = 42)

In [32]:
# read in the list of image IDs with a kithara
kithara = pd.read_csv('../data/images-cropped/kithara-full-resized-filenames.csv')

In [33]:
# create a new column that labels the images with a kithara as such
kithara['kithara'] = 1

In [34]:
kithara.head()

Unnamed: 0,filename,kithara
0,IMAG0009-kithara-full.jpg,1
1,IMAG0029-kithara-full.jpg,1
2,IMAG0030-kithara-full.jpg,1
3,IMAG0034-kithara-full.jpg,1
4,IMAG0043-kithara-full.jpg,1


In [35]:
# train/test split the kithara dataframe
train_k, test_k = tts(kithara, test_size = .25, random_state = 42)

In [58]:
# WIP: DOES NOT WORK
# move kithara training images into a new folder
for path in glob.glob('../data/images-cropped/kithara-full/*.jpg'):
    name = os.path.splitext(os.path.basename(path))[0]
    for f in train_k['filename']:
        if f == name:
            shutil.copy(name, '../data/images-cropped/kithara-full-train/'+name)
        else:
            pass