# Sample the Image Dataset

*This notebook explores the images generated from the first notebook and samples down the image dataset, using a train/test split, in order to optimize the data to run on a locally-built model.*

In [1]:
# import libraries
import pandas as pd
import numpy as np
import cv2
import glob
from sklearn.model_selection import train_test_split as tts
import os
import shutil
import csv

### Explore the Dataset
#### Check to see if both color and grayscale images are formatted correctly
*Note that Inceptionv3 only takes RGB images, not grayscale, due to the dataset on which it was trained.*

In [2]:
# read in a color image
color = cv2.imread("../data/images-cropped/kithara-full-resized/IMAG0009-kithara-full.jpg")
# read in what looks like a grayscale image
gray = cv2.imread("../data/images-cropped/kithara-full-resized/IMAG0029-kithara-full.jpg")
# check if both have three channels or not
print(color.shape)
print(gray.shape)

(558, 558, 3)
(558, 558, 3)


### Create train / test *and* train / validation / test split of `kithara-full` and `no-kithara` images
#### 1. Use the image list `csv` to create a sample of `no-kithara` images and train / test *and* train / validation / test split it

In [3]:
# read in the list of image IDs without a kithara
no_kithara = pd.read_csv('../data/images-cropped/no-kithara-resized-filenames.csv')

In [4]:
# take a look at the first few rows
no_kithara.head()

Unnamed: 0,filename
0,IMAG0009-1248-1248.jpg
1,IMAG0009-1248-1664.jpg
2,IMAG0009-1248-2080.jpg
3,IMAG0009-1248-2496.jpg
4,IMAG0009-1248-2912.jpg


##### Train / Test Split Only (75/25)

In [5]:
# Randomly select X images and assign to a new df named 'nk'
trash, nk = tts(no_kithara, test_size = 5000, random_state = 42)
# train/test split the nk dataframe
train_nk, test_nk = tts(nk, test_size = .25, random_state = 42)

In [7]:
# write the train / test data to csv
nk.to_csv('../data/images-model/image-lists/no-kithara-filenames.csv', index = False)
train_nk.to_csv('../data/images-model/image-lists/no-kithara-train-filenames.csv', index = False)
test_nk.to_csv('../data/images-model/image-lists/no-kithara-test-filenames.csv', index = False)

##### Train / Validation / Test Split (60/20/20)

In [17]:
# Split out the training data from the nk dataframe
train_nkv, test_val_nk = tts(nk, test_size = .4, random_state = 42)
# Split out the testing and validation data
test_nkv, val_nkv = tts(test_val_nk, test_size = .5, random_state = 42)

In [18]:
# write the train / test data to csv
train_nkv.to_csv('../data/images-model/image-lists/no-kithara-train60-filenames.csv', index = False)
test_nkv.to_csv('../data/images-model/image-lists/no-kithara-test20-filenames.csv', index = False)
val_nkv.to_csv('../data/images-model/image-lists/no-kithara-validation20-filenames.csv', index = False)

#### 2. Use the image list `csv`s to train / test split the `kithara-full` images

In [9]:
# read in the list of image IDs with a kithara
kithara = pd.read_csv('../data/images-cropped/kithara-full-resized-filenames.csv')

In [10]:
kithara

Unnamed: 0,filename
0,IMAG0009-kithara-full.jpg
1,IMAG0029-kithara-full.jpg
2,IMAG0030-kithara-full.jpg
3,IMAG0034-kithara-full.jpg
4,IMAG0043-kithara-full.jpg
...,...
405,IMAG9949-kithara-full.jpg
406,IMAG9950a-kithara-full.jpg
407,IMAG9950b-kithara-full.jpg
408,IMAG9970-kithara-full.jpg


##### Train / Test Split Only (75/25)

In [11]:
# train/test split the kithara dataframe
train_k, test_k = tts(kithara, test_size = .25, random_state = 42)

In [12]:
# write the train / test data to csv
train_k.to_csv('../data/images-model/image-lists/kithara-full-train-filenames.csv', index = False)
test_k.to_csv('../data/images-model/image-lists/kithara-full-test-filenames.csv', index = False)

##### Train / Validation / Test Split (60/20/20)

In [11]:
# Split out the training data from the nk dataframe
train_kv, test_val_k = tts(kithara, test_size = .4, random_state = 42)
# Split out the testing and validation data
test_kv, val_kv = tts(test_val_k, test_size = .5, random_state = 42)

In [14]:
# write the train / test data to csv
train_kv.to_csv('../data/images-model/image-lists/kithara-train60-filenames.csv', index = False)
test_kv.to_csv('../data/images-model/image-lists/kithara-test20-filenames.csv', index = False)
val_kv.to_csv('../data/images-model/image-lists/kithara-validation20-filenames.csv', index = False)

#### Copy the selected, 75/25 split, resized images to `train/kithara`, `test/kithara`, `train/no-kithara`, and `test/no-kithara` folders
*This method of generating the training / testing datasets leverages the splits created in this notebook.*

In [13]:
# copy the training kithara images to a new folder
with open('../data/images-model/image-lists/kithara-full-train-filenames.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\kithara-full-resized\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-model\\train\\kithara\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            print('end of kithara train file')

# copy the testing kithara images to a new folder
with open('../data/images-model/image-lists/kithara-full-test-filenames.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\kithara-full-resized\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-model\\test\\kithara\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            print('end of kithara test file')
            
# copy the training no-kithara images to a new folder
with open('../data/images-model/image-lists/no-kithara-train-filenames.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\no-kithara-resized\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-model\\train\\no-kithara\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            print('end of no-kithara train file')

# copy the testing no-kithara images to a new folder
with open('../data/images-model/image-lists/no-kithara-test-filenames.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\no-kithara-resized\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-model\\test\\no-kithara\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            print('end of no-kithara test file')

end of kithara train file
end of kithara test file
end of no-kithara train file
end of no-kithara test file


#### Copy the selected, 60/20/20 split, resized images to `train/kithara`, `test/kithara`, `validate/kithara`, `train/no-kithara`, `test/no-kithara`, and `validate/no-kithara` folders
*This method of generating the training / testing / validation datasets leverages the splits created in this notebook.*

In [19]:
# copy the training kithara images to a new folder
with open('../data/images-model/image-lists/kithara-train60-filenames.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\kithara-full-resized\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-model\\train60\\kithara\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            print('end of kithara train60 file')

# copy the testing kithara images to a new folder
with open('../data/images-model/image-lists/kithara-test20-filenames.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\kithara-full-resized\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-model\\test20\\kithara\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            print('end of kithara test20 file')

# copy the validation kithara images to a new folder
with open('../data/images-model/image-lists/kithara-validation20-filenames.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\kithara-full-resized\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-model\\validate20\\kithara\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            print('end of kithara validate20 file')
            
# copy the training no-kithara images to a new folder
with open('../data/images-model/image-lists/no-kithara-train60-filenames.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\no-kithara-resized\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-model\\train60\\no-kithara\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            print('end of no-kithara train60 file')

# copy the testing no-kithara images to a new folder
with open('../data/images-model/image-lists/no-kithara-test20-filenames.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\no-kithara-resized\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-model\\test20\\no-kithara\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            print('end of no-kithara test20 file')
            
# copy the testing no-kithara images to a new folder
with open('../data/images-model/image-lists/no-kithara-validation20-filenames.csv', 'r') as f:
    for img in csv.reader(f, delimiter=','):
        name = img[0]
        dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\no-kithara-resized\\'
        dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-model\\validate20\\no-kithara\\'
        try:
            src_file = os.path.join(dir_src, name)
            dst_file = os.path.join(dir_dst, name)
            shutil.copy(src_file, dst_file)
        except:
            print('end of no-kithara validate20 file')

end of kithara train60 file
end of kithara test20 file
end of kithara validate20 file
end of no-kithara train60 file
end of no-kithara test20 file
end of no-kithara validate20 file


#### Copy the selected, resized images to `kithara` and `no-kithara` folders
*An alternate way to write images, with the train / test split happening in the data preparation later.*

In [15]:
# # copy all of the no-kithara images to a new folder
# with open('../data/images-model/image-lists/no-kithara-filenames.csv', 'r') as f:
#     for img in csv.reader(f, delimiter=','):
#         name = img[0]
#         dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\no-kithara-resized\\'
#         dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-model\\inceptionv3\\no-kithara\\'
#         try:
#             src_file = os.path.join(dir_src, name)
#             dst_file = os.path.join(dir_dst, name)
#             shutil.copy(src_file, dst_file)
#         except:
#             print('end of no-kithara file')

# # copy all of the no-kithara images to a new folder
# with open('../data/images-cropped/kithara-full-resized-filenames.csv', 'r') as f:
#     for img in csv.reader(f, delimiter=','):
#         name = img[0]
#         dir_src = os.path.dirname(os.getcwd()) + '\\data\\images-cropped\\kithara-full-resized\\'
#         dir_dst = os.path.dirname(os.getcwd()) + '\\data\\images-model\\inceptionv3\\kithara\\'
#         try:
#             src_file = os.path.join(dir_src, name)
#             dst_file = os.path.join(dir_dst, name)
#             shutil.copy(src_file, dst_file)
#         except:
#             print('end of kithara file')

end of no-kithara file
end of kithara file


#### Images are now standardized and structured for `keras` CNN modeling.