In [1]:
import pandas as pd
import numpy as np
import glob
import PIL
from PIL import Image

from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

import shutil
import os

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
#define the root and target folders
read_dir = '/content/gdrive/MyDrive/MMAI 894 Team Project/data_final/'
write_dir = '/content/gdrive/MyDrive/MMAI 894 Team Project/'
test_dir = 'Test/'
train_dir = 'Train/'
#define the data split 
test_split = 0.2
aug_split = 0.1 
#Number of images to generate when total number is ABOVE the cut off
small_num_gen = 1
#Number of images to generate when total number is BELOW the cut off
big_num_gen = 3
#Cut off of number of images after the Aug_split 
img_cutoff = 20
    

In [4]:
def retrieve_data(rootdir):
    fld_lst = []
    for it in os.scandir(rootdir):
        if it.is_dir():
            pth = it.path
            pth = pth.replace(rootdir, '') + '/'
            fld_lst.append(pth)
    
    df_list = []

    for i in range(len(fld_lst)):
          files = []
          files.extend(glob.glob(rootdir + fld_lst[i] + '*'))
          df_list.append(pd.DataFrame(pd.Series(files), columns=['img_path']))
          df_list[i]['folder'] = fld_lst[i]
   
    #concatonate all image types into one df
    df = pd.concat(df_list)

    #reset indices after concatonation
    df = df.reset_index(drop=True)

    return df

#df = retrieve_data()

In [5]:
def split_data(img_list, split_per):
  
    #split into 80% train, 20% test
    X_train, X_test = train_test_split(img_list, test_size=split_per)
    
    return X_train, X_test

In [6]:
def setup_folders(rootdir, folders, fold_type):
    path = rootdir + fold_type
    # delete the folders before adding images.
    shutil.rmtree(path)
    #create the new folders
    os.mkdir(path)
    fld_lst = folders['folder'].unique()
    #create all of the subfolders
    for i in fld_lst:
      os.mkdir(path + i)

In [7]:
#Put a copy of the images in each directory based on the split
def write_to_file(rootdir, img_list, fold_type):
    img_list = img_list.reset_index(drop=True)
    dest_fldr = rootdir + fold_type
  
    for i in range(len(img_list)):
      shutil.copy2(img_list['img_path'][i], dest_fldr + img_list['folder'][i])
      if (i/1000).is_integer():
        print(str(i) + " images copied of " + str(len(img_list)) + " for " + fold_type)

In [8]:
#Generate images for the 10% of the training set
def generate_data(rootdir, img_set):
    datagen = ImageDataGenerator(
            width_shift_range=0.1,
            height_shift_range=0.1,
            brightness_range=[0.9,1.1],
            zoom_range=0.1,
            horizontal_flip=True,
            vertical_flip=True,
            fill_mode='reflect')
    
    img_set = img_set.reset_index(drop=True)
    #Add the list of folders to the list and the total count of images in the folder.
    Aug_img_count = img_set.groupby('folder').count()
    Aug_img_count = Aug_img_count.rename(columns={"img_path": "img_count"})
    img_set = img_set.merge(Aug_img_count, on="folder")
    num_imgs = 0
    #Loop through the list of images to generate
    for c in range(len(img_set)):
        pic_count = Aug_set.groupby('folder').count()
        img = np.asarray(Image.open(img_set['img_path'][c]))
        x = img_to_array(img) 
        x = x.reshape((1,) + x.shape) 
        
        # the .flow() command below generates batches of randomly transformed images
        # and saves the results to the `preview/` directory
 
        if img_set['img_count'][c] > img_cutoff:
          num_imgs = small_num_gen - 1
        else:
          num_imgs = big_num_gen - 1
        i = 0
        for batch in datagen.flow(x, batch_size=1,
                                  save_to_dir=rootdir + train_dir + img_set['folder'][c], save_prefix='generated_', save_format='jpeg'):
            i += 1
            if i > num_imgs:
                break  # otherwise the generator would loop indefinitely

#generate_data(Aug_set)

In [9]:
df = retrieve_data(read_dir)
Train_set, Test_set = split_data(df, test_split)
setup_folders(write_dir, Train_set, train_dir)
setup_folders(write_dir, Test_set, test_dir)
write_to_file(write_dir, Train_set, train_dir)
write_to_file(write_dir, Test_set, test_dir)
Train_set, Aug_set = split_data(Train_set, aug_split)
generate_data(write_dir, Aug_set)

0 images copied of 8116 for Train_Images/
1000 images copied of 8116 for Train_Images/
2000 images copied of 8116 for Train_Images/
3000 images copied of 8116 for Train_Images/
4000 images copied of 8116 for Train_Images/
5000 images copied of 8116 for Train_Images/
6000 images copied of 8116 for Train_Images/
7000 images copied of 8116 for Train_Images/
8000 images copied of 8116 for Train_Images/
0 images copied of 2029 for Test_Images/
1000 images copied of 2029 for Test_Images/
2000 images copied of 2029 for Test_Images/
