In [None]:
#importing the necessary modules
import shutil
from imutils import paths
from random import shuffle
import os
from datetime import datetime
import numpy as np
import pathlib
import pandas as pd

## Sort train images into train and validation folders

In [None]:
train_data = pd.read_csv('../data/train_corrected.csv')
train_data.image_id = train_data.image_id.apply(lambda x: x.strip()+".jpg")
train_data['turtle_id-image_id'] = train_data.turtle_id + "-" + train_data.image_id
train_data['turtle_id-image_location'] = train_data.turtle_id + "-" + train_data.image_location

In [None]:
train_data

In [None]:
#list all pictures in the image_dir
image_dir = '../images/'
imagePaths = sorted(list(paths.list_images(image_dir)))

In [None]:
# Store all the turtle_ids in train_data
turtle_ids = train_data['turtle_id']
turtle_ids

## Sort images
To sort the images you need to create a subfolder "sorted_images" into the main folder

In [None]:
#after listing all individuals create two empty folders for each individual 
#one for the training and another for the validaiton dataset

#define the folder were the training and validation datasets will be placed
if not os.path.exists("../sorted_images"):
    os.makedirs("../sorted_images")
root_dir="../sorted_images"

#loop through all individuals and create a folder for the training dataset
# and a folder for the validation dataset
for i in range(0, len(turtle_ids)):
    train_dir=root_dir+"/train/"+turtle_ids[i]#variable with the full path of the training folder
    val_dir=root_dir+"/val/"+turtle_ids[i]#variable with the full path of the validation folder
    if not os.path.exists(train_dir):#condition for if the folder already exists
        os.makedirs(train_dir)#create the folder
    if not os.path.exists(val_dir):
        os.makedirs(val_dir)   



In [None]:
# These lines will create a folder called "new_turtle" into our train and validation sub-folders. 
#This is necessary for our specific task. (Reminder: if the image most likelx does not belong to any turtle_id, the models needs to output "new_turtle")
#These folders will not contain any pictures
#os.makedirs("../sorted_images/train/new_turtle")
#os.makedirs("../sorted_images/val/new_turtle")

In [None]:
#Check length of train_data, i.e. number of relevant pictures
round(len(train_data))

In [None]:
#in this example we are going to select 1822 (approx 86%) pictures for training and 300 (approx 14%) pictures for validation.
#We setup the condition that the validation pictures should contain at least one picture per turtle_id for each image_location
#as per now we have nothing to avoid having pictures that are very similiar in both datasets, which could result in overfitting the CNN

#define the number of validation pictures and the number of training pictures
N_val_pics=300
N_train_pics=1822

#create two empty lists to store the pictures files that are going to be moved to the training 
#and validation fodlers
training_pictures=[]
validation_pictures=[]

#list to check if combination is already used
turtle_id_image_location=[]

#loop through each individual turtle_id and secondary image_location
for index in range(0, len(train_data)):
    if train_data['turtle_id-image_location'][index] not in turtle_id_image_location:
        validation_pictures.append(train_data['turtle_id-image_id'][index])
        turtle_id_image_location.append(train_data['turtle_id-image_location'][index])
    else:
        training_pictures.append(train_data['turtle_id-image_id'][index])

In [None]:
#Check if number of validation pictures is indeed 300
len(validation_pictures)

In [None]:
len(training_pictures)

In [None]:
val_images = []
train_images = []
#loop through the list of pictures
#move the pictures files to the validation folder
for i in range(0, len(validation_pictures)):
    #get the picture name
    val_file_name = image_dir + validation_pictures[i].split('-')[-1]
    image_name = validation_pictures[i].split('-')[-1]

    #create a variable with the directory and the name of the pictures file
    output_name_val=root_dir+"/val/"+validation_pictures[i].split('-')[-0]+"/"+image_name
       
    #move the file
    shutil.copy(val_file_name, output_name_val)
    
    # put the information into a DataFrame
    val_images.append(image_name)

for i in range(0, len(training_pictures)):
    #get the picture name (e.g. "01103F7D5A_2018-11-26_07-56-03.jpg")
    train_file_name = image_dir + training_pictures[i].split('-')[-1]
    image_name = training_pictures[i].split('-')[-1]

    #create a variable with the directory and the name of the pictures file
    output_name_train=root_dir+"/train/"+training_pictures[i].split('-')[-0]+"/"+image_name
       
    #move the file
    shutil.copy(train_file_name, output_name_train)    
    
    train_images.append(image_name)

## Sort test images
To appriately use our Train_CNN pipeline, we need to move our test pictures into a subfolder, as well.

As we don't know the turtle_id for these pictures, we will save them into subfolders containing their image_id

In [None]:
test_data = pd.read_csv('../data/test_corrected.csv')
test_data_jpg = pd.read_csv('../data/test_corrected.csv')
test_data_jpg.image_id = test_data_jpg.image_id.apply(lambda x: x.strip()+".jpg")

In [None]:
#define the folder were the training and validation datasets will be placed
root_dir="../sorted_images/"
image_dir="../images"
#loop through all individuals and create a folder for the test dataset
for i in range(0, len(test_data)):
    test_dir_folder= root_dir + "test/" + test_data['image_id'][i]#variable with the full path of the training folder
    if not os.path.exists(test_dir_folder):#condition for if the folder already exists
        os.makedirs(test_dir_folder)#create the folder


In [None]:
# put images into the folders

root_dir="../sorted_images/"
image_dir="../images/"

for i in range(0, len(test_data)):    
    image_name = test_data_jpg['image_id'][i]
    test_file_name = image_dir + image_name
    output_name = root_dir + "test/" + test_data['image_id'][i] + "/" + image_name
    #move the file
    shutil.copy(test_file_name, output_name)

## Create dataframes with new folder structure

In [None]:
import os
import pandas as pd

In [None]:
train = []
train_dir="../sorted_images/train/"

for r, d, f in os.walk(train_dir):
    d.sort()
    for file in sorted(f):
        if ".jpg" in file:
            train.append((d,os.path.join(r,file)))

df_train = pd.DataFrame(train,columns=['folder','image_id'])

In [None]:
train = []
train_dir="../sorted_images/train/"

for r, d, f in os.walk(train_dir):
    d.sort(key=str.lower)
    for file in sorted(f):
        if ".jpg" in file:
            train.append((d,os.path.join(r,file)))

df_train = pd.DataFrame(train,columns=['folder','image_id'])

val = []
val_dir="../sorted_images/val/"

for r, d, f in os.walk(val_dir):
    d.sort(key=str.lower)
    for file in sorted(f):
        if ".jpg" in file:
            val.append((d,os.path.join(r,file)))

df_val = pd.DataFrame(val,columns=['folder','image_id'])

test = []
val_dir="../sorted_images/test/"

for r, d, f in os.walk(val_dir):
    d.sort(key=str.lower)
    for file in sorted(f):
        if ".jpg" in file:
            test.append((d,os.path.join(r,file)))

df_test = pd.DataFrame(test,columns=['folder','image_id'])

In [None]:
df_train = df_train['image_id']
df_val = df_val['image_id']
df_test = df_test['image_id']

In [None]:
df_train

In [None]:
train = []
for line in range(len(df_train)):
    train.append(df_train[line][-15:])

val = []
for line in range(len(df_val)):
    val.append(df_val[line][-15:])

test = []
for line in range(len(df_test)):
    test.append(df_test[line][-15:])

In [None]:
train[-1]

In [None]:
image_location_train = []
for i in range(len(train)):
    for j in range(len(train_data)):
        if train[i] in train_data['image_id'][j]: 
            image_location_train.append(train_data['image_location'][j])
            
d_train = {'image_id':train,'image_location':image_location_train}   
df_train = pd.DataFrame(d_train)
df_train.to_csv('../data/df_sorted_train.csv', index = False)

In [None]:
image_location_val = []
for i in range(len(val)):
    for j in range(len(train_data)):
        if val[i] in train_data['image_id'][j]: 
            image_location_val.append(train_data['image_location'][j])
            
d_val = {'image_id':val,'image_location':image_location_val}   
df_val = pd.DataFrame(d_val)
df_val.to_csv('../data/df_sorted_val.csv', index = False)

In [None]:
image_location_test = []
for i in range(len(test)):
    for j in range(len(test_data_jpg)):
        if test[i] in test_data_jpg['image_id'][j]: 
            image_location_test.append(test_data_jpg['image_location'][j])
            
d_test = {'image_id':test,'image_location':image_location_test}   
df_test = pd.DataFrame(d_test)
df_test.to_csv('../data/df_sorted_test.csv', index = False)