# Sort images

This notebook will create a (sub)folder structure for the train and test images.
It will sort the train images into train and val folders, such that each turtle gets one image per image location in each turtle id validation subfolder.
The rest of the training images is sorted into the train folder.
The goal is to optimize the train/validation split.

The test images are sorted via their image id.

In [1]:
#importing the necessary modules
import shutil
from imutils import paths
from random import shuffle
import os
from datetime import datetime
import numpy as np
import pathlib
import pandas as pd

## Sort train images into train and validation folders

In [18]:
train_data = pd.read_csv('../data/train_corrected.csv')
train_data.image_id = train_data.image_id.apply(lambda x: x.strip()+".jpg")
train_data['turtle_id-image_id'] = train_data.turtle_id + "-" + train_data.image_id
train_data['turtle_id-image_location'] = train_data.turtle_id + "-" + train_data.image_location
train_data['turtle_id-image_location+image_id'] = train_data.turtle_id + "-" + train_data.image_location + "+" + train_data.image_id

In [19]:
train_data

Unnamed: 0,image_id,image_location,turtle_id,turtle_id-image_id,turtle_id-image_location,turtle_id-image_location+image_id
0,ID_009TNNQ8.jpg,left,t_id_Kf73l69A,t_id_Kf73l69A-ID_009TNNQ8.jpg,t_id_Kf73l69A-left,t_id_Kf73l69A-left+ID_009TNNQ8.jpg
1,ID_010JDNBL.jpg,right,t_id_GrxmyS59,t_id_GrxmyS59-ID_010JDNBL.jpg,t_id_GrxmyS59-right,t_id_GrxmyS59-right+ID_010JDNBL.jpg
2,ID_01L54J3D.jpg,left,t_id_d6aYXtor,t_id_d6aYXtor-ID_01L54J3D.jpg,t_id_d6aYXtor-left,t_id_d6aYXtor-left+ID_01L54J3D.jpg
3,ID_02445SY2.jpg,top,t_id_GrxmyS59,t_id_GrxmyS59-ID_02445SY2.jpg,t_id_GrxmyS59-top,t_id_GrxmyS59-top+ID_02445SY2.jpg
4,ID_026RZIZ9.jpg,right,t_id_hRzOoJ2t,t_id_hRzOoJ2t-ID_026RZIZ9.jpg,t_id_hRzOoJ2t-right,t_id_hRzOoJ2t-right+ID_026RZIZ9.jpg
...,...,...,...,...,...,...
2117,ID_ZWHZBX92.jpg,top,t_id_Kc1tXDbJ,t_id_Kc1tXDbJ-ID_ZWHZBX92.jpg,t_id_Kc1tXDbJ-top,t_id_Kc1tXDbJ-top+ID_ZWHZBX92.jpg
2118,ID_ZXM8MLS2.jpg,top,t_id_2QmcRkNj,t_id_2QmcRkNj-ID_ZXM8MLS2.jpg,t_id_2QmcRkNj-top,t_id_2QmcRkNj-top+ID_ZXM8MLS2.jpg
2119,ID_ZY15TQYT.jpg,left,t_id_stWei2Uq,t_id_stWei2Uq-ID_ZY15TQYT.jpg,t_id_stWei2Uq-left,t_id_stWei2Uq-left+ID_ZY15TQYT.jpg
2120,ID_ZYTRP3VN.jpg,left,t_id_ip3jsrYo,t_id_ip3jsrYo-ID_ZYTRP3VN.jpg,t_id_ip3jsrYo-left,t_id_ip3jsrYo-left+ID_ZYTRP3VN.jpg


In [4]:
#list all pictures in the image_dir
image_dir = '../images/'
imagePaths = sorted(list(paths.list_images(image_dir)))

In [5]:
# Store all the turtle_ids in train_data
turtle_ids = train_data['turtle_id']
turtle_ids

0       t_id_Kf73l69A
1       t_id_GrxmyS59
2       t_id_d6aYXtor
3       t_id_GrxmyS59
4       t_id_hRzOoJ2t
            ...      
2117    t_id_Kc1tXDbJ
2118    t_id_2QmcRkNj
2119    t_id_stWei2Uq
2120    t_id_ip3jsrYo
2121    t_id_m2JvEcsg
Name: turtle_id, Length: 2122, dtype: object

## Sort images
To sort the images you need to create a subfolder "sorted_images" into the main folder

In [6]:
#after listing all individuals create two empty folders for each individual 
#one for the training and another for the validaiton dataset

#define the folder were the training and validation datasets will be placed
if not os.path.exists("../sorted_images_locationwise"):
    os.makedirs("../sorted_images_locationwise")
root_dir="../sorted_images_locationwise"

#loop through all individuals and create a folder for the training dataset
# and a folder for the validation dataset
for i in range(0, len(turtle_ids)):
    train_dir=root_dir+"/train/"+turtle_ids[i]#variable with the full path of the training folder
    val_dir=root_dir+"/val/"+turtle_ids[i]#variable with the full path of the validation folder
    if not os.path.exists(train_dir):#condition for if the folder already exists
        os.makedirs(train_dir)#create the folder
    if not os.path.exists(val_dir):
        os.makedirs(val_dir)   

In [7]:
# These lines will create a folder called "new_turtle" into our train and validation sub-folders. 
#This is necessary for our specific task. (Reminder: if the image most likelx does not belong to any turtle_id, the models needs to output "new_turtle")
#These folders will not contain any pictures
#os.makedirs("../sorted_images/train/new_turtle")
#os.makedirs("../sorted_images/val/new_turtle")

In [7]:
#Check length of train_data, i.e. number of relevant pictures
round(len(train_data))

2122

In [17]:
train_data

Unnamed: 0,image_id,image_location,turtle_id,turtle_id-image_id,turtle_id-image_location
0,ID_009TNNQ8.jpg,left,t_id_Kf73l69A,t_id_Kf73l69A-ID_009TNNQ8.jpg,t_id_Kf73l69A-left
1,ID_010JDNBL.jpg,right,t_id_GrxmyS59,t_id_GrxmyS59-ID_010JDNBL.jpg,t_id_GrxmyS59-right
2,ID_01L54J3D.jpg,left,t_id_d6aYXtor,t_id_d6aYXtor-ID_01L54J3D.jpg,t_id_d6aYXtor-left
3,ID_02445SY2.jpg,top,t_id_GrxmyS59,t_id_GrxmyS59-ID_02445SY2.jpg,t_id_GrxmyS59-top
4,ID_026RZIZ9.jpg,right,t_id_hRzOoJ2t,t_id_hRzOoJ2t-ID_026RZIZ9.jpg,t_id_hRzOoJ2t-right
...,...,...,...,...,...
2117,ID_ZWHZBX92.jpg,top,t_id_Kc1tXDbJ,t_id_Kc1tXDbJ-ID_ZWHZBX92.jpg,t_id_Kc1tXDbJ-top
2118,ID_ZXM8MLS2.jpg,top,t_id_2QmcRkNj,t_id_2QmcRkNj-ID_ZXM8MLS2.jpg,t_id_2QmcRkNj-top
2119,ID_ZY15TQYT.jpg,left,t_id_stWei2Uq,t_id_stWei2Uq-ID_ZY15TQYT.jpg,t_id_stWei2Uq-left
2120,ID_ZYTRP3VN.jpg,left,t_id_ip3jsrYo,t_id_ip3jsrYo-ID_ZYTRP3VN.jpg,t_id_ip3jsrYo-left


In [8]:
#in this example we are going to select 1822 (approx 86%) pictures for training and 300 (approx 14%) pictures for validation.
#We setup the condition that the validation pictures should contain at least one picture per turtle_id for each image_location
#as per now we have nothing to avoid having pictures that are very similiar in both datasets, which could result in overfitting the CNN

#define the number of validation pictures and the number of training pictures
N_val_pics=300
N_train_pics=1822

#create two empty lists to store the pictures files that are going to be moved to the training 
#and validation fodlers
training_pictures=[]
validation_pictures=[]

#list to check if combination is already used
turtle_id_image_location=[]

#loop through each individual turtle_id and secondary image_location
for index in range(0, len(train_data)):
    if train_data['turtle_id-image_location'][index] not in turtle_id_image_location:
        validation_pictures.append(train_data['turtle_id-image_id'][index])
        turtle_id_image_location.append(train_data['turtle_id-image_location'][index])
    else:
        training_pictures.append(train_data['turtle_id-image_id'][index])

In [None]:
#loop through each individual turtle_id and secondary image_location
for index in range(0, len(train_data)):
    if train_data['turtle_id-image_location'][index] not in turtle_id_image_location:
        validation_pictures.append(train_data['image_id-image_location'][index])
        turtle_id_image_location.append(train_data['turtle_id-image_location'][index])
    else:
        training_pictures.append(train_data['turtle_id-image_id'][index])

In [10]:
#Check if number of validation pictures is indeed 300
len(validation_pictures)

300

In [11]:
len(training_pictures)

1822

In [16]:
validation_pictures

['t_id_Kf73l69A-ID_009TNNQ8.jpg',
 't_id_GrxmyS59-ID_010JDNBL.jpg',
 't_id_d6aYXtor-ID_01L54J3D.jpg',
 't_id_GrxmyS59-ID_02445SY2.jpg',
 't_id_hRzOoJ2t-ID_026RZIZ9.jpg',
 't_id_dhdJMT1K-ID_04LIX78Y.jpg',
 't_id_fjHGjp1w-ID_05OAB9HN.jpg',
 't_id_m2JvEcsg-ID_08R685XU.jpg',
 't_id_Ts5LyVQz-ID_08VEIX80.jpg',
 't_id_fxTQ5vHC-ID_092X1NP4.jpg',
 't_id_AOWArhGb-ID_0B15SK84.jpg',
 't_id_uJXT7dGu-ID_0D2ZJPGL.jpg',
 't_id_stWei2Uq-ID_0DOWCQVL.jpg',
 't_id_2QmcRkNj-ID_0E36WAY8.jpg',
 't_id_8b8sprYe-ID_0F3ODH2R.jpg',
 't_id_Ts5LyVQz-ID_0FP7W8WI.jpg',
 't_id_n2FBHk6d-ID_0FX6ODC0.jpg',
 't_id_G5JLzvai-ID_0G8SI7HQ.jpg',
 't_id_n2FBHk6d-ID_0I5FAS7W.jpg',
 't_id_3K93fQBS-ID_0ILV12LP.jpg',
 't_id_dVQ4x3wz-ID_0INOHY6I.jpg',
 't_id_dVQ4x3wz-ID_0J1NXY6D.jpg',
 't_id_tjWepji1-ID_0JC13MBA.jpg',
 't_id_uIlC9Gfo-ID_0JDDE04I.jpg',
 't_id_MwnEYfqe-ID_0KMB0NN4.jpg',
 't_id_72SiiZCp-ID_0KQJ1TYW.jpg',
 't_id_g9Fz8PH7-ID_0LHYKDEU.jpg',
 't_id_GOIvCduN-ID_0MGFWAOV.jpg',
 't_id_ksTLswDT-ID_0NFH1PZ6.jpg',
 't_id_e9i3Lbq

In [15]:
validation_pictures[-1].split('-')[-2]

't_id_4XiPKIk7'

In [None]:
val_images = []
train_images = []
#loop through the list of pictures
#move the pictures files to the validation folder
for i in range(0, len(validation_pictures)):
    #get the picture name
    val_file_name = image_dir + validation_pictures[i].split('-')[-1]
    image_name = validation_pictures[i].split('-')[-1]

    #create a variable with the directory and the name of the pictures file
    output_name_val=root_dir+"/val/"+validation_pictures[i].split('-')[-0]+"/"+image_name
       
    #move the file
    shutil.copy(val_file_name, output_name_val)
    
    # put the information into a DataFrame
    val_images.append(image_name)

for i in range(0, len(training_pictures)):
    #get the picture name (e.g. "01103F7D5A_2018-11-26_07-56-03.jpg")
    train_file_name = image_dir + training_pictures[i].split('-')[-1]
    image_name = training_pictures[i].split('-')[-1]

    #create a variable with the directory and the name of the pictures file
    output_name_train=root_dir+"/train/"+training_pictures[i].split('-')[-0]+"/"+image_name
       
    #move the file
    shutil.copy(train_file_name, output_name_train)    
    
    train_images.append(image_name)

In [12]:
val_images = []
train_images = []
#loop through the list of pictures
#move the pictures files to the validation folder
for i in range(0, len(validation_pictures)):
    #get the picture name
    val_file_name = image_dir + validation_pictures[i].split('-')[-1]
    image_name = validation_pictures[i].split('-')[-1]

    #create a variable with the directory and the name of the pictures file
    output_name_val=root_dir+"/val/"+validation_pictures[i].split('-')[-0]+"/"+image_name
       
    #move the file
    shutil.copy(val_file_name, output_name_val)
    
    # put the information into a DataFrame
    val_images.append(image_name)

for i in range(0, len(training_pictures)):
    #get the picture name (e.g. "01103F7D5A_2018-11-26_07-56-03.jpg")
    train_file_name = image_dir + training_pictures[i].split('-')[-1]
    image_name = training_pictures[i].split('-')[-1]

    #create a variable with the directory and the name of the pictures file
    output_name_train=root_dir+"/train/"+training_pictures[i].split('-')[-0]+"/"+image_name
       
    #move the file
    shutil.copy(train_file_name, output_name_train)    
    
    train_images.append(image_name)

## Sort test images
To appriately use our Train_CNN pipeline, we need to move our test pictures into a subfolder, as well.

As we don't know the turtle_id for these pictures, we will save them into subfolders containing their image_id

In [13]:
test_data = pd.read_csv('../data/test_corrected.csv')
test_data_jpg = pd.read_csv('../data/test_corrected.csv')
test_data_jpg.image_id = test_data_jpg.image_id.apply(lambda x: x.strip()+".jpg")

In [14]:
#define the folder were the training and validation datasets will be placed
root_dir="../sorted_images/"
image_dir="../images/"
#loop through all individuals and create a folder for the test dataset
for i in range(0, len(test_data)):
    test_dir_folder= root_dir + "test/" + test_data['image_id'][i]#variable with the full path of the training folder
    if not os.path.exists(test_dir_folder):#condition for if the folder already exists
        os.makedirs(test_dir_folder)#create the folder


In [15]:
# put images into the folders

root_dir="../sorted_images/"
image_dir="../images/"

for i in range(0, len(test_data)):    
    image_name = test_data_jpg['image_id'][i]
    test_file_name = image_dir + image_name
    output_name = root_dir + "test/" + test_data['image_id'][i] + "/" + image_name
    #move the file
    shutil.copy(test_file_name, output_name)

## Create dataframes with new folder structure

In [16]:
import os
import pandas as pd

In [17]:
train = []
train_dir="../sorted_images/train/"

for r, d, f in os.walk(train_dir):
    d.sort()
    for file in sorted(f):
        if ".jpg" in file:
            train.append((d,os.path.join(r,file)))

df_train = pd.DataFrame(train,columns=['folder','image_id'])

In [18]:
train = []
train_dir="../sorted_images/train/"

for r, d, f in os.walk(train_dir):
    d.sort()#key=str.lower)
    for file in sorted(f):
        if ".jpg" in file:
            train.append((d,os.path.join(r,file)))

df_train = pd.DataFrame(train,columns=['folder','image_id'])

val = []
val_dir="../sorted_images/val/"

for r, d, f in os.walk(val_dir):
    d.sort()#key=str.lower)
    for file in sorted(f):
        if ".jpg" in file:
            val.append((d,os.path.join(r,file)))

df_val = pd.DataFrame(val,columns=['folder','image_id'])

test = []
val_dir="../sorted_images/test/"

for r, d, f in os.walk(val_dir):
    d.sort()#key=str.lower)
    for file in sorted(f):
        if ".jpg" in file:
            test.append((d,os.path.join(r,file)))

df_test = pd.DataFrame(test,columns=['folder','image_id'])

In [19]:
df_train = df_train['image_id']
df_val = df_val['image_id']
df_test = df_test['image_id']

In [20]:
df_train

0       ../sorted_images/train/t_id_0DPPpRUz/ID_96LWSV...
1       ../sorted_images/train/t_id_0DPPpRUz/ID_HONVLG...
2       ../sorted_images/train/t_id_0DPPpRUz/ID_R7GK2Y...
3       ../sorted_images/train/t_id_0DPPpRUz/ID_SBK2IX...
4       ../sorted_images/train/t_id_0DPPpRUz/ID_UIR6EX...
                              ...                        
1817    ../sorted_images/train/t_id_utw0thCe/ID_TKDIA5...
1818    ../sorted_images/train/t_id_utw0thCe/ID_TN3XKG...
1819    ../sorted_images/train/t_id_utw0thCe/ID_VSXDS4...
1820    ../sorted_images/train/t_id_utw0thCe/ID_W69Q85...
1821    ../sorted_images/train/t_id_utw0thCe/ID_WKZM3K...
Name: image_id, Length: 1822, dtype: object

In [21]:
train = []
for line in range(len(df_train)):
    train.append(df_train[line][-15:])

val = []
for line in range(len(df_val)):
    val.append(df_val[line][-15:])

test = []
for line in range(len(df_test)):
    test.append(df_test[line][-15:])

In [22]:
train[-1]

'ID_WKZM3KRR.jpg'

In [23]:
image_location_train = []
for i in range(len(train)):
    for j in range(len(train_data)):
        if train[i] in train_data['image_id'][j]: 
            image_location_train.append(train_data['image_location'][j])
            
d_train = {'image_id':train,'image_location':image_location_train}   
df_train = pd.DataFrame(d_train)
df_train.to_csv('../data/df_sorted_train.csv', index = False)

In [24]:
image_location_val = []
for i in range(len(val)):
    for j in range(len(train_data)):
        if val[i] in train_data['image_id'][j]: 
            image_location_val.append(train_data['image_location'][j])
            
d_val = {'image_id':val,'image_location':image_location_val}   
df_val = pd.DataFrame(d_val)
df_val.to_csv('../data/df_sorted_val.csv', index = False)

In [25]:
image_location_test = []
for i in range(len(test)):
    for j in range(len(test_data_jpg)):
        if test[i] in test_data_jpg['image_id'][j]: 
            image_location_test.append(test_data_jpg['image_location'][j])
            
d_test = {'image_id':test,'image_location':image_location_test}   
df_test = pd.DataFrame(d_test)
df_test.to_csv('../data/df_sorted_test.csv', index = False)