# Create Training & Validation Images

In [7]:
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
import skimage.feature

classes = ["adult_males", "subadult_males", "adult_females", "juveniles", "pups"]
patch_df = pd.DataFrame( columns=classes)
val_patch_df = pd.DataFrame( columns=classes)

## Set your data path

In [8]:
# Training data path (from Kaggle)
train_path = 'Train/'
# Train Dotted data path (from Kaggle)
train_dot_path = 'TrainDotted/'

# New images data path for sea_lion
sea_lion_path = '300x300/sea_lion/'
# New images data path for background
background_path = '300x300/background/'

r = 1     #scale down
width = 300 #patch size

train_nb = 947
bad_train_ids = {
            3, 7, 9, 21, 30, 34, 71, 81, 89, 97, 151, 184, 215, 234, 242, 
            268, 290, 311, 331, 344, 380, 384, 406, 421, 469, 475, 490, 499, 
            507, 530, 531, 605, 607, 614, 621, 638, 644, 687, 712, 721, 767, 
            779, 781, 794, 800, 811, 839, 840, 869, 882, 901, 903, 905, 909, 
            913, 927, 946}

tids = range(0, train_nb)
tids = list(set(tids) - bad_train_ids)
tids.sort() 


filenames = [str(x)+'.jpg' for x in tids]

In [10]:
sea_lion_0 = 0
sea_lion_1 = 0

for filename in filenames:
    
    if int(filename[:-4])%20 == 0:
        print(filename)
    np_0 = np.array([0,0,0,0,0])
    
    
    filename_number = filename[:-4]

    
    # read the Train and Train Dotted images
    image_1 = cv2.imread(train_dot_path + filename)
    image_2 = cv2.imread(train_path + filename)
    img1 = cv2.GaussianBlur(image_1,(5,5),0)

    # absolute difference between Train and Train Dotted
    image_3 = cv2.absdiff(image_1,image_2)
    mask_1 = cv2.cvtColor(image_1, cv2.COLOR_BGR2GRAY)
    mask_1[mask_1 < 50] = 0
    mask_1[mask_1 > 0] = 255
    image_4 = cv2.bitwise_or(image_3, image_3, mask=mask_1)

    # convert to grayscale to be accepted by skimage.feature.blob_log
    image_6 = np.max(image_4,axis=2)

    # detect blobs
    blobs = skimage.feature.blob_log(image_6, min_sigma=3, max_sigma=7, num_sigma=1, threshold=0.05)

    h,w,d = image_2.shape

    res=np.zeros((int((w*r)//width)+1,int((h*r)//width)+1,5), dtype='int16')

    for blob in blobs:
        # get the coordinates for each blob
        y, x, s = blob
        # get the color of the pixel from Train Dotted in the center of the blob
        b,g,R = img1[int(y)][int(x)][:]
        x1 = int((x*r)//width)
        y1 = int((y*r)//width)
        # decision tree to pick the class of the blob by looking at the color in Train Dotted
        if R > 225 and b < 25 and g < 25: # RED
            res[x1,y1,0]+=1
        elif R > 225 and b > 225 and g < 25: # MAGENTA
            res[x1,y1,1]+=1
        elif R < 75 and b < 50 and 150 < g < 200: # GREEN
            res[x1,y1,4]+=1
        elif R < 75 and  150 < b < 200 and g < 75: # BLUE
            res[x1,y1,3]+=1
        elif 60 < R < 120 and b < 50 and g < 75:  # BROWN
            res[x1,y1,2]+=1

    ma = cv2.cvtColor((1*(np.sum(image_1, axis=2)>20)).astype('uint8'), cv2.COLOR_GRAY2BGR)
    img = cv2.resize(image_2 * ma, (int(w*r),int(h*r)))
    h1,w1,d = img.shape

    #trainX = []
    #trainY = []

    for i in range(int(w1//width)):
        for j in range(int(h1//width)):
            
            # 703 is the number of validation data set for sea lions 5% of all the sea lions images
            if np.sum(res[i,j,:]) >0 and sea_lion_1 <=703:
                val_patch_df.loc[sea_lion_path + filename_number + '_x' + str(i) + '_y' + str(j) + '.jpg'] = res[i,j,:]
                sea_lion_1 += 1
                
            # 14064 is the number of training data set for sea lions 95% of all the sea lions images
            elif np.sum(res[i,j,:]) >0 and sea_lion_1 <=14064:
                patch_df.loc[sea_lion_path + filename_number + '_x' + str(i) + '_y' + str(j) + '.jpg'] = res[i,j,:]
                sea_lion_1 += 1
            
            # 2110 is the number of validation data set for background 5% of all the background images
            elif np.sum(res[i,j,:]) == 0 and sea_lion_0 <=2110:
                val_patch_df.loc[background_path + filename_number + '_x' + str(i) + '_y' + str(j) + '.jpg'] = res[i,j,:]
                sea_lion_0 += 1
                
            # 14064 is the number of training data set for background 95% of all the background images
            elif np.sum(res[i,j,:]) == 0 and sea_lion_0 <=14064*3:
                patch_df.loc[background_path + filename_number + '_x' + str(i) + '_y' + str(j) + '.jpg'] = res[i,j,:]
                sea_lion_0 += 1

            #trainX.append(img[j*width:j*width+width,i*width:i*width+width,:])
            
            
            
    for i in range(int(w//width)):
        for j in range(int(h//width)):
            #print(res[i,j,:])
            if (res[i,j,:]== np_0).all():
                img = image_2[j*width:j*width+width,i*width:i*width+width]
                cv2.imwrite(background_path + filename_number + '_x' + str(i) + '_y' + str(j) + '.jpg', img)
                
            else:
                img = image_2[j*width:j*width+width,i*width:i*width+width]
                cv2.imwrite(sea_lion_path + filename_number + '_x' + str(i) + '_y' + str(j) + '.jpg', img)
                

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

0.jpg
20.jpg
40.jpg
60.jpg
80.jpg
100.jpg
120.jpg
140.jpg
160.jpg
180.jpg
200.jpg
220.jpg
240.jpg
260.jpg
280.jpg
300.jpg
320.jpg
340.jpg
360.jpg
400.jpg
420.jpg
440.jpg
460.jpg
480.jpg
500.jpg
520.jpg
540.jpg
560.jpg
580.jpg
600.jpg
620.jpg
640.jpg
660.jpg
680.jpg
700.jpg
720.jpg
740.jpg
760.jpg
780.jpg
820.jpg
860.jpg
880.jpg
900.jpg
920.jpg
940.jpg


In [11]:
(patch_df.sum(axis=1)[:]==0).sum()

40082

In [12]:
patch_df.to_csv('divide_image.csv')

In [13]:
val_patch_df.to_csv('val_image.csv')