In [None]:
import re, sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [242]:
train = pd.read_csv('train.csv', delimiter = ',')

In [339]:
#so there are a lot of whales that we have just one of
#let's see how many of these bad boys are compared to the total
whales = train.groupby('Id').size()

In [None]:
'''
to get a better idea of what we;re dealing with let's make a histrogram of the whales
'''


float(whales[whales <= 2].size)/float(whales.size)

0.670929070929071

In [31]:
'''
so it's pretty damn impressive if we can even 
get 60% of these, seeing as we only have 1 for many of the individuals

If we're going to learn meaningful features about individuals with only 
one picture, we can only really learn differences in a meaningful way.  For
example, if we see the right side of a fluke, we won't be able to say 
much about the left side of a fluke of the same individual if say, the whale
was partially eaten by a shark.  Although learning differences won;t get us
around an example as pedantic as this, it will outperform a standard feature
learning network 

http://www.cs.utoronto.ca/~gkoch/files/msc-thesis.pdf Koch's masters thesis 
describes training on MNIST data and applying affine transformations.  In 
theory it could be useful to apply a number of transformations to the whales
to see what could happen, but in the first round of training, for the sake, we will 
design a pipeline that just grayscales the data and randomly selects a pair of a whale 
with only one member and pairs it with a whale we have multiples of. 

I'm going to make a class that will let us determine which images to pair and store 
the pairs as strings we can load into 

'''

"\nso it's pretty damn impressive if we can even \nget 60% of these, seeing as we only have 1 for many of the individuals\n"

In [361]:
import tensorflow as tf
import tensorflow.image as img
import keras as k
import numpy as np
import random as r

class Training_Matcher:
    '''
    We wanna be strategic about how we pair up our training data.  Obviously 
    having just one whale of a class will not do for pure feature recognition.
    
    So this class will generate pairs of images in the form of a rank 2 tensor 
    with rows as pairs of jpeg file names.
    '''
    def __init__(self, train_filename, class_size_cutoff, rand_seed):
        '''
        train: filename of CSV (str)
        class_size_cutoff: classes with this many images or less will get paired with every
                            image.
        rand_seed: included so we can randomly generate the same pairs for training
        '''
        r.seed(rand_seed)
        
        self.train = pd.read_csv(train_filename)
        
        
        whale_num = self.train.groupby('Id').size()
        small_classes_names = whale_num[whale_num <= class_size_cutoff].index.values
        big_classes_names = whale_num[whale_num > class_size_cutoff].index.values
        self.small_classes = self.train.set_index('Id').loc[small_classes_names]
        self.big_classes = self.train.set_index('Id').loc[big_classes_names]
        
        #np.random.shuffle(self.small_classes['Image'])
        #np.random.shuffle(self.big_classes['Image'])
        self.small_size = self.small_classes.shape[0]
        self.big_size = self.big_classes.shape[0]
        self.remaining_small_classes = self.small_classes['Image'][:]
        self.remaining_big_classes = self.big_classes['Image'][:] 
        self.current_img = np.random.choice(self.remaining_small_classes, replace = False)
        
        self.i = 0
        self.j = 0
    
    def get_pair(self):
        '''
        returns a dict corresponding to the jpegs
        
        we write the generator like this so that instead of generating a gigantic numpy 
        array and storing it in RAM 
        '''
        while self.i <= self.small_size:
            d = {'small class':self.current_img, 
                  'big class':np.random.choice(self.remaining_big_classes, replace = False)}
            if self.j < self.big_size - 1:
                self.j += 1
            else:
                self.i += 1
                self.j = 0 
                big_classes_copy = self.big_classes['Image'][:]
                self.current_img = np.random.choice(
                                    self.remaining_small_classes, replace = False)
            yield d       

In [362]:
y = Training_Matcher('train.csv', 2, 12)

In [359]:
y.get_pair().next()

0 9


{'large class': '9a2968eae.jpg', 'small class': '8a6c28487.jpg'}

In [366]:
import tensorflow as tf

class DataPipeline:
    '''
    imports the images using tensorflow fed by instances of the generator in
    the TrainingMatcher class
    
    alsos it will zero pad the outclasss
    '''
    def __init__(self, filename, cutoff, seed, height, width):
        self.next_pair = Training_Matcher(filename, cutoff, seed)
        self.height = height
        self.width = width
    
    def image_map(self, image_dict):
        '''
        use the previous class to read in two images and turn them into 
        tensorflow images
        
        we're going to do some downsampling here to speed things up as tf.decode_jpeg does
        really fast downsampling
        '''
        file_1 = tf.io.read_file(image_dict['small_class'])
        file_2 = tf.io.read_file(image_dict['big_class'])
        img_1 = tf.image.decode_jpeg(file_1, ratio = 2)
        img_2 = tf.image.decode_jpeg(file_2, ratio = 2)
        
        img_1 = tf.image.resize_image_with_pad(img_1, self.height, self.width)
        img_2 = tf.image.resize_image_with_pad(img_2, self.height, self.width)
        
        return np.asarray([img_1, img_2])
        
    def build_dataset(self, batch_size):
        dataset = tf.data.Dataset.from_generator(self.next_pair)
        dataset = dataset.map(image_map, num_parallel_calls = 4)
        dataset = dataset.shuffle()
        dataset = dataset.batch(batch_size)
        return dataset
        

In [381]:
import keras as k
from keras.models import Sequential, Model
from keras.activations import sigmoid, relu
from keras.layers import Conv2D, MaxPooling2D, Dense, Subtract
from keras.regularizers import l2

def loss_function(prediction, distance):
    #loss function from Koch's masters thesis
    #prediction is 0 or 1 
    loss = prediction * np.log(distance) + (1 - prediction) * np.log(1 - distance) 
    return loss
                 
def distance_function(component_weight, output_1, output_2):
    return sigmoid(np.abs(output_1 - output_2).component_weight)

def init_weights(shape):
    values = np.normal(loc = 0, scale = 1e-2, size = shape)
    return k.variable(values)

def init_bias(shape):
    values = np.normal(loc = 0.5, scale = 1e-2, size = shape)
    return k.variable(values)

class cnn_siamese_network:
    '''
    let's start by hard-coding in the CNN architecture from Koch
    
    all the hard coded numbers come from http://www.cs.utoronto.ca/~gkoch/files/msc-thesis.pdf 14-15
    '''
    def __init__(self, input_shape):
        self.input_shape = input_shape
        self.
        self.model = Sequential()
        self.model.add(Conv2D(64, (10, 10), activation = 'relu', bias_inializer = init_bias
                              kernel_initializer = init_weights, kernel_regularizer = l2(2e-4)))
        self.model.add(MaxPooling2D())
        self.model.add(Conv2D(128, (7, 7), bias_inializer = init_bias, kernel_initializer = init_weights, 
                              kernel_regularizer = l2(2e-4), activation = 'relu'))
        self.model.add(MaxPooling2D())
        self.model.add(Conv2D(128, (4, 4), activation = 'relu', bias_inializer = init_bias
                              kernel_initializer = init_weights, kernel_regularizer = l2(2e-4)))
        self.model.add(MaxPooling2D())
        self.model.add(Conv2D(256, (4, 4), activation = 'relu'), bias_inializer = init_bias
                              kernel_initializer = init_weights, kernel_regularizer = l2(2e-4))
        self.model.add(Flatten())
        self.model.add(Dense(4096, activation = 'sigmoid', bias_inializer = init_bias
                              kernel_initializer = init_weights, kernel_regularizer = l2(2e-4)))
    
    def predict_input(self, input_tensor_1, input_tensor_2):
        '''
        get us the similarity score as per the L_1 distance between the two predictions
        not sure how I want to batch the training as I write this, so I'm going to intentionally 
        leave the size of input_tensor_1, the input tensor, ambiguous. 
        
        input_tensor_1/2: tensor referring to one batch of inputs, of size input_shape
        '''
        output_tensor_1 = self.model(input_tensor_1)
        output_tensor_2 = self.model(input_tensor_2)
        self.siamese_network = Model(inputs = [output_tensor_1,output_tensor_2], 
                                    )
        k.backend.abs(Subtract(output_tensor_1, output_tensor_2))

In [382]:
'''
- B/w images
- edit generator to include same/diff class label
- edit generator to pair same class items from large classes
'''



'\n- B/w images\n- edit generator to include same/diff class label\n- edit generator to pair same class items from large classes\n'