In [27]:
import re, sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [28]:
train = pd.read_csv('train.csv', delimiter = ',')

In [86]:
#so there are a lot of whales that we have just one of
#let's see how many of these bad boys are compared to the total
whales = train.groupby('Id').size()

In [87]:
'''
to get a better idea of what we;re dealing with let's make a histrogram of the whales
'''


float(whales[whales <= 2].size)/float(whales.size)

0.670929070929071

In [31]:
'''
so it's pretty damn impressive if we can even 
get 60% of these, seeing as we only have 1 for many of the individuals

If we're going to learn meaningful features about individuals with only 
one picture, we can only really learn differences in a meaningful way.  For
example, if we see the right side of a fluke, we won't be able to say 
much about the left side of a fluke of the same individual if say, the whale
was partially eaten by a shark.  Although learning differences won;t get us
around an example as pedantic as this, it will outperform a standard feature
learning network 

http://www.cs.utoronto.ca/~gkoch/files/msc-thesis.pdf Koch's masters thesis 
describes training on MNIST data and applying affine transformations.  In 
theory it could be useful to apply a number of transformations to the whales
to see what could happen, but in the first round of training, for the sake, we will 
design a pipeline that just grayscales the data and randomly selects a pair of a whale 
with only one member and pairs it with a whale we have multiples of. 

I'm going to make a class that will let us determine which images to pair and store 
the pairs as strings we can load into 

'''

"\nso it's pretty damn impressive if we can even \nget 60% of these, seeing as we only have 1 for many of the individuals\n"

In [4]:
import tensorflow as tf
import tensorflow.image as img
import keras as k
import numpy as np
import random as r
import pandas as pd
#tf.enable_eager_execution()
'''
I'm going to start by creating a generator that pairs elements of small classes with 
elements of larger classes.  My intent is to train a siamese CNN pair that specifically
identifies known whales with only a few members well.
'''
class TrainingMatcher:
    '''
    We wanna be strategic about how we pair up our training data.  Obviously 
    having just one whale of a class will not get us any generally useful features
    for IDing the class.
    
    So this class will generate pairs of images in the form of a rank 2 tensor 
    with rows as pairs of jpeg file names.
    '''
    def __init__(self, train_filename, class_size_cutoff, rand_seed):
        '''
        train: filename of CSV (str)
        class_size_cutoff: classes with this many images or less will get paired with every
                            image.
        rand_seed: included so we can randomly generate the same pairs for training
        '''
        r.seed(rand_seed)
        
        self.train = pd.read_csv(train_filename)
        
        whale_num = self.train.groupby('Id').size()
        small_class_names = whale_num[whale_num <= class_size_cutoff].index.values
        all_class_names = whale_num.index.values
        self.small_class = self.train.set_index('Id').loc[small_class_names]
        self.all_class = self.train.set_index('Id').loc[all_class_names]
        
        #np.random.shuffle(self.small_classes['Image'])
        #np.random.shuffle(self.big_classes['Image'])
        self.small_size = self.small_class.shape[0]
        self.all_size = self.all_class.shape[0]
        self.remaining_small_class = self.small_class['Image'][:]
        self.remaining_all_class = self.all_class['Image'][:] 
        self.current_img = np.random.choice(self.remaining_small_class, replace = False)
        self.current_class = self.train[self.train['Image'] 
                                        == self.current_img]['Id'].values[0]
        self.i = 0
        self.j = 0
    
    def get_pair(self):
        '''
        returns a dict corresponding to the jpegs
        
        we write the generator like this so that instead of generating a gigantic numpy 
        array and storing it in RAM 
        '''
        while self.i <= self.small_size:
            other_class_img = np.random.choice(self.remaining_all_class, replace = False)
            other_class = self.train[self.train['Image'] == 
                                     other_class_img]['Id'].values[0]
            similarity = np.float(other_class == self.current_class)
            d = {'small class' : (self.current_img, self.current_class),
                  'other class' : (other_class_img, other_class),
                'similarity' : similarity}
            if self.j < self.all_size - 1:
                self.j += 1
            else:
                self.i += 1
                self.j = 0 
                self.remaining_all_class = self.all_class['Image'][:]
                self.current_img = np.random.choice(
                                    self.remaining_small_class, replace = False)
                self.current_class = self.train[self.train['Image'] 
                                                == self.current_img]['Id'].values[0]
            yield d       

In [2]:
y = TrainingMatcher('train.csv', 2, 12)
x = y.get_pair()
x.next()

{'other class': ('5e7beef6e.jpg', 'w_5f20f03'),
 'similarity': 0.0,
 'small class': ('564df4f6c.jpg', 'w_71ec6d4')}

In [12]:
import tensorflow as tf

class DataPipeline:
    '''
    imports the images using tensorflow fed by instances of the generator in
    the TrainingMatcher class
    
    alsos it will zero pad the outclasss
    '''
    def __init__(self, filename, cutoff, seed, height, width):
        self.matcher = TrainingMatcher(filename, cutoff, seed)
        self.next_pair = self.matcher.get_pair
        self.height = height
        self.width = width
    
    def image_map(self, image_dict):
        '''
        use the previous class to read in two images and turn them into 
        tensorflow images
        '''
        file_1 = tf.io.read_file('/Users/jbowen/projects/train/' 
                                 + image_dict['small class'][0])
        file_2 = tf.io.read_file('/Users/jbowen/projects/train/' 
                                 + image_dict['other class'][0])
        img_1 = tf.image.decode_jpeg(file_1,channels=3)
        img_2 = tf.image.decode_jpeg(file_2,channels=3)

        img_1 = tf.image.resize_image_with_pad(img_1, self.height, self.width)
        img_2 = tf.image.resize_image_with_pad(img_2, self.height, self.width)
        return tf.stack([img_1, img_2], axis = 0)
    
    def get_similarity(self, dataset_dict):
        '''
        functiom to map over dataset to build dataset with class informatio 
        '''
        return dataset_dict['similarity']
        
    def build_dataset(self, batch_size):
        '''
        batch_size : int, size to data to read before updating weights
        '''
        dataset = tf.data.Dataset.from_generator(self.next_pair, output_types = 
                                        {'other class': tf.string, 'small class': tf.string,
                                        'similarity': tf.float16})
        X_data = dataset.map(self.image_map, num_parallel_calls = 4)
        X_data = X_data.batch(batch_size)
        Y_data = dataset.map(self.get_similarity, num_parallel_calls = 4)
        Y_data = Y_data.batch(batch_size)
        return (X_data, Y_data)      

In [33]:
#executed with eager mode enabled
import matplotlib.pyplot as plt
x = DataPipeline('train.csv', 4, 1234, 1200, 800)
y = x.build_dataset(20)[0].make_one_shot_iterator()
i = y.get_next()
f = plt.figure()
f.add_subplot(1,2,1)
plt.imshow(i.numpy()[0])
f.add_subplot(1,2,2)
plt.imshow(i.numpy()[1])
plt.show()

AttributeError: 'Tensor' object has no attribute 'numpy'

In [41]:
import keras as k
from keras.models import Sequential, Model
from keras.activations import sigmoid, relu
from keras.layers import Conv2D, MaxPooling2D, Dense, Subtract, Flatten
from keras.initializers import RandomNormal
from keras.regularizers import l2
from keras.engine.input_layer import Input


def loss_function(prediction, distance):
    #loss function from Koch's masters thesis
    #prediction is 0 or 1 
    loss = prediction * np.log(distance) + (1 - prediction) * np.log(1 - distance) 
    return loss
                 
def distance_function(component_weight, output_1, output_2):
    return sigmoid(np.abs(output_1 - output_2).component_weight)

#following functions decprecated 1/22/19, subbing in builtin Keras initializers

#def init_weights(shape):
#    values = np.random.normal(loc = 0, scale = 1e-2, size = shape)
#    return k.variable(values)

#def init_bias(shape):
#    values = np.normal(loc = 0.5, scale = 1e-2, size = shape)
#    return k.variable(values)

class CnnSiameseNetwork:
    '''
    let's start by hard-coding in the CNN architecture from Koch
    
    all the hard coded numbers come from http://www.cs.utoronto.ca/~gkoch/files/msc-thesis.pdf 14-15
    '''
    def __init__(self, X_dataset, Y_dataset):
        '''
        Pipeline is an instance of the DataPipeline class from the earlier page
        '''
        
        X_dataset_0 =  X_dataset.map(lambda x: x[0], num_parallel_calls=4)
        X_dataset_1 =  X_dataset.map(lambda x: x[1], num_parallel_calls=4)
        
        self.Input_0 = Input(tensor = X_dataset_0.make_one_shot_iterator().get_next())
        self.Input_1 = Input(tensor = X_dataset_1.make_one_shot_iterator().get_next())
        
        self.twin_cnn = Sequential()
        input_sh = tuple(X_dataset.output_shapes.as_list()[2:4]) + (3,)
        print input_sh
        
        self.twin_cnn.add(Conv2D(64, (10, 10), activation = 'relu', 
                                 bias_initializer = RandomNormal(mean = 0.5, stddev = 1e-2),
                                 input_shape=input_sh,  data_format = 'channels_last',
                              kernel_initializer = RandomNormal(mean = 0.0, stddev = 1e-2),
                                 kernel_regularizer = l2(2e-4))
                         )
        self.twin_cnn.add(MaxPooling2D())
        self.twin_cnn.add(Conv2D(128, (7, 7), 
                                 bias_initializer = RandomNormal(mean = 0.5, stddev = 1e-2), 
                                 kernel_initializer = RandomNormal(mean = 0.0, stddev = 1e-2), 
                              kernel_regularizer = l2(2e-4), activation = 'relu')
                         )
        self.twin_cnn.add(MaxPooling2D())
        self.twin_cnn.add(Conv2D(128, (4, 4), activation = 'relu', 
                                 bias_initializer = RandomNormal(mean = 0.5, stddev = 1e-2),
                              kernel_initializer = RandomNormal(mean = 0.0, stddev = 1e-2),
                                 kernel_regularizer = l2(2e-4))
                         )
        self.twin_cnn.add(MaxPooling2D())
        self.twin_cnn.add(Conv2D(256, (4, 4), activation = 'relu', 
                                 bias_initializer = RandomNormal(mean = 0.5, stddev = 1e-2),
                              kernel_initializer = RandomNormal(mean = 0.0, stddev = 1e-2), 
                                 kernel_regularizer = l2(2e-4))
                         )
        self.twin_cnn.add(Flatten())
        self.twin_cnn.add(Dense(4096, activation = 'sigmoid', 
                                bias_initializer = RandomNormal(mean = 0.5, stddev = 1e-2),
                              kernel_initializer = RandomNormal(mean = 0.0, stddev = 1e-2), 
                                kernel_regularizer = l2(2e-4))
                         )
        
        self.twin0_output = self.twin_cnn(self.Input_0)
        self.twin1_output = self.twin_cnn(self.Input_1)
        
        self.diff = Subtract()([self.twin0_output,self.twin1_output])
        self.embedded_layer = Dense(1, activation = 'sigmoid', 
                                    bias_initializer = RandomNormal(mean = 0.5, stddev = 1e-2))(self.diff)
        
        self.model = Model(input = [self.Input_0, self.Input_1], output = self.embedded_layer)
        
    
    def compile_model(self, learning_rate, momentum, decay):
        '''
        learning rate: float64 to control rate at which weights are updated in the standard stochastic
            gradient descent algo
        momentum: normal float64 damping factor on parameter changes
        decay: normal float64 damping factor on changes to learning rate across epochs
        '''
        sgd = k.optimizers.SGD(lr = learning_rate, momentum = momentum, decay = decay)
        self.model.compile(loss = "binary_crossentropy", optimizer = sgd) 
        
    def train_model(self, batch, num_epochs):
        '''
        batch: size of batch to go through dataset with
        epochs: number of times to go over training data
        '''
        self.model.fit(batch_size = batch, epochs = num_epochs)

In [42]:
x = DataPipeline('train.csv', 12, 1234, 1200, 800)
l = x.build_dataset(30)

In [43]:
A = CnnSiameseNetwork(*l)

(1200, 800, 3)




In [44]:
A.compile_model(.05, .9, .001)