### Data Processing

If you are not using Google Colab, you need not run this. The purpose of this element is to navigate to the files on Google Drive.

In [2]:
from google.colab import drive
import os

# Mount my Google Drrive
drive.mount('/content/drive')

# Go to the directory with the 583 data
os.chdir('/content/drive/My Drive/data_583')
!pwd

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive/data_583


In [0]:
import os
from PIL import Image
import numpy as np
import pandas as pd

from tqdm import tqdm

numeric_cols = [
    'Age',
    'Quantity', 'Fee', 'State',
    'VideoAmt', 'PhotoAmt', 'Type'
]

one_hot_cols = {
    #'Type': 2, 
    'Breed1': 307, 'Breed2': 307,
    'Gender': 3, 'Color1': 7, 'Color2': 7,
    'Color3': 7,
    'MaturitySize': 5,
    'FurLength': 4, 'Vaccinated': 3,
    'Dewormed': 3, 'Sterilized': 3,
    'Health': 4, 'State': 15
}

def one_hot_encode(df, col, num_class=None, labels=None, inplace=False):
    ''' Takes in dataframe df and replaces col with num_class columns
        For example, use as follows
        for col, num_class in data.one_hot_cols.items():
            one_hot_encode(train_df, col, num_class)
    '''
    # get the true values from data
    column_values = np.sort(df[col].dropna().unique())
    if num_class == None:
        num_class = len(column_values)
    if num_class == 2:
        # These can just be boolean
        if inplace:
            df[col] = (df[col] == column_values[0]).astype(int)
        else:
            return (df[col] == column_values[0]).astype(int)
    else:        
        if labels is not None:
            res = np.zeros((len(df), num_class))
            for i, label in enumerate(labels):
                if inplace:
                    df[col+'_'+str(label)] = (df[col] == label).astype(int)
                else:
                    one_hot = np.zeros(num_class)
                    one_hot[i] = 1
                    res[df[col] == label] = one_hot
        else:
            res = np.zeros((len(df), num_class))
            for i in range(num_class):
                if (i >= len(column_values)):
                    break # Index out of bounds
                cur_value = column_values[i]

                if inplace:
                    df[col+'_'+str(cur_value)] = (df[col] == cur_value).astype(int)
                else:
                    one_hot = np.zeros(num_class)
                    one_hot[i] = 1
                    res[df[col] == cur_value] = one_hot
    
        if inplace:
            # delete original column
            df.drop(col, axis=1, inplace=True)
        else:
            return res

def load_data(fname):
    return pd.read_csv(fname)

def load_pet_files(regdir):
    """ Extracts all of the files associated with each pet listed
    by the 'PetID' tag.

    regdir - The directory containing the files

    returns a dictionary containing keypairs (k, v) such that v
    matches the regex (k\-.*) where k is the key (a valid PetID).
    """
    fname = os.path.join(regdir, 'picked_pictures.npy')
    
    print(f"Images will be loaded from {fname}")
    return np.load(fname)

def load_pet_pics(regdir):
    """ Extracts all of the files associated with each pet listed
    by the 'PetID' tag.

    regdir - The directory containing the files

    returns a dictionary containing keypairs (k, v) such that v
    matches the regex (k\-.*) where k is the key (a valid PetID).
    """
    if os.path.isfile(os.path.join(regdir + 'picked_pictures.npy')):
        print("Images loaded from existing file")
        return np.load(os.path.join(regdir + 'picked_pictures.npy'))
    pfiles = {}

    # Extract the pet names
    for f in tqdm(os.listdir(regdir), desc='Loading Pet Files'):
        # Extract the name
        n = f[:f.index('-')]
        url = os.path.join(regdir, f)
        img = load_image(url)
        if n in pfiles:
            # Add to the entry
            pfiles[n].append(img)

        else:
            # Add a new entry
            pfiles[n] = [img]

    np.save(os.path.join(regdir + 'picked_pictures.npy'), pfiles)
    return pfiles

def load_train_data(is_train = True):
    # Get the annotations for each pet
    dta = load_data('data/train/train.csv' if is_train else 'data/test/test.csv')
     
    # Get the pet pictures
    petpics = load_pet_files('data/') if is_train else load_pet_files('data/test_images')

    # Get the state ids
    states = load_data('data/state_labels.csv')
    states = states['StateID'].tolist()

    # Load parsed sentiment
    sentiment = load_data('data/{}_sentiment_parsed.csv'.format('train' if is_train else 'test'))

    X_num = []
    X_pic = []

    Y = []
    
    # Build a single object to store the X values
    X = [X_num, X_pic]
    
    """
    
    for k in one_hot_cols:
        one_hot_encode(dta, k, inplace=True)
        print('One hot encoded', k)
        """
    
    for i, row in dta.iterrows():
        # Save the numeric values
        vals = row[numeric_cols]
        
        state = [x == row['State'] for x in states]
        #assert(sum(state) == 1)
        
        # Add all of the valid one-hot encodings
        #state = [row[k] for k in dta if any(q in k for q in one_hot_cols)]
            
        x = list(vals) + state
        
            
        # Join sentiment on PetID
        s = sentiment[sentiment['PetID'] == row['PetID']]
        
        for col in s:
            if col not in row:
                row[col] = s.iloc[0][col]
                x.append(row[col])
        
        # Save the pictures
        if row['PetID'] in petpics:
            X_pic.append(petpics[row['PetID']])
        else:
            X_pic.append([])
            
        # Save the data pair
        X_num.append(np.array(x))
        
        if is_train:
            Y.append(row['AdoptionSpeed'])
        else:
            Y.append(row['PetID'])

    # Laziness
    if len(X) == 1:
        X = np.array(X[0])
    else:
        X = list(map(np.array, X))

    Y = np.array(Y)

    return (X, Y)

def load_image(img_file, size=64):
    # print('img file:', img_file)
    img = Image.open(img_file)
    img = img.resize((size, size), Image.ANTIALIAS)
    return np.array(img)

In [6]:
from keras.utils import to_categorical

import numpy as np

# Get the data
X, Y = load_train_data()

Images will be loaded from data/picked_pictures.npy


In [7]:
print([x.shape for x in X])
print(Y.shape)
print(X[0][0])

[(14993, 25), (14993, 0)]
(14993,)
[3.0000e+00 1.0000e+00 1.0000e+02 4.1326e+04 0.0000e+00 1.0000e+00
 2.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 1.0000e+00 0.0000e+00 2.4000e+00 3.0000e-01
 6.0000e+00]


In [0]:
def shuffle(X, Y):
    idxs = list(range(len(Y)))
    
    if isinstance(X, list):
        for i in range(len(X)):
            X[i] = X[i][idxs]
    else:
        X = X[idxs]

    Y = Y[idxs]
    
    return X, Y

def split(X, Y, s=0.2):
    s = int(s * len(Y))
    
    if isinstance(X, list):
        X_train = [x[:-s] for x in X]
        X_test = [x[-s:] for x in X]
    else:
        X_train = X[:-s]
        X_test = X[-s:]

    Y_train = Y[:-s]
    Y_test = Y[-s:]

    return (X_train, Y_train), (X_test, Y_test)


def convert_for_all(X, Y):
    # For single image training, make a datapoint for each image or the default zero image
    Xs = [[], []]
    Ys = []
    for i in range(len(X[1])):
        Xs[0].append(X[0][i])
        if len(X[1][i]) == 0:
            Xs[1].append(np.zeros((64, 64, 3)))
            Ys.append(Y[i])
        else:
            # Make a datapoint for all images. We assume equal relevance of images
            for img in X[1][i]:
                Xs[0].append(X[0][i])
                Xs[1].append(img)
                Ys.append(Y[i])

    X = list(map(np.array, Xs))
    Y = np.array(Ys)

    return X, Y

def convert_for_single_axis(X, Y, ax=0):
    return X[ax], Y


In [0]:
X, Y = shuffle(X, Y)

### Architectures

In [0]:
from keras.models import Sequential, Model
from keras.layers import *
from keras.callbacks import ModelCheckpoint

class KaggleModel:
    def __init__(self, model, train, test):
        model.summary()
        self.model = model
        self.train_data = train
        self.test_data = test

    def compile(self):
        """ Compiles the model. Should be defined by the user.
        """
        raise NotImplementedError()

    def train(self, epochs=1):
        """ Default training behavior. Simply does a Model.fit(X, Y).
        """
        # Get the training data
        X_train, Y_train = self.train_data

        checkpoint = ModelCheckpoint('model.h5')
        
        # Fit to the data
        self.model.fit(X_train, Y_train, epochs=epochs, validation_data=self.test_data, callbacks=[checkpoint])

    def predict(self, X):
        return self.model.predict(X)


def ResidualBlock(mdl):
    x = Input(shape=mdl.input_shape[1:])

    y = mdl(x)
    y = Add()([x, y])

    return Model(x, y)

class ImageFreeModel(KaggleModel):
    def __init__(self, train, test):
        kernel = Sequential(name='image_free_encoder')
        
        kernel.add(BatchNormalization(input_shape=(25,)))
        
        kernel.add(Dense(128))
        kernel.add(Activation('relu'))
        
        # Use a single dense residual block
        """
        blk = Sequential()
        
        blk.add(Dropout(0.5, input_shape=kernel.output_shape[1:]))
        blk.add(Dense(128))
        blk.add(Activation('relu'))

        blk.add(Dense(128))
        blk.add(Activation('relu'))

        kernel.add(ResidualBlock(blk))
        """

        model = Sequential(name='image_free')
        model.add(kernel)
        
        model.add(Dense(64, activation='relu'))

        # Labels are one of [0, 1, 2, 3, 4]
        model.add(Dense(5, activation='softmax'))

        # Build using the built model
        super().__init__(model, train, test)

    def compile(self):
        self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

class SingleImageModel(KaggleModel):
    def __init__(self, train, test):
        # The model takes in attributes and an image.
        kernel = Sequential(name='single_image_encoder')
        
        # Architecture for the images
        kernel.add(Conv2D(64, kernel_size=(7,7), strides=(1,1), padding='same', input_shape=(64, 64, 3)))
        kernel.add(BatchNormalization())

        while kernel.output_shape[1] > 4:
            for _ in range(2):
                # Old output is kept as residue
                x = z = Input(shape=(kernel.output_shape[1:]))

                z = Activation('relu')(z)
                z = Conv2D(64, kernel_size=(5,5), strides=(1,1), padding='same')(z)
                z = BatchNormalization()(z)

                z = Activation('relu')(z)
                z = Conv2D(64, kernel_size=(5,5), strides=(1,1), padding='same')(z)
                z = BatchNormalization()(z)
                
                # The sum of the residue and the new computation
                y = Add()([x, z])
                
                # Add the residual block
                blk = Model(x, y)
                kernel.add(blk)
            
            # Reduce dimension
            kernel.add(MaxPooling2D((2,2)))
        
        kernel.add(Flatten())

        kernel.add(Dense(128, activation='relu'))

        model = Sequential(name='single_image')

        model.add(kernel)
        model.add(Dense(5, activation='softmax'))

        super().__init__(model, train, test)

    def compile(self):
        self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

class UnionModel(KaggleModel):
    def __init__(self, models, train, test, freeze=True):
        
        xs = []
        ys = []
        for model in models:
            model = model.model
            x = Input(shape=model.input_shape[1:], name='{}_in'.format(model.name))
            # Get the first layer of the model. This is the encoder
            layer = model.get_layer(index=0)
            
            # It must not be trainable
            if freeze: layer.trainable = False
            # The output only utilizes the encoder component
            y = layer(x)
            
            # Output should be flat
            if len(y.shape) > 2:
                y = Flatten()(y)
                
            # Save values
            xs.append(x)
            ys.append(y)

        y = Concatenate()(ys)

        y = Dense(128)(y)
        y = BatchNormalization()(y)

        y = Dense(5, activation='softmax')(y)
        
        model = Model(xs, y)

        super().__init__(model, train, test)

    def compile(self):
        self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


### Training Procedure

In [0]:
def train_model(mdl, X, Y, epochs=32):
    # Shuffle the data
    shuffle(X, Y)

    # One hot encode the output
    Y = to_categorical(Y)

    # Validation split
    (X_train, Y_train), (X_valid, Y_valid) = split(X, Y)

    print('Training points:', len(Y_train))
    print('Validation points:', len(Y_valid))
    print('Total points:', len(Y))

    clf = mdl((X_train, Y_train), (X_valid, Y_valid))

    # Build the model
    clf.compile()

    # Fit to the data
    clf.train(epochs=epochs)

    return clf

### Model Training

Models are trained one by one.

In [13]:
# Attribute model data
X_attr, Y_attr = convert_for_single_axis(X, Y, ax=0)

# Image-free model
attr_clf = ImageFreeModel

# Train the model
attr_clf = train_model(attr_clf, X_attr, Y_attr, epochs=32)

Training points: 11995
Validation points: 2998
Total points: 14993
Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
image_free_encoder (Sequenti (None, 128)               3428      
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 325       
Total params: 12,009
Trainable params: 11,959
Non-trainable params: 50
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Train on 11995 samples, validate on 2998 samples
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 

In [0]:
# Create inputs for convolutional model
X_conv, Y_conv = convert_for_all(X, Y)
X_conv = X_conv[1]
print(X_conv.shape)

from keras.preprocessing.image import ImageDataGenerator

# Image generator for training
def make_generator(X):
    gen = ImageDataGenerator(
            rotation_range=40,
            zoom_range=0.2,
            shear_range=0.2,
            width_shift_range=0.2,
            height_shift_range=0.2,
            fill_mode='nearest',
            horizontal_flip=True,
    )
    gen.fit(X)
    return gen

Y_conv = to_categorical(Y_conv)

# Validation split
(X_train, Y_train), (X_valid, Y_valid) = split(X_conv, Y_conv)
print('Training points:', len(Y_train))
print('Validation points:', len(Y_valid))
print('Total points:', len(Y))
print(X_train.shape, X_valid.shape)

train_gen = make_generator(X_train).flow(X_train, Y_train, batch_size=10)

# Build a model
conv_clf = SingleImageModel((X_train, Y_train), (X_valid, Y_valid))

# Train the model
conv_clf.compile()
conv_clf.model.fit_generator(train_gen, steps_per_epoch = len(X_train) // 32, epochs=8, validation_data=(X_valid, Y_valid))


del X_conv
del Y_conv

In [0]:
attr_clf.model.predict(X_attr)

In [0]:
# Create inputs for convolutional model
X_conv, Y_conv = convert_for_all(X, Y)
print('Built data')
print([x.shape for x in X_conv])
print(Y_conv.shape)

# Build a model
union_clf = lambda tr, tst: UnionModel([attr_clf, conv_clf], tr, tst, freeze=True)

# Train the model
union_clf = train_model(union_clf, X_conv, Y_conv)

#### Evaluation

In [30]:
# Get the test data
X, ids = load_train_data(is_train=False)
X = X[0]

Images will be loaded from data/test_images/picked_pictures.npy


In [0]:
# Make predictions on the test data
Y = attr_clf.model.predict(X)
Y = np.argmax(Y, -1)

In [32]:
# Save the results to a file
df = pd.DataFrame({
    'PetID': ids,
    'AdoptionSpeed':Y
})

print(df)

df.to_csv('results.csv', index=False)

          PetID  AdoptionSpeed
0     378fcc4fc              4
1     73c10e136              4
2     72000c4c5              4
3     e147a4b9f              4
4     43fbba852              4
5     77a490ec9              4
6     28c4b1b13              4
7     d1eada628              4
8     d134dec34              4
9     bcd464bb8              4
10    4e21958c3              4
11    7b070aed6              4
12    ff8d0708f              4
13    f1e6c9bf3              4
14    248914c05              4
15    948002885              4
16    111e67cd2              4
17    4f4b2ede1              4
18    d77fca061              4
19    ac9fb74b9              4
20    47ef39e7a              4
21    c69ee9807              4
22    62dcf8ecb              4
23    4df1d19d6              4
24    e8ef8455e              4
25    cbd23bc17              2
26    4d1a91ccf              2
27    004ee5cf7              4
28    95fad0a75              4
29    64341b5db              4
...         ...            ...
3918  45