### Data Processing

In [1]:
import os
from PIL import Image
import numpy as np
import pandas as pd
from glob import glob
import json

from tqdm import tqdm

numeric_cols = [
    'Age',
    'Quantity', 'Fee', 'State',
    'VideoAmt', 'PhotoAmt', 'Type',
    "SentimentMagnitude", "SentimentScore", "NumSentences"
]

one_hot_cols = {
    #'Type': 2, 
    'Breed1': 307, 'Breed2': 307,
    'Gender': 3, 'Color1': 7, 'Color2': 7,
    'Color3': 7,
    'MaturitySize': 5,
    'FurLength': 4, 'Vaccinated': 3,
    'Dewormed': 3, 'Sterilized': 3,
    'Health': 4, 'State': 15
}

def one_hot_encode(df, col, num_class=None, labels=None, inplace=False):
    ''' Takes in dataframe df and replaces col with num_class columns
        For example, use as follows
        for col, num_class in data.one_hot_cols.items():
            one_hot_encode(train_df, col, num_class)
    '''
    # get the true values from data
    column_values = np.sort(df[col].dropna().unique())
    if num_class == None:
        num_class = len(column_values)
    if num_class == 2:
        # These can just be boolean
        if inplace:
            df[col] = (df[col] == column_values[0]).astype(int)
        else:
            return (df[col] == column_values[0]).astype(int)
    else:        
        if labels is not None:
            res = np.zeros((len(df), num_class))
            for i, label in enumerate(labels):
                if inplace:
                    df[col+'_'+str(label)] = (df[col] == label).astype(int)
                else:
                    one_hot = np.zeros(num_class)
                    one_hot[i] = 1
                    res[df[col] == label] = one_hot
        else:
            res = np.zeros((len(df), num_class))
            for i in range(num_class):
                if (i >= len(column_values)):
                    break # Index out of bounds
                cur_value = column_values[i]

                if inplace:
                    df[col+'_'+str(cur_value)] = (df[col] == cur_value).astype(int)
                else:
                    one_hot = np.zeros(num_class)
                    one_hot[i] = 1
                    res[df[col] == cur_value] = one_hot
    
        if inplace:
            # delete original column
            df.drop(col, axis=1, inplace=True)
        else:
            return res

def load_data(fname):
    return pd.read_csv(fname)

def get_sentiment(df, sentiment_location):
    ''' Parses the text sentiment metadata and adds a few additional
        metrics to the specified dataframe.
    '''
    sentiment_files = glob(sentiment_location + "/*")

    # Add some additional metrics from the sentiment files
    for s_file in tqdm(sentiment_files, desc='Loading sentiment data'):
        pet_id = s_file.split('/')[-1].split('.')[0]
        with open(s_file) as json_file:
            data = json.load(json_file)

            df.loc[df["PetID"] == pet_id, "SentimentMagnitude"] = data['documentSentiment']['magnitude']
            df.loc[df["PetID"] == pet_id, "SentimentScore"] = data['documentSentiment']['score']
            df.loc[df["PetID"] == pet_id, "NumSentences"] = len(data['sentences'])

def load_pet_files(regdir):
    """ Extracts all of the files associated with each pet listed
    by the 'PetID' tag.

    regdir - The directory containing the files

    returns a dictionary containing keypairs (k, v) such that v
    matches the regex (k\-.*) where k is the key (a valid PetID).
    """
    if os.path.isfile(os.path.join(regdir + 'picked_pictures.npy')):
        print("Images loaded from existing file")
        return np.load(os.path.join(regdir + 'picked_pictures.npy'))
    pfiles = {}

    # Extract the pet names
    for f in tqdm(os.listdir(regdir), desc='Loading Pet Files'):
        # Extract the name
        n = f[:f.index('-')]
        url = os.path.join(regdir, f)
        img = load_image(url)
        if n in pfiles:
            # Add to the entry
            pfiles[n].append(img)

        else:
            # Add a new entry
            pfiles[n] = [img]

    return pfiles

def load_train_data(is_train = True):
    # Get the annotations for each pet
    dta = load_data('../input/train/train.csv' if is_train else '../input/test/test.csv')
     
    # Get the pet pictures
    petpics = {}#load_pet_files('../input/train_images/' if is_train else '../input/test_images')

    # Get the state ids
    states = load_data('../input/state_labels.csv')
    states = states['StateID'].tolist()

    # Load parsed sentiment
    get_sentiment(dta, '../input/{}_sentiment/'.format('train' if is_train else 'test'))
    
    # NaN will become zero
    dta = dta.fillna(0)

    X_num = []
    X_pic = []

    Y = []
    
    # Build a single object to store the X values
    X = [X_num, X_pic]
    
    """
    
    for k in one_hot_cols:
        one_hot_encode(dta, k, inplace=True)
        print('One hot encoded', k)
        """
    
    for i, row in tqdm(dta.iterrows(), desc='Processing data rows'):
        # Save the numeric values
        vals = row[numeric_cols]
        
        state = [x == row['State'] for x in states]
        #assert(sum(state) == 1)
        
        # Add all of the valid one-hot encodings
        #state = [row[k] for k in dta if any(q in k for q in one_hot_cols)]
            
        x = list(vals) + state
        
            
        # Join sentiment on PetID
        """
        s = sentiment[sentiment['PetID'] == row['PetID']]
        
        for col in s:
            if col not in row:
                row[col] = s.iloc[0][col]
                x.append(row[col])
        """
        
        # Save the pictures
        if row['PetID'] in petpics:
            X_pic.append(petpics[row['PetID']])
        else:
            X_pic.append([])
            
        # Save the data pair
        X_num.append(np.array(x))
        
        if is_train:
            Y.append(row['AdoptionSpeed'])
        else:
            Y.append(row['PetID'])

    # Laziness
    if len(X) == 1:
        X = np.array(X[0])
    else:
        X = list(map(np.array, X))

    Y = np.array(Y)

    return (X, Y)

def load_image(img_file, size=64):
    # print('img file:', img_file)
    img = Image.open(img_file)
    img = img.resize((size, size), Image.ANTIALIAS)
    return np.array(img)

In [2]:
from keras.utils import to_categorical

import numpy as np

# Get the data
X, Y = load_train_data()

Using TensorFlow backend.
Loading sentiment data: 100%|██████████| 14442/14442 [03:30<00:00, 68.47it/s]
Processing data rows: 14993it [00:18, 799.36it/s]


In [3]:
print([x.shape for x in X])
print(Y.shape)
print(X[0][0])
print(Y[0])

X[0] = np.nan_to_num(X[0])
print('nan:', np.count_nonzero(np.isnan(X[0])))

[(14993, 25), (14993, 0)]
(14993,)
[3.0000e+00 1.0000e+00 1.0000e+02 4.1326e+04 0.0000e+00 1.0000e+00
 2.0000e+00 2.4000e+00 3.0000e-01 6.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00
 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 0.0000e+00 1.0000e+00
 0.0000e+00]
2
nan: 0


In [4]:
def shuffle(X, Y):
    idxs = list(range(len(Y)))
    
    if isinstance(X, list):
        for i in range(len(X)):
            X[i] = X[i][idxs]
    else:
        X = X[idxs]

    Y = Y[idxs]
    
    return X, Y

def split(X, Y, s=0.2):
    s = int(s * len(Y))
    
    if isinstance(X, list):
        X_train = [x[:-s] for x in X]
        X_test = [x[-s:] for x in X]
    else:
        X_train = X[:-s]
        X_test = X[-s:]

    Y_train = Y[:-s]
    Y_test = Y[-s:]

    return (X_train, Y_train), (X_test, Y_test)


def convert_for_all(X, Y):
    # For single image training, make a datapoint for each image or the default zero image
    Xs = [[], []]
    Ys = []
    for i in range(len(X[1])):
        Xs[0].append(X[0][i])
        if len(X[1][i]) == 0:
            Xs[1].append(np.zeros((64, 64, 3)))
            Ys.append(Y[i])
        else:
            # Make a datapoint for all images. We assume equal relevance of images
            for img in X[1][i]:
                Xs[0].append(X[0][i])
                Xs[1].append(img)
                Ys.append(Y[i])

    X = list(map(np.array, Xs))
    Y = np.array(Ys)

    return X, Y

def convert_for_single_axis(X, Y, ax=0):
    return X[ax], Y


In [5]:
X, Y = shuffle(X, Y)

### Architectures

In [6]:
from keras.models import Sequential, Model
from keras.layers import *
from keras.callbacks import ModelCheckpoint

class KaggleModel:
    def __init__(self, model, train, test):
        model.summary()
        self.model = model
        self.train_data = train
        self.test_data = test

    def compile(self):
        """ Compiles the model. Should be defined by the user.
        """
        raise NotImplementedError()

    def train(self, epochs=1):
        """ Default training behavior. Simply does a Model.fit(X, Y).
        """
        # Get the training data
        X_train, Y_train = self.train_data

        checkpoint = ModelCheckpoint('model.h5')
        
        # Fit to the data
        self.model.fit(X_train, Y_train, epochs=epochs, validation_data=self.test_data, callbacks=[checkpoint])

    def predict(self, X):
        return self.model.predict(X)


def ResidualBlock(mdl):
    x = Input(shape=mdl.input_shape[1:])

    y = mdl(x)
    y = Add()([x, y])

    return Model(x, y)

class ImageFreeModel(KaggleModel):
    def __init__(self, train, test):
        kernel = Sequential(name='image_free_encoder')
        
        kernel.add(BatchNormalization(input_shape=(25,)))
        
        kernel.add(Dense(128))
        kernel.add(Activation('relu'))

        model = Sequential(name='image_free')
        model.add(kernel)
        
        model.add(Dense(64, activation='relu'))

        # Labels are one of [0, 1, 2, 3, 4]
        model.add(Dense(5, activation='softmax'))

        # Build using the built model
        super().__init__(model, train, test)

    def compile(self):
        self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


### Training Procedure

In [7]:
def train_model(mdl, X, Y, epochs=32):
    # Shuffle the data
    shuffle(X, Y)

    # One hot encode the output
    Y = to_categorical(Y)

    # Validation split
    (X_train, Y_train), (X_valid, Y_valid) = split(X, Y)

    print('Training points:', len(Y_train))
    print('Validation points:', len(Y_valid))
    print('Total points:', len(Y))

    clf = mdl((X_train, Y_train), (X_valid, Y_valid))

    # Build the model
    clf.compile()

    # Fit to the data
    clf.train(epochs=epochs)

    return clf

### Model Training

Train the model.

In [8]:
# Attribute model data
X_attr, Y_attr = convert_for_single_axis(X, Y, ax=0)

# Image-free model
attr_clf = ImageFreeModel

# Train the model
attr_clf = train_model(attr_clf, X_attr, Y_attr, epochs=16)

Training points: 11995
Validation points: 2998
Total points: 14993
Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
image_free_encoder (Sequenti (None, 128)               3428      
_________________________________________________________________
dense_2 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 325       
Total params: 12,009
Trainable params: 11,959
Non-trainable params: 50
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Train on 11995 samples, validate on 2998 samples
Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 

#### Evaluation

In [9]:
# Get the test data
X, ids = load_train_data(is_train=False)
X = X[0]

Loading sentiment data: 100%|██████████| 3865/3865 [00:45<00:00, 84.59it/s]
Processing data rows: 3972it [00:04, 803.81it/s]


In [10]:
# Make predictions on the test data
Y = attr_clf.model.predict(X)
Y = np.argmax(Y, -1)

In [11]:
# Save the results to a file
df = pd.DataFrame({
    'PetID': ids,
    'AdoptionSpeed':Y
})

print(df)

df.to_csv('submission.csv', index=False)

          PetID  AdoptionSpeed
0     e2dfc2935              4
1     f153b465f              4
2     3c90f3f54              2
3     e02abc8a3              4
4     09f0df7d1              4
5     0487529d4              2
6     bae7c4b1c              4
7     548bcf206              2
8     0f82cea1e              4
9     a3787f15e              4
10    0113cedff              4
11    0070b950a              4
12    cbe2df167              4
13    37a0b72a4              4
14    669695dd6              4
15    be85036be              2
16    c2bbbdde2              4
17    6a968e033              4
18    3107d0aa7              4
19    9e11f1974              4
20    14bc519cd              4
21    cee2cdc6b              4
22    12799a2af              1
23    34af29aab              4
24    5bd1a9042              4
25    04d1a03dc              4
26    cdac529a3              4
27    6b593bbe2              4
28    9992b9fce              3
29    8c03c9a3b              4
...         ...            ...
3942  9c