In [1]:
# import
import pandas as pd
import numpy as np
import operator
import re
import os

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import xgboost as xgb
from sklearn import grid_search
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split

#from lasagne import layers
#from lasagne.nonlinearities import  softmax, rectify
#from lasagne.updates import nesterov_momentum
#from nolearn.lasagne import NeuralNet

# keras neural network model
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import LSTM
from keras.optimizers import SGD

Using TensorFlow backend.


In [2]:
# data 
train = pd.read_csv("data/train_clean_2.csv")
test = pd.read_csv("data/test_clean_2.csv")

In [3]:
train.head(5)

Unnamed: 0,OutcomeType,AgeuponOutcome,Male,Female,SexType_Intact,SexType_Neutered,SexType_Spayed,SexType_Unknown,crosses,mix,...,Color_Brown,Color_num_mix,animal_is_dog,Year,Month,Day,Weekday,Hour,Minute,has_name
0,Return_to_owner,365.0,1,0,0.0,1.0,0.0,0.0,2,1,...,True,2,True,2014,2,12,2,18,22,True
1,Euthanasia,365.0,0,1,0.0,0.0,1.0,0.0,2,1,...,False,1,False,2013,10,13,6,12,44,True
2,Adoption,730.0,1,0,0.0,1.0,0.0,0.0,2,1,...,False,2,True,2015,1,31,5,12,28,True
3,Transfer,21.0,1,0,1.0,0.0,0.0,0.0,2,1,...,False,1,False,2014,7,11,4,19,9,False
4,Transfer,730.0,1,0,0.0,1.0,0.0,0.0,2,0,...,False,1,True,2013,11,15,4,12,52,False


In [4]:
test.head(5)

Unnamed: 0,ID,AgeuponOutcome,Male,Female,SexType_Intact,SexType_Neutered,SexType_Spayed,SexType_Unknown,crosses,mix,...,Color_Brown,Color_num_mix,animal_is_dog,Year,Month,Day,Weekday,Hour,Minute,has_name
0,1,305.0,0,1,1.0,0.0,0.0,0.0,2,1,...,False,2,True,2015,10,12,0,12,15,True
1,2,730.0,0,1,0.0,0.0,1.0,0.0,2,0,...,False,2,True,2014,7,26,5,17,59,True
2,3,365.0,1,0,0.0,1.0,0.0,0.0,2,1,...,True,1,False,2016,1,13,2,12,20,True
3,4,122.0,1,0,1.0,0.0,0.0,0.0,2,1,...,False,1,True,2013,12,28,5,18,12,True
4,5,730.0,1,0,0.0,1.0,0.0,0.0,2,1,...,False,1,True,2015,9,24,3,17,59,True


In [5]:
# split
train_X = train.ix[:, train.columns != "OutcomeType"]
train_Y = train["OutcomeType"]

test_ID = test["ID"]
test_X = test.ix[:, test.columns != "ID"]

# if using the smote data set
test_X.columns = [x.replace(" ",".") for x in test_X.columns]

print(train_X.shape)
print(train_Y.shape)
print(test_X.shape)

train_X = train_X.astype(float)
test_X = test_X.astype(float)

(26728, 279)
(26728,)
(11456, 279)


In [6]:
# function to write submission output
def create_submission(pred):
    sub = pd.DataFrame(pred)
    sub["ID"] = test_ID
    cols = sub.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    sub = sub[cols]
    sub.columns = ["ID","Adoption","Died","Euthanasia","Return_to_owner","Transfer"]
    return sub

In [7]:
def multiclass_log_loss(y_true, y_pred, eps=1e-15):
    """Multi class version of Logarithmic Loss metric.
    https://www.kaggle.com/wiki/MultiClassLogLoss

    idea from this post:
    http://www.kaggle.com/c/emc-data-science/forums/t/2149/is-anyone-noticing-difference-betwen-validation-
    and-leaderboard-error/12209#post12209

    Parameters
    ----------
    y_true : array, shape = [n_samples]
    y_pred : array, shape = [n_samples, n_classes]

    Returns
    -------
    loss : float
    """
    predictions = np.clip(y_pred, eps, 1 - eps)

    # normalize row sums to 1
    predictions /= predictions.sum(axis=1)[:, np.newaxis]

    actual = np.zeros(y_pred.shape)
    rows = actual.shape[0]
    actual[np.arange(rows), y_true.astype(int)] = 1
    vsota = np.sum(actual * np.log(predictions))
    return -1.0 / rows * vsota

## random forest

In [232]:
model_rf = RandomForestClassifier(random_state=1, n_estimators=1000)
model_rf = model_rf.fit(train_X, train_Y)

In [233]:
model_rf_pred = model_rf.predict_proba(test_X)

In [234]:
model_rf_submission = create_submission(model_rf_pred)
model_rf_submission.head(5)

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,0.083,0.001,0.053,0.19,0.673
1,2,0.661,0.0,0.016,0.239,0.084
2,3,0.418,0.002,0.014,0.184,0.382
3,4,0.194,0.004,0.048,0.268,0.486
4,5,0.376,0.001,0.008,0.497,0.118


In [44]:
model_rf_submission.to_csv("submission/model_rf_1.csv",index=False)

## extremely randomized trees

In [268]:
model_erf = ExtraTreesClassifier(n_jobs=-1, 
                                 random_state=123,
                                 n_estimators=1000)
model_erf.fit(train_X,train_Y)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
           oob_score=False, random_state=123, verbose=0, warm_start=False)

In [269]:
model_erf_pred = model_erf.predict_proba(test_X)

In [270]:
model_erf_submission = create_submission(model_erf_pred)
model_erf_submission.head(5)

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,0.089,0.001,0.053,0.12,0.737
1,2,0.699,0.0,0.011,0.189,0.101
2,3,0.533,0.0,0.006,0.189,0.272
3,4,0.106,0.002,0.053,0.327,0.512
4,5,0.438,0.0,0.003,0.422,0.137


In [271]:
model_rf_submission.to_csv("submission/model_erf_1.csv",index=False)

## xgboost

In [8]:
mapping = {
    "Adoption": 0,
    "Died": 1,
    "Euthanasia": 2,
    "Return_to_owner": 3,
    "Transfer": 4
}
train_Y = train_Y.replace(mapping)

In [9]:
# compile datasets
dtrain = xgb.DMatrix(train_X, label=train_Y)
dtest = xgb.DMatrix(test_X)

In [10]:
# parameters
param = {
    'objective': 'multi:softprob',
    'bst:max_depth': 8, 
    'bst:eta': 0.1,
    'silent': 1, 
    'gamma': 0.01,
    # "min_child_weight": 3,
    'num_class': 5,
    'verbose': 1,
    'subsample': 0.8,
    'nthread': 4
}
num_rounds = 200

In [11]:
# train
model_xgb = xgb.train(param, dtrain, num_rounds)

In [12]:
# predict
model_xgb_pred = model_xgb.predict(dtest)

In [13]:
# submission
model_xgb_submission = create_submission(model_xgb_pred)
model_xgb_submission.head(5)

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,0.015095,0.000489,0.032278,0.16061,0.791527
1,2,0.512545,0.00025,0.007197,0.442108,0.0379
2,3,0.446164,0.000734,0.005829,0.151879,0.395395
3,4,0.162331,0.000879,0.007747,0.379517,0.449525
4,5,0.384003,0.000582,0.002657,0.552297,0.060461


In [27]:
# write to csv
# model_xgb_submission.to_csv("submission/model_xgb.csv",index=False)

In [14]:
# feature importance
importance = model_xgb.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))

In [15]:
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()
df = df.sort_values("fscore",ascending=False)
df.head(5)

Unnamed: 0,feature,fscore
172,Minute,0.165567
171,Day,0.12139
170,AgeuponOutcome,0.107929
169,Hour,0.105464
168,Weekday,0.094436


## xgb error analysis

In [17]:
mapping = {
    "Adoption": 0,
    "Died": 1,
    "Euthanasia": 2,
    "Return_to_owner": 3,
    "Transfer": 4
}
train_Y = train_Y.replace(mapping)

In [18]:
# split train-validate
err_train_X, err_val_X, err_train_Y, err_val_Y = train_test_split(train_X, train_Y, test_size=0.20, random_state=123)

In [19]:
# compile datasets
err_dtrain = xgb.DMatrix(err_train_X, label=err_train_Y)
err_dvalidate = xgb.DMatrix(err_val_X)

In [20]:
# parameters
param = {
    'objective': 'multi:softprob',
    'bst:max_depth': 8, 
    'bst:eta': 0.1,
    'silent': 1, 
    'gamma': 0.01,
    'num_class': 5,
    'verbose': 1,
    'subsample': 0.8,
    'nthread': 4
}
num_rounds = 200

In [21]:
# train
model_xgb_err = xgb.train(param, err_dtrain, num_rounds)

In [22]:
# predict
model_xgb_err_pred = model_xgb_err.predict(err_dvalidate)

In [23]:
# prdicted class
model_xgb_err_pred_class = model_xgb_err_pred.argmax(axis=1)

In [24]:
wrong_pred_data = err_val_X[model_xgb_err_pred_class != err_val_Y]
wrong_pred_data.head(5)

Unnamed: 0,AgeuponOutcome,Male,Female,SexType_Intact,SexType_Neutered,SexType_Spayed,SexType_Unknown,crosses,mix,Breed_American_Pit_Bull_Terrier,...,Color_Brown,Color_num_mix,animal_is_dog,Year,Month,Day,Weekday,Hour,Minute,has_name
5887,730.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,...,0.0,2.0,1.0,2013.0,12.0,12.0,3.0,16.0,51.0,1.0
16717,730.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,...,0.0,1.0,1.0,2014.0,6.0,26.0,3.0,15.0,28.0,1.0
5391,730.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,...,1.0,2.0,1.0,2013.0,11.0,15.0,4.0,17.0,40.0,1.0
24775,91.5,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,...,0.0,2.0,1.0,2014.0,6.0,26.0,3.0,8.0,57.0,1.0
25499,730.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,...,1.0,2.0,0.0,2015.0,4.0,25.0,5.0,17.0,2.0,1.0


In [25]:
# check where errors are
pd.crosstab(model_xgb_err_pred_class, err_val_Y, margins=True)

OutcomeType,0,1,2,3,4,All
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1783,5,31,352,322,2493
1,0,3,0,0,0,3
2,6,5,97,11,26,145
3,222,0,65,468,156,911
4,133,24,146,113,1378,1794
All,2144,37,339,944,1882,5346


## xgb_grid_search and xgb_combined

In [42]:
def xgb_grid_search(train_X, train_Y, test_X):
    # parameters
    param = {
        'objective': 'multi:softprob', 
        'silent': 1, 
        'num_class': 5,
        'verbose': 1,
        'nthread': 4 
    }
    max_depths = [6, 8]
    etas = [0.1, 0.2]
    num_rounds = [100, 200]
    gammas = [0.01, 0.02]
    
    # split
    X_train, X_validate, y_train, y_validate = train_test_split(train_X, train_Y, test_size=0.20, random_state=123)
    train_set = xgb.DMatrix(X_train, label=y_train)
    validate_set = xgb.DMatrix(X_validate)
    
    # store result
    result = pd.DataFrame(columns=["max_depth", "eta", "gamma", "num_rounds", "loss"])
    
    # loop
    i = 0
    for i1 in max_depths:
        for i2 in etas:
            for i3 in gammas:
                for i4 in num_rounds:
                    param["bst:max_depths"] = i1
                    param["bst:eta"] = i2
                    param["gamma"] = i3

                    # train
                    temp_model = xgb.train(param, train_set, i4)
                    temp_model_pred = temp_model.predict(validate_set)

                    # calculate loss
                    temp_loss = multiclass_log_loss(y_validate, temp_model_pred)
                    print("%d, %.2f, %d, %d, %f" % (i1, i2, i3, i4, temp_loss))

                    # write
                    result.loc[i] = [i1, i2, i3, i4, temp_loss]
                    i += 1
    
    # return
    return result            

In [43]:
xgb_grid_search_result = xgb_grid_search(train_X, train_Y, test_X)
xgb_grid_search_result.sort_values("loss")[:5]

6, 0.100000.2, 0, 100, 0.751412
6, 0.100000.2, 0, 200, 0.750025
6, 0.100000.2, 0, 100, 0.751302
6, 0.100000.2, 0, 200, 0.749474
6, 0.200000.2, 0, 100, 0.751770
6, 0.200000.2, 0, 200, 0.756768
6, 0.200000.2, 0, 100, 0.751307
6, 0.200000.2, 0, 200, 0.757841
8, 0.100000.2, 0, 100, 0.751412
8, 0.100000.2, 0, 200, 0.750025
8, 0.100000.2, 0, 100, 0.751302
8, 0.100000.2, 0, 200, 0.749474
8, 0.200000.2, 0, 100, 0.751770
8, 0.200000.2, 0, 200, 0.756768
8, 0.200000.2, 0, 100, 0.751307
8, 0.200000.2, 0, 200, 0.757841


Unnamed: 0,max_depth,eta,gamma,num_rounds,loss
3,6.0,0.1,0.02,200.0,0.749474
11,8.0,0.1,0.02,200.0,0.749474
1,6.0,0.1,0.01,200.0,0.750025
9,8.0,0.1,0.01,200.0,0.750025
2,6.0,0.1,0.02,100.0,0.751302


In [74]:
def xgb_combined(train_X, train_Y, test_X):
    # parameters
    param = {
        'objective': 'multi:softprob', 
        'silent': 1, 
        'num_class': 5,
        'verbose': 1,
        'nthread': 4 
    }
    max_depths = [6, 8]
    etas = [0.1, 0.2, 0.3]
    num_rounds = [100, 200, 250]
    gammas = [0.005, 0.01, 0.02]
    
    # split
    #X_train, X_validate, y_train, y_validate = train_test_split(train_X, train_Y, test_size=0.20, random_state=123)
    #train_set = xgb.DMatrix(X_train, label=y_train)
    train_X = xgb.DMatrix(train_X,label=train_Y)
    test_X = xgb.DMatrix(test_X)
    #validate_set = xgb.DMatrix(X_validate)
    
    # store result
    #result = pd.DataFrame(columns=["max_depth", "eta", "gamma", "num_rounds", "loss"])
    probs = None
    
    # loop
    i = 0
    for i1 in max_depths:
        for i2 in etas:
            for i3 in gammas:
                for i4 in num_rounds:
                    param["bst:max_depths"] = i1
                    param["bst:eta"] = i2
                    param["gamma"] = i3

                    # train
                    temp_model = xgb.train(param, train_X, i4)
                    temp_model_pred = temp_model.predict(test_X)

                    # calculate loss
                    #temp_loss = multiclass_log_loss(y_validate, temp_model_pred)
                    #print("%d, %f, %d, %f, %f" % (i1, i2, i3, i4, temp_loss))

                    # write
                    #result.loc[i] = [i1, i2, i3, i4, temp_loss]
                    probs = temp_model_pred if probs is None else probs + temp_model_pred
                    i += 1
                    print(i)
    
    probs /= i
    
    # return
    return probs#, result            

In [None]:
xgb_combined_pred = xgb_combined(train_X, train_Y, test_X)

In [78]:
# submission
xgb_combined_submission = create_submission(xgb_combined_pred)
xgb_combined_submission.head(5)

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,0.018466,0.000602,0.037585,0.194873,0.748474
1,2,0.704308,0.000319,0.006949,0.224233,0.06419
2,3,0.390029,0.001051,0.008756,0.18705,0.413114
3,4,0.230126,0.00062,0.012688,0.245976,0.51059
4,5,0.460555,0.000424,0.004292,0.44404,0.09069


In [79]:
model_xgb_submission.to_csv("submission/model_xgb_combined.csv",index=False)

## keras neural network

In [250]:
input_dim = train_X.shape[1]

In [259]:
model = Sequential()

model.add(Embedding(1000, 128, input_length=input_dim, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))
model.add(Dense(5))
model.add(Activation('sigmoid'))

In [260]:
sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(
    loss='categorical_crossentropy', 
    optimizer=sgd, 
    metrics=['accuracy']
)

In [262]:
train_Y_keras = to_categorical(train_Y)
model.fit(np.array(train_X), train_Y_keras, nb_epoch=1)

In [133]:
model_pred = model.predict(np.array(test_X))

In [134]:
model_pred_submission = create_submission(model_pred)
model_pred_submission.head(5)

Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
0,1,0.0,0.0,0.0,0.0,1.0
1,2,0.0,0.0,0.0,0.0,1.0
2,3,0.0,0.0,0.0,0.0,1.0
3,4,0.0,0.0,0.0,0.0,1.0
4,5,0.0,0.0,0.0,0.0,1.0


In [109]:
model_pred_submission.to_csv("submission/model_keras_nn_3.csv",index=False)

## emsemble submissions

In [51]:
all_subs = os.listdir("submission/")[1:]
all_subs = ["submission/"+x for x in all_subs]
print(all_subs)

['submission/combined_submission_1.csv', 'submission/model_xgb_combined.csv', 'submission/model_xgb_gamma_0.01.csv', 'submission/model_xgb_gamma_0.02.csv', 'submission/model_xgb_nround10.csv', 'submission/model_xgb_nround1000.csv', 'submission/model_xgb_nround125.csv', 'submission/model_xgb_r_mine.csv', 'submission/model_xgb_smote_data.csv', 'submission/model_xgb_smote_data_2.csv', 'submission/model_xgb_time_name_initial_nround200.csv', 'submission/model_xgb_time_name_initial_nround200_1.csv', 'submission/model_xgb_time_name_initial_nround200_2.csv', 'submission/model_xgb_time_name_nround200.csv', 'submission/model_xgb_time_nround150.csv', 'submission/model_xgb_train_clean2_1.csv']


In [57]:
def combine_subs(lst):
    num = len(lst)
    result = np.zeros((11456,5))
    for f in all_subs:
        if pd.read_csv(f).ix[0,-1] < 0.75:
            this_df = np.array(pd.read_csv(f).ix[:,1:6])
            result += this_df
        else:
            num -= 1
    result = result / float(num)
    print(result[:5])
    sub = create_submission(result)
    print(sub[:5])
    sub.to_csv("submission/combined_submission_2.csv",index=False)

combine_subs(all_subs)

[[ 0.03542814  0.01860565  0.05643968  0.17548523  0.67562354]
 [ 0.57270935  0.01731569  0.02354728  0.25421036  0.08703435]
 [ 0.3580849   0.01742549  0.02636353  0.16100393  0.36471219]
 [ 0.16082326  0.01646131  0.04220834  0.17344242  0.53159107]
 [ 0.41902036  0.01631312  0.02116244  0.36852462  0.10829876]]
   ID  Adoption      Died  Euthanasia  Return_to_owner  Transfer
0   1  0.035428  0.018606    0.056440         0.175485  0.675624
1   2  0.572709  0.017316    0.023547         0.254210  0.087034
2   3  0.358085  0.017425    0.026364         0.161004  0.364712
3   4  0.160823  0.016461    0.042208         0.173442  0.531591
4   5  0.419020  0.016313    0.021162         0.368525  0.108299
