# Submission 1
by Wilsion Lee

## Problem definition

Finding the adoption speed of pets

 ## Import Libraries

In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

## Load Data

In [2]:
df_train = pd.read_csv("../../Data/Processed/train_cleaned.csv")
print(df_train.columns)
df_train.head()

Index(['Age', 'VideoAmt', 'PetID', 'PhotoAmt', 'AdoptionSpeed', 'Type_1',
       'Type_2', 'Probability', 'Breed_0', 'Breed_1',
       ...
       'State_41332', 'State_41335', 'State_41336', 'State_41342',
       'State_41345', 'State_41361', 'State_41367', 'State_41401',
       'State_41415', 'fee_per_pet'],
      dtype='object', length=241)


Unnamed: 0,Age,VideoAmt,PetID,PhotoAmt,AdoptionSpeed,Type_1,Type_2,Probability,Breed_0,Breed_1,...,State_41332,State_41335,State_41336,State_41342,State_41345,State_41361,State_41367,State_41401,State_41415,fee_per_pet
0,36,0,3f8824a3b,1.0,4,1,0,6.7e-05,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,3,0,9238eb7fc,1.0,2,0,1,0.001334,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,7,0,f0a1f2b90,2.0,4,0,1,6.7e-05,0,0,...,0,0,0,0,0,0,0,0,0,0.0
3,3,0,7d028bdea,4.0,2,1,0,0.088108,0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,1,0,8377bfe97,0.0,2,0,1,6.7e-05,0,0,...,0,0,0,0,0,0,0,1,0,0.0


In [3]:
df_test = pd.read_csv("../../Data/Processed/test_cleaned.csv")
print(df_test.columns)
df_test.head()

Index(['Age', 'VideoAmt', 'PetID', 'PhotoAmt', 'Type_1', 'Type_2',
       'Probability', 'Breed_0', 'Breed_1', 'Breed_2',
       ...
       'State_41332', 'State_41335', 'State_41336', 'State_41342',
       'State_41345', 'State_41361', 'State_41367', 'State_41401',
       'State_41415', 'fee_per_pet'],
      dtype='object', length=240)


Unnamed: 0,Age,VideoAmt,PetID,PhotoAmt,Type_1,Type_2,Probability,Breed_0,Breed_1,Breed_2,...,State_41332,State_41335,State_41336,State_41342,State_41345,State_41361,State_41367,State_41401,State_41415,fee_per_pet
0,1,0,f42161740,10.0,0,1,6.7e-05,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,6,0,0118db3a8,2.0,1,0,0.001401,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,2,0,e5164d828,2.0,1,0,0.0006,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
3,10,0,5335bfb38,0.0,1,0,6.7e-05,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,12,0,ff2cf88a0,2.0,1,0,0.000133,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


## Model Testing function

In [4]:
def model_training(model, X_train, y_train, X_test):
    model.fit(X_train, y_train.values)
    y_pred = model.predict(X_test)
    return(y_pred)

def model_evaluation(title, model, y_test, y_pred):
    # precision = precision_score(y_test, y_pred)
    # recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    
    print(title)
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)
    
    importance = []
    if hasattr(model, 'feature_importances_'):
        print('Feature Importance')
        importance = []
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.feature_importances_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
    elif hasattr(model, 'coef_'):
        print('Feature Importance')
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.coef_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
    print('')
    return (precision, recall)

def run_experiment(title, model, X_train, y_train, X_test, y_test):
    y_pred = model_training(model, X_train, y_train, X_test)
    precision, recall = model_evaluation(title, model, y_test, y_pred)
    return(precision, recall)


## Model Definition

In [5]:
models = [
    ('Naive Bayes', GaussianNB()),
    ('RandomForestClassifier10', RandomForestClassifier(n_estimators=10)),
    ('RandomForestClassifier100', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('KNeighborsClassifier', KNeighborsClassifier()),
    ('DecisionTreeClassifier', DecisionTreeClassifier())  
]

## Execution Function

In [6]:
def executetion(df_data, list_X_column, list_y_column):
    threshold = 0.8
    X = df_data[list_X_column]
    
    y = df_data[list_y_column]
#    for column in list_y_column:
#        y = pd.concat([y, pd.get_dummies(y[column], prefix=column)], axis=1)
#        y = y.drop(column, axis=1)

    print(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

    print('X_train', X_train.shape)
    print('y_train', y_train.shape)
    print('X_test', X_test.shape)
    print('y_test', y_test.shape)

    results = []
    for m in models:
        precision, recall = run_experiment(m[0], m[1], X_train, y_train, X_test, y_test)
        results.append([m[0], precision, recall])
    
    return results


## Define Columns to Use

In [7]:
y_columns = ["AdoptionSpeed"]
X_column_exclude = ["PetID"]
X_columns = [ x for x in df_train.columns if (x not in X_column_exclude) & (x not in y_columns)]
print(X_columns)

['Age', 'VideoAmt', 'PhotoAmt', 'Type_1', 'Type_2', 'Probability', 'Breed_0', 'Breed_1', 'Breed_2', 'Breed_3', 'Breed_4', 'Breed_5', 'Breed_7', 'Breed_10', 'Breed_11', 'Breed_14', 'Breed_15', 'Breed_16', 'Breed_17', 'Breed_18', 'Breed_19', 'Breed_20', 'Breed_21', 'Breed_23', 'Breed_24', 'Breed_25', 'Breed_26', 'Breed_31', 'Breed_32', 'Breed_36', 'Breed_39', 'Breed_40', 'Breed_42', 'Breed_44', 'Breed_49', 'Breed_50', 'Breed_56', 'Breed_58', 'Breed_60', 'Breed_61', 'Breed_64', 'Breed_65', 'Breed_69', 'Breed_70', 'Breed_71', 'Breed_72', 'Breed_75', 'Breed_76', 'Breed_78', 'Breed_81', 'Breed_82', 'Breed_83', 'Breed_85', 'Breed_88', 'Breed_93', 'Breed_96', 'Breed_97', 'Breed_98', 'Breed_99', 'Breed_100', 'Breed_102', 'Breed_103', 'Breed_104', 'Breed_105', 'Breed_108', 'Breed_109', 'Breed_111', 'Breed_114', 'Breed_115', 'Breed_117', 'Breed_119', 'Breed_122', 'Breed_123', 'Breed_125', 'Breed_128', 'Breed_129', 'Breed_130', 'Breed_132', 'Breed_139', 'Breed_141', 'Breed_143', 'Breed_145', 'Bree

## Run Model

In [8]:
results = executetion(df_train, X_columns, y_columns)

      AdoptionSpeed
0                 4
1                 2
2                 4
3                 2
4                 2
5                 1
6                 4
7                 4
8                 3
9                 4
10                4
11                4
12                1
13                3
14                3
15                4
16                2
17                1
18                1
19                1
20                2
21                1
22                2
23                4
24                2
25                3
26                3
27                2
28                3
29                4
...             ...
9970              2
9971              1
9972              4
9973              4
9974              4
9975              2
9976              3
9977              2
9978              1
9979              2
9980              4
9981              3
9982              4
9983              2
9984              2
9985              2
9986              3
9987              4


  y = column_or_1d(y, warn=True)


Naive Bayes
[[ 33   4   0  17   2]
 [156  79   0 164  10]
 [162  70   0 299  19]
 [114  41   0 235  13]
 [176  57   0 311  38]]
Precision 0.1925
Recall 0.1925



  


RandomForestClassifier10
[[  2  20  13   6  15]
 [  7 135 112  75  80]
 [ 11 130 169 118 122]
 [  3  73 109 105 113]
 [  8  82 127  90 275]]
Precision 0.343
Recall 0.343
Feature Importance
               0         1
2       PhotoAmt  0.207488
0            Age  0.189010
5    Probability  0.182894
238  fee_per_pet  0.052353
210  FurLength_2  0.020967
209  FurLength_1  0.020434
195     Gender_2  0.020181
194     Gender_1  0.019701
226  State_41326  0.019199
236  State_41401  0.015922



  


RandomForestClassifier100
[[  3  17  12   9  15]
 [  4 119 112  71 103]
 [  6 108 167 119 150]
 [  6  52  99 116 130]
 [  7  71 101  74 329]]
Precision 0.367
Recall 0.367
Feature Importance
               0         1
2       PhotoAmt  0.203179
0            Age  0.189953
5    Probability  0.182220
238  fee_per_pet  0.053723
226  State_41326  0.020697
209  FurLength_1  0.019198
210  FurLength_2  0.018819
194     Gender_1  0.018549
236  State_41401  0.018399
195     Gender_2  0.018342



  


KNeighborsClassifier
[[  6  17  13  12   8]
 [  8 141 126  63  71]
 [  5 161 183 102  99]
 [  5  86 115 103  94]
 [  9  93 140 102 238]]
Precision 0.3355
Recall 0.3355

DecisionTreeClassifier
[[  4  19  16   9   8]
 [ 10 114 108  79  98]
 [ 20 141 162 116 111]
 [ 11  71 104 106 111]
 [ 19 108 118 102 235]]
Precision 0.3105
Recall 0.3105
Feature Importance
               0         1
5    Probability  0.180566
2       PhotoAmt  0.174307
0            Age  0.146756
238  fee_per_pet  0.050136
209  FurLength_1  0.033252
226  State_41326  0.030844
210  FurLength_2  0.029864
194     Gender_1  0.027151
236  State_41401  0.026936
195     Gender_2  0.026867



## Model performance

In [9]:
# sort the results and print as a table
df_results = pd.DataFrame(results)
df_results.columns = ['model', 'precision', 'recall']
df_results = df_results.sort_values(by='precision')
df_results

Unnamed: 0,model,precision,recall
0,Naive Bayes,0.1925,0.1925
4,DecisionTreeClassifier,0.3105,0.3105
3,KNeighborsClassifier,0.3355,0.3355
1,RandomForestClassifier10,0.343,0.343
2,RandomForestClassifier100,0.367,0.367


## Fit Test Data

In [10]:
print("Using Model {0}".format(models[2][0]))
X_test = df_test[X_columns]
y_pred = models[2][1].predict(X_test)

Using Model RandomForestClassifier100


## Prepare Data for Submission

In [12]:
df_test['AdoptionSpeed'] = y_pred
df_save = df_test[['PetID', 'AdoptionSpeed']]
df_save

Unnamed: 0,PetID,AdoptionSpeed
0,f42161740,2
1,0118db3a8,4
2,e5164d828,3
3,5335bfb38,4
4,ff2cf88a0,4
5,1d13441b9,3
6,7d835cf7c,3
7,577d15fea,3
8,91736f444,3
9,db194aec8,2


## Save Data

In [13]:
df_save.to_csv("../../Data/Modeling/submission_01.csv", index=False)