# Submission 2
by Wilsion Lee

## Problem definition

Finding the adoption speed of pets

 ## Import Libraries

In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

## Load Data

In [2]:
df_train = pd.read_csv("../../Data/Processed/train_cleaned.csv")
print(df_train.columns)
df_train.head()

Index(['Type', 'Age', 'Gender', 'VideoAmt', 'PetID', 'PhotoAmt',
       'AdoptionSpeed', 'Name_phrase_name', 'Name_no_name', 'Name_baby',
       ...
       'State_41332', 'State_41335', 'State_41336', 'State_41342',
       'State_41345', 'State_41361', 'State_41367', 'State_41401',
       'State_41415', 'fee_per_pet'],
      dtype='object', length=1147)


Unnamed: 0,Type,Age,Gender,VideoAmt,PetID,PhotoAmt,AdoptionSpeed,Name_phrase_name,Name_no_name,Name_baby,...,State_41332,State_41335,State_41336,State_41342,State_41345,State_41361,State_41367,State_41401,State_41415,fee_per_pet
0,1,36,2,0,3f8824a3b,1.0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,2,3,1,0,9238eb7fc,1.0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,1,3,1,0,7d028bdea,4.0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
3,2,1,1,0,8377bfe97,0.0,2,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
4,1,3,1,0,965b31ba7,2.0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [3]:
df_test = pd.read_csv("../../Data/Processed/test_cleaned.csv")
print(df_test.columns)
df_test.head()

Index(['Type', 'Age', 'Gender', 'VideoAmt', 'PetID', 'PhotoAmt',
       'Name_phrase_name', 'Name_no_name', 'Name_baby', 'Name_puppy',
       ...
       'State_41332', 'State_41335', 'State_41336', 'State_41342',
       'State_41345', 'State_41361', 'State_41367', 'State_41401',
       'State_41415', 'fee_per_pet'],
      dtype='object', length=1146)


Unnamed: 0,Type,Age,Gender,VideoAmt,PetID,PhotoAmt,Name_phrase_name,Name_no_name,Name_baby,Name_puppy,...,State_41332,State_41335,State_41336,State_41342,State_41345,State_41361,State_41367,State_41401,State_41415,fee_per_pet
0,2,1,1,0,f42161740,10.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,1,6,2,0,0118db3a8,2.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,1,2,2,0,e5164d828,2.0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
3,1,10,1,0,5335bfb38,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,1,12,1,0,ff2cf88a0,2.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


## Model Testing function

In [4]:
def model_training(model, X_train, y_train, X_test):
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    return(y_pred)

def model_evaluation(title, model, y_test, y_pred):
    # precision = precision_score(y_test, y_pred)
    # recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    
    print(title)
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)
    
    importance = []
    if hasattr(model, 'feature_importances_'):
        print('Feature Importance')
        importance = []
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.feature_importances_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(20))
    elif hasattr(model, 'coef_'):
        print('Feature Importance')
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.coef_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(20))
    print('')
    return (precision, recall)

def run_experiment(title, model, X_train, y_train, X_test, y_test):
    y_pred = model_training(model, X_train, y_train, X_test)
    precision, recall = model_evaluation(title, model, y_test, y_pred)
    return(precision, recall)


## Model Definition

In [5]:
models = [
    ('Naive Bayes', GaussianNB()),
    ('RandomForestClassifier10', RandomForestClassifier(n_estimators=10, random_state=15)),
    ('RandomForestClassifier100', RandomForestClassifier(n_estimators=100, random_state=15)),
    ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=10)),
    ('DecisionTreeClassifier', DecisionTreeClassifier())  
]

## Execution Function

In [6]:
def executetion(df_data, list_X_column, list_y_column):
    threshold = 0.8
    X = df_data[list_X_column]
    
    y = df_data[list_y_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

    print('X_train', X_train.shape)
    print('y_train', y_train.shape)
    print('X_test', X_test.shape)
    print('y_test', y_test.shape)

    results = []
    for m in models:
        precision, recall = run_experiment(m[0], m[1], X_train, y_train, X_test, y_test)
        results.append([m[0], precision, recall])
    
    return results


## Define Columns to Use

In [7]:
y_columns = ["AdoptionSpeed"]
X_column_exclude = ["PetID"]

for column in df_test.columns:
    if -1 != column.find("Name"):
        X_column_exclude.append(column)
    if -1 != column.find("Breed"):
        X_column_exclude.append(column)

X_columns = [ x for x in df_train.columns if (x not in X_column_exclude) & (x not in y_columns)]
print(X_columns)

['Type', 'Age', 'Gender', 'VideoAmt', 'PhotoAmt', 'Probability', 'Age_Y_3', 'Age_M_3', 'Age_M_7', 'Age_M_1', 'Age_M_2', 'Age_M_8', 'Age_M_6', 'Age_Y_1', 'Age_Y_2', 'Age_M_4', 'Age_M_5', 'Age_Y_4', 'Age_M_9', 'Age_Y_6', 'Age_Y_7', 'Age_Y_5', 'Age_Y_9', 'Age_M_10', 'Age_M_0', 'Age_M_11', 'Age_Y_8', 'Age_Y_12', 'Age_Y_10', 'Age_Y_11', 'Age_Y_17', 'Age_Y_21', 'Color_0', 'Color_1', 'Color_2', 'Color_3', 'Color_4', 'Color_5', 'Color_6', 'Color_7', 'MaturitySize_1', 'MaturitySize_2', 'MaturitySize_3', 'MaturitySize_4', 'FurLength_1', 'FurLength_2', 'FurLength_3', 'Vaccinated_1', 'Vaccinated_2', 'Vaccinated_3', 'Dewormed_1', 'Dewormed_2', 'Dewormed_3', 'Sterilized_1', 'Sterilized_2', 'Sterilized_3', 'Health_1', 'Health_2', 'Health_3', 'State_41324', 'State_41325', 'State_41326', 'State_41327', 'State_41330', 'State_41332', 'State_41335', 'State_41336', 'State_41342', 'State_41345', 'State_41361', 'State_41367', 'State_41401', 'State_41415', 'fee_per_pet']


## Run Model

In [8]:
results = executetion(df_train, X_columns, y_columns)

X_train (6190, 74)
y_train (6190, 1)
X_test (1548, 74)
y_test (1548, 1)
Naive Bayes
[[ 46   1   2   1   2]
 [287  13   6   4   8]
 [379  39  11   4   7]
 [319  20  11   4  11]
 [303  26  11   4  29]]
Precision 0.0665374677002584
Recall 0.0665374677002584

RandomForestClassifier10
[[  2  14  14   9  13]
 [  8 103 102  46  59]
 [  3 133 135  85  84]
 [  5  72 120  99  69]
 [  5  73  81  54 160]]
Precision 0.32235142118863047
Recall 0.32235142118863047
Feature Importance
                 0         1
1              Age  0.197152
5      Probability  0.189704
4         PhotoAmt  0.181153
73     fee_per_pet  0.055825
2           Gender  0.044393
0             Type  0.028703
61     State_41326  0.022314
41  MaturitySize_2  0.019870
71     State_41401  0.019055
45     FurLength_2  0.018914
44     FurLength_1  0.017534
40  MaturitySize_1  0.016148
54    Sterilized_2  0.015875
50      Dewormed_1  0.013538
3         VideoAmt  0.013530
47    Vaccinated_1  0.013138
48    Vaccinated_2  0.013121
53   

## Model performance

In [9]:
# sort the results and print as a table
sort_column = "precision"

df_results = pd.DataFrame(results)
df_results.columns = ['model', 'precision', 'recall']
df_results = df_results.sort_values(by=sort_column)
df_results

Unnamed: 0,model,precision,recall
0,Naive Bayes,0.066537,0.066537
4,DecisionTreeClassifier,0.308786,0.308786
1,RandomForestClassifier10,0.322351,0.322351
3,KNeighborsClassifier,0.329457,0.329457
2,RandomForestClassifier100,0.363695,0.363695


## Tunning Selected Model

In [10]:
##threshold = 0.5
##X = df_train[X_columns]
##y = df_train[y_columns]
##X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)
##
##model = models[model_index]
##print("Tunning Model {0}".format(model[0]))
##
##model[1].fit(X_train, y_train.values.ravel())
##y_pred_tune = model[1].predict_proba(X_test)
##print(list(y_pred_tune))
##print(len(y_pred_tune))
##
##for i in range(1,10):
##    print(i)
##    y_pred_tune = model[1].predict_proba(X_test)[:,1]
##    y_pred_tune = [1 if x > i/10.0 else 0 for x in y_pred_tune]
##    precision = precision_score(y_test, y_pred_tune, average='micro')
##    recall = recall_score(y_test, y_pred_tune, average='micro')
##    print(confusion_matrix(y_test, y_pred_tune))
##    print('Precision', precision)
##    print('Recall', recall)

## Fit Test Data

In [11]:
model_index = df_results[sort_column].idxmax()
print("Using Model {0}".format(models[model_index][0]))
X_test = df_test[X_columns]
y_pred = models[model_index][1].predict(X_test)
models[model_index][1].predict(X_test)


Using Model RandomForestClassifier100


array([2, 4, 2, ..., 4, 1, 3], dtype=int64)

## Prepare Data for Submission

In [12]:
df_test['AdoptionSpeed'] = y_pred
df_save = df_test[['PetID', 'AdoptionSpeed']]
df_save

Unnamed: 0,PetID,AdoptionSpeed
0,f42161740,2
1,0118db3a8,4
2,e5164d828,2
3,5335bfb38,4
4,ff2cf88a0,4
5,1d13441b9,3
6,7d835cf7c,3
7,577d15fea,3
8,91736f444,1
9,db194aec8,1


## Save Data

In [13]:
df_save.to_csv("../../Data/Modeling/submission_02.csv", index=False)