# Submission 4
by Wilsion Lee

## Problem definition

Finding the adoption speed of pets

 ## Import Libraries

In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

## Load Data

In [2]:
df_train = pd.read_csv("../../Data/Processed/train_cleaned.csv")
print(df_train.columns)
df_train.head()

Index(['Type', 'Age', 'Gender', 'VideoAmt', 'PetID', 'PhotoAmt',
       'AdoptionSpeed', 'Name_phrase_name', 'Name_no_name', 'Name_baby',
       ...
       'State_41336', 'State_41342', 'State_41345', 'State_41361',
       'State_41367', 'State_41401', 'State_41415', 'fee_per_pet', 'has_fee',
       'more_than_1'],
      dtype='object', length=1150)


Unnamed: 0,Type,Age,Gender,VideoAmt,PetID,PhotoAmt,AdoptionSpeed,Name_phrase_name,Name_no_name,Name_baby,...,State_41336,State_41342,State_41345,State_41361,State_41367,State_41401,State_41415,fee_per_pet,has_fee,more_than_1
0,1,36,1,0,3f8824a3b,1.0,4,0,0,0,...,0,0,0,0,0,0,0,0.0,0,0
1,2,3,0,0,9238eb7fc,1.0,2,0,0,0,...,0,0,0,0,0,0,0,0.0,0,0
2,2,7,0,0,f0a1f2b90,2.0,4,0,0,0,...,0,0,0,0,0,0,0,0.0,0,1
3,1,3,0,0,7d028bdea,4.0,2,0,0,0,...,0,0,0,0,0,0,0,0.0,0,0
4,2,1,0,0,8377bfe97,0.0,2,0,0,0,...,0,0,0,0,0,1,0,0.0,0,0


In [3]:
df_test = pd.read_csv("../../Data/Processed/test_cleaned.csv")
print(df_test.columns)
df_test.head()

Index(['Type', 'Age', 'Gender', 'VideoAmt', 'PetID', 'PhotoAmt',
       'Name_phrase_name', 'Name_no_name', 'Name_baby', 'Name_puppy',
       ...
       'State_41336', 'State_41342', 'State_41345', 'State_41361',
       'State_41367', 'State_41401', 'State_41415', 'fee_per_pet', 'has_fee',
       'more_than_1'],
      dtype='object', length=1149)


Unnamed: 0,Type,Age,Gender,VideoAmt,PetID,PhotoAmt,Name_phrase_name,Name_no_name,Name_baby,Name_puppy,...,State_41336,State_41342,State_41345,State_41361,State_41367,State_41401,State_41415,fee_per_pet,has_fee,more_than_1
0,2,1,0,0,f42161740,10.0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,0
1,1,6,1,0,0118db3a8,2.0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,0
2,1,2,1,0,e5164d828,2.0,0,0,0,0,...,0,0,0,0,0,1,0,0.0,0,0
3,1,10,0,0,5335bfb38,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,0
4,1,12,0,0,ff2cf88a0,2.0,0,0,0,0,...,0,0,0,0,0,0,0,0.0,0,0


## Model Testing function

In [4]:
def model_training(model, X_train, y_train, X_test):
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    return(y_pred)

def model_evaluation(title, model, y_test, y_pred):
    # precision = precision_score(y_test, y_pred)
    # recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    
    print(title)
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)
    
    importance = []
    if hasattr(model, 'feature_importances_'):
        print('Feature Importance')
        importance = []
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.feature_importances_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
    elif hasattr(model, 'coef_'):
        print('Feature Importance')
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.coef_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
    print('')
    return (precision, recall)

def run_experiment(title, model, X_train, y_train, X_test, y_test):
    y_pred = model_training(model, X_train, y_train, X_test)
    precision, recall = model_evaluation(title, model, y_test, y_pred)
    return(precision, recall)


## Model Definition

In [5]:
models = [
    ('Naive Bayes', GaussianNB()),
    ('RandomForestClassifier10', RandomForestClassifier(n_estimators=10, random_state=15)),
    ('RandomForestClassifier200', RandomForestClassifier(n_estimators=200, random_state=15)),
    ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=16)),
    ('DecisionTreeClassifier', DecisionTreeClassifier())  
]

## Execution Function

In [6]:
def executetion(df_data, list_X_column, list_y_column):
    threshold = 0.8
    X = df_data[list_X_column]
    
    y = df_data[list_y_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

    print('X_train', X_train.shape)
    print('y_train', y_train.shape)
    print('X_test', X_test.shape)
    print('y_test', y_test.shape)

    results = []
    for m in models:
        precision, recall = run_experiment(m[0], m[1], X_train, y_train, X_test, y_test)
        results.append([m[0], precision, recall])
    
    return results


## Define Columns to Use

In [7]:
y_columns = ["AdoptionSpeed"]
X_column_exclude = ["PetID", "Probability"]

for column in df_test.columns:
    if -1 != column.find("Name_"):
        X_column_exclude.append(column)
        
X_columns = [ x for x in df_train.columns if (x not in X_column_exclude) & (x not in y_columns)]
print(X_columns)

['Type', 'Age', 'Gender', 'VideoAmt', 'PhotoAmt', 'has_a_name', 'Age_Y_3', 'Age_M_3', 'Age_M_7', 'Age_M_1', 'Age_M_2', 'Age_M_8', 'Age_M_6', 'Age_Y_1', 'Age_Y_2', 'Age_M_4', 'Age_M_5', 'Age_Y_4', 'Age_M_9', 'Age_Y_6', 'Age_Y_7', 'Age_Y_5', 'Age_Y_9', 'Age_M_10', 'Age_M_0', 'Age_M_11', 'Age_Y_8', 'Age_Y_12', 'Age_Y_10', 'Age_Y_11', 'Age_Y_17', 'Age_Y_21', 'Breed_0', 'Breed_1', 'Breed_2', 'Breed_3', 'Breed_4', 'Breed_5', 'Breed_7', 'Breed_10', 'Breed_11', 'Breed_14', 'Breed_15', 'Breed_16', 'Breed_17', 'Breed_18', 'Breed_19', 'Breed_20', 'Breed_21', 'Breed_23', 'Breed_24', 'Breed_25', 'Breed_26', 'Breed_31', 'Breed_32', 'Breed_36', 'Breed_39', 'Breed_40', 'Breed_42', 'Breed_44', 'Breed_49', 'Breed_50', 'Breed_56', 'Breed_58', 'Breed_60', 'Breed_61', 'Breed_64', 'Breed_65', 'Breed_69', 'Breed_70', 'Breed_71', 'Breed_72', 'Breed_75', 'Breed_76', 'Breed_78', 'Breed_81', 'Breed_82', 'Breed_83', 'Breed_85', 'Breed_88', 'Breed_93', 'Breed_96', 'Breed_97', 'Breed_98', 'Breed_99', 'Breed_100', '

## Run Model

In [8]:
results = executetion(df_train, X_columns, y_columns)

X_train (8000, 264)
y_train (8000, 1)
X_test (2000, 264)
y_test (2000, 1)
Naive Bayes
[[ 21  30   0   4   4]
 [ 88 279   0  23  10]
 [133 308   0  68  17]
 [ 92 247   0  77  15]
 [174 275   0  87  48]]
Precision 0.2125
Recall 0.2125

RandomForestClassifier10
[[  2  21  19   6  11]
 [  9 123 121  52  95]
 [ 12  88 189 112 125]
 [  5  81 128 115 102]
 [  8  80 138 105 253]]
Precision 0.341
Recall 0.341
Feature Importance
               0         1
4       PhotoAmt  0.248343
1            Age  0.245691
2         Gender  0.068481
261  fee_per_pet  0.042582
0           Type  0.025371
263  more_than_1  0.023256
5     has_a_name  0.022848
232  FurLength_1  0.021068
233  FurLength_2  0.020457
249  State_41326  0.019122

RandomForestClassifier200
[[  3  14  22   7  13]
 [  4 119 127  45 105]
 [  9  78 175 104 160]
 [  3  73 117 119 119]
 [  6  64 113  77 324]]
Precision 0.37
Recall 0.37
Feature Importance
               0         1
4       PhotoAmt  0.250374
1            Age  0.231166
2         

## Model performance

In [9]:
# sort the results and print as a table
sort_column = "precision"

df_results = pd.DataFrame(results)
df_results.columns = ['model', 'precision', 'recall']
df_results = df_results.sort_values(by=sort_column)
df_results

Unnamed: 0,model,precision,recall
0,Naive Bayes,0.2125,0.2125
4,DecisionTreeClassifier,0.322,0.322
1,RandomForestClassifier10,0.341,0.341
3,KNeighborsClassifier,0.347,0.347
2,RandomForestClassifier200,0.37,0.37


## Fit Test Data

In [10]:
model_index = df_results[sort_column].idxmax()
print("Using Model {0}".format(models[model_index][0]))
X_test = df_test[X_columns]
y_pred = models[model_index][1].predict(X_test)

Using Model RandomForestClassifier200


## Prepare Data for Submission

In [11]:
df_test['AdoptionSpeed'] = y_pred
df_save = df_test[['PetID', 'AdoptionSpeed']]
df_save

Unnamed: 0,PetID,AdoptionSpeed
0,f42161740,2
1,0118db3a8,4
2,e5164d828,2
3,5335bfb38,4
4,ff2cf88a0,4
5,1d13441b9,3
6,7d835cf7c,1
7,577d15fea,4
8,91736f444,4
9,db194aec8,2


## Save Data

In [12]:
df_save.to_csv("../../Data/Modeling/submission_04.csv", index=False)