# Submission 5
by Wilsion Lee

## Problem definition

Finding the adoption speed of pets

 ## Import Libraries

In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

## Load Data

In [2]:
df_train = pd.read_csv("../../Data/Processed/train_cleaned.csv")
print(df_train.columns)
df_train.head()

Index(['Type', 'Age', 'Gender', 'VideoAmt', 'PetID', 'PhotoAmt',
       'AdoptionSpeed', 'Name_phrase_name', 'Name_no_name', 'Name_baby',
       ...
       'State_41345', 'State_41361', 'State_41367', 'State_41401',
       'State_41415', 'fee_per_pet', 'has_fee', 'more_than_1', 'RescuerCount',
       'description_wordcount'],
      dtype='object', length=1152)


Unnamed: 0,Type,Age,Gender,VideoAmt,PetID,PhotoAmt,AdoptionSpeed,Name_phrase_name,Name_no_name,Name_baby,...,State_41345,State_41361,State_41367,State_41401,State_41415,fee_per_pet,has_fee,more_than_1,RescuerCount,description_wordcount
0,1,36,1,0,3f8824a3b,1.0,4,0,0,0,...,0,0,0,0,0,0.0,0,0,17,41
1,2,3,0,0,9238eb7fc,1.0,2,0,0,0,...,0,0,0,0,0,0.0,0,0,4,36
2,2,7,0,0,f0a1f2b90,2.0,4,0,0,0,...,0,0,0,0,0,0.0,0,1,2,96
3,1,3,0,0,7d028bdea,4.0,2,0,0,0,...,0,0,0,0,0,0.0,0,0,9,68
4,2,1,0,0,8377bfe97,0.0,2,0,0,0,...,0,0,0,1,0,0.0,0,0,1,67


In [3]:
df_test = pd.read_csv("../../Data/Processed/test_cleaned.csv")
print(df_test.columns)
df_test.head()

Index(['Type', 'Age', 'Gender', 'VideoAmt', 'PetID', 'PhotoAmt',
       'Name_phrase_name', 'Name_no_name', 'Name_baby', 'Name_lucky',
       ...
       'State_41345', 'State_41361', 'State_41367', 'State_41401',
       'State_41415', 'fee_per_pet', 'has_fee', 'more_than_1', 'RescuerCount',
       'description_wordcount'],
      dtype='object', length=1151)


Unnamed: 0,Type,Age,Gender,VideoAmt,PetID,PhotoAmt,Name_phrase_name,Name_no_name,Name_baby,Name_lucky,...,State_41345,State_41361,State_41367,State_41401,State_41415,fee_per_pet,has_fee,more_than_1,RescuerCount,description_wordcount
0,2,1,0,0,f42161740,10.0,0,0,0,0,...,0,0,0,0,0,0.0,0,0,6,26
1,1,6,1,0,0118db3a8,2.0,0,0,0,0,...,0,0,0,0,0,0.0,0,0,24,15
2,1,2,1,0,e5164d828,2.0,0,0,0,0,...,0,0,0,1,0,0.0,0,0,231,23
3,1,10,0,0,5335bfb38,0.0,0,0,0,0,...,0,0,0,0,0,0.0,0,0,1,34
4,1,12,0,0,ff2cf88a0,2.0,0,0,0,0,...,0,0,0,0,0,0.0,0,0,2,35


## Model Testing function

In [4]:
def model_training(model, X_train, y_train, X_test):
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    return(y_pred)

def model_evaluation(title, model, y_test, y_pred):
    # precision = precision_score(y_test, y_pred)
    # recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    
    print(title)
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)
    
    importance = []
    if hasattr(model, 'feature_importances_'):
        print('Feature Importance')
        importance = []
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.feature_importances_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
    elif hasattr(model, 'coef_'):
        print('Feature Importance')
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.coef_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
    print('')
    return (precision, recall)

def run_experiment(title, model, X_train, y_train, X_test, y_test):
    y_pred = model_training(model, X_train, y_train, X_test)
    precision, recall = model_evaluation(title, model, y_test, y_pred)
    return(precision, recall)


## Model Definition

In [5]:
models = [
    ('Naive Bayes', GaussianNB()),
    ('RandomForestClassifier10', RandomForestClassifier(n_estimators=10, random_state=15)),
    ('RandomForestClassifier100', RandomForestClassifier(n_estimators=100, random_state=15)),
    ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=16)),
    ('DecisionTreeClassifier', DecisionTreeClassifier())  
]

## Execution Function

In [6]:
def executetion(df_data, list_X_column, list_y_column):
    threshold = 0.8
    X = df_data[list_X_column]
    
    y = df_data[list_y_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

    print('X_train', X_train.shape)
    print('y_train', y_train.shape)
    print('X_test', X_test.shape)
    print('y_test', y_test.shape)

    results = []
    for m in models:
        precision, recall = run_experiment(m[0], m[1], X_train, y_train, X_test, y_test)
        results.append([m[0], precision, recall])
    
    return results


## Define Columns to Use

In [7]:
y_columns = ["AdoptionSpeed"]
X_column_exclude = ["PetID", "Probability","fee_per_pet"]

for column in df_test.columns:
    if -1 != column.find("Name_"):
        X_column_exclude.append(column)
    if -1 != column.find("Age_"):
        X_column_exclude.append(column)
    if -1 != column.find("Breed_"):
        X_column_exclude.append(column)
    if -1 != column.find("Color_"):
        X_column_exclude.append(column)
        
X_columns = [ x for x in df_train.columns if (x not in X_column_exclude) & (x not in y_columns)]
print(X_columns)

['Type', 'Age', 'Gender', 'VideoAmt', 'PhotoAmt', 'has_a_name', 'MaturitySize_1', 'MaturitySize_2', 'MaturitySize_3', 'MaturitySize_4', 'FurLength_1', 'FurLength_2', 'FurLength_3', 'Vaccinated_1', 'Vaccinated_2', 'Vaccinated_3', 'Dewormed_1', 'Dewormed_2', 'Dewormed_3', 'Sterilized_1', 'Sterilized_2', 'Sterilized_3', 'Health_1', 'Health_2', 'Health_3', 'State_41324', 'State_41325', 'State_41326', 'State_41327', 'State_41330', 'State_41332', 'State_41335', 'State_41336', 'State_41342', 'State_41345', 'State_41361', 'State_41367', 'State_41401', 'State_41415', 'has_fee', 'more_than_1', 'RescuerCount', 'description_wordcount']


## Run Model

In [8]:
results = executetion(df_train, X_columns, y_columns)

X_train (8000, 43)
y_train (8000, 1)
X_test (2000, 43)
y_test (2000, 1)
Naive Bayes
[[  5  37   0   4   5]
 [ 49 298   0  29  11]
 [ 51 389   4  78  21]
 [ 38 273   2 100  19]
 [ 72 383   2  79  51]]
Precision 0.229
Recall 0.229

RandomForestClassifier10
[[  2  19  10   6  14]
 [  6 131 128  58  64]
 [  9 141 198  96  99]
 [  2 100 128 115  87]
 [  4  92 107  78 306]]
Precision 0.376
Recall 0.376
Feature Importance
                        0         1
42  description_wordcount  0.188356
41           RescuerCount  0.133021
4                PhotoAmt  0.131462
1                     Age  0.118193
2                  Gender  0.042212
10            FurLength_1  0.023415
11            FurLength_2  0.023337
39                has_fee  0.022957
27            State_41326  0.022754
0                    Type  0.022536

RandomForestClassifier100
[[  2  15  11   5  18]
 [  3 122 120  45  97]
 [  1 118 194 101 129]
 [  1  70 120 124 117]
 [  1  65  75  50 396]]
Precision 0.419
Recall 0.419
Feature Impor

## Model performance

In [9]:
# sort the results and print as a table
sort_column = "precision"

df_results = pd.DataFrame(results)
df_results.columns = ['model', 'precision', 'recall']
df_results = df_results.sort_values(by=sort_column)
df_results

Unnamed: 0,model,precision,recall
0,Naive Bayes,0.229,0.229
4,DecisionTreeClassifier,0.35,0.35
3,KNeighborsClassifier,0.365,0.365
1,RandomForestClassifier10,0.376,0.376
2,RandomForestClassifier100,0.419,0.419


## Fit Test Data

In [10]:
model_index = df_results[sort_column].idxmax()
print("Using Model {0}".format(models[model_index][0]))
X_test = df_test[X_columns]
y_pred = models[model_index][1].predict(X_test)

Using Model RandomForestClassifier100


## Prepare Data for Submission

In [11]:
df_test['AdoptionSpeed'] = y_pred
df_save = df_test[['PetID', 'AdoptionSpeed']]
df_save

Unnamed: 0,PetID,AdoptionSpeed
0,f42161740,2
1,0118db3a8,4
2,e5164d828,2
3,5335bfb38,4
4,ff2cf88a0,4
5,1d13441b9,3
6,7d835cf7c,2
7,577d15fea,2
8,91736f444,4
9,db194aec8,2


## Save Data

In [12]:
df_save.to_csv("../../Data/Modeling/submission_05.csv", index=False)