# Submission 1
by Wilsion Lee

## Problem definition

Finding the adoption speed of pets

 ## Import Libraries

In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

## Load Data

In [2]:
df_train = pd.read_csv("../../Data/Processed/train_cleaned.csv")
print(df_train.columns)
df_train.head()

Index(['Age', 'VideoAmt', 'PetID', 'PhotoAmt', 'AdoptionSpeed', 'Type_1',
       'Type_2', 'Name_phrase_name', 'Name_no_name', 'Name_baby',
       ...
       'State_41332', 'State_41335', 'State_41336', 'State_41342',
       'State_41345', 'State_41361', 'State_41367', 'State_41401',
       'State_41415', 'fee_per_pet'],
      dtype='object', length=1124)


Unnamed: 0,Age,VideoAmt,PetID,PhotoAmt,AdoptionSpeed,Type_1,Type_2,Name_phrase_name,Name_no_name,Name_baby,...,State_41332,State_41335,State_41336,State_41342,State_41345,State_41361,State_41367,State_41401,State_41415,fee_per_pet
0,36,0,3f8824a3b,1.0,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,3,0,9238eb7fc,1.0,2,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,3,0,7d028bdea,4.0,2,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
3,1,0,8377bfe97,0.0,2,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
4,3,0,965b31ba7,2.0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


In [3]:
df_test = pd.read_csv("../../Data/Processed/test_cleaned.csv")
print(df_test.columns)
df_test.head()

Index(['Age', 'VideoAmt', 'PetID', 'PhotoAmt', 'Type_1', 'Type_2',
       'Name_phrase_name', 'Name_no_name', 'Name_baby', 'Name_puppy',
       ...
       'State_41332', 'State_41335', 'State_41336', 'State_41342',
       'State_41345', 'State_41361', 'State_41367', 'State_41401',
       'State_41415', 'fee_per_pet'],
      dtype='object', length=1123)


Unnamed: 0,Age,VideoAmt,PetID,PhotoAmt,Type_1,Type_2,Name_phrase_name,Name_no_name,Name_baby,Name_puppy,...,State_41332,State_41335,State_41336,State_41342,State_41345,State_41361,State_41367,State_41401,State_41415,fee_per_pet
0,1,0,f42161740,10.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,6,0,0118db3a8,2.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,2,0,e5164d828,2.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
3,10,0,5335bfb38,0.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,12,0,ff2cf88a0,2.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


## Model Testing function

In [4]:
def model_training(model, X_train, y_train, X_test):
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    return(y_pred)

def model_evaluation(title, model, y_test, y_pred):
    # precision = precision_score(y_test, y_pred)
    # recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    
    print(title)
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)
    
    importance = []
    if hasattr(model, 'feature_importances_'):
        print('Feature Importance')
        importance = []
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.feature_importances_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
    elif hasattr(model, 'coef_'):
        print('Feature Importance')
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.coef_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
    print('')
    return (precision, recall)

def run_experiment(title, model, X_train, y_train, X_test, y_test):
    y_pred = model_training(model, X_train, y_train, X_test)
    precision, recall = model_evaluation(title, model, y_test, y_pred)
    return(precision, recall)


## Model Definition

In [5]:
models = [
    ('Naive Bayes', GaussianNB()),
    ('RandomForestClassifier10', RandomForestClassifier(n_estimators=10, random_state=15)),
    ('RandomForestClassifier100', RandomForestClassifier(n_estimators=100, random_state=15)),
    ('KNeighborsClassifier', KNeighborsClassifier()),
    ('DecisionTreeClassifier', DecisionTreeClassifier())  
]

## Execution Function

In [6]:
def executetion(df_data, list_X_column, list_y_column):
    threshold = 0.8
    X = df_data[list_X_column]
    
    y = df_data[list_y_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

    print('X_train', X_train.shape)
    print('y_train', y_train.shape)
    print('X_test', X_test.shape)
    print('y_test', y_test.shape)

    results = []
    for m in models:
        precision, recall = run_experiment(m[0], m[1], X_train, y_train, X_test, y_test)
        results.append([m[0], precision, recall])
    
    return results


## Define Columns to Use

In [7]:
y_columns = ["AdoptionSpeed"]
X_column_exclude = ["PetID"]

X_columns = [ x for x in df_train.columns if (x not in X_column_exclude) & (x not in y_columns)]
print(X_columns)

['Age', 'VideoAmt', 'PhotoAmt', 'Type_1', 'Type_2', 'Name_phrase_name', 'Name_no_name', 'Name_baby', 'Name_puppy', 'Name_lucky', 'Name_kitty', 'Name_mimi', 'Name_puppies', 'Name_kittens', 'Name_brownie', 'Name_blackie', 'Name_kitten', 'Name_coco', 'Name_max', 'Name_oreo', 'Name_milo', 'Name_tiger', 'Name_angel', 'Name_kiki', 'Name_lucy', 'Name_snowy', 'Name_ginger', 'Name_bobby', 'Name_tom', 'Name_lily', 'Name_happy', 'Name_daisy', 'Name_molly', 'Name_fluffy', 'Name_simba', 'Name_prince', 'Name_momo', 'Name_bella', 'Name_rocky', 'Name_lola', 'Name_panda', 'Name_luna', 'Name_buddy', 'Name_princess', 'Name_lulu', 'Name_unknown', 'Name_bobo', 'Name_snow', 'Name_tommy', 'Name_jojo', 'Name_teddy', 'Name_cookie', 'Name_cutie', 'Name_peanut', 'Name_mickey', 'Name_leo', 'Name_toby', 'Name_doggie r', 'Name_shadow', 'Name_b', 'Name_bailey', 'Name_blacky', 'Name_boy', 'Name_a', 'Name_coffee', 'Name_minnie', 'Name_oyen', 'Name_kitten ra', 'Name_f', 'Name_putih', 'Name_pepper', 'Name_milky', 'Name_

## Run Model

In [8]:
results = executetion(df_train, X_columns, y_columns)

X_train (6190, 1122)
y_train (6190, 1)
X_test (1548, 1122)
y_test (1548, 1)
Naive Bayes
[[ 11  28   2   1   4]
 [ 85 210  21   4  19]
 [ 84 258  32   9  33]
 [ 54 201  23  23  35]
 [ 95 178  31  11  96]]
Precision 0.24031007751937986
Recall 0.24031007751937986

RandomForestClassifier10
[[  3  16  12   6   9]
 [  9  97 111  61  61]
 [  7  89 142  83  95]
 [  5  60 109 102  60]
 [  2  70  96  81 162]]
Precision 0.3268733850129199
Recall 0.3268733850129199
Feature Importance
                0         1
0             Age  0.199526
888   Probability  0.193579
2        PhotoAmt  0.182280
1121  fee_per_pet  0.057748
1092  FurLength_1  0.021722
1119  State_41401  0.021522
1093  FurLength_2  0.021339
1109  State_41326  0.019630
1078     Gender_2  0.018723
4          Type_2  0.018450

RandomForestClassifier100
[[  3  11  12   8  12]
 [  5  93 111  52  78]
 [  5  81 140  85 105]
 [  3  49 100  99  85]
 [  1  44  79  70 217]]
Precision 0.35658914728682173
Recall 0.35658914728682173
Feature Importa

## Model performance

In [9]:
# sort the results and print as a table
sort_column = "precision"

df_results = pd.DataFrame(results)
df_results.columns = ['model', 'precision', 'recall']
df_results = df_results.sort_values(by=sort_column)
df_results

Unnamed: 0,model,precision,recall
0,Naive Bayes,0.24031,0.24031
4,DecisionTreeClassifier,0.306848,0.306848
1,RandomForestClassifier10,0.326873,0.326873
3,KNeighborsClassifier,0.343023,0.343023
2,RandomForestClassifier100,0.356589,0.356589


## Fit Test Data

In [10]:
model_index = df_results[sort_column].idxmax()
print("Using Model {0}".format(models[model_index][0]))
X_test = df_test[X_columns]
y_pred = models[model_index][1].predict(X_test)

Using Model RandomForestClassifier100


## Prepare Data for Submission

In [11]:
df_test['AdoptionSpeed'] = y_pred
df_save = df_test[['PetID', 'AdoptionSpeed']]
df_save

Unnamed: 0,PetID,AdoptionSpeed
0,f42161740,2
1,0118db3a8,4
2,e5164d828,2
3,5335bfb38,3
4,ff2cf88a0,4
5,1d13441b9,3
6,7d835cf7c,3
7,577d15fea,2
8,91736f444,1
9,db194aec8,1


## Save Data

In [12]:
df_save.to_csv("../../Data/Modeling/submission_01.csv", index=False)