# Submission 1
by Wilsion Lee

## Problem definition

Finding the adoption speed of pets

 ## Import Libraries

In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn :: utils
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# sklearn :: models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

## Load Data

In [2]:
df_train = pd.read_csv("../../Data/Processed/train_cleaned.csv")
print(df_train.columns)
df_train.head()

Index(['Age', 'VideoAmt', 'PetID', 'PhotoAmt', 'AdoptionSpeed', 'Type_1',
       'Type_2', 'Probability', 'a', 'a g',
       ...
       'State_41332', 'State_41335', 'State_41336', 'State_41342',
       'State_41345', 'State_41361', 'State_41367', 'State_41401',
       'State_41415', 'fee_per_pet'],
      dtype='object', length=4494)


Unnamed: 0,Age,VideoAmt,PetID,PhotoAmt,AdoptionSpeed,Type_1,Type_2,Probability,a,a g,...,State_41332,State_41335,State_41336,State_41342,State_41345,State_41361,State_41367,State_41401,State_41415,fee_per_pet
0,36,0,3f8824a3b,1.0,4,1,0,16.541052,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,3,0,9238eb7fc,1.0,2,0,1,0.133396,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,7,0,f0a1f2b90,2.0,4,0,1,16.541052,0,0,...,0,0,0,0,0,0,0,0,0,0.0
3,3,0,7d028bdea,4.0,2,1,0,9.36437,0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,1,0,8377bfe97,0.0,2,0,1,0.00667,0,0,...,0,0,0,0,0,0,0,1,0,0.0


In [3]:
df_test = pd.read_csv("../../Data/Processed/test_cleaned.csv")
print(df_test.columns)
df_test.head()

Index(['Age', 'VideoAmt', 'PetID', 'PhotoAmt', 'Type_1', 'Type_2',
       'Probability', ' ', 'a', 'a ki',
       ...
       'State_41332', 'State_41335', 'State_41336', 'State_41342',
       'State_41345', 'State_41361', 'State_41367', 'State_41401',
       'State_41415', 'fee_per_pet'],
      dtype='object', length=2666)


Unnamed: 0,Age,VideoAmt,PetID,PhotoAmt,Type_1,Type_2,Probability,Unnamed: 8,a,a ki,...,State_41332,State_41335,State_41336,State_41342,State_41345,State_41361,State_41367,State_41401,State_41415,fee_per_pet
0,1,0,f42161740,10.0,0,1,0.00667,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,6,0,0118db3a8,2.0,1,0,16.541052,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,2,0,e5164d828,2.0,1,0,0.053358,0,0,0,...,0,0,0,0,0,0,0,1,0,0.0
3,10,0,5335bfb38,0.0,1,0,0.00667,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,12,0,ff2cf88a0,2.0,1,0,0.01334,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0


## Model Testing function

In [4]:
def model_training(model, X_train, y_train, X_test):
    model.fit(X_train, y_train.values.ravel())
    y_pred = model.predict(X_test)
    return(y_pred)

def model_evaluation(title, model, y_test, y_pred):
    # precision = precision_score(y_test, y_pred)
    # recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='micro')
    recall = recall_score(y_test, y_pred, average='micro')
    
    print(title)
    print(confusion_matrix(y_test, y_pred))
    print('Precision', precision)
    print('Recall', recall)
    
    importance = []
    if hasattr(model, 'feature_importances_'):
        print('Feature Importance')
        importance = []
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.feature_importances_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
    elif hasattr(model, 'coef_'):
        print('Feature Importance')
        for i in range(len(X_columns)):
            importance.append([X_columns[i], model.coef_[i]])
        print(pd.DataFrame(importance).sort_values(by=1, ascending=False).head(10))
    print('')
    return (precision, recall)

def run_experiment(title, model, X_train, y_train, X_test, y_test):
    y_pred = model_training(model, X_train, y_train, X_test)
    precision, recall = model_evaluation(title, model, y_test, y_pred)
    return(precision, recall)


## Model Definition

In [5]:
models = [
    ('Naive Bayes', GaussianNB()),
    ('RandomForestClassifier10', RandomForestClassifier(n_estimators=10, random_state=15)),
    ('RandomForestClassifier100', RandomForestClassifier(n_estimators=100, random_state=15)),
    ('KNeighborsClassifier', KNeighborsClassifier()),
    ('DecisionTreeClassifier', DecisionTreeClassifier())  
]

## Execution Function

In [6]:
def executetion(df_data, list_X_column, list_y_column):
    threshold = 0.8
    X = df_data[list_X_column]
    
    y = df_data[list_y_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1.0-threshold, shuffle=True)

    print('X_train', X_train.shape)
    print('y_train', y_train.shape)
    print('X_test', X_test.shape)
    print('y_test', y_test.shape)

    results = []
    for m in models:
        precision, recall = run_experiment(m[0], m[1], X_train, y_train, X_test, y_test)
        results.append([m[0], precision, recall])
    
    return results


## Define Columns to Use

In [7]:
y_columns = ["AdoptionSpeed"]
X_column_exclude = ["PetID"]
X_columns = [ x for x in df_train.columns if (x not in X_column_exclude) & (x not in y_columns)]
print(X_columns)

['Age', 'VideoAmt', 'PhotoAmt', 'Type_1', 'Type_2', 'Probability', 'a', 'a g', 'aaron ayumi', 'ab', 'ababa', 'abam', 'abandon kitten', 'abandoned kittens', 'abandoned kitty', 'abandoned newborn', 'abandoned pregnant', 'abandoned puppies', 'abang', 'abang junior', 'abba', 'abberboy', 'abbie', 'abbot', 'abby', 'abby mojo', 'abcd', 'abcd kittens', 'abcde', 'abdul', 'abe', 'abe amell', 'abg long', 'abon', 'aboo', 'abott', 'aboy', 'abree', 'abu', 'abu apu', 'abu atan', 'abu nawas', 'acang ulit', 'ace', 'aci', 'ackie', 'activator', 'acu', 'adam eve', 'addie', 'addison', 'adelina', 'adelle', 'adik', 'admes', 'adoption drive', 'adoption urgent', 'adorable beagle', 'adorable kitten', 'adorable kittens', 'adorable melody', 'adorable puppies', 'adorable puppy', 'adorable pups', 'adpoted', 'adrian', 'adult cat', 'aemy', 'aerin', 'afu', 'agent button', 'aggie', 'agnes', 'agos', 'ah bee', 'ah boo', 'ah boy', 'ah girl', 'ah gray', 'ah kao', 'ah ki', 'ah man', 'ah mao', 'ah miao', 'ah pong', 'ah pui',

## Run Model

In [8]:
results = executetion(df_train, X_columns, y_columns)

X_train (8000, 4492)
y_train (8000, 1)
X_test (2000, 4492)
y_test (2000, 1)


AttributeError: 'DataFrame' object has no attribute 'ravel'

## Model performance

In [None]:
# sort the results and print as a table
sort_column = "precision"

df_results = pd.DataFrame(results)
df_results.columns = ['model', 'precision', 'recall']
df_results = df_results.sort_values(by=sort_column)
df_results

## Fit Test Data

In [None]:
print("Using Model {0}".format(models[df_results[sort_column].idxmax()][0]))
X_test = df_test[X_columns]
y_pred = models[2][1].predict(X_test)

## Prepare Data for Submission

In [None]:
df_test['AdoptionSpeed'] = y_pred
df_save = df_test[['PetID', 'AdoptionSpeed']]
df_save

## Save Data

In [None]:
df_save.to_csv("../../Data/Modeling/submission_01.csv", index=False)