In [112]:
import os
import subprocess

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz 


from sklearn import tree
from pandas import read_table, DataFrame
from os import system

In [113]:
df = pd.read_csv('train.csv')

In [114]:
print("* df.head()", df.head(), sep="\n", end="\n\n")
print("* df.tail()", df.tail(), sep="\n", end="\n\n")
list(df)

* df.head()
  AnimalID     Name          DateTime      OutcomeType OutcomeSubtype  \
0  A671945  Hambone   2/12/2014 18:22  Return_to_owner            NaN   
1  A656520    Emily  10/13/2013 12:44       Euthanasia      Suffering   
2  A686464   Pearce   1/31/2015 12:28         Adoption         Foster   
3  A683430      NaN   7/11/2014 19:09         Transfer        Partner   
4  A667013      NaN  11/15/2013 12:52         Transfer        Partner   

  AnimalType SexuponOutcome AgeuponOutcome                        Breed  \
0        Dog  Neutered Male         1 year        Shetland Sheepdog Mix   
1        Cat  Spayed Female         1 year       Domestic Shorthair Mix   
2        Dog  Neutered Male        2 years                 Pit Bull Mix   
3        Cat    Intact Male        3 weeks       Domestic Shorthair Mix   
4        Dog  Neutered Male        2 years  Lhasa Apso/Miniature Poodle   

         Color  
0  Brown/White  
1  Cream Tabby  
2   Blue/White  
3   Blue Cream  
4          Ta

['AnimalID',
 'Name',
 'DateTime',
 'OutcomeType',
 'OutcomeSubtype',
 'AnimalType',
 'SexuponOutcome',
 'AgeuponOutcome',
 'Breed',
 'Color']

In [115]:
#goal: to predict outcome (adoption, return to owner, euthanasia, diead) given the features SexuponOutcome, AgeuponOutcome, Breed and Color. 
#We can use pandas to show the adoption types:
print("* outcome types:", df["OutcomeType"].unique())

* outcome types: ['Return_to_owner' 'Euthanasia' 'Adoption' 'Transfer' 'Died']


In [47]:
def encode_target(df, target_column):
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)

In [57]:
def get_sex(x):
    x=str(x)
    if x.find('Male') >= 0: return 'male'
    if x.find('Female') >= 0: return 'female'
    return 'unknown'
def get_neutered(x):
    x=str(x)
    if x.find('Spayed') >= 0: return 'neutered'
    if x.find('Neutered') >= 0: return 'neutered'
    if x.find('Intact') >= 0: return 'intact'
    return 'unknown'

In [59]:
df['Sex']=df.SexuponOutcome.apply(get_sex)
df['Neutered']= df.SexuponOutcome.apply(get_neutered)

In [60]:
#looking at csv file, if 'Mix' in the desc then assume not purebred. ex.American Pit Bull Terrier Mix
def get_mix(x):
    x = str(x)
    if x.find('Mix')>= 0: return 'mix'
    return 'not'

In [66]:
def calc_age_in_years(x):
    x=str(x)
    if x == 'nan': return 0
    age = int(x.split()[0])
    if x.find('year')> -1: return age
    if x.find('month')> -1: return age/12.
    if x.find('week')> -1: return age/52.
    if x.find('day')> -1: return age/365.
    else: return 0

In [67]:
df['AgeInYears'] = df.AgeuponOutcome.apply(calc_age_in_years)

In [68]:
def calc_age_category(x):
    if x < 3: return 'young'
    if x < 5: return 'young adult'
    if x < 10: return 'adult'
    return 'old'

In [69]:
df['Mix']= df.Breed.apply(get_mix)
df['AgeCategory']=df.AgeInYears.apply(calc_age_category)

In [70]:
list(df)

['AnimalID',
 'Name',
 'DateTime',
 'OutcomeType',
 'OutcomeSubtype',
 'AnimalType',
 'SexuponOutcome',
 'AgeuponOutcome',
 'Breed',
 'Color',
 'Sex',
 'Neutered',
 'Mix',
 'AgeInYears',
 'AgeCategory']

In [78]:
sex_dummies = pd.get_dummies(df['Sex'])
age_group_dummies = pd.get_dummies(df['AgeCategory'])
mix_dummies = pd.get_dummies(df['Mix'])
neutered_dummies = pd.get_dummies(df['SexuponOutcome'])

In [83]:
df3= pd.concat([sex_dummies, age_group_dummies, mix_dummies, neutered_dummies], axis=1)

In [84]:
df3.head()

Unnamed: 0,female,male,unknown,adult,old,young,young adult,mix,not,Intact Female,Intact Male,Neutered Male,Spayed Female,Unknown
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [72]:
list(df)

['AnimalID',
 'Name',
 'DateTime',
 'OutcomeType',
 'OutcomeSubtype',
 'AnimalType',
 'SexuponOutcome',
 'AgeuponOutcome',
 'Breed',
 'Color',
 'Sex',
 'Neutered',
 'Mix',
 'AgeInYears',
 'AgeCategory']

In [85]:
df2, targets = encode_target(df, "OutcomeType")

In [86]:
df2[["Target", "OutcomeType"]].head()

Unnamed: 0,Target,OutcomeType
0,0,Return_to_owner
1,1,Euthanasia
2,2,Adoption
3,3,Transfer
4,3,Transfer


In [87]:
df2.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Sex,Neutered,Mix,AgeInYears,AgeCategory,Target
0,A671945,Hambone,2/12/2014 18:22,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,male,neutered,mix,1.0,young,0
1,A656520,Emily,10/13/2013 12:44,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,female,neutered,mix,1.0,young,1
2,A686464,Pearce,1/31/2015 12:28,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,male,neutered,mix,2.0,young,2
3,A683430,,7/11/2014 19:09,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,male,intact,mix,0.057692,young,3
4,A667013,,11/15/2013 12:52,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,male,neutered,not,2.0,young,3


In [52]:
df2["Target"].unique()

array([0, 1, 2, 3, 4], dtype=int64)

In [19]:
#return to owner = 0, euthanasia =1, adoption =2, transfer =3, died = 4

In [90]:
df_all = df2.join(df3)

In [91]:
list(df_all)

['AnimalID',
 'Name',
 'DateTime',
 'OutcomeType',
 'OutcomeSubtype',
 'AnimalType',
 'SexuponOutcome',
 'AgeuponOutcome',
 'Breed',
 'Color',
 'Sex',
 'Neutered',
 'Mix',
 'AgeInYears',
 'AgeCategory',
 'Target',
 'female',
 'male',
 'unknown',
 'adult',
 'old',
 'young',
 'young adult',
 'mix',
 'not',
 'Intact Female',
 'Intact Male',
 'Neutered Male',
 'Spayed Female',
 'Unknown']

In [97]:
features = list(df_all.columns[16:30])
features

['female',
 'male',
 'unknown',
 'adult',
 'old',
 'young',
 'young adult',
 'mix',
 'not',
 'Intact Female',
 'Intact Male',
 'Neutered Male',
 'Spayed Female',
 'Unknown']

In [98]:
y = df_all["Target"]
X = df_all[features]
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)
dt.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=20, min_weight_fraction_leaf=0.0,
            presort=False, random_state=99, splitter='best')

In [116]:
def visualize_tree(tree, feature_names):
    """Create tree png using graphviz.

    Args
    ----
    tree -- scikit-learn DecsisionTree.
    feature_names -- list of feature names.
    """
    with open("dt.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")

In [117]:
%matplotlib inline
visualize_tree(dt, features)

In [118]:
dt

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=20, min_weight_fraction_leaf=0.0,
            presort=False, random_state=99, splitter='best')

In [110]:
# PRINT OUT VARIABLE IMPORTANCE
print(DataFrame(dt.feature_importances_, columns = ["Imp"], index = X.columns).sort_values(['Imp'], ascending = False))

                    Imp
Intact Male    0.306957
Intact Female  0.257944
young          0.210222
Unknown        0.200231
old            0.006327
young adult    0.006193
mix            0.003844
female         0.003783
not            0.002366
adult          0.001271
male           0.000576
Spayed Female  0.000194
Neutered Male  0.000092
unknown        0.000000


In [103]:
def get_code(tree, feature_names, target_names,
             spacer_base="    "):
    """Produce psuedo-code for decision tree.

    Args
    ----
    tree -- scikit-leant DescisionTree.
    feature_names -- list of feature names.
    target_names -- list of target (class) names.
    spacer_base -- used for spacing code (default: "    ").

    Notes
    -----
    based on http://stackoverflow.com/a/30104792.
    """
    left      = tree.tree_.children_left
    right     = tree.tree_.children_right
    threshold = tree.tree_.threshold
    features  = [feature_names[i] for i in tree.tree_.feature]
    value = tree.tree_.value

    def recurse(left, right, threshold, features, node, depth):
        spacer = spacer_base * depth
        if (threshold[node] != -2):
            print(spacer + "if ( " + features[node] + " <= " + \
                  str(threshold[node]) + " ) {")
            if left[node] != -1:
                    recurse(left, right, threshold, features,
                            left[node], depth+1)
            print(spacer + "}\n" + spacer +"else {")
            if right[node] != -1:
                    recurse(left, right, threshold, features,
                            right[node], depth+1)
            print(spacer + "}")
        else:
            target = value[node]
            for i, v in zip(np.nonzero(target)[1],
                            target[np.nonzero(target)]):
                target_name = target_names[i]
                target_count = int(v)
                print(spacer + "return " + str(target_name) + \
                      " ( " + str(target_count) + " examples )")

    recurse(left, right, threshold, features, 0, 0)

In [111]:
#get_code(dt, features, targets)
#this runs but doesn't look good/isn't very readable