In [794]:
import pandas as pd
import numpy as np
import re

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_curve,auc
from sklearn import metrics
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn import cross_validation

In [795]:
#%install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
%load_ext watermark
%watermark -g -m -v -p pandas,scikit-learn

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
CPython 2.7.10
IPython 3.1.0

pandas 0.16.2
scikit-learn 0.16.0

compiler   : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)
system     : Linux
release    : 3.16.0-67-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
Git hash   : 948874e37b6f8bebff81ce56569be23254686efd


In [796]:
df = pd.read_csv("train.csv")
df.shape

(26729, 10)

In [797]:
format = '%Y-%m-%d %H:%M:%S'
df['DateTime'] = df['DateTime'].apply(lambda x: pd.to_datetime(x, format=format))
df['Hour'] = df['DateTime'].apply(lambda x: x.hour).astype(np.int16)
df['Day_of_week'] = df['DateTime'].apply(lambda x: x.dayofweek).astype(np.int16)

# Age

In [798]:
obj = re.compile("(?P<num>\d+)\s*(?P<word>\w+)")
def convert_age(string):
    match = obj.search(string)
    if match:
        return match.groups()
    else:
        None, None

In [799]:
%%bash
cat train.csv | cut -d, -f 8 | cut -d' ' -f 2 | sort | uniq -c

     18 
      1 AgeuponOutcome
     66 day
    332 days
   1281 month
   8339 months
    146 week
   1704 weeks
   3969 year
  10874 years


In [800]:
convert_dict = {None:0, 
                'day':1, 
                'days':1, 
                'month':30, 
                'months':30, 
                'year':365, 
                'years':365, 
                'week':7, 
                'weeks':7
               }

In [801]:
df['AgeuponOutcome_inDays'] = np.zeros((df.shape[0], 1), dtype=np.int16)

for i in range(df.shape[0]):
    try:
        string = df.loc[i, 'AgeuponOutcome']
        num, word = convert_age(string)
        num_days = int(num) * convert_dict[word]
        df.loc[i, 'AgeuponOutcome_inDays'] = num_days
    
    except:
        continue

In [802]:
df.head(n=3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Hour,Day_of_week,AgeuponOutcome_inDays
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,18,2,365
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,12,6,365
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,12,5,730


# Name

In [803]:
df['has_name_bool'] = df['Name'].apply(lambda x: 1 if x is not None else 0)
df['has_name_bool'] = df['has_name_bool'].astype(np.int16)

# Color

In [804]:
color = df.groupby('Color').count()['AnimalID'].sort(inplace=False)

In [805]:
color.sample(n=10).head()

Color
Cream/Brown          2
Blue Tick           18
Seal Point/White    19
Fawn/Black          19
White/White         12
Name: AnimalID, dtype: int64

In [806]:
obj = re.compile('(\w+)\s*(\w+)*/*(\w+)*')
def break_up_colors(string):
    match = obj.search(string)
    if match:
        colors = match.groups()
        return colors
    else:
        return None

In [807]:
colors = []
for c in color.index:
    colors +=break_up_colors(c)

colors = np.unique(colors)
print (colors)

[None 'Agouti' 'Apricot' 'Black' 'Blue' 'Brindle' 'Brown' 'Buff' 'Calico'
 'Chocolate' 'Cream' 'Fawn' 'Flame' 'Gold' 'Gray' 'Lilac' 'Liver' 'Lynx'
 'Merle' 'Orange' 'Pink' 'Point' 'Red' 'Ruddy' 'Sable' 'Seal' 'Silver'
 'Smoke' 'Tabby' 'Tan' 'Tick' 'Tiger' 'Torbie' 'Tortie' 'Tricolor' 'White'
 'Yellow']


In [808]:
df_colors = pd.DataFrame(columns=colors, data=np.zeros((df.shape[0], colors.shape[0]), np.int16))

In [809]:
for i in range(df.shape[0]):
    color_string = df.loc[i, 'Color']
    color_string_tuple = break_up_colors(color_string)
    df_colors.loc[i, color_string_tuple] = 1

# Breed

In [939]:
breed = df.groupby('Breed').count()['AnimalID']

breeds = []
for b in breed.index:
    breeds +=break_up_colors(b)

breeds = np.unique(breeds)
print breeds[0::4]

[None 'Airedale' 'Anatol' 'Australian' 'Bay' 'Bedlington' 'Bichon'
 'Bluetick' 'Border' 'Boykin' 'Bruss' 'Burmese' 'Cane' 'Cattle' 'Chin'
 'Coated' 'Coonhound' 'Cur' 'Dane' 'Dog' 'Dutch' 'Eskimo' 'Finnish'
 'Foxhound' 'Giant' 'Greater' 'Hairless' 'Heeler' 'Hovawart' 'Irish'
 'Japanese' 'Kelpie' 'Landseer' 'Lowchen' 'Maltese' 'Medium' 'Mountain'
 'Newfoundland' 'Nova' 'Otterhound' 'Pbgv' 'Persian' 'Pinscher' 'Podengo'
 'Poodle' 'Pyrenees' 'Redbone' 'Ridgeback' 'Russian' 'Schnauzer' 'Setter'
 'Shetland' 'Siamese' 'Smooth' 'Spaniel' 'Spitz' 'Standard' 'Tennesse'
 'Tibetan' 'Turkish' 'Van' 'Weimaraner' 'Wire' 'Yorkshire']


In [940]:
df_breed = pd.DataFrame(columns=breeds, data=np.zeros((df.shape[0], breeds.shape[0]), np.int8))

In [941]:
for i in range(df.shape[0]):
    breed_string = df.loc[i, 'Breed']
    breed_string_tuple = break_up_colors(breed_string)
    df_breed.loc[i, breed_string_tuple] = 1

In [944]:
df_breed.drop(['Mix', 'Unknown', 'Dog'], axis=1, inplace=True)

# Breed Outcomesubtype

In [971]:
grp = df.groupby(['OutcomeSubtype'])
breed_outcome = grp.count()['AnimalID']
ind = breed_outcome >= 200
print breed_outcome[ind]

OutcomeSubtype
Aggressive     320
Foster        1800
Partner       7816
SCRP          1599
Suffering     1002
Name: AnimalID, dtype: int64


In [958]:
df_sub = pd.get_dummies(df['OutcomeSubtype'])
df_sub = df_sub[['Aggressive', 'Foster', 'Partner', 'SCRP', 'Suffering']]
df_sub['Breed'] = df['Breed']

count = df_sub.groupby("Breed").sum()
print count.sample(n=3)

                                 Aggressive  Foster  Partner  SCRP  Suffering
Breed                                                                        
Maine Coon Mix                            0      10        7     4          0
Great Dane/Pit Bull                       0       0        0     0          0
English Springer Spaniel/Beagle           0       0        0     0          0


In [961]:
zeros = np.zeros((len(breeds)), dtype=np.int16)
breed_sub_df = pd.DataFrame({'Breed': breeds,
                             'Aggressive': zeros,
                             'Foster': zeros,
                             'Partner': zeros,
                             'SCRP': zeros,
                             'Suffering': zeros,
                            })

for b in count.index:
    tuple_ = break_up_colors(b)
    for i in range(breed_sub_df.shape[0]):
        if breed_sub_df.loc[i, 'Breed'] in tuple_:
            breed_sub_df.loc[i, 'Aggressive'] = 1 
            breed_sub_df.loc[i, 'Foster'] = 1 
            breed_sub_df.loc[i, 'Partner'] = 1 
            breed_sub_df.loc[i, 'SCRP'] = 1 
            breed_sub_df.loc[i, 'Suffering'] = 1 

In [980]:
breed_sub_df = breed_sub_df.ix[1:, :] # drop the 'None' row
breed_sub_df.reset_index(inplace=True)
breed_sub_df.sample(n=5)

Unnamed: 0,Aggressive,Breed,Foster,Partner,SCRP,Suffering
45,1,Cairn,1,1,1,1
153,1,Ocicat,1,1,1,1
192,1,Russian,1,1,1,1
97,1,Glen,1,1,1,1
77,1,Dogo,1,1,1,1


In [979]:
zeros = np.zeros(df.shape[0], dtype=np.int16)

df['Aggressive'] = zeros
df['Foster'] = zeros
df['Partner'] = zeros
df['SCRP']= zeros
df['Suffering'] = zeros
                       
for i in range(df.shape[0]):
    string = df.loc[i, 'Breed']
    tuple_ = break_up_colors(string)
    for j in range(1, breed_sub_df.shape[0]):
        word = breed_sub_df.loc[j, 'Breed']
        if word in tuple_ and not np.all(df.loc[i, ['Aggressive', 'Foster', 'Partner', 'SCRP', 'Suffering']]):
            df.loc[i, 'Aggressive'] += breed_sub_df.loc[j, 'Aggressive']
            df.loc[i, 'Foster'] += breed_sub_df.loc[j, 'Foster']
            df.loc[i, 'Partner'] += breed_sub_df.loc[j, 'Partner']
            df.loc[i, 'SCRP'] += breed_sub_df.loc[j, 'SCRP']
            df.loc[i, 'Suffering'] += breed_sub_df.loc[j, 'Suffering']

## Mix

In [983]:
obj = re.compile('(?i)mix')
df['mix'] = df['Breed'].apply(lambda x: 1 if obj.search(x) else 0)

## Put it all together

In [984]:
df_final = pd.get_dummies(df[['OutcomeType', 'AnimalType', 'SexuponOutcome']])
df_final = pd.concat([df_final, 
                      df_colors, 
                      df_breed, 
                      df['AgeuponOutcome_inDays'],
                      df['has_name_bool'], 
                      df['Hour'], 
                      df['Day_of_week'],
                      df['mix'],
                      df['Aggressive'],
                      df['Foster'],
                      df['Partner'],
                      df['SCRP'],
                      df['Suffering']
                     ], axis=1)

In [985]:
indices = np.where(df_final.columns == None)[0]
print indices
df_final = pd.concat([df_final.ix[:, 0:12], df_final.ix[:, 13:49],df_final.ix[:, 50:]], axis=1)
print df_final.shape

[12 49]
(26729, 307)


In [986]:
print np.where(df_final.columns == None)[0]

for col in df_final:
    df_final[col] = df_final[col].astype(np.int16)

[]


In [987]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26729 entries, 0 to 26728
Columns: 307 entries, OutcomeType_Adoption to Suffering
dtypes: int16(307)
memory usage: 15.9 MB


In [988]:
print df_colors.shape
print df_breed.shape
print df_final.shape

(26729, 37)
(26729, 250)
(26729, 307)


In [989]:
Y = df_final[['OutcomeType_Adoption', 
              'OutcomeType_Died', 
              'OutcomeType_Euthanasia', 
              'OutcomeType_Return_to_owner', 
              'OutcomeType_Transfer']]

X = df_final.drop(Y.columns, axis=1)

In [990]:
X.shape

(26729, 302)

In [991]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, train_size=0.3, random_state=0)

In [992]:
clf = RandomForestClassifier(random_state=0, 
                             n_jobs=4, 
                             n_estimators=df_final.shape[1]*2, 
                             max_features='sqrt',
                             min_samples_split=1
                            )
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=1,
            min_weight_fraction_leaf=0.0, n_estimators=614, n_jobs=4,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [993]:
y_pred = clf.predict(X_test)
error = metrics.classification_report(y_test, y_pred)
print error

             precision    recall  f1-score   support

          0       0.72      0.69      0.71      7617
          1       0.36      0.03      0.06       125
          2       0.21      0.02      0.03      1090
          3       0.46      0.20      0.28      3291
          4       0.77      0.61      0.68      6588

avg / total       0.66      0.53      0.58     18711



In [994]:
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(df_final.columns.shape[0]):
    print("(%d)'%s' - %f" % (f + 1, df_final.columns[indices[f]], 
          importances[indices[f]]))

(1)'Wire' - 0.207449
(2)'Wirehaired' - 0.151398
(3)'Wolfhound' - 0.106470
(4)'AnimalType_Cat' - 0.038842
(5)'OutcomeType_Euthanasia' - 0.034575
(6)'OutcomeType_Transfer' - 0.033893
(7)'OutcomeType_Return_to_owner' - 0.033314
(8)'Tick' - 0.026938
(9)'SexuponOutcome_Neutered Male' - 0.020544
(10)'Agouti' - 0.017814
(11)'Yorkshire' - 0.015282
(12)'AnimalType_Dog' - 0.014183
(13)'Sable' - 0.012298
(14)'Sheepdog' - 0.011520
(15)'OutcomeType_Adoption' - 0.009975
(16)'SexuponOutcome_Spayed Female' - 0.009688
(17)'OutcomeType_Died' - 0.008847
(18)'De' - 0.008583
(19)'Ruddy' - 0.006673
(20)'Lynx' - 0.006162
(21)'SexuponOutcome_Unknown' - 0.006005
(22)'Carolina' - 0.005690
(23)'Persian' - 0.005644
(24)'Javanese' - 0.005461
(25)'Cornish' - 0.005313
(26)'Boykin' - 0.005299
(27)'Tan' - 0.004956
(28)'Pyrenees' - 0.004716
(29)'Blue' - 0.004694
(30)'Labrador' - 0.004669
(31)'Swedish' - 0.004448
(32)'American' - 0.004231
(33)'Scottish' - 0.004162
(34)'Chocolate' - 0.004084
(35)'Forest' - 0.003971
(36)'

IndexError: index 302 is out of bounds for axis 0 with size 302

In [762]:
cv = cross_validation.KFold(X_test.shape[0], 10, shuffle=True, random_state=0)
cross_validation.cross_val_score(clf, X_test, y_test, cv=cv, n_jobs=7)

array([ 0.84081197,  0.8364511 ,  0.84500267,  0.84339925,  0.818279  ,
        0.82469268,  0.8444682 ,  0.84607162,  0.8343132 ,  0.82576162])

In [300]:
#params = {'min_samples_split': np.arange(1, 15)}
#gs = Grid(clf, cv=cv, n_jobs=7)
#gs.fit(X, Y)
#best_parameters, score, _ = max(gs.grid_scores_, key=lambda x: x[1])
#for param_name in sorted(params.keys()):
#    print("%s: %r" % (param_name, best_parameters[param_name]))

# Test Data Set

In [763]:
df_new = pd.read_csv("test.csv")
df_new.shape

(11456, 8)

In [764]:
format = '%Y-%m-%d %H:%M:%S'
df_new['DateTime'] = df_new['DateTime'].apply(lambda x: pd.to_datetime(x, format=format)) 
df_new['Hour'] = df_new['DateTime'].apply(lambda x: x.hour).astype(np.int16)
df_new['Day_of_week'] = df_new['DateTime'].apply(lambda x: x.dayofweek).astype(np.int16)

In [765]:
df_new['AgeuponOutcome_inDays'] = np.zeros((df_new.shape[0], 1), dtype=np.int16)
for i in range(df_new.shape[0]):
    try:
        string = df_new.loc[i, 'AgeuponOutcome']
        num, word = convert_age(string)
        num_days = int(num) * convert_dict[word]
        df_new.loc[i, 'AgeuponOutcome_inDays'] = num_days
    
    except:
        continue

In [766]:
df_new['has_name_bool'] = df_new['Name'].apply(lambda x: 1 if x is not None else 0)
df_new['has_name_bool'] = df_new['has_name_bool'].astype(np.int16)

In [767]:
df_colors = pd.DataFrame(columns=colors, data=np.zeros((df_new.shape[0], colors.shape[0]), np.int16))
for i in range(df_new.shape[0]):
    color_string = df_new.loc[i, 'Color']
    color_string_tuple = break_up_colors(color_string)
    df_colors.loc[i, color_string_tuple] = 1

In [768]:
df_breed = pd.DataFrame(columns=breeds, data=np.zeros((df_new.shape[0], breeds.shape[0]), np.int8))
for i in range(df_new.shape[0]):
    breed_string = df_new.loc[i, 'Breed']
    breed_string_tuple = break_up_colors(breed_string)
    
    try: 
        df_breed.loc[i, breed_string_tuple] = 1
    except:   
        continue 

In [769]:
obj = re.compile('(?i)mix')
df_new['mix'] = df_new['Breed'].apply(lambda x: 1 if obj.search(x) else 0)

In [770]:
print df_colors.shape
print df_breed.shape

(11456, 37)
(11456, 253)


In [588]:
df_new_final = pd.get_dummies(df_new[['AnimalType', 'SexuponOutcome']])
df_new_final = pd.concat([df_new_final, 
                      df_colors, 
                      df_breed, 
                      df_new['AgeuponOutcome_inDays'],
                      df_new['has_name_bool'], 
                      df_new['Hour'], 
                      df_new['Day_of_week'],
                      df_new['mix']    
                     ], axis=1)

In [589]:
df_new_final.shape

(11456, 301)

In [590]:
indices = np.where(df_new_final.columns == None)[0]
print indices
df_new_final = pd.concat([df_new_final.ix[:, 0:7], df_new_final.ix[:, 8:44], df_new_final.ix[:, 45:]], axis=1)
print df_new_final.shape

[ 7 44]
(11456, 299)


In [591]:
print np.where(df_new_final.columns == None)[0]
for col in df_final:
    df_final[col] = df_final[col].astype(np.int16)

[]


In [592]:
x = set(X.columns)
y = set(df_new_final.columns)
x-y

set()

In [593]:
clf = RandomForestClassifier(random_state=0, 
                             n_jobs=4, 
                             n_estimators=df_final.shape[1]*2, 
                             max_features='sqrt',
                             min_samples_split=1
                            )
clf.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=1,
            min_weight_fraction_leaf=0.0, n_estimators=608, n_jobs=4,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [594]:
final_pred = clf.predict(df_new_final)