In [540]:
import pandas as pd
import numpy as np
import re

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve,auc
from sklearn import metrics
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn import cross_validation

In [541]:
#%install_ext https://raw.githubusercontent.com/rasbt/watermark/master/watermark.py
%load_ext watermark
%watermark -g -m -v -p pandas,scikit-learn

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
CPython 2.7.10
IPython 3.1.0

pandas 0.16.2
scikit-learn 0.16.0

compiler   : GCC 4.4.7 20120313 (Red Hat 4.4.7-1)
system     : Linux
release    : 3.16.0-67-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
Git hash   : c00d664fabd1c1ebed87c7b392c327a0f0eab982


In [542]:
df = pd.read_csv("train.csv")
df.shape

(26729, 10)

In [543]:
format = '%Y-%m-%d %H:%M:%S'
df['DateTime'] = df['DateTime'].apply(lambda x: pd.to_datetime(x, format=format))
df['Hour'] = df['DateTime'].apply(lambda x: x.hour).astype(np.int16)
df['Day_of_week'] = df['DateTime'].apply(lambda x: x.dayofweek).astype(np.int16)

In [544]:
obj = re.compile("(?P<num>\d+)\s*(?P<word>\w+)")
def convert_age(string):
    match = obj.search(string)
    if match:
        return match.groups()
    else:
        None, None

In [545]:
%%bash
cat train.csv | cut -d, -f 8 | cut -d' ' -f 2 | sort | uniq -c

     18 
      1 AgeuponOutcome
     66 day
    332 days
   1281 month
   8339 months
    146 week
   1704 weeks
   3969 year
  10874 years


In [546]:
convert_dict = {None:0, 
                'day':1, 
                'days':1, 
                'month':30, 
                'months':30, 
                'year':365, 
                'years':365, 
                'week':7, 
                'weeks':7
               }

In [547]:
df['AgeuponOutcome_inDays'] = np.zeros((df.shape[0], 1), dtype=np.int16)

for i in range(df.shape[0]):
    try:
        string = df.loc[i, 'AgeuponOutcome']
        num, word = convert_age(string)
        num_days = int(num) * convert_dict[word]
        df.loc[i, 'AgeuponOutcome_inDays'] = num_days
    
    except:
        continue

In [548]:
df.head(n=3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Hour,Day_of_week,AgeuponOutcome_inDays
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,18,2,365
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,12,6,365
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,12,5,730


In [549]:
df['has_name_bool'] = df['Name'].apply(lambda x: 1 if x is not None else 0)
df['has_name_bool'] = df['has_name_bool'].astype(np.int16)

In [550]:
color = df.groupby('Color').count()['AnimalID'].sort(inplace=False)

In [551]:
color.sample(n=10).head()

Color
Tortie/Brown     1
Tan/Apricot      1
Gold/Tan         1
Black/Red       17
Red/Tricolor     2
Name: AnimalID, dtype: int64

In [552]:
obj = re.compile('(\w+)\s*(\w+)*/*(\w+)*')
def break_up_colors(string):
    match = obj.search(string)
    if match:
        colors = match.groups()
        return colors
    else:
        return None

In [553]:
colors = []
for c in color.index:
    colors +=break_up_colors(c)

colors = np.unique(colors)
print (colors)

[None 'Agouti' 'Apricot' 'Black' 'Blue' 'Brindle' 'Brown' 'Buff' 'Calico'
 'Chocolate' 'Cream' 'Fawn' 'Flame' 'Gold' 'Gray' 'Lilac' 'Liver' 'Lynx'
 'Merle' 'Orange' 'Pink' 'Point' 'Red' 'Ruddy' 'Sable' 'Seal' 'Silver'
 'Smoke' 'Tabby' 'Tan' 'Tick' 'Tiger' 'Torbie' 'Tortie' 'Tricolor' 'White'
 'Yellow']


In [554]:
df_colors = pd.DataFrame(columns=colors, data=np.zeros((df.shape[0], colors.shape[0]), np.int16))

In [555]:
for i in range(df.shape[0]):
    color_string = df.loc[i, 'Color']
    color_string_tuple = break_up_colors(color_string)
    df_colors.loc[i, color_string_tuple] = 1

In [556]:
breed = df.groupby('Breed').count()['AnimalID']

breeds = []
for b in breed.index:
    breeds +=break_up_colors(b)

breeds = np.unique(breeds)

In [557]:
df_breed = pd.DataFrame(columns=breeds, data=np.zeros((df.shape[0], breeds.shape[0]), np.int8))

In [558]:
for i in range(df.shape[0]):
    breed_string = df.loc[i, 'Breed']
    breed_string_tuple = break_up_colors(breed_string)
    df_breed.loc[i, breed_string_tuple] = 1

In [566]:
df_final = pd.get_dummies(df[['OutcomeType', 'AnimalType', 'SexuponOutcome']])
df_final = pd.concat([df_final, 
                      df_colors, 
                      df_breed, 
                      df['AgeuponOutcome_inDays'],
                      df['has_name_bool'], 
                      df['Hour'], 
                      df['Day_of_week']
                     ], axis=1)

Name, OutcomeType, OutcomeSubtype, AnimalType, SexuponOutcom, Breed, (colors dummies), has_name_bool
hour, Day_of_week, AgeuponOutcome_inDays, breeds

In [567]:
indices = np.where(df_final.columns == None)[0]
print indices
df_final = pd.concat([df_final.ix[:, 0:12], df_final.ix[:, 13:49],df_final.ix[:, 50:]], axis=1)
print df_final.shape

[12 49]
(26729, 304)


In [568]:
print np.where(df_final.columns == None)[0]

for col in df_final:
    df_final[col] = df_final[col].astype(np.int16)

[]


In [569]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26729 entries, 0 to 26728
Columns: 304 entries, OutcomeType_Adoption to Day_of_week
dtypes: int16(304)
memory usage: 15.7 MB


In [570]:
print df_colors.shape
print df_breed.shape
print df_final.shape

(26729, 37)
(26729, 253)
(26729, 304)


In [571]:
Y = df_final[['OutcomeType_Adoption', 
              'OutcomeType_Died', 
              'OutcomeType_Euthanasia', 
              'OutcomeType_Return_to_owner', 
              'OutcomeType_Transfer']]

X = df_final.drop(Y.columns, axis=1)

In [572]:
X.shape

(26729, 299)

In [573]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, Y, train_size=0.3, random_state=0)

In [574]:
clf = RandomForestClassifier(random_state=0, 
                             n_jobs=4, 
                             n_estimators=df_final.shape[1]*2, 
                             max_features='sqrt',
                             min_samples_split=1
                            )
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=1,
            min_weight_fraction_leaf=0.0, n_estimators=608, n_jobs=4,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [575]:
y_pred = clf.predict(X_test)
error = metrics.classification_report(y_test, y_pred)
print error

             precision    recall  f1-score   support

          0       0.72      0.69      0.71      7617
          1       0.33      0.03      0.06       125
          2       0.23      0.02      0.04      1090
          3       0.45      0.21      0.28      3291
          4       0.76      0.60      0.67      6588

avg / total       0.66      0.53      0.58     18711



In [480]:
cv = cross_validation.KFold(X_test.shape[0], 10, shuffle=True, random_state=0)
cross_validation.cross_val_score(clf, X_test, y_test, cv=cv, n_jobs=7)

array([ 0.86111111,  0.85301978,  0.85676109,  0.85515767,  0.83110636,
        0.84019241,  0.86531267,  0.85355425,  0.85355425,  0.8423303 ])

In [300]:
#params = {'min_samples_split': np.arange(1, 15)}
#gs = Grid(clf, cv=cv, n_jobs=7)
#gs.fit(X, Y)
#best_parameters, score, _ = max(gs.grid_scores_, key=lambda x: x[1])
#for param_name in sorted(params.keys()):
#    print("%s: %r" % (param_name, best_parameters[param_name]))

# Test Data Set

In [581]:
df_new = pd.read_csv("test.csv")
df_new.shape

(11456, 8)

In [582]:
format = '%Y-%m-%d %H:%M:%S'
df_new['DateTime'] = df_new['DateTime'].apply(lambda x: pd.to_datetime(x, format=format)) 
df_new['Hour'] = df_new['DateTime'].apply(lambda x: x.hour).astype(np.int16)
df_new['Day_of_week'] = df_new['DateTime'].apply(lambda x: x.dayofweek).astype(np.int16)

In [583]:
df_new['AgeuponOutcome_inDays'] = np.zeros((df_new.shape[0], 1), dtype=np.int16)
for i in range(df_new.shape[0]):
    try:
        string = df_new.loc[i, 'AgeuponOutcome']
        num, word = convert_age(string)
        num_days = int(num) * convert_dict[word]
        df_new.loc[i, 'AgeuponOutcome_inDays'] = num_days
    
    except:
        continue

In [584]:
df_new['has_name_bool'] = df_new['Name'].apply(lambda x: 1 if x is not None else 0)
df_new['has_name_bool'] = df_new['has_name_bool'].astype(np.int16)

In [585]:
df_colors = pd.DataFrame(columns=colors, data=np.zeros((df_new.shape[0], colors.shape[0]), np.int16))
for i in range(df_new.shape[0]):
    color_string = df_new.loc[i, 'Color']
    color_string_tuple = break_up_colors(color_string)
    df_colors.loc[i, color_string_tuple] = 1

In [586]:
df_breed = pd.DataFrame(columns=breeds, data=np.zeros((df_new.shape[0], breeds.shape[0]), np.int8))
for i in range(df_new.shape[0]):
    breed_string = df_new.loc[i, 'Breed']
    breed_string_tuple = break_up_colors(breed_string)
    
    try: 
        df_breed.loc[i, breed_string_tuple] = 1
    except:   
        continue 

In [587]:
print df_colors.shape
print df_breed.shape

(11456, 37)
(11456, 253)


In [588]:
df_new_final = pd.get_dummies(df_new[['AnimalType', 'SexuponOutcome']])
df_new_final = pd.concat([df_new_final, 
                      df_colors, 
                      df_breed, 
                      df_new['AgeuponOutcome_inDays'],
                      df_new['has_name_bool'], 
                      df_new['Hour'], 
                      df_new['Day_of_week']
                     ], axis=1)

In [589]:
df_new_final.shape

(11456, 301)

In [590]:
indices = np.where(df_new_final.columns == None)[0]
print indices
df_new_final = pd.concat([df_new_final.ix[:, 0:7], df_new_final.ix[:, 8:44], df_new_final.ix[:, 45:]], axis=1)
print df_new_final.shape

[ 7 44]
(11456, 299)


In [591]:
print np.where(df_new_final.columns == None)[0]
for col in df_final:
    df_final[col] = df_final[col].astype(np.int16)

[]


In [592]:
x = set(X.columns)
y = set(df_new_final.columns)
x-y

set()

In [593]:
clf = RandomForestClassifier(random_state=0, 
                             n_jobs=4, 
                             n_estimators=df_final.shape[1]*2, 
                             max_features='sqrt',
                             min_samples_split=1
                            )
clf.fit(X, Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='sqrt', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=1,
            min_weight_fraction_leaf=0.0, n_estimators=608, n_jobs=4,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [594]:
final_pred = clf.predict(df_new_final)