[Shelter Animal Outcomes](https://www.kaggle.com/c/shelter-animal-outcomes)

In [1]:
import pandas as pd
import numpy as np
import datetime

In [2]:
animals_test = pd.read_csv('data/test.csv')

In [3]:
animals_test.head(2)

Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan


In [4]:
animals_test.shape

(11456, 8)

In [5]:
animals = pd.read_csv('data/train.csv')

In [6]:
animals.head(2)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby


In [7]:
animals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26729 entries, 0 to 26728
Data columns (total 10 columns):
AnimalID          26729 non-null object
Name              19038 non-null object
DateTime          26729 non-null object
OutcomeType       26729 non-null object
OutcomeSubtype    13117 non-null object
AnimalType        26729 non-null object
SexuponOutcome    26728 non-null object
AgeuponOutcome    26711 non-null object
Breed             26729 non-null object
Color             26729 non-null object
dtypes: object(10)
memory usage: 2.0+ MB


TO DO:

### Clean up
- missing names = unknown
- datetime: days of week, months (seasonality)
- impute missing values from outcomesubtype via logistic regression (or decision tree)
    - would you actually model a logreg 1st to see if var is actually significant (but with smaller sample size)
- impute age? age => categorical?
- outcome = > class imbalance in predict - stratified?
- kfolds? 
- test/train split?
- break down breeds - size? larger sub-categories (ie hounds, etc)
- color... ?

- stats things... plot different distributions... MLE? binomial, etc

In [8]:
animals.columns

Index(['AnimalID', 'Name', 'DateTime', 'OutcomeType', 'OutcomeSubtype',
       'AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color'],
      dtype='object')

In [9]:
# rename columns
animals.columns = [x.lower() for x in animals.columns]

In [10]:
animals.columns

Index(['animalid', 'name', 'datetime', 'outcometype', 'outcomesubtype',
       'animaltype', 'sexuponoutcome', 'ageuponoutcome', 'breed', 'color'],
      dtype='object')

In [11]:
animals.outcomesubtype.value_counts(dropna=False)

NaN                    13612
Partner                 7816
Foster                  1800
SCRP                    1599
Suffering               1002
Aggressive               320
Offsite                  165
In Kennel                114
Behavior                  86
Rabies Risk               74
Medical                   66
In Foster                 52
Enroute                    8
Court/Investigation        6
At Vet                     4
In Surgery                 3
Barn                       2
Name: outcomesubtype, dtype: int64

In [12]:
animals = animals.rename(index=str, 
               columns={
                   "animalid": "id",
                   "datetime": "date",
                   "outcometype": "outcome",
                   "outcomesubtype": "outcome_info",
                   "animaltype": "animal",
                   "sexuponoutcome": "sex",
                   "ageuponoutcome": "age"})

In [13]:
animals.columns

Index(['id', 'name', 'date', 'outcome', 'outcome_info', 'animal', 'sex', 'age',
       'breed', 'color'],
      dtype='object')

In [14]:
animals.name.value_counts(dropna=False).head()

NaN        7691
Max         136
Bella       135
Charlie     107
Daisy       106
Name: name, dtype: int64

In [15]:
animals.name.replace(np.nan, 'unknown', inplace=True)

In [16]:
animals.name.value_counts(dropna=False).head()

unknown    7691
Max         136
Bella       135
Charlie     107
Daisy       106
Name: name, dtype: int64

In [17]:
type(animals.date[0])

str

In [18]:
animals['datetime'] = pd.to_datetime(animals['date'], infer_datetime_format=True)

In [19]:
animals.head(2)

Unnamed: 0,id,name,date,outcome,outcome_info,animal,sex,age,breed,color,datetime
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,2014-02-12 18:22:00
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,2013-10-13 12:44:00


In [20]:
type(animals.datetime[0])

pandas.tslib.Timestamp

In [21]:
animals.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26729 entries, 0 to 26728
Data columns (total 11 columns):
id              26729 non-null object
name            26729 non-null object
date            26729 non-null object
outcome         26729 non-null object
outcome_info    13117 non-null object
animal          26729 non-null object
sex             26728 non-null object
age             26711 non-null object
breed           26729 non-null object
color           26729 non-null object
datetime        26729 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(10)
memory usage: 2.4+ MB


In [22]:
# Monday is 0, Sunday is 6
animals['day'] = animals['datetime'].map(lambda row: row.weekday())

In [23]:
animals.day.value_counts()

5    4348
6    4317
1    3896
0    3696
4    3586
2    3510
3    3376
Name: day, dtype: int64

In [24]:
animals['month'] = animals['datetime'].map(lambda row: row.month)

In [25]:
animals.month.value_counts()

10    2881
12    2691
11    2668
7     2506
1     2334
6     2319
8     2172
5     2094
9     2004
2     1873
4     1689
3     1498
Name: month, dtype: int64

In [26]:
animals['year'] = animals['datetime'].map(lambda row: row.year)

In [27]:
animals.year.value_counts()

2015    11481
2014    11179
2013     2702
2016     1367
Name: year, dtype: int64

In [28]:
animals.columns

Index(['id', 'name', 'date', 'outcome', 'outcome_info', 'animal', 'sex', 'age',
       'breed', 'color', 'datetime', 'day', 'month', 'year'],
      dtype='object')

In [29]:
animals.datetime.describe()

count                   26729
unique                  22918
top       2015-08-11 00:00:00
freq                       19
first     2013-10-01 09:31:00
last      2016-02-21 19:17:00
Name: datetime, dtype: object

In [30]:
animals.datetime.value_counts().head(50)

2015-08-11 00:00:00    19
2015-11-17 00:00:00    17
2015-07-02 00:00:00    13
2015-04-02 00:00:00    11
2014-08-31 09:00:00    10
2015-11-23 00:00:00     9
2014-08-26 09:00:00     9
2016-02-15 00:00:00     9
2015-09-11 00:00:00     9
2015-11-08 00:00:00     8
2015-07-26 09:00:00     8
2014-08-17 09:00:00     8
2015-08-03 00:00:00     8
2015-06-19 00:00:00     8
2015-09-01 00:00:00     8
2014-07-16 09:00:00     8
2014-09-06 09:00:00     8
2014-06-21 09:00:00     8
2014-04-26 15:31:00     7
2014-10-20 09:00:00     7
2016-02-07 00:00:00     7
2014-12-27 11:09:00     7
2014-08-16 09:00:00     7
2015-05-31 00:00:00     7
2015-07-03 09:00:00     7
2015-04-20 14:08:00     7
2015-08-02 09:00:00     7
2014-05-03 16:15:00     7
2015-04-25 09:00:00     7
2015-10-19 09:00:00     7
2015-11-05 00:00:00     7
2014-11-05 09:00:00     7
2015-11-30 00:00:00     7
2015-07-30 09:00:00     6
2014-03-13 14:58:00     6
2014-04-17 14:30:00     6
2015-09-21 15:57:00     6
2015-05-30 18:42:00     6
2014-10-23 0

In [31]:
# morning, afternoon, evening

animals['hour'] = animals['datetime'].map(lambda row: row.hour)

In [32]:
animals.hour.value_counts().sort_index()

0      378
5        2
6       10
7      110
8      302
9     1278
10     405
11    2042
12    2513
13    2468
14    2800
15    2682
16    2690
17    4162
18    3684
19    1083
20      77
21      18
22       5
23      20
Name: hour, dtype: int64

In [33]:
animals['minutes'] = animals['datetime'].map(lambda row: row.minute)

In [34]:
animals.minutes.value_counts()

0     1895
17     485
6      466
58     462
15     461
9      460
20     455
4      453
16     451
38     448
8      448
59     447
37     443
52     443
26     441
5      440
55     440
42     437
32     436
34     436
25     436
2      436
10     433
31     431
22     429
39     426
50     426
27     426
56     425
7      422
30     422
1      418
18     417
53     416
14     413
13     413
47     409
54     409
57     409
23     409
33     408
46     407
21     407
29     407
51     406
43     404
41     402
19     402
24     400
40     397
3      396
28     396
49     396
44     388
11     385
36     381
45     377
12     369
48     366
35     363
Name: minutes, dtype: int64

In [35]:
animals[(animals['hour']==0) & (animals['minutes']==0)]

Unnamed: 0,id,name,date,outcome,outcome_info,animal,sex,age,breed,color,datetime,day,month,year,hour,minutes
38,A702655,Mica,2015-08-11 00:00:00,Transfer,Partner,Cat,Neutered Male,4 months,Domestic Shorthair Mix,Black/White,2015-08-11,1,8,2015,0,0
117,A713392,Basil,2015-11-05 00:00:00,Transfer,Partner,Dog,Intact Male,3 months,Chihuahua Shorthair Mix,Black/White,2015-11-05,3,11,2015,0,0
175,A710071,Herb,2015-09-01 00:00:00,Transfer,Partner,Cat,Neutered Male,2 months,Domestic Shorthair Mix,White/Black,2015-09-01,1,9,2015,0,0
181,A716575,Scarlet,2015-12-22 00:00:00,Transfer,Partner,Dog,Spayed Female,2 years,Australian Cattle Dog Mix,Tan/White,2015-12-22,1,12,2015,0,0
195,A683666,unknown,2015-09-01 00:00:00,Transfer,Partner,Cat,Neutered Male,1 year,Domestic Shorthair Mix,White/Cream Tabby,2015-09-01,1,9,2015,0,0
248,A709067,Earlene,2015-08-05 00:00:00,Return_to_owner,,Dog,Intact Female,5 months,Boxer Mix,Black/White,2015-08-05,2,8,2015,0,0
251,A720190,Little Missy,2016-02-07 00:00:00,Transfer,Partner,Dog,Spayed Female,7 years,Airedale Terrier Mix,Tricolor,2016-02-07,6,2,2016,0,0
289,A714820,Lizzie,2015-11-05 00:00:00,Transfer,Partner,Dog,Spayed Female,6 months,Plott Hound Mix,Brown Brindle,2015-11-05,3,11,2015,0,0
306,A716488,unknown,2015-11-23 00:00:00,Transfer,Partner,Dog,Neutered Male,2 years,Pit Bull Mix,Red,2015-11-23,0,11,2015,0,0
409,A716400,Calliope,2015-11-24 00:00:00,Transfer,Partner,Dog,Intact Female,1 year,Yorkshire Terrier Mix,Black/Tan,2015-11-24,1,11,2015,0,0


In [36]:
animals[(animals['hour']==0) & (animals['minutes']==0)].shape

(375, 16)

In [37]:
animals.animal.value_counts(dropna=False)

Dog    15595
Cat    11134
Name: animal, dtype: int64

In [38]:
animals.outcome.value_counts(dropna=False)

Adoption           10769
Transfer            9422
Return_to_owner     4786
Euthanasia          1555
Died                 197
Name: outcome, dtype: int64

In [39]:
animals.outcome_info.value_counts(dropna=False)

NaN                    13612
Partner                 7816
Foster                  1800
SCRP                    1599
Suffering               1002
Aggressive               320
Offsite                  165
In Kennel                114
Behavior                  86
Rabies Risk               74
Medical                   66
In Foster                 52
Enroute                    8
Court/Investigation        6
At Vet                     4
In Surgery                 3
Barn                       2
Name: outcome_info, dtype: int64

In [51]:
animals['id'].groupby([animals['outcome'], animals['outcome_info']]).count()

outcome     outcome_info       
Adoption    Barn                      1
            Foster                 1800
            Offsite                 165
Died        Enroute                   8
            Foster                   52
            In Kennel               114
            Medical                   7
Euthanasia  Behavior                406
            Court/Investigation       6
            Medical                  66
            Rabies Risk              74
            Suffering              1002
Transfer    Barn                      1
            Partner                7816
            SCRP                   1599
Name: id, dtype: int64

In [40]:
def outcome_info_cleanup(orginal='original', new='new'):
    animals.outcome_info = animals.outcome_info.str.replace(orginal, new)
    return len(animals[animals['outcome_info'] == new])

In [42]:
outcome_info_cleanup(orginal='Aggressive', new='Behavior')

406

In [43]:
outcome_info_cleanup(orginal='In Foster', new='Foster')

1852

In [44]:
outcome_info_cleanup(orginal='At Vet', new='Medical')

70

In [45]:
outcome_info_cleanup(orginal='In Surgery', new='Medical')

73

In [47]:
animals.outcome_info.value_counts(dropna=False)

NaN                    13612
Partner                 7816
Foster                  1852
SCRP                    1599
Suffering               1002
Behavior                 406
Offsite                  165
In Kennel                114
Rabies Risk               74
Medical                   73
Enroute                    8
Court/Investigation        6
Barn                       2
Name: outcome_info, dtype: int64

In [45]:
animals.columns

Index(['id', 'name', 'date', 'outcome', 'outcome_info', 'animal', 'sex', 'age',
       'breed', 'color', 'datetime', 'day', 'month', 'year', 'hour',
       'minutes'],
      dtype='object')

In [46]:
animals.sex.value_counts(dropna=True)

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1093
Name: sex, dtype: int64

In [47]:
len(animals.breed.value_counts(dropna=True))

1380

In [48]:
len(animals.color.value_counts(dropna=True))

366

In [49]:
animals.name.value_counts().head(10)

unknown     7691
Max          136
Bella        135
Charlie      107
Daisy        106
Lucy          94
Buddy         87
Princess      86
Rocky         85
Luna          68
Name: name, dtype: int64

In [50]:
animals.shape

(26729, 16)

In [51]:
import plotly.plotly as py
import plotly.graph_objs as go

In [52]:
import cufflinks as cf
print (cf.__version__)

0.8.2


In [53]:
animals.columns

Index(['id', 'name', 'date', 'outcome', 'outcome_info', 'animal', 'sex', 'age',
       'breed', 'color', 'datetime', 'day', 'month', 'year', 'hour',
       'minutes'],
      dtype='object')

In [54]:
animals.sex.value_counts()

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1093
Name: sex, dtype: int64

## TO DO: explore other plot types & colors...
- ie boxplots?
- regplot, lmplot (logistic regression plot from seaborn?)

In [56]:
animals['sex'].value_counts().iplot(kind='bar', yTitle='Sex', title='Animal sex',)

In [55]:
animals['name'].value_counts()[:20].iplot(kind='bar', yTitle='Name Frequency', title='Animal names',)

In [57]:
animals['outcome'].value_counts().iplot(kind='bar', yTitle='Outcomes', title='Animal outcomes',)

In [59]:
# subset/group by dog vs cat
animals['breed'].value_counts().iplot(kind='bar', yTitle='Breeds', title='Animal Breeds')

In [60]:
# subset/group by dog vs cat
animals['color'].value_counts().iplot(kind='bar', yTitle='Color of coat', title='Animal coat colors')