[Shelter Animal Outcomes](https://www.kaggle.com/c/shelter-animal-outcomes)

In [1]:
reset -fs

In [71]:
import pandas as pd
import numpy as np
from collections import Counter
# import datetime

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf

sns.set_style("dark")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
animals_test = pd.read_csv('data/test.csv')
print('shape:',animals_test.shape)
animals_test.head(2)

shape: (11456, 8)


Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan


In [4]:
animals = pd.read_csv('data/train.csv')
animals.head(2)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby


In [5]:
animals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26729 entries, 0 to 26728
Data columns (total 10 columns):
AnimalID          26729 non-null object
Name              19038 non-null object
DateTime          26729 non-null object
OutcomeType       26729 non-null object
OutcomeSubtype    13117 non-null object
AnimalType        26729 non-null object
SexuponOutcome    26728 non-null object
AgeuponOutcome    26711 non-null object
Breed             26729 non-null object
Color             26729 non-null object
dtypes: object(10)
memory usage: 2.0+ MB


## Data cleaning

In [6]:
# rename columns
animals.columns = [x.lower() for x in animals.columns]
animals = animals.rename(index=str, 
               columns={
                   "animalid": "id",
                   "datetime": "date_outcome",
                   "outcometype": "outcome",
                   "outcomesubtype": "outcome_info",
                   "animaltype": "animal",
                   "sexuponoutcome": "sex_outcome",
                   "ageuponoutcome": "age"})
animals.columns

Index(['id', 'name', 'date_outcome', 'outcome', 'outcome_info', 'animal',
       'sex_outcome', 'age', 'breed', 'color'],
      dtype='object')

In [7]:
len(animals.name.unique())

6375

In [8]:
animals.name.value_counts(dropna=False).head(11)

NaN         7691
Max          136
Bella        135
Charlie      107
Daisy        106
Lucy          94
Buddy         87
Princess      86
Rocky         85
Luna          68
Jack          66
Name: name, dtype: int64

In [9]:
# replace NaN with "unknown"
animals.name.replace(np.nan, 'unknown', inplace=True)
animals.name.value_counts(dropna=False).head(10)

unknown     7691
Max          136
Bella        135
Charlie      107
Daisy        106
Lucy          94
Buddy         87
Princess      86
Rocky         85
Luna          68
Name: name, dtype: int64

In [10]:
# create has_name column - 0 no name, 1 name
animals['has_name'] = animals.name.apply(lambda row: 0 if 'unknown' in row else 1)

In [11]:
animals[['name','has_name']].head()

Unnamed: 0,name,has_name
0,Hambone,1
1,Emily,1
2,Pearce,1
3,unknown,0
4,unknown,0


In [12]:
top10 = [name for name in animals.name.value_counts().head(11).index.tolist() if name != 'unknown']

def popular_name(row):
    if row in top10:
        return 1
    else:
        return 0

assert popular_name('Jack') == 1

In [13]:
animals['pop_name'] = animals.name.apply(popular_name)

In [14]:
animals.pop_name.value_counts()

0    25759
1      970
Name: pop_name, dtype: int64

In [15]:
sum(animals.name.value_counts().head(11)) - len(animals[animals.name == 'unknown'])

970

In [16]:
animals.head(10)

Unnamed: 0,id,name,date_outcome,outcome,outcome_info,animal,sex_outcome,age,breed,color,has_name,pop_name
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,1,0
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,1,0
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,1,0
3,A683430,unknown,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,0,0
4,A667013,unknown,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,0,0
5,A677334,Elsa,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan,1,0
6,A699218,Jimmy,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby,1,0
7,A701489,unknown,2015-04-30 17:02:00,Transfer,Partner,Cat,Unknown,3 weeks,Domestic Shorthair Mix,Brown Tabby,0,0
8,A671784,Lucy,2014-02-04 17:17:00,Adoption,,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White,1,1
9,A677747,unknown,2014-05-03 07:48:00,Adoption,Offsite,Dog,Spayed Female,1 year,Cairn Terrier,White,0,0


### Sex and spayed/neutered
make new columns for sex, spayed/neutered... drop unknowns

In [17]:
animals.sex_outcome.value_counts(dropna=False)

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1093
NaN                 1
Name: sex_outcome, dtype: int64

In [18]:
# make "Unknown" NaN for dropping
animals.sex_outcome.replace('Unknown', np.nan, inplace=True)

In [19]:
animals.shape

(26729, 12)

In [20]:
animals.dropna(subset=['sex_outcome'],inplace=True)
animals.reset_index(drop=True, inplace=True)

In [21]:
animals.shape

(25635, 12)

In [22]:
animals['sex'] = animals.sex_outcome.apply(lambda row: 1 if 'Female' in row else 0)
animals['spay_neutered'] = animals.sex_outcome.apply(lambda row: 0 if 'Intact' in row else 1)

In [23]:
animals.sex.value_counts(dropna=False)

0    13304
1    12331
Name: sex, dtype: int64

In [24]:
animals.spay_neutered.value_counts(dropna=False)

1    18599
0     7036
Name: spay_neutered, dtype: int64

### Date of outcome
convert string date to datetime and create columns for day, month, year

In [25]:
animals['datetime'] = pd.to_datetime(animals['date_outcome'], infer_datetime_format=True)

In [26]:
# Monday is 0, Sunday is 6
animals['day'] = animals['datetime'].apply(lambda row: row.weekday())
# 1-12
animals['month'] = animals['datetime'].apply(lambda row: row.month)
#2013-2016
animals['year'] = animals['datetime'].apply(lambda row: row.year)
# split into morning, afternoon, evening
animals['hour'] = animals['datetime'].apply(lambda row: row.hour)
# to investigate when hour = 0
animals['minutes'] = animals['datetime'].apply(lambda row: row.minute)

In [27]:
animals.hour.value_counts(dropna=False).sort_index()

0      364
5        1
6        9
7      109
8      292
9     1057
10     375
11    1997
12    2442
13    2373
14    2670
15    2573
16    2612
17    4058
18    3562
19    1024
20      74
21      18
22       5
23      20
Name: hour, dtype: int64

In [28]:
# 00:00:00 is ambiguous and could midnight or first thing in the morning 
# or essentially be a missing value
animals.datetime.value_counts().head(10)

2015-08-11 00:00:00    19
2015-11-17 00:00:00    17
2015-07-02 00:00:00    13
2015-04-02 00:00:00    11
2014-08-31 09:00:00    10
2015-11-23 00:00:00     9
2016-02-15 00:00:00     9
2015-09-11 00:00:00     9
2015-09-01 00:00:00     8
2015-11-08 00:00:00     8
Name: datetime, dtype: int64

In [29]:
animals[(animals['hour']==0) & (animals['minutes']==0)]

Unnamed: 0,id,name,date_outcome,outcome,outcome_info,animal,sex_outcome,age,breed,color,has_name,pop_name,sex,spay_neutered,datetime,day,month,year,hour,minutes
36,A702655,Mica,2015-08-11 00:00:00,Transfer,Partner,Cat,Neutered Male,4 months,Domestic Shorthair Mix,Black/White,1,0,0,1,2015-08-11,1,8,2015,0,0
110,A713392,Basil,2015-11-05 00:00:00,Transfer,Partner,Dog,Intact Male,3 months,Chihuahua Shorthair Mix,Black/White,1,0,0,0,2015-11-05,3,11,2015,0,0
167,A710071,Herb,2015-09-01 00:00:00,Transfer,Partner,Cat,Neutered Male,2 months,Domestic Shorthair Mix,White/Black,1,0,0,1,2015-09-01,1,9,2015,0,0
173,A716575,Scarlet,2015-12-22 00:00:00,Transfer,Partner,Dog,Spayed Female,2 years,Australian Cattle Dog Mix,Tan/White,1,0,1,1,2015-12-22,1,12,2015,0,0
187,A683666,unknown,2015-09-01 00:00:00,Transfer,Partner,Cat,Neutered Male,1 year,Domestic Shorthair Mix,White/Cream Tabby,0,0,0,1,2015-09-01,1,9,2015,0,0
239,A709067,Earlene,2015-08-05 00:00:00,Return_to_owner,,Dog,Intact Female,5 months,Boxer Mix,Black/White,1,0,1,0,2015-08-05,2,8,2015,0,0
242,A720190,Little Missy,2016-02-07 00:00:00,Transfer,Partner,Dog,Spayed Female,7 years,Airedale Terrier Mix,Tricolor,1,0,1,1,2016-02-07,6,2,2016,0,0
278,A714820,Lizzie,2015-11-05 00:00:00,Transfer,Partner,Dog,Spayed Female,6 months,Plott Hound Mix,Brown Brindle,1,0,1,1,2015-11-05,3,11,2015,0,0
295,A716488,unknown,2015-11-23 00:00:00,Transfer,Partner,Dog,Neutered Male,2 years,Pit Bull Mix,Red,0,0,0,1,2015-11-23,0,11,2015,0,0
395,A716400,Calliope,2015-11-24 00:00:00,Transfer,Partner,Dog,Intact Female,1 year,Yorkshire Terrier Mix,Black/Tan,1,0,1,0,2015-11-24,1,11,2015,0,0


In [30]:
# so only 3 rows are times where hour = 0 and minutes !=0
# I'll drop these rows
print(animals[(animals['hour']==0) & (animals['minutes']==0)].shape)
print(animals[(animals['hour']==0) & (animals['minutes']!=0)].shape)

(361, 20)
(3, 20)


In [31]:
# make "Unknown" NaN for dropping
animals.hour.replace(0, np.nan, inplace=True)

In [32]:
animals.dropna(subset=['hour'],inplace=True)

In [33]:
def season(row):
    if row == 3 or row == 4 or row == 5:
        season = 'spring'
    elif row == 6 or row == 7 or row == 8:
        season = 'summer'
    elif row == 9 or row == 10 or row == 11:
        season = 'fall'
    else:
        season = 'winter'
    return season

assert season(12) == 'winter'

In [34]:
animals['season'] = animals.month.apply(season)

In [35]:
animals.hour.value_counts(dropna=False).sort_index(ascending=True)

5.0        1
6.0        9
7.0      109
8.0      292
9.0     1057
10.0     375
11.0    1997
12.0    2442
13.0    2373
14.0    2670
15.0    2573
16.0    2612
17.0    4058
18.0    3562
19.0    1024
20.0      74
21.0      18
22.0       5
23.0      20
Name: hour, dtype: int64

In [36]:
def time_of_day(row):
    if row >= 5 and row < 12:
        time = 'morning'
    elif row >= 12 and row < 5:
        time = 'afternoon'
    else:
        time = 'evening'
    return time

assert time_of_day(5) == 'morning'

In [37]:
animals['time_of_day'] = animals.month.apply(time_of_day)

### Outcome info

After examining this variable I've come to the conclusion it's not useful as it's information explaining the details of the Outcome after the fact so I won't include it in any modeling.

In [38]:
# good number of NaNs
animals.outcome_info.value_counts(dropna=False)

NaN                    13540
Partner                 6866
Foster                  1798
SCRP                    1292
Suffering                910
Aggressive               318
Offsite                  164
In Kennel                 98
Behavior                  86
Rabies Risk               69
Medical                   63
In Foster                 48
Court/Investigation        6
Enroute                    4
At Vet                     4
In Surgery                 3
Barn                       2
Name: outcome_info, dtype: int64

In [39]:
animals[['outcome', 'outcome_info']]

Unnamed: 0,outcome,outcome_info
0,Return_to_owner,
1,Euthanasia,Suffering
2,Adoption,Foster
3,Transfer,Partner
4,Transfer,Partner
5,Transfer,Partner
6,Transfer,Partner
7,Adoption,
8,Adoption,Offsite
9,Adoption,


In [40]:
animals[['id']].groupby([animals['outcome'], animals['outcome_info']]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id
outcome,outcome_info,Unnamed: 2_level_1
Adoption,Barn,1
Adoption,Foster,1798
Adoption,Offsite,164
Died,At Vet,4
Died,Enroute,4
Died,In Foster,48
Died,In Kennel,98
Died,In Surgery,3
Euthanasia,Aggressive,318
Euthanasia,Behavior,86


In [41]:
animals[['id']].groupby([animals['outcome'], animals['outcome_info']]).count().unstack()

Unnamed: 0_level_0,id,id,id,id,id,id,id,id,id,id,id,id,id,id,id,id
outcome_info,Aggressive,At Vet,Barn,Behavior,Court/Investigation,Enroute,Foster,In Foster,In Kennel,In Surgery,Medical,Offsite,Partner,Rabies Risk,SCRP,Suffering
outcome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Adoption,,,1.0,,,,1798.0,,,,,164.0,,,,
Died,,4.0,,,,4.0,,48.0,98.0,3.0,,,,,,
Euthanasia,318.0,,,86.0,6.0,,,,,,63.0,,,69.0,,910.0
Transfer,,,1.0,,,,,,,,,,6866.0,,1292.0,


In [42]:
animals[['id']].groupby([animals['outcome'], animals['outcome_info'].isnull()]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id
outcome,outcome_info,Unnamed: 2_level_1
Adoption,False,1963
Adoption,True,8769
Died,False,157
Died,True,11
Euthanasia,False,1452
Euthanasia,True,1
Return_to_owner,True,4753
Transfer,False,8159
Transfer,True,6


In [43]:
animals[['outcome', 'outcome_info']][animals['outcome_info'].isnull()].head(10)

Unnamed: 0,outcome,outcome_info
0,Return_to_owner,
7,Adoption,
9,Adoption,
11,Return_to_owner,
12,Return_to_owner,
17,Return_to_owner,
20,Adoption,
21,Adoption,
22,Return_to_owner,
24,Adoption,


### Age

In [44]:
len(animals.age.unique())

45

In [45]:
# drop NaN and 0 years
animals.age.value_counts(dropna=False)

1 year       3788
2 years      3599
2 months     3331
3 years      1788
3 months     1231
1 month      1230
4 years      1047
5 years       972
4 months      851
6 years       656
5 months      632
6 months      563
8 years       529
3 weeks       528
7 years       521
10 years      442
10 months     442
8 months      371
2 weeks       359
9 years       283
7 months      281
4 weeks       280
12 years      232
9 months      218
11 months     161
13 years      141
11 years      125
1 weeks       114
14 years       96
15 years       85
1 week         66
2 days         60
3 days         55
4 days         38
16 years       34
1 day          24
6 days         21
17 years       17
NaN            14
5 weeks        11
5 days         11
18 years       10
0 years         9
19 years        3
20 years        2
Name: age, dtype: int64

In [46]:
animals.age.replace('0 years', np.nan, inplace=True)

In [47]:
animals.shape

(25271, 22)

In [48]:
animals.dropna(subset=['age'],inplace=True)
animals.reset_index(drop=True, inplace=True)

In [49]:
animals.shape

(25248, 22)

In [50]:
def age_in_weeks(row): 
    if row.find('day') > 0:
        num = int(row.split(' ')[0]) / 7
    elif row.find('week') > 0:
        num = int(row.split(' ')[0])
    elif row.find('month') > 0:
        num = int(row.split(' ')[0]) * 4
    elif row.find('year') > 0:
        num = int(row.split(' ')[0]) * 52
    else:
        num = 'unknown'
    return num

assert age_in_weeks('1 day') == 1/7
assert age_in_weeks('6 days') == 6/7

In [51]:
animals['age_in_weeks'] = animals.age.apply(age_in_weeks)

In [52]:
# verify
animals[['age','age_in_weeks']].head(50)

Unnamed: 0,age,age_in_weeks
0,1 year,52.0
1,1 year,52.0
2,2 years,104.0
3,3 weeks,3.0
4,2 years,104.0
5,1 month,4.0
6,3 weeks,3.0
7,5 months,20.0
8,1 year,52.0
9,2 years,104.0


In [53]:
assert age_in_weeks(animals.age[10]) == 208

In [54]:
animals.age_in_weeks.value_counts(dropna=False)

52.000000      3788
104.000000     3599
8.000000       3331
156.000000     1788
4.000000       1510
12.000000      1231
208.000000     1047
260.000000      972
16.000000       851
312.000000      656
20.000000       632
24.000000       563
416.000000      529
3.000000        528
364.000000      521
520.000000      442
40.000000       442
32.000000       371
2.000000        359
468.000000      283
28.000000       281
624.000000      232
36.000000       218
1.000000        180
44.000000       161
676.000000      141
572.000000      125
728.000000       96
780.000000       85
0.285714         60
0.428571         55
0.571429         38
832.000000       34
0.142857         24
0.857143         21
884.000000       17
5.000000         11
0.714286         11
936.000000       10
988.000000        3
1040.000000       2
Name: age_in_weeks, dtype: int64

### Color

In [55]:
animals.columns

Index(['id', 'name', 'date_outcome', 'outcome', 'outcome_info', 'animal',
       'sex_outcome', 'age', 'breed', 'color', 'has_name', 'pop_name', 'sex',
       'spay_neutered', 'datetime', 'day', 'month', 'year', 'hour', 'minutes',
       'season', 'time_of_day', 'age_in_weeks'],
      dtype='object')

In [56]:
len(animals.color.value_counts(dropna=True))

358

In [58]:
value_counts = animals.color.value_counts()
print(len(value_counts[value_counts > 500]))
value_counts[value_counts > 500]

16


Black/White          2658
Black                2053
Brown Tabby          1428
White                 879
Brown/White           877
Brown Tabby/White     829
Tan/White             757
Orange Tabby          750
Tricolor              740
Blue/White            663
Black/Tan             663
Brown                 617
Tan                   609
White/Black           598
White/Brown           554
Tortie                509
Name: color, dtype: int64

In [59]:
animals['color_list'] = animals.color.apply(lambda row: row.split('/'))

In [60]:
animals['color_list'].head(10)

0    [Brown, White]
1     [Cream Tabby]
2     [Blue, White]
3      [Blue Cream]
4             [Tan]
5      [Black, Tan]
6      [Blue Tabby]
7      [Red, White]
8           [White]
9          [Silver]
Name: color_list, dtype: object

In [61]:
type(animals['color_list'][0])

list

In [62]:
colors = [color for color_list in animals.color_list for color in color_list]
unique_colors = list(set(colors))
print(len(unique_colors))
unique_colors

57


['Tortie Point',
 'Buff',
 'Tortie',
 'Lilac Point',
 'Black Brindle',
 'Yellow Brindle',
 'Black',
 'Blue Tiger',
 'Cream Tabby',
 'Fawn',
 'Blue Tabby',
 'Orange',
 'Gray',
 'Lynx Point',
 'Brown Merle',
 'Blue Cream',
 'Yellow',
 'Brown Tabby',
 'Tan',
 'Red Tick',
 'Apricot',
 'Torbie',
 'Orange Tiger',
 'Blue Tick',
 'Blue Smoke',
 'Silver Lynx Point',
 'Blue',
 'Ruddy',
 'Cream',
 'Red',
 'Brown Brindle',
 'Seal Point',
 'Flame Point',
 'Pink',
 'Blue Point',
 'Sable',
 'Liver',
 'Gray Tabby',
 'Calico Point',
 'Calico',
 'Silver',
 'Orange Tabby',
 'Blue Merle',
 'Brown Tiger',
 'Chocolate Point',
 'Black Tiger',
 'Chocolate',
 'Tricolor',
 'Red Merle',
 'Gold',
 'Black Smoke',
 'Brown',
 'Silver Tabby',
 'Agouti',
 'Liver Tick',
 'Black Tabby',
 'White']

In [82]:
only_colors = [colors for phrases in unique_colors for colors in phrases.split()]
len(set((only_colors)))

36

In [79]:
only_colors = Counter(only_colors)
only_colors.most_common()

[('Point', 9),
 ('Blue', 8),
 ('Tabby', 7),
 ('Black', 5),
 ('Brown', 5),
 ('Tiger', 4),
 ('Brindle', 3),
 ('Cream', 3),
 ('Orange', 3),
 ('Merle', 3),
 ('Red', 3),
 ('Tick', 3),
 ('Silver', 3),
 ('Tortie', 2),
 ('Yellow', 2),
 ('Gray', 2),
 ('Lynx', 2),
 ('Smoke', 2),
 ('Liver', 2),
 ('Calico', 2),
 ('Chocolate', 2),
 ('Buff', 1),
 ('Lilac', 1),
 ('Fawn', 1),
 ('Tan', 1),
 ('Apricot', 1),
 ('Torbie', 1),
 ('Ruddy', 1),
 ('Seal', 1),
 ('Flame', 1),
 ('Pink', 1),
 ('Sable', 1),
 ('Tricolor', 1),
 ('Gold', 1),
 ('Agouti', 1),
 ('White', 1)]

In [83]:
only_colors = list(set(only_colors))
len(only_colors)

36

In [85]:
only_colors

['Buff',
 'Tortie',
 'Tiger',
 'Black',
 'Tick',
 'Fawn',
 'Lilac',
 'Brindle',
 'Merle',
 'Orange',
 'Lynx',
 'Gray',
 'Flame',
 'Point',
 'Yellow',
 'Apricot',
 'Tan',
 'Torbie',
 'Blue',
 'Ruddy',
 'Cream',
 'Red',
 'Pink',
 'Sable',
 'Liver',
 'Seal',
 'Silver',
 'Calico',
 'Tabby',
 'Chocolate',
 'Tricolor',
 'Gold',
 'Agouti',
 'Smoke',
 'Brown',
 'White']

In [86]:
animal_colors = pd.DataFrame()

In [87]:
for color in unique_colors:
    animal_colors[color] = animals.color_list.map(lambda row: 1 if color in row else 0)

In [88]:
animal_colors.columns

Index(['Tortie Point', 'Buff', 'Tortie', 'Lilac Point', 'Black Brindle',
       'Yellow Brindle', 'Black', 'Blue Tiger', 'Cream Tabby', 'Fawn',
       'Blue Tabby', 'Orange', 'Gray', 'Lynx Point', 'Brown Merle',
       'Blue Cream', 'Yellow', 'Brown Tabby', 'Tan', 'Red Tick', 'Apricot',
       'Torbie', 'Orange Tiger', 'Blue Tick', 'Blue Smoke',
       'Silver Lynx Point', 'Blue', 'Ruddy', 'Cream', 'Red', 'Brown Brindle',
       'Seal Point', 'Flame Point', 'Pink', 'Blue Point', 'Sable', 'Liver',
       'Gray Tabby', 'Calico Point', 'Calico', 'Silver', 'Orange Tabby',
       'Blue Merle', 'Brown Tiger', 'Chocolate Point', 'Black Tiger',
       'Chocolate', 'Tricolor', 'Red Merle', 'Gold', 'Black Smoke', 'Brown',
       'Silver Tabby', 'Agouti', 'Liver Tick', 'Black Tabby', 'White'],
      dtype='object')

In [89]:
animal_colors.head()

Unnamed: 0,Tortie Point,Buff,Tortie,Lilac Point,Black Brindle,Yellow Brindle,Black,Blue Tiger,Cream Tabby,Fawn,...,Tricolor,Red Merle,Gold,Black Smoke,Brown,Silver Tabby,Agouti,Liver Tick,Black Tabby,White
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [90]:
animal_colors.describe()

Unnamed: 0,Tortie Point,Buff,Tortie,Lilac Point,Black Brindle,Yellow Brindle,Black,Blue Tiger,Cream Tabby,Fawn,...,Tricolor,Red Merle,Gold,Black Smoke,Brown,Silver Tabby,Agouti,Liver Tick,Black Tabby,White
count,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,...,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0
mean,0.001228,0.012714,0.02222,0.001386,0.004476,0.001545,0.289251,0.000356,0.010417,0.008199,...,0.035528,0.002337,0.003208,0.002733,0.118227,0.001584,7.9e-05,0.000158,0.002297,0.459522
std,0.035019,0.112039,0.1474,0.037207,0.066751,0.039273,0.453423,0.018877,0.101531,0.090176,...,0.185113,0.048285,0.056551,0.052207,0.322883,0.039772,0.0089,0.012586,0.047875,0.498369
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [91]:
animal_colors['White'].mean()

0.45952154626108999

In [92]:
means = []
for col in animal_colors.columns:
    means.append(animal_colors[col].mean())

means
columns = [col for col in animal_colors.columns]
sorted(list(zip(means, columns)), reverse=True)

[(0.45952154626108999, 'White'),
 (0.28925063371356147, 'Black'),
 (0.11822718631178707, 'Brown'),
 (0.11739543726235742, 'Tan'),
 (0.09367078580481622, 'Brown Tabby'),
 (0.049786121673003801, 'Blue'),
 (0.048637515842839034, 'Orange Tabby'),
 (0.035527566539923956, 'Tricolor'),
 (0.034220532319391636, 'Red'),
 (0.03382446134347275, 'Brown Brindle'),
 (0.025110899873257288, 'Blue Tabby'),
 (0.022219581749049429, 'Tortie'),
 (0.021229404309252218, 'Calico'),
 (0.019288656527249683, 'Chocolate'),
 (0.014892268694550063, 'Torbie'),
 (0.013307984790874524, 'Gray'),
 (0.012713878326996197, 'Buff'),
 (0.012674271229404309, 'Sable'),
 (0.01136723700887199, 'Cream'),
 (0.010416666666666666, 'Cream Tabby'),
 (0.0093076679340937888, 'Yellow'),
 (0.0081986692015209132, 'Fawn'),
 (0.0070104562737642586, 'Lynx Point'),
 (0.0069708491761723704, 'Blue Merle'),
 (0.0057430291508238276, 'Seal Point'),
 (0.0044756020278833965, 'Black Brindle'),
 (0.0032873891001267428, 'Flame Point'),
 (0.00320817490494

In [204]:
animals['multicolor'] = animals.color_list.apply(lambda row: 0 if len(row) < 2 else 1)

In [205]:
animals[['multicolor', 'color_list']]

Unnamed: 0,multicolor,color_list
0,1,"[Brown, White]"
1,0,[Cream Tabby]
2,1,"[Blue, White]"
3,0,[Blue Cream]
4,0,[Tan]
5,1,"[Black, Tan]"
6,0,[Blue Tabby]
7,1,"[Red, White]"
8,0,[White]
9,0,[Silver]


### Breed

In [187]:
print(len(animals.breed.value_counts()))
animals.breed.value_counts().head(60)

1371


Domestic Shorthair Mix                7782
Pit Bull Mix                          1876
Chihuahua Shorthair Mix               1726
Labrador Retriever Mix                1322
Domestic Medium Hair Mix               786
German Shepherd Mix                    570
Domestic Longhair Mix                  482
Siamese Mix                            368
Australian Cattle Dog Mix              362
Dachshund Mix                          309
Boxer Mix                              233
Miniature Poodle Mix                   231
Border Collie Mix                      214
Australian Shepherd Mix                158
Catahoula Mix                          155
Rat Terrier Mix                        153
Jack Russell Terrier Mix               144
Yorkshire Terrier Mix                  141
Chihuahua Longhair Mix                 138
Siberian Husky Mix                     135
Miniature Schnauzer Mix                133
Domestic Shorthair                     133
Beagle Mix                             123
Rottweiler 

In [231]:
animals['mixed_breed'] = animals.breed.apply(lambda row: \
                                             1 if '/' in row or 'mix' in row.lower() else 0)

In [232]:
animals[['breed','mixed_breed']]

Unnamed: 0,breed,mixed_breed
0,Shetland Sheepdog Mix,1
1,Domestic Shorthair Mix,1
2,Pit Bull Mix,1
3,Domestic Shorthair Mix,1
4,Lhasa Apso/Miniature Poodle,1
5,Cairn Terrier/Chihuahua Shorthair,1
6,Domestic Shorthair Mix,1
7,American Pit Bull Terrier Mix,1
8,Cairn Terrier,0
9,Miniature Schnauzer Mix,1


## TO DO: explore other plot types & colors...
- ie boxplots?
- regplot, lmplot (logistic regression plot from seaborn?)

In [56]:
animals['sex'].value_counts().iplot(kind='bar', yTitle='Sex', title='Animal sex',)

In [55]:
animals['name'].value_counts()[:20].iplot(kind='bar', yTitle='Name Frequency', title='Animal names',)

In [57]:
animals['outcome'].value_counts().iplot(kind='bar', yTitle='Outcomes', title='Animal outcomes',)

In [59]:
# subset/group by dog vs cat
animals['breed'].value_counts().iplot(kind='bar', yTitle='Breeds', title='Animal Breeds')

In [60]:
# subset/group by dog vs cat
animals['color'].value_counts().iplot(kind='bar', yTitle='Color of coat', title='Animal coat colors')