[Shelter Animal Outcomes](https://www.kaggle.com/c/shelter-animal-outcomes)

In [1]:
reset -fs

## Imports

In [2]:
import pandas as pd
import numpy as np
from collections import Counter
# import datetime

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.plotly as py
import plotly.graph_objs as go
import cufflinks as cf

sns.set_style("dark")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
animals_test = pd.read_csv('data/test.csv')
print('shape:',animals_test.shape)
animals_test.head(2)

shape: (11456, 8)


Unnamed: 0,ID,Name,DateTime,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,1,Summer,2015-10-12 12:15:00,Dog,Intact Female,10 months,Labrador Retriever Mix,Red/White
1,2,Cheyenne,2014-07-26 17:59:00,Dog,Spayed Female,2 years,German Shepherd/Siberian Husky,Black/Tan


In [4]:
animals = pd.read_csv('data/train.csv')
animals.head(2)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby


In [5]:
animals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26729 entries, 0 to 26728
Data columns (total 10 columns):
AnimalID          26729 non-null object
Name              19038 non-null object
DateTime          26729 non-null object
OutcomeType       26729 non-null object
OutcomeSubtype    13117 non-null object
AnimalType        26729 non-null object
SexuponOutcome    26728 non-null object
AgeuponOutcome    26711 non-null object
Breed             26729 non-null object
Color             26729 non-null object
dtypes: object(10)
memory usage: 2.0+ MB


## Data cleaning

In [6]:
# rename columns
animals.columns = [x.lower() for x in animals.columns]
animals = animals.rename(index=str, 
               columns={
                   "animalid": "id",
                   "datetime": "date_outcome",
                   "outcometype": "outcome",
                   "outcomesubtype": "outcome_info",
                   "animaltype": "animal",
                   "sexuponoutcome": "sex_outcome",
                   "ageuponoutcome": "age"})
animals.columns

Index(['id', 'name', 'date_outcome', 'outcome', 'outcome_info', 'animal',
       'sex_outcome', 'age', 'breed', 'color'],
      dtype='object')

### Name

Some animals come into the shelter with no name 😥 — made a new column for this as well as top 10 most popular names.

In [7]:
len(animals.name.unique())

6375

In [8]:
animals.name.value_counts(dropna=False).head(11)

NaN         7691
Max          136
Bella        135
Charlie      107
Daisy        106
Lucy          94
Buddy         87
Princess      86
Rocky         85
Luna          68
Jack          66
Name: name, dtype: int64

In [9]:
# replace NaN with "unknown"
animals.name.replace(np.nan, 'unknown', inplace=True)
animals.name.value_counts(dropna=False).head(10)

unknown     7691
Max          136
Bella        135
Charlie      107
Daisy        106
Lucy          94
Buddy         87
Princess      86
Rocky         85
Luna          68
Name: name, dtype: int64

In [10]:
# create has_name column - 0 no name, 1 name
animals['has_name'] = animals.name.apply(lambda row: 0 if 'unknown' in row else 1)

In [11]:
animals[['name','has_name']].head()

Unnamed: 0,name,has_name
0,Hambone,1
1,Emily,1
2,Pearce,1
3,unknown,0
4,unknown,0


In [12]:
top10 = [name for name in animals.name.value_counts().head(11).index.tolist() if name != 'unknown']

def popular_name(row):
    if row in top10:
        return 1
    else:
        return 0

assert popular_name('Jack') == 1

In [13]:
animals['pop_name'] = animals.name.apply(popular_name)

In [14]:
animals.pop_name.value_counts()

0    25759
1      970
Name: pop_name, dtype: int64

In [15]:
sum(animals.name.value_counts().head(11)) - len(animals[animals.name == 'unknown'])

970

### Sex and spayed/neutered
Made new columns for sex, spayed/neutered... drop unknowns

In [16]:
animals.sex_outcome.value_counts(dropna=False)

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1093
NaN                 1
Name: sex_outcome, dtype: int64

In [17]:
# make "Unknown" NaN for dropping
animals.sex_outcome.replace('Unknown', np.nan, inplace=True)

In [18]:
animals.shape

(26729, 12)

In [19]:
animals.dropna(subset=['sex_outcome'],inplace=True)
animals.reset_index(drop=True, inplace=True)

In [20]:
animals.shape

(25635, 12)

In [21]:
animals['sex'] = animals.sex_outcome.apply(lambda row: 1 if 'Female' in row else 0)
animals['spay_neutered'] = animals.sex_outcome.apply(lambda row: 0 if 'Intact' in row else 1)

In [22]:
animals.sex.value_counts(dropna=False)

0    13304
1    12331
Name: sex, dtype: int64

In [23]:
animals.spay_neutered.value_counts(dropna=False)

1    18599
0     7036
Name: spay_neutered, dtype: int64

### Date of outcome
Convert string date to datetime and create columns for day, month, year

In [24]:
animals['datetime'] = pd.to_datetime(animals['date_outcome'], infer_datetime_format=True)

In [25]:
# Monday is 0, Sunday is 6
animals['day'] = animals['datetime'].apply(lambda row: row.weekday())
# 1-12
animals['month'] = animals['datetime'].apply(lambda row: row.month)
#2013-2016
animals['year'] = animals['datetime'].apply(lambda row: row.year)
# split into morning, afternoon, evening
animals['hour'] = animals['datetime'].apply(lambda row: row.hour)
# to investigate when hour = 0
animals['minutes'] = animals['datetime'].apply(lambda row: row.minute)

In [26]:
animals.hour.value_counts(dropna=False).sort_index()

0      364
5        1
6        9
7      109
8      292
9     1057
10     375
11    1997
12    2442
13    2373
14    2670
15    2573
16    2612
17    4058
18    3562
19    1024
20      74
21      18
22       5
23      20
Name: hour, dtype: int64

In [27]:
# 00:00:00 is ambiguous and could midnight or first thing in the morning 
# or essentially be a missing value
animals.datetime.value_counts().head(10)

2015-08-11 00:00:00    19
2015-11-17 00:00:00    17
2015-07-02 00:00:00    13
2015-04-02 00:00:00    11
2014-08-31 09:00:00    10
2015-11-23 00:00:00     9
2016-02-15 00:00:00     9
2015-09-11 00:00:00     9
2015-09-01 00:00:00     8
2015-11-08 00:00:00     8
Name: datetime, dtype: int64

In [28]:
animals[(animals['hour']==0) & (animals['minutes']==0)]

Unnamed: 0,id,name,date_outcome,outcome,outcome_info,animal,sex_outcome,age,breed,color,has_name,pop_name,sex,spay_neutered,datetime,day,month,year,hour,minutes
36,A702655,Mica,2015-08-11 00:00:00,Transfer,Partner,Cat,Neutered Male,4 months,Domestic Shorthair Mix,Black/White,1,0,0,1,2015-08-11,1,8,2015,0,0
110,A713392,Basil,2015-11-05 00:00:00,Transfer,Partner,Dog,Intact Male,3 months,Chihuahua Shorthair Mix,Black/White,1,0,0,0,2015-11-05,3,11,2015,0,0
167,A710071,Herb,2015-09-01 00:00:00,Transfer,Partner,Cat,Neutered Male,2 months,Domestic Shorthair Mix,White/Black,1,0,0,1,2015-09-01,1,9,2015,0,0
173,A716575,Scarlet,2015-12-22 00:00:00,Transfer,Partner,Dog,Spayed Female,2 years,Australian Cattle Dog Mix,Tan/White,1,0,1,1,2015-12-22,1,12,2015,0,0
187,A683666,unknown,2015-09-01 00:00:00,Transfer,Partner,Cat,Neutered Male,1 year,Domestic Shorthair Mix,White/Cream Tabby,0,0,0,1,2015-09-01,1,9,2015,0,0
239,A709067,Earlene,2015-08-05 00:00:00,Return_to_owner,,Dog,Intact Female,5 months,Boxer Mix,Black/White,1,0,1,0,2015-08-05,2,8,2015,0,0
242,A720190,Little Missy,2016-02-07 00:00:00,Transfer,Partner,Dog,Spayed Female,7 years,Airedale Terrier Mix,Tricolor,1,0,1,1,2016-02-07,6,2,2016,0,0
278,A714820,Lizzie,2015-11-05 00:00:00,Transfer,Partner,Dog,Spayed Female,6 months,Plott Hound Mix,Brown Brindle,1,0,1,1,2015-11-05,3,11,2015,0,0
295,A716488,unknown,2015-11-23 00:00:00,Transfer,Partner,Dog,Neutered Male,2 years,Pit Bull Mix,Red,0,0,0,1,2015-11-23,0,11,2015,0,0
395,A716400,Calliope,2015-11-24 00:00:00,Transfer,Partner,Dog,Intact Female,1 year,Yorkshire Terrier Mix,Black/Tan,1,0,1,0,2015-11-24,1,11,2015,0,0


In [29]:
# so only 3 rows are times where hour = 0 and minutes !=0
# I'll drop these rows
print(animals[(animals['hour']==0) & (animals['minutes']==0)].shape)
print(animals[(animals['hour']==0) & (animals['minutes']!=0)].shape)

(361, 20)
(3, 20)


In [30]:
# make "Unknown" NaN for dropping
animals.hour.replace(0, np.nan, inplace=True)

In [31]:
animals.dropna(subset=['hour'],inplace=True)

In [32]:
def season(row):
    if row == 3 or row == 4 or row == 5:
        season = 'spring'
    elif row == 6 or row == 7 or row == 8:
        season = 'summer'
    elif row == 9 or row == 10 or row == 11:
        season = 'fall'
    else:
        season = 'winter'
    return season

assert season(12) == 'winter'

In [33]:
animals['season'] = animals.month.apply(season)

In [34]:
animals.hour.value_counts(dropna=False).sort_index(ascending=True)

5.0        1
6.0        9
7.0      109
8.0      292
9.0     1057
10.0     375
11.0    1997
12.0    2442
13.0    2373
14.0    2670
15.0    2573
16.0    2612
17.0    4058
18.0    3562
19.0    1024
20.0      74
21.0      18
22.0       5
23.0      20
Name: hour, dtype: int64

In [35]:
def time_of_day(row):
    if row >= 5 and row < 12:
        time = 'morning'
    elif row >= 12 and row < 5:
        time = 'afternoon'
    else:
        time = 'evening'
    return time

assert time_of_day(5) == 'morning'

In [36]:
animals['time_of_day'] = animals.month.apply(time_of_day)

### Outcome
To keep things simple make this purely binary 0 - not adopted, 1 - adopted

In [94]:
animals.outcome.value_counts(dropna=False)

Adoption           10732
Transfer            8145
Return_to_owner     4753
Euthanasia          1451
Died                 167
Name: outcome, dtype: int64

In [96]:
animals['adopted'] = animals.outcome.apply(lambda row: 1 if row == 'Adoption' else 0)

In [97]:
animals.adopted.value_counts(dropna=False)

0    14516
1    10732
Name: adopted, dtype: int64

In [104]:
sum([i for i in animals.outcome.value_counts().tolist() if i != \
    animals.outcome.value_counts().tolist()[0]])

14516

### Outcome info

After examining this variable I've come to the conclusion it's not useful as it's information explaining the details of the Outcome after the fact so I won't include it in any modeling.

In [37]:
# good number of NaNs
animals.outcome_info.value_counts(dropna=False)

NaN                    13540
Partner                 6866
Foster                  1798
SCRP                    1292
Suffering                910
Aggressive               318
Offsite                  164
In Kennel                 98
Behavior                  86
Rabies Risk               69
Medical                   63
In Foster                 48
Court/Investigation        6
At Vet                     4
Enroute                    4
In Surgery                 3
Barn                       2
Name: outcome_info, dtype: int64

In [38]:
animals[['outcome', 'outcome_info']]

Unnamed: 0,outcome,outcome_info
0,Return_to_owner,
1,Euthanasia,Suffering
2,Adoption,Foster
3,Transfer,Partner
4,Transfer,Partner
5,Transfer,Partner
6,Transfer,Partner
7,Adoption,
8,Adoption,Offsite
9,Adoption,


In [39]:
animals[['id']].groupby([animals['outcome'], animals['outcome_info']]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id
outcome,outcome_info,Unnamed: 2_level_1
Adoption,Barn,1
Adoption,Foster,1798
Adoption,Offsite,164
Died,At Vet,4
Died,Enroute,4
Died,In Foster,48
Died,In Kennel,98
Died,In Surgery,3
Euthanasia,Aggressive,318
Euthanasia,Behavior,86


In [40]:
animals[['id']].groupby([animals['outcome'], animals['outcome_info']]).count().unstack()

Unnamed: 0_level_0,id,id,id,id,id,id,id,id,id,id,id,id,id,id,id,id
outcome_info,Aggressive,At Vet,Barn,Behavior,Court/Investigation,Enroute,Foster,In Foster,In Kennel,In Surgery,Medical,Offsite,Partner,Rabies Risk,SCRP,Suffering
outcome,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Adoption,,,1.0,,,,1798.0,,,,,164.0,,,,
Died,,4.0,,,,4.0,,48.0,98.0,3.0,,,,,,
Euthanasia,318.0,,,86.0,6.0,,,,,,63.0,,,69.0,,910.0
Transfer,,,1.0,,,,,,,,,,6866.0,,1292.0,


In [41]:
animals[['id']].groupby([animals['outcome'], animals['outcome_info'].isnull()]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id
outcome,outcome_info,Unnamed: 2_level_1
Adoption,False,1963
Adoption,True,8769
Died,False,157
Died,True,11
Euthanasia,False,1452
Euthanasia,True,1
Return_to_owner,True,4753
Transfer,False,8159
Transfer,True,6


In [42]:
animals[['outcome', 'outcome_info']][animals['outcome_info'].isnull()].head(10)

Unnamed: 0,outcome,outcome_info
0,Return_to_owner,
7,Adoption,
9,Adoption,
11,Return_to_owner,
12,Return_to_owner,
17,Return_to_owner,
20,Adoption,
21,Adoption,
22,Return_to_owner,
24,Adoption,


### Age

In [43]:
len(animals.age.unique())

45

In [44]:
# drop NaN and 0 years
animals.age.value_counts(dropna=False)

1 year       3788
2 years      3599
2 months     3331
3 years      1788
3 months     1231
1 month      1230
4 years      1047
5 years       972
4 months      851
6 years       656
5 months      632
6 months      563
8 years       529
3 weeks       528
7 years       521
10 years      442
10 months     442
8 months      371
2 weeks       359
9 years       283
7 months      281
4 weeks       280
12 years      232
9 months      218
11 months     161
13 years      141
11 years      125
1 weeks       114
14 years       96
15 years       85
1 week         66
2 days         60
3 days         55
4 days         38
16 years       34
1 day          24
6 days         21
17 years       17
NaN            14
5 weeks        11
5 days         11
18 years       10
0 years         9
19 years        3
20 years        2
Name: age, dtype: int64

In [45]:
animals.age.replace('0 years', np.nan, inplace=True)

In [46]:
animals.shape

(25271, 22)

In [47]:
animals.dropna(subset=['age'],inplace=True)
animals.reset_index(drop=True, inplace=True)

In [48]:
animals.shape

(25248, 22)

In [49]:
def age_in_weeks(row): 
    if row.find('day') > 0:
        num = int(row.split(' ')[0]) / 7
    elif row.find('week') > 0:
        num = int(row.split(' ')[0])
    elif row.find('month') > 0:
        num = int(row.split(' ')[0]) * 4
    elif row.find('year') > 0:
        num = int(row.split(' ')[0]) * 52
    else:
        num = 'unknown'
    return num

assert age_in_weeks('1 day') == 1/7
assert age_in_weeks('6 days') == 6/7

In [50]:
animals['age_in_weeks'] = animals.age.apply(age_in_weeks)

In [51]:
# verify
animals[['age','age_in_weeks']].head(50)

Unnamed: 0,age,age_in_weeks
0,1 year,52.0
1,1 year,52.0
2,2 years,104.0
3,3 weeks,3.0
4,2 years,104.0
5,1 month,4.0
6,3 weeks,3.0
7,5 months,20.0
8,1 year,52.0
9,2 years,104.0


In [52]:
assert age_in_weeks(animals.age[10]) == 208

In [53]:
animals.age_in_weeks.value_counts(dropna=False)

52.000000      3788
104.000000     3599
8.000000       3331
156.000000     1788
4.000000       1510
12.000000      1231
208.000000     1047
260.000000      972
16.000000       851
312.000000      656
20.000000       632
24.000000       563
416.000000      529
3.000000        528
364.000000      521
520.000000      442
40.000000       442
32.000000       371
2.000000        359
468.000000      283
28.000000       281
624.000000      232
36.000000       218
1.000000        180
44.000000       161
676.000000      141
572.000000      125
728.000000       96
780.000000       85
0.285714         60
0.428571         55
0.571429         38
832.000000       34
0.142857         24
0.857143         21
884.000000       17
5.000000         11
0.714286         11
936.000000       10
988.000000        3
1040.000000       2
Name: age_in_weeks, dtype: int64

### Color

In [54]:
animals.columns

Index(['id', 'name', 'date_outcome', 'outcome', 'outcome_info', 'animal',
       'sex_outcome', 'age', 'breed', 'color', 'has_name', 'pop_name', 'sex',
       'spay_neutered', 'datetime', 'day', 'month', 'year', 'hour', 'minutes',
       'season', 'time_of_day', 'age_in_weeks'],
      dtype='object')

In [55]:
len(animals.color.value_counts(dropna=True))

358

In [56]:
value_counts = animals.color.value_counts()
print(len(value_counts[value_counts > 500]))
value_counts[value_counts > 500]

16


Black/White          2658
Black                2053
Brown Tabby          1428
White                 879
Brown/White           877
Brown Tabby/White     829
Tan/White             757
Orange Tabby          750
Tricolor              740
Blue/White            663
Black/Tan             663
Brown                 617
Tan                   609
White/Black           598
White/Brown           554
Tortie                509
Name: color, dtype: int64

In [57]:
animals['color_list'] = animals.color.apply(lambda row: row.split('/'))

In [58]:
animals['color_list'].head(10)

0    [Brown, White]
1     [Cream Tabby]
2     [Blue, White]
3      [Blue Cream]
4             [Tan]
5      [Black, Tan]
6      [Blue Tabby]
7      [Red, White]
8           [White]
9          [Silver]
Name: color_list, dtype: object

In [59]:
colors = [color for color_list in animals.color_list for color in color_list]
unique_colors = list(set(colors))
print(len(unique_colors))
unique_colors

57


['Liver Tick',
 'Blue Merle',
 'Chocolate Point',
 'Black',
 'Black Tiger',
 'Orange Tiger',
 'Brown Tiger',
 'Lynx Point',
 'Torbie',
 'Yellow Brindle',
 'Silver Tabby',
 'Gray',
 'Cream',
 'Red',
 'Red Merle',
 'Gold',
 'Calico',
 'Orange',
 'Blue Smoke',
 'Ruddy',
 'Silver',
 'Blue',
 'White',
 'Tan',
 'Tortie',
 'Black Tabby',
 'Agouti',
 'Cream Tabby',
 'Blue Cream',
 'Blue Point',
 'Apricot',
 'Sable',
 'Silver Lynx Point',
 'Liver',
 'Calico Point',
 'Orange Tabby',
 'Blue Tiger',
 'Seal Point',
 'Chocolate',
 'Pink',
 'Fawn',
 'Buff',
 'Brown Merle',
 'Gray Tabby',
 'Flame Point',
 'Brown Brindle',
 'Yellow',
 'Blue Tick',
 'Tortie Point',
 'Tricolor',
 'Black Brindle',
 'Brown Tabby',
 'Black Smoke',
 'Red Tick',
 'Lilac Point',
 'Brown',
 'Blue Tabby']

In [60]:
simple_colors = [color for phrases in unique_colors for color in phrases.split()]
len(set((simple_colors)))

36

In [61]:
simple_colors = Counter(simple_colors)
simple_colors.most_common()

[('Point', 9),
 ('Blue', 8),
 ('Tabby', 7),
 ('Black', 5),
 ('Brown', 5),
 ('Tiger', 4),
 ('Tick', 3),
 ('Merle', 3),
 ('Orange', 3),
 ('Brindle', 3),
 ('Silver', 3),
 ('Cream', 3),
 ('Red', 3),
 ('Liver', 2),
 ('Chocolate', 2),
 ('Lynx', 2),
 ('Yellow', 2),
 ('Gray', 2),
 ('Calico', 2),
 ('Smoke', 2),
 ('Tortie', 2),
 ('Torbie', 1),
 ('Gold', 1),
 ('Ruddy', 1),
 ('White', 1),
 ('Tan', 1),
 ('Agouti', 1),
 ('Apricot', 1),
 ('Sable', 1),
 ('Seal', 1),
 ('Pink', 1),
 ('Fawn', 1),
 ('Buff', 1),
 ('Flame', 1),
 ('Tricolor', 1),
 ('Lilac', 1)]

In [62]:
simple_colors = list(set(simple_colors))
len(simple_colors)

36

In [63]:
simple_colors

['Lynx',
 'Black',
 'Flame',
 'Tiger',
 'Torbie',
 'Gray',
 'Lilac',
 'Cream',
 'Red',
 'Orange',
 'Gold',
 'Calico',
 'Ruddy',
 'Silver',
 'Blue',
 'White',
 'Tan',
 'Tortie',
 'Seal',
 'Agouti',
 'Tick',
 'Apricot',
 'Point',
 'Sable',
 'Liver',
 'Brindle',
 'Chocolate',
 'Smoke',
 'Pink',
 'Fawn',
 'Buff',
 'Tabby',
 'Yellow',
 'Merle',
 'Tricolor',
 'Brown']

In [64]:
simple_colors.remove('Point')
simple_colors.remove('Tick')

In [65]:
for color in simple_colors:
    animals[color] = animals.color_list.map(lambda row: 1 if color in row else 0)

In [66]:
animals.head(3)

Unnamed: 0,id,name,date_outcome,outcome,outcome_info,animal,sex_outcome,age,breed,color,...,Chocolate,Smoke,Pink,Fawn,Buff,Tabby,Yellow,Merle,Tricolor,Brown
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,...,0,0,0,0,0,0,0,0,0,1
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,...,0,0,0,0,0,0,0,0,0,0
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,...,0,0,0,0,0,0,0,0,0,0


In [67]:
animals.shape

(25248, 58)

### Breed

In [77]:
print(len(animals.breed.value_counts()))
value_counts = animals.breed.value_counts()
print(len(value_counts[value_counts > 100]))
value_counts[value_counts > 100]

1371
25


Domestic Shorthair Mix       7782
Pit Bull Mix                 1876
Chihuahua Shorthair Mix      1726
Labrador Retriever Mix       1322
Domestic Medium Hair Mix      786
German Shepherd Mix           570
Domestic Longhair Mix         482
Siamese Mix                   368
Australian Cattle Dog Mix     362
Dachshund Mix                 309
Boxer Mix                     233
Miniature Poodle Mix          231
Border Collie Mix             214
Australian Shepherd Mix       158
Catahoula Mix                 155
Rat Terrier Mix               153
Jack Russell Terrier Mix      144
Yorkshire Terrier Mix         141
Chihuahua Longhair Mix        138
Siberian Husky Mix            135
Miniature Schnauzer Mix       133
Domestic Shorthair            133
Beagle Mix                    123
Rottweiler Mix                110
American Bulldog Mix          109
Name: breed, dtype: int64

In [78]:
animals['mixed_breed'] = animals.breed.apply(lambda row: \
                                             1 if '/' in row or 'mix' in row.lower() else 0)

In [79]:
animals[['breed','mixed_breed']].head(10)

Unnamed: 0,breed,mixed_breed
0,Shetland Sheepdog Mix,1
1,Domestic Shorthair Mix,1
2,Pit Bull Mix,1
3,Domestic Shorthair Mix,1
4,Lhasa Apso/Miniature Poodle,1
5,Cairn Terrier/Chihuahua Shorthair,1
6,Domestic Shorthair Mix,1
7,American Pit Bull Terrier Mix,1
8,Cairn Terrier,0
9,Miniature Schnauzer Mix,1


In [80]:
animals['breed_list'] = animals.breed.apply(lambda row: row.split('/'))

In [81]:
animals.columns

Index(['id', 'name', 'date_outcome', 'outcome', 'outcome_info', 'animal',
       'sex_outcome', 'age', 'breed', 'color', 'has_name', 'pop_name', 'sex',
       'spay_neutered', 'datetime', 'day', 'month', 'year', 'hour', 'minutes',
       'season', 'time_of_day', 'age_in_weeks', 'color_list', 'Lynx', 'Black',
       'Flame', 'Tiger', 'Torbie', 'Gray', 'Lilac', 'Cream', 'Red', 'Orange',
       'Gold', 'Calico', 'Ruddy', 'Silver', 'Blue', 'White', 'Tan', 'Tortie',
       'Seal', 'Agouti', 'Apricot', 'Sable', 'Liver', 'Brindle', 'Chocolate',
       'Smoke', 'Pink', 'Fawn', 'Buff', 'Tabby', 'Yellow', 'Merle', 'Tricolor',
       'Brown', 'multicolor', 'mixed_breed', 'breed_list'],
      dtype='object')

In [82]:
animals['breed_list'].head()

0           [Shetland Sheepdog Mix]
1          [Domestic Shorthair Mix]
2                    [Pit Bull Mix]
3          [Domestic Shorthair Mix]
4    [Lhasa Apso, Miniature Poodle]
Name: breed_list, dtype: object

In [83]:
breeds = [breed for breed_list in animals.breed_list for breed in breed_list]
unique_breeds = list(set(breeds))
print(len(unique_breeds))
unique_breeds

397


['German Shepherd Mix',
 'Manchester Terrier',
 'Jack Russell Terrier Mix',
 'Podengo Pequeno Mix',
 'Scottish Terrier Mix',
 'French Bulldog',
 'Pembroke Welsh Corgi Mix',
 'Swedish Vallhund',
 'Silky Terrier',
 'Greyhound Mix',
 'Neapolitan Mastiff Mix',
 'Pbgv Mix',
 'St. Bernard Smooth Coat',
 'Siamese Mix',
 'Miniature Pinscher Mix',
 'Greyhound',
 'Cardigan Welsh Corgi Mix',
 'Domestic Shorthair Mix',
 'Ragdoll Mix',
 'Bullmastiff Mix',
 'St. Bernard Smooth Coat Mix',
 'Irish Terrier Mix',
 'Chinese Sharpei Mix',
 'Welsh Terrier',
 'Dalmatian Mix',
 'Norwich Terrier',
 'Manx Mix',
 'Picardy Sheepdog Mix',
 'Vizsla Mix',
 'Exotic Shorthair Mix',
 'American Foxhound',
 'Flat Coat Retriever',
 'Great Dane',
 'Maine Coon',
 'Japanese Chin Mix',
 'Yorkshire Terrier Mix',
 'Rhod Ridgeback Mix',
 'St. Bernard Rough Coat',
 'Snowshoe',
 'Siberian Husky',
 'Golden Retriever',
 'Jack Russell Terrier',
 'Shih Tzu Mix',
 'Keeshond',
 'Dogo Argentino',
 'Plott Hound',
 'Skye Terrier Mix',
 'T

In [84]:
simple_breeds = [breed for phrases in unique_breeds for breed in phrases.split()]
len(set((simple_breeds)))

262

In [86]:
simple_breeds = Counter(simple_breeds)
simple_breeds.most_common()

[('Mix', 213),
 ('Terrier', 51),
 ('English', 18),
 ('Hound', 16),
 ('Shorthair', 11),
 ('American', 11),
 ('Spaniel', 11),
 ('Dog', 11),
 ('Shepherd', 10),
 ('Bulldog', 9),
 ('German', 8),
 ('Welsh', 8),
 ('Miniature', 8),
 ('Retriever', 8),
 ('Pointer', 8),
 ('Bull', 8),
 ('Australian', 7),
 ('Collie', 7),
 ('Longhair', 7),
 ('Smooth', 6),
 ('Coat', 6),
 ('Domestic', 6),
 ('Sheepdog', 6),
 ('Fox', 6),
 ('Poodle', 6),
 ('Belgian', 6),
 ('Dachshund', 6),
 ('Schnauzer', 6),
 ('Mastiff', 5),
 ('Irish', 5),
 ('Russell', 4),
 ('Corgi', 4),
 ('Greyhound', 4),
 ('St.', 4),
 ('Bernard', 4),
 ('Pinscher', 4),
 ('Chinese', 4),
 ('Foxhound', 4),
 ('Great', 4),
 ('Japanese', 4),
 ('Rough', 4),
 ('Husky', 4),
 ('Treeing', 4),
 ('Cocker', 4),
 ('Hair', 4),
 ('Staffordshire', 4),
 ('Blue', 4),
 ('Border', 4),
 ('Rex', 4),
 ('Coonhound', 4),
 ('Terr', 4),
 ('Span', 4),
 ('Alaskan', 4),
 ('Toy', 4),
 ('Chihuahua', 4),
 ('Springer', 4),
 ('Pit', 4),
 ('Tibetan', 4),
 ('Chow', 4),
 ('Standard', 4),
 ('Y

In [87]:
simple_breeds = list(set(simple_breeds))
len(simple_breeds)

262

In [88]:
animals.shape

(25248, 61)

Below I essentially try to find the most important breed in a pretty hacky way...

In [156]:
breed_imp = animals[['adopted', 'breed']]
breed_imp

Unnamed: 0,adopted,breed
0,0,Shetland Sheepdog Mix
1,0,Domestic Shorthair Mix
2,1,Pit Bull Mix
3,0,Domestic Shorthair Mix
4,0,Lhasa Apso/Miniature Poodle
5,0,Cairn Terrier/Chihuahua Shorthair
6,0,Domestic Shorthair Mix
7,1,American Pit Bull Terrier Mix
8,1,Cairn Terrier
9,1,Miniature Schnauzer Mix


In [157]:
for breed in simple_breeds:
    breed_imp[breed] = animals.breed.map(lambda row: 1 if breed in row else 0)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [158]:
breed_imp.columns

Index(['adopted', 'breed', 'Havanese', 'French', 'Vallhund', 'Rex', 'De',
       'Bobtail', 'Borzoi', 'Bull',
       ...
       'Silky', 'Himalayan', 'St.', 'Boxer', 'Mastiff', 'Whippet', 'Chinese',
       'Tennesse', 'Retriever', 'Labrador'],
      dtype='object', length=264)

In [159]:
breed_imp.head()

Unnamed: 0,adopted,breed,Havanese,French,Vallhund,Rex,De,Bobtail,Borzoi,Bull,...,Silky,Himalayan,St.,Boxer,Mastiff,Whippet,Chinese,Tennesse,Retriever,Labrador
0,0,Shetland Sheepdog Mix,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,Domestic Shorthair Mix,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,Pit Bull Mix,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,Domestic Shorthair Mix,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,Lhasa Apso/Miniature Poodle,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [160]:
breed_imp.describe()

Unnamed: 0,adopted,Havanese,French,Vallhund,Rex,De,Bobtail,Borzoi,Bull,Glen,...,Silky,Himalayan,St.,Boxer,Mastiff,Whippet,Chinese,Tennesse,Retriever,Labrador
count,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,...,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0,25248.0
mean,0.425063,0.000634,0.000554,0.000277,0.000158,0.000317,0.000317,4e-05,0.10381,0.000158,...,0.000317,0.000673,0.000911,0.016714,0.002297,0.001862,0.003802,4e-05,0.094542,0.087928
std,0.494362,0.025166,0.023542,0.016649,0.012586,0.017798,0.017798,0.006293,0.30502,0.012586,...,0.017798,0.02594,0.030169,0.128201,0.047875,0.043106,0.061547,0.006293,0.292587,0.283195
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [164]:
cols = [col for col in breed_imp.columns if col not in ['adopted','breed']]
means = []
for col in cols:
    means.append(breed_imp[col].mean())

means
columns = [col for col in breed_imp.columns]
sorted(list(zip(means, columns)), reverse=True)

[(0.82782794676806082, 'Russell'),
 (0.40830956907477822, 'Cymric'),
 (0.36715779467680609, 'Shetland'),
 (0.10381020278833968, 'Bobtail'),
 (0.10000792141951838, 'Beauceron'),
 (0.095057034220532313, 'Poodle'),
 (0.094542141951837771, 'Chinese'),
 (0.092997465145754118, 'Pekingese'),
 (0.087927756653992398, 'Tennesse'),
 (0.063648605830164767, 'Havana'),
 (0.063252534854245882, 'Tolling'),
 (0.054024081115335867, 'English'),
 (0.040478453738910014, 'Unknown'),
 (0.039052598225602031, 'Tonkinese'),
 (0.035369138149556403, 'Bay'),
 (0.03382446134347275, 'Shepherd'),
 (0.032873891001267426, 'Cattle'),
 (0.030537072243346008, 'Angora'),
 (0.029784537389100127, 'Van'),
 (0.029507287705956908, 'Boykin'),
 (0.028517110266159697, 'Highland'),
 (0.024992078580481622, 'Coated'),
 (0.020080798479087454, 'Queensland'),
 (0.019605513307984792, 'Medium'),
 (0.019367870722433459, 'Treeing'),
 (0.016714195183776932, 'Himalayan'),
 (0.016040874524714827, 'Brindle'),
 (0.015050697084917616, 'Corgi'),
 

In [165]:
breed_imp.corr()

Unnamed: 0,adopted,Havanese,French,Vallhund,Rex,De,Bobtail,Borzoi,Bull,Glen,...,Silky,Himalayan,St.,Boxer,Mastiff,Whippet,Chinese,Tennesse,Retriever,Labrador
adopted,1.000000,0.000634,0.000167,0.014555,-0.010824,-0.010806,0.002699,0.007319,-0.074623,0.008274,...,-0.006305,-0.000698,-0.015341,0.006639,0.000580,0.009334,0.001554,0.007319,0.023105,0.024432
Havanese,0.000634,1.000000,-0.000593,-0.000419,-0.000317,-0.000448,-0.000448,-0.000158,-0.008570,-0.000317,...,-0.000448,-0.000654,-0.000760,-0.003283,-0.001208,-0.001087,-0.001556,-0.000158,-0.008137,-0.007819
French,0.000167,-0.000593,1.000000,-0.000392,-0.000296,-0.000419,-0.000419,-0.000148,0.069207,-0.000296,...,-0.000419,-0.000611,-0.000711,-0.003071,-0.001130,-0.001017,-0.001455,-0.000148,-0.007611,-0.007313
Vallhund,0.014555,-0.000419,-0.000392,1.000000,-0.000210,-0.000296,-0.000296,-0.000105,-0.005668,-0.000210,...,-0.000296,-0.000432,-0.000503,-0.002171,-0.000799,-0.000719,-0.001029,-0.000105,-0.005381,-0.005171
Rex,-0.010824,-0.000317,-0.000296,-0.000210,1.000000,0.353413,-0.000224,-0.000079,-0.004284,-0.000158,...,-0.000224,-0.000327,-0.000380,-0.001641,-0.000604,-0.000544,-0.000778,-0.000079,-0.004068,-0.003908
De,-0.010806,-0.000448,-0.000419,-0.000296,0.353413,1.000000,-0.000317,-0.000112,0.008533,-0.000224,...,-0.000317,-0.000462,-0.000538,-0.002321,-0.000854,-0.000769,-0.001100,-0.000112,-0.005753,-0.005528
Bobtail,0.002699,-0.000448,-0.000419,-0.000296,-0.000224,-0.000317,1.000000,-0.000112,-0.006059,-0.000224,...,-0.000317,-0.000462,-0.000538,-0.002321,-0.000854,-0.000769,-0.001100,-0.000112,-0.005753,-0.005528
Borzoi,0.007319,-0.000158,-0.000148,-0.000105,-0.000079,-0.000112,-0.000112,1.000000,-0.002142,-0.000079,...,-0.000112,-0.000163,-0.000190,-0.000821,-0.000302,0.145732,-0.000389,-0.000040,-0.002034,-0.001954
Bull,-0.074623,-0.008570,0.069207,-0.005668,-0.004284,0.008533,-0.006059,-0.002142,1.000000,-0.004284,...,-0.006059,-0.008834,-0.005973,0.004246,-0.000057,-0.014698,0.010622,-0.002142,-0.055830,-0.052025
Glen,0.008274,-0.000317,-0.000296,-0.000210,-0.000158,-0.000224,-0.000224,-0.000079,-0.004284,1.000000,...,-0.000224,-0.000327,-0.000380,-0.001641,-0.000604,-0.000544,-0.000778,-0.000079,-0.004068,-0.003908


## TO DO: explore other plot types & colors...
- ie boxplots?
- regplot, lmplot (logistic regression plot from seaborn?)

In [56]:
animals['sex'].value_counts().iplot(kind='bar', yTitle='Sex', title='Animal sex',)

In [55]:
animals['name'].value_counts()[:20].iplot(kind='bar', yTitle='Name Frequency', title='Animal names',)

In [57]:
animals['outcome'].value_counts().iplot(kind='bar', yTitle='Outcomes', title='Animal outcomes',)

In [59]:
# subset/group by dog vs cat
animals['breed'].value_counts().iplot(kind='bar', yTitle='Breeds', title='Animal Breeds')

In [60]:
# subset/group by dog vs cat
animals['color'].value_counts().iplot(kind='bar', yTitle='Color of coat', title='Animal coat colors')