# DS-SF-23 | Exploratory Data Analysis on Kaggle Dataset | Shelter Animal Outcomes | Ivan's Notebook

(https://www.kaggle.com/c/shelter-animal-outcomes)

## CODE TO GET YOU STARTED

In [1]:
import os
import numpy as np
import pandas as pd
import csv
import re

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

In [2]:
df = pd.read_csv(os.path.join('..', 'datasets', 'shelter-animal-outcomes.csv.gz'), index_col = 'AnimalID')

In [3]:
df

Unnamed: 0_level_0,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan
...,...,...,...,...,...,...,...,...,...
A702446,,2015-05-14 11:56:00,Transfer,Partner,Cat,Intact Male,1 month,Domestic Shorthair Mix,Brown Tabby/White
A718934,,2016-01-20 18:59:00,Transfer,SCRP,Cat,Spayed Female,3 months,Domestic Shorthair Mix,Brown Tabby
A698128,Zeus,2015-03-09 13:33:00,Adoption,,Dog,Neutered Male,4 years,Old English Bulldog Mix,White/Tan
A677478,,2014-04-27 12:22:00,Transfer,Partner,Cat,Intact Male,4 weeks,Domestic Shorthair Mix,Black


In [4]:
df.columns

Index([u'Name', u'DateTime', u'OutcomeType', u'OutcomeSubtype', u'AnimalType',
       u'SexuponOutcome', u'AgeuponOutcome', u'Breed', u'Color'],
      dtype='object')

## Name

In [5]:
df.Name.unique()

array(['Hambone', 'Emily', 'Pearce', ..., 'Mowmow', 'Sonja', 'Green Bean'], dtype=object)

In [6]:
df.Name.value_counts()

Max            136
Bella          135
Charlie        107
Daisy          106
Lucy            94
              ... 
Sally Jessy      1
Nelda            1
Hobie Cat        1
Madi             1
Sumi             1
Name: Name, dtype: int64

## DateTime

In [7]:
df.DateTime

AnimalID
A671945    2014-02-12 18:22:00
A656520    2013-10-13 12:44:00
A686464    2015-01-31 12:28:00
A683430    2014-07-11 19:09:00
A667013    2013-11-15 12:52:00
                  ...         
A702446    2015-05-14 11:56:00
A718934    2016-01-20 18:59:00
A698128    2015-03-09 13:33:00
A677478    2014-04-27 12:22:00
A706629    2015-07-02 09:00:00
Name: DateTime, dtype: object

In [8]:
df.DateTime[-1]

'2015-07-02 09:00:00'

In [9]:
type(df.DateTime[-1])

str

In [10]:
datetime_as_str = '2015-07-02 09:00:00'

In [11]:
datetime_as_timestamp = pd.to_datetime(datetime_as_str)

In [12]:
datetime_as_timestamp

Timestamp('2015-07-02 09:00:00')

In [13]:
type(datetime_as_timestamp)

pandas.tslib.Timestamp

In [14]:
datetime_as_timestamp.year

2015

In [15]:
datetime_as_timestamp.month

7

In [16]:
datetime_as_timestamp.day

2

In [17]:
datetime_as_timestamp.hour

9

In [18]:
datetime_as_timestamp.minute

0

In [19]:
datetime_as_timestamp.second

0

In [20]:
pd.to_datetime(df.DateTime)

AnimalID
A671945   2014-02-12 18:22:00
A656520   2013-10-13 12:44:00
A686464   2015-01-31 12:28:00
A683430   2014-07-11 19:09:00
A667013   2013-11-15 12:52:00
                  ...        
A702446   2015-05-14 11:56:00
A718934   2016-01-20 18:59:00
A698128   2015-03-09 13:33:00
A677478   2014-04-27 12:22:00
A706629   2015-07-02 09:00:00
Name: DateTime, dtype: datetime64[ns]

In [21]:
df.DateTime = pd.to_datetime(df.DateTime)

In [22]:
df.DateTime[-1]

Timestamp('2015-07-02 09:00:00')

In [23]:
type(df.DateTime[-1])

pandas.tslib.Timestamp

## OutcomeType and OutcomeSubtype

### OutcomeType

In [24]:
df.OutcomeType

AnimalID
A671945    Return_to_owner
A656520         Euthanasia
A686464           Adoption
A683430           Transfer
A667013           Transfer
                ...       
A702446           Transfer
A718934           Transfer
A698128           Adoption
A677478           Transfer
A706629           Transfer
Name: OutcomeType, dtype: object

In [25]:
df.OutcomeType.unique()

array(['Return_to_owner', 'Euthanasia', 'Adoption', 'Transfer', 'Died'], dtype=object)

In [26]:
df.OutcomeType.value_counts()

Adoption           10769
Transfer            9422
Return_to_owner     4786
Euthanasia          1555
Died                 197
Name: OutcomeType, dtype: int64

In [27]:
df.OutcomeType = df.OutcomeType.\
    apply(lambda outcome_type: 'ReturnToOwner' if outcome_type == 'Return_to_owner' else outcome_type)

In [28]:
df.OutcomeType.unique()

array(['ReturnToOwner', 'Euthanasia', 'Adoption', 'Transfer', 'Died'], dtype=object)

In [29]:
df.OutcomeType.value_counts()

Adoption         10769
Transfer          9422
ReturnToOwner     4786
Euthanasia        1555
Died               197
Name: OutcomeType, dtype: int64

### OutcomeSubtype

In [30]:
df.OutcomeSubtype.unique()

array([nan, 'Suffering', 'Foster', 'Partner', 'Offsite', 'SCRP',
       'Aggressive', 'Behavior', 'Rabies Risk', 'Medical', 'In Kennel',
       'In Foster', 'Barn', 'Court/Investigation', 'Enroute', 'At Vet',
       'In Surgery'], dtype=object)

In [31]:
df.OutcomeSubtype.value_counts()

Partner                7816
Foster                 1800
SCRP                   1599
Suffering              1002
Aggressive              320
                       ... 
Enroute                   8
Court/Investigation       6
At Vet                    4
In Surgery                3
Barn                      2
Name: OutcomeSubtype, dtype: int64

In [32]:
'In Kennel'.translate(None, ' ')

'InKennel'

In [33]:
'Court/Investigation'.translate(None, ' ')

'Court/Investigation'

In [34]:
'Court/Investigation'.translate(None, '/')

'CourtInvestigation'

In [35]:
'Court/Investigation'.translate(None, '/ ')

'CourtInvestigation'

In [36]:
'In Kennel'.translate(None, '/ ')

'InKennel'

In [37]:
def process_outcome_subtype(outcome_subtype):
    if pd.isnull(outcome_subtype):
        return np.nan
    return outcome_subtype.translate(None, '/ ')
    
df.OutcomeSubtype = df.OutcomeSubtype.apply(process_outcome_subtype)

In [38]:
df.OutcomeSubtype.unique()

array([nan, 'Suffering', 'Foster', 'Partner', 'Offsite', 'SCRP',
       'Aggressive', 'Behavior', 'RabiesRisk', 'Medical', 'InKennel',
       'InFoster', 'Barn', 'CourtInvestigation', 'Enroute', 'AtVet',
       'InSurgery'], dtype=object)

## AnimalType

In [39]:
df.AnimalType.unique()

array(['Dog', 'Cat'], dtype=object)

In [40]:
df.AnimalType.value_counts()

Dog    15595
Cat    11134
Name: AnimalType, dtype: int64

## SexuponOutcome

In [41]:
df.SexuponOutcome.unique()

array(['Neutered Male', 'Spayed Female', 'Intact Male', 'Intact Female',
       'Unknown', nan], dtype=object)

In [42]:
df.SexuponOutcome.value_counts()

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1093
Name: SexuponOutcome, dtype: int64

In [43]:
df.SexuponOutcome = df.SexuponOutcome.\
    apply(lambda sex_upon_outcome: 'Unknown' if pd.isnull(sex_upon_outcome) else sex_upon_outcome)

In [44]:
df.SexuponOutcome.unique()

array(['Neutered Male', 'Spayed Female', 'Intact Male', 'Intact Female',
       'Unknown'], dtype=object)

In [45]:
df.SexuponOutcome.value_counts()

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1094
Name: SexuponOutcome, dtype: int64

In [46]:
'Intact' in 'Intact Male'

True

In [47]:
'Male' in 'Intact Male'

True

In [48]:
'Male ' in 'Intact Male'

False

In [49]:
is_male = 'Male' in df.SexuponOutcome

In [50]:
is_male

False

In [51]:
is_male = df.SexuponOutcome.apply(lambda sex_upon_outcome: 'Male' in sex_upon_outcome)

In [52]:
is_male

AnimalID
A671945     True
A656520    False
A686464     True
A683430     True
A667013     True
           ...  
A702446     True
A718934    False
A698128     True
A677478     True
A706629     True
Name: SexuponOutcome, dtype: bool

In [53]:
is_female = df.SexuponOutcome.apply(lambda sex_upon_outcome: 'Female' in sex_upon_outcome)

In [54]:
df['Sex'] = 'Unknown'

In [55]:
df.Sex.value_counts()

Unknown    26729
Name: Sex, dtype: int64

In [56]:
df.loc[is_male, 'Sex'] = 'Male'

In [57]:
df.Sex.value_counts()

Unknown    13425
Male       13304
Name: Sex, dtype: int64

In [58]:
df.loc[is_female, 'Sex'] = 'Female'

In [59]:
df.Sex.value_counts()

Male       13304
Female     12331
Unknown     1094
Name: Sex, dtype: int64

In [60]:
is_neutered = df.SexuponOutcome.apply(lambda sex_upon_outcome: 'Neutered' in sex_upon_outcome)
is_spayed = df.SexuponOutcome.apply(lambda sex_upon_outcome: 'Spayed' in sex_upon_outcome)

df['Sterilization'] = 'Intact'
df.loc[is_neutered, 'Sterilization'] = 'Neutered'
df.loc[is_spayed, 'Sterilization'] = 'Spayed'

In [61]:
df.Sterilization.value_counts()

Neutered    9779
Spayed      8820
Intact      8130
Name: Sterilization, dtype: int64

In [62]:
df.SexuponOutcome.value_counts()

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Unknown          1094
Name: SexuponOutcome, dtype: int64

In [63]:
((df.Sex == 'Male') & (df.Sterilization == 'Neutered')).sum()

9779

In [64]:
((df.Sex == 'Female') & (df.Sterilization == 'Spayed')).sum()

8820

In [65]:
((df.Sex == 'Male') & (df.Sterilization == 'Intact')).sum()

3525

In [66]:
((df.Sex == 'Female') & (df.Sterilization == 'Intact')).sum()

3511

In [67]:
(df.Sex == 'Unknown').sum()

1094

In [68]:
df = df.drop('SexuponOutcome', axis = 1)

## AgeuponOutcome

In [69]:
df.AgeuponOutcome.unique()

array(['1 year', '2 years', '3 weeks', '1 month', '5 months', '4 years',
       '3 months', '2 weeks', '2 months', '10 months', '6 months',
       '5 years', '7 years', '3 years', '4 months', '12 years', '9 years',
       '6 years', '1 weeks', '11 years', '4 weeks', '7 months', '8 years',
       '11 months', '4 days', '9 months', '8 months', '15 years',
       '10 years', '1 week', '0 years', '14 years', '3 days', '6 days',
       '5 days', '5 weeks', '2 days', '16 years', '1 day', '13 years', nan,
       '17 years', '18 years', '19 years', '20 years'], dtype=object)

In [70]:
age_upon_outcome = '1 year'

In [71]:
match = re.search(r'(\d) year', age_upon_outcome)

In [72]:
match.group(1)

'1'

In [73]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'(\d) year', age_upon_outcome)
    if match:
        value_in_years = match.group(1)
        print 'We have a match in years:', value_in_years
    else:
        print 'We have no match for "{}"'.format(age_upon_outcome)

In [74]:
print_age_upcon_outcome('1 year')

We have a match in years: 1


In [75]:
print_age_upcon_outcome('2 years')

We have a match in years: 2


In [76]:
print_age_upcon_outcome('12 years')

We have a match in years: 2


In [77]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'(\d+) year', age_upon_outcome)
    if match:
        value_in_years = match.group(1)
        print 'We have a match in years:', value_in_years
    else:
        print 'We have no match for "{}"'.format(age_upon_outcome)

In [78]:
print_age_upcon_outcome('1 year')
print_age_upcon_outcome('2 years')
print_age_upcon_outcome('12 years')

We have a match in years: 1
We have a match in years: 2
We have a match in years: 12


In [79]:
print_age_upcon_outcome('1 month')
print_age_upcon_outcome('3 weeks')

We have no match for "1 month"
We have no match for "3 weeks"


In [80]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'(\d+) (\S+)', age_upon_outcome)
    if match:
        value = match.group(1)
        unit = match.group(2)
        print 'We have a match in {}: {}'.format(unit, value)
    else:
        print 'We have no match for "{}"'.format(age_upon_outcome)

In [81]:
print_age_upcon_outcome('1 year')
print_age_upcon_outcome('2 years')
print_age_upcon_outcome('12 years')
print_age_upcon_outcome('1 month')
print_age_upcon_outcome('3 weeks')

We have a match in year: 1
We have a match in years: 2
We have a match in years: 12
We have a match in month: 1
We have a match in weeks: 3


In [82]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'(\d+) (\S+[^s])', age_upon_outcome)
    if match:
        value = match.group(1)
        unit = match.group(2)
        print 'We have a match in {}: {}'.format(unit, value)
    else:
        print 'We have no match for "{}"'.format(age_upon_outcome)

In [83]:
print_age_upcon_outcome('1 year')
print_age_upcon_outcome('2 years')
print_age_upcon_outcome('12 years')
print_age_upcon_outcome('1 month')
print_age_upcon_outcome('3 weeks')

We have a match in year: 1
We have a match in year: 2
We have a match in year: 12
We have a match in month: 1
We have a match in week: 3


In [84]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'^(\d+) (\S+[^s])s?$', age_upon_outcome)
    if match:
        value = match.group(1)
        unit = match.group(2)
        print 'We have a match in {}: {}'.format(unit, value)
    else:
        raise Exception('No match for "{}"'.format(age_upon_outcome))

In [85]:
print_age_upcon_outcome('1 year')
print_age_upcon_outcome('2 years')
print_age_upcon_outcome('12 years')
print_age_upcon_outcome('1 month')
print_age_upcon_outcome('3 weeks')

We have a match in year: 1
We have a match in year: 2
We have a match in year: 12
We have a match in month: 1
We have a match in week: 3


In [86]:
def age_upcon_outcome(age_upon_outcome):
    if pd.isnull(age_upon_outcome):
        return np.nan

    match = re.search(r'^(\d+) (\S+[^s])s?$', age_upon_outcome)
    if not match:
        raise Exception('No match for "{}"'.format(age_upon_outcome))
        
    value = float(match.group(1))
    unit = match.group(2)
    
    if unit == 'day':
        return value
    elif unit == 'week':
        return value * 7
    elif unit == 'month':
        return value * 30.5
    elif unit == 'year':
        return value * 365.25
    else:
        raise Exception('No match for {}'.format(unit))

In [87]:
df.AgeuponOutcome = df.AgeuponOutcome.apply(age_upcon_outcome)

## Breed

In [88]:
df.Breed.unique()

array(['Shetland Sheepdog Mix', 'Domestic Shorthair Mix', 'Pit Bull Mix',
       ..., 'Vizsla/Boxer', 'German Shepherd/Australian Kelpie',
       'Boxer/German Shepherd'], dtype=object)

In [89]:
df.Breed.value_counts()

Domestic Shorthair Mix                   8810
Pit Bull Mix                             1906
Chihuahua Shorthair Mix                  1766
Labrador Retriever Mix                   1363
Domestic Medium Hair Mix                  839
                                         ... 
Norfolk Terrier/Dachshund                   1
Golden Retriever/Whippet                    1
Chihuahua Shorthair/Shetland Sheepdog       1
Vizsla/Boxer                                1
Dachshund/Cavalier Span                     1
Name: Breed, dtype: int64

In [90]:
len(df.Breed.unique())

1380

That's a lot of breeds to consider.  What would do you next?

## Color

In [91]:
df.Color.unique()

array(['Brown/White', 'Cream Tabby', 'Blue/White', 'Blue Cream', 'Tan',
       'Black/Tan', 'Blue Tabby', 'Brown Tabby', 'Red/White', 'White',
       'Black', 'Silver', 'Brown', 'Black/Red', 'White/Cream',
       'Orange Tabby/White', 'Black/White', 'Brown Brindle/White',
       'Black/Brown', 'Orange Tabby', 'Chocolate/White', 'White/Tan',
       'Cream Tabby/White', 'Blue', 'Calico', 'Torbie', 'Brown/Black',
       'Yellow', 'Tricolor', 'White/Black', 'Tortie', 'Blue Tabby/White',
       'Gray/White', 'Tan/Black', 'Tan/White', 'Buff', 'Brown Tabby/White',
       'Red', 'Blue/Tan', 'Seal Point', 'Brown Brindle', 'White/Brown',
       'Gray', 'Yellow Brindle/White', 'Fawn/White', 'Flame Point',
       'Black Tabby/White', 'Tortie/Calico', 'Black/Brown Brindle',
       'White/Gray', 'Tan/Silver', 'Red Tick/Black', 'White/Cream Tabby',
       'Blue Merle', 'Chocolate/Tan', 'Sable', 'Brown Merle/White',
       'Brown Tiger/White', 'Liver/White', 'White/Blue Tabby',
       'Black/Gray', 'C

In [92]:
len(df.Color.unique())

366

There's also a lot of colors to consider.  How could you handle them?