# DAT-NYC-37 | Exploratory Data Analysis on Shelter Animal Outcomes

(https://www.kaggle.com/c/shelter-animal-outcomes)

## CODE TO GET YOU STARTED

In [None]:
import os
import numpy as np
import pandas as pd
import csv
import re

pd.set_option('display.max_rows', 10)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

In [None]:
df = pd.read_csv(os.path.join('..', 'datasets', 'shelter-animal-outcomes.csv.gz'), index_col = 'AnimalID')

In [None]:
df

In [None]:
df.columns

## Name

In [None]:
df.Name.unique()

In [None]:
df.Name.value_counts()

## DateTime

In [None]:
df.DateTime

In [None]:
df.DateTime[-1]

In [None]:
type(df.DateTime[-1])

In [None]:
datetime_as_str = '2015-07-02 09:00:00'

In [None]:
datetime_as_timestamp = pd.to_datetime(datetime_as_str)

In [None]:
datetime_as_timestamp

In [None]:
type(datetime_as_timestamp)

In [None]:
datetime_as_timestamp.year

In [None]:
datetime_as_timestamp.month

In [None]:
datetime_as_timestamp.day

In [None]:
datetime_as_timestamp.hour

In [None]:
datetime_as_timestamp.minute

In [None]:
datetime_as_timestamp.second

In [None]:
pd.to_datetime(df.DateTime)

In [None]:
df.DateTime = pd.to_datetime(df.DateTime)

In [None]:
df.DateTime[-1]

In [None]:
type(df.DateTime[-1])

## OutcomeType and OutcomeSubtype

### OutcomeType

In [None]:
df.OutcomeType

In [None]:
df.OutcomeType.unique()

In [None]:
df.OutcomeType.value_counts()

In [None]:
df.OutcomeType = df.OutcomeType.\
    apply(lambda outcome_type: 'ReturnToOwner' if outcome_type == 'Return_to_owner' else outcome_type)

In [None]:
df.OutcomeType.unique()

In [None]:
df.OutcomeType.value_counts()

### OutcomeSubtype

In [None]:
df.OutcomeSubtype.unique()

In [None]:
df.OutcomeSubtype.value_counts()

In [None]:
'In Kennel'.translate(None, ' ')

In [None]:
'Court/Investigation'.translate(None, ' ')

In [None]:
'Court/Investigation'.translate(None, '/')

In [None]:
'Court/Investigation'.translate(None, '/ ')

In [None]:
'In Kennel'.translate(None, '/ ')

In [None]:
def process_outcome_subtype(outcome_subtype):
    if pd.isnull(outcome_subtype):
        return np.nan
    return outcome_subtype.translate(None, '/ ')
    
df.OutcomeSubtype = df.OutcomeSubtype.apply(process_outcome_subtype)

In [None]:
df.OutcomeSubtype.unique()

## AnimalType

In [None]:
df.AnimalType.unique()

In [None]:
df.AnimalType.value_counts()

## SexuponOutcome

In [None]:
df.SexuponOutcome.unique()

In [None]:
df.SexuponOutcome.value_counts()

In [None]:
df.SexuponOutcome = df.SexuponOutcome.\
    apply(lambda sex_upon_outcome: 'Unknown' if pd.isnull(sex_upon_outcome) else sex_upon_outcome)

In [None]:
df.SexuponOutcome.unique()

In [None]:
df.SexuponOutcome.value_counts()

In [None]:
'Intact' in 'Intact Male'

In [None]:
'Male' in 'Intact Male'

In [None]:
'Male ' in 'Intact Male'

In [None]:
is_male = 'Male' in df.SexuponOutcome

In [None]:
is_male

In [None]:
is_male = df.SexuponOutcome.apply(lambda sex_upon_outcome: 'Male' in sex_upon_outcome)

In [None]:
is_male

In [None]:
is_female = df.SexuponOutcome.apply(lambda sex_upon_outcome: 'Female' in sex_upon_outcome)

In [None]:
df['Sex'] = 'Unknown'

In [None]:
df.Sex.value_counts()

In [None]:
df.loc[is_male, 'Sex'] = 'Male'

In [None]:
df.Sex.value_counts()

In [None]:
df.loc[is_female, 'Sex'] = 'Female'

In [None]:
df.Sex.value_counts()

In [None]:
is_neutered = df.SexuponOutcome.apply(lambda sex_upon_outcome: 'Neutered' in sex_upon_outcome)
is_spayed = df.SexuponOutcome.apply(lambda sex_upon_outcome: 'Spayed' in sex_upon_outcome)

df['Sterilization'] = 'Intact'
df.loc[is_neutered, 'Sterilization'] = 'Neutered'
df.loc[is_spayed, 'Sterilization'] = 'Spayed'

In [None]:
df.Sterilization.value_counts()

In [None]:
df.SexuponOutcome.value_counts()

In [None]:
((df.Sex == 'Male') & (df.Sterilization == 'Neutered')).sum()

In [None]:
((df.Sex == 'Female') & (df.Sterilization == 'Spayed')).sum()

In [None]:
((df.Sex == 'Male') & (df.Sterilization == 'Intact')).sum()

In [None]:
((df.Sex == 'Female') & (df.Sterilization == 'Intact')).sum()

In [None]:
(df.Sex == 'Unknown').sum()

In [None]:
df = df.drop('SexuponOutcome', axis = 1)

## AgeuponOutcome

In [None]:
df.AgeuponOutcome.unique()

In [None]:
age_upon_outcome = '1 year'

In [None]:
match = re.search(r'(\d) year', age_upon_outcome)

In [None]:
match.group(1)

In [None]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'(\d) year', age_upon_outcome)
    if match:
        value_in_years = match.group(1)
        print 'We have a match in years:', value_in_years
    else:
        print 'We have no match for "{}"'.format(age_upon_outcome)

In [None]:
print_age_upcon_outcome('1 year')

In [None]:
print_age_upcon_outcome('2 years')

In [None]:
print_age_upcon_outcome('12 years')

In [None]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'(\d+) year', age_upon_outcome)
    if match:
        value_in_years = match.group(1)
        print 'We have a match in years:', value_in_years
    else:
        print 'We have no match for "{}"'.format(age_upon_outcome)

In [None]:
print_age_upcon_outcome('1 year')
print_age_upcon_outcome('2 years')
print_age_upcon_outcome('12 years')

In [None]:
print_age_upcon_outcome('1 month')
print_age_upcon_outcome('3 weeks')

In [None]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'(\d+) (\S+)', age_upon_outcome)
    if match:
        value = match.group(1)
        unit = match.group(2)
        print 'We have a match in {}: {}'.format(unit, value)
    else:
        print 'We have no match for "{}"'.format(age_upon_outcome)

In [None]:
print_age_upcon_outcome('1 year')
print_age_upcon_outcome('2 years')
print_age_upcon_outcome('12 years')
print_age_upcon_outcome('1 month')
print_age_upcon_outcome('3 weeks')

In [None]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'(\d+) (\S+[^s])', age_upon_outcome)
    if match:
        value = match.group(1)
        unit = match.group(2)
        print 'We have a match in {}: {}'.format(unit, value)
    else:
        print 'We have no match for "{}"'.format(age_upon_outcome)

In [None]:
print_age_upcon_outcome('1 year')
print_age_upcon_outcome('2 years')
print_age_upcon_outcome('12 years')
print_age_upcon_outcome('1 month')
print_age_upcon_outcome('3 weeks')

In [None]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'^(\d+) (\S+[^s])s?$', age_upon_outcome)
    if match:
        value = match.group(1)
        unit = match.group(2)
        print 'We have a match in {}: {}'.format(unit, value)
    else:
        raise Exception('No match for "{}"'.format(age_upon_outcome))

In [None]:
print_age_upcon_outcome('1 year')
print_age_upcon_outcome('2 years')
print_age_upcon_outcome('12 years')
print_age_upcon_outcome('1 month')
print_age_upcon_outcome('3 weeks')

In [None]:
def age_upcon_outcome(age_upon_outcome):
    if pd.isnull(age_upon_outcome):
        return np.nan

    match = re.search(r'^(\d+) (\S+[^s])s?$', age_upon_outcome)
    if not match:
        raise Exception('No match for "{}"'.format(age_upon_outcome))
        
    value = float(match.group(1))
    unit = match.group(2)
    
    if unit == 'day':
        return value
    elif unit == 'week':
        return value * 7
    elif unit == 'month':
        return value * 30.5
    elif unit == 'year':
        return value * 365.25
    else:
        raise Exception('No match for {}'.format(unit))

In [None]:
df.AgeuponOutcome = df.AgeuponOutcome.apply(age_upcon_outcome)

## Breed

In [None]:
df.Breed.unique()

In [None]:
df.Breed.value_counts()

In [None]:
len(df.Breed.unique())

That's a lot of breeds to consider.  What would do you next?

## Color

In [None]:
df.Color.unique()

In [None]:
len(df.Color.unique())

There's also a lot of colors to consider.  How could you handle them?