# DS-SF-27 | Exploratory Data Analysis on Kaggle's Shelter Animal Outcomes Dataset | Ivan's Notebook

(dataset from https://www.kaggle.com/c/shelter-animal-outcomes)

In [1]:
import os

import numpy as np

import pandas as pd
pd.set_option('display.max_rows', 25)
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_columns', 10)

import re

In [2]:
df = pd.read_csv(os.path.join('..', 'datasets', 'shelter-animal-outcomes.csv.gz'), index_col = 'AnimalID')

In [3]:
df

Unnamed: 0_level_0,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan
A677334,Elsa,2014-04-25 13:04:00,Transfer,Partner,Dog,Intact Female,1 month,Cairn Terrier/Chihuahua Shorthair,Black/Tan
A699218,Jimmy,2015-03-28 13:11:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Tabby
A701489,,2015-04-30 17:02:00,Transfer,Partner,Cat,Unknown,3 weeks,Domestic Shorthair Mix,Brown Tabby
A671784,Lucy,2014-02-04 17:17:00,Adoption,,Dog,Spayed Female,5 months,American Pit Bull Terrier Mix,Red/White
A677747,,2014-05-03 07:48:00,Adoption,Offsite,Dog,Spayed Female,1 year,Cairn Terrier,White


In [4]:
df.shape

(26729, 9)

In [5]:
df.columns

Index([u'Name', u'DateTime', u'OutcomeType', u'OutcomeSubtype', u'AnimalType',
       u'SexuponOutcome', u'AgeuponOutcome', u'Breed', u'Color'],
      dtype='object')

## `Name`

In [6]:
df.Name.unique()

array(['Hambone', 'Emily', 'Pearce', ..., 'Mowmow', 'Sonja', 'Green Bean'], dtype=object)

In [7]:
df.Name.value_counts()

Max            136
Bella          135
Charlie        107
Daisy          106
Lucy            94
Buddy           87
Princess        86
Rocky           85
Luna            68
Jack            66
Lola            61
Sadie           58
              ... 
Guru Ramdas      1
Mr.Cookie        1
Tabs             1
Eedo             1
Marlyin          1
Meerkat          1
Tugga            1
Sally Jessy      1
Nelda            1
Hobie Cat        1
Madi             1
Sumi             1
Name: Name, dtype: int64

> By default, `value_counts()` doesn't include counts of `NaN`.

In [8]:
df.Name.value_counts(dropna = False)

NaN            7691
Max             136
Bella           135
Charlie         107
Daisy           106
Lucy             94
Buddy            87
Princess         86
Rocky            85
Luna             68
Jack             66
Lola             61
               ... 
Mr.Cookie         1
Tabs              1
Eedo              1
Marlyin           1
Meerkat           1
Tugga             1
Sally Jessy       1
Nelda             1
Hobie Cat         1
Madi              1
Moscato           1
Sumi              1
Name: Name, dtype: int64

## `DateTime`

In [9]:
df.DateTime

AnimalID
A671945    2014-02-12 18:22:00
A656520    2013-10-13 12:44:00
A686464    2015-01-31 12:28:00
A683430    2014-07-11 19:09:00
A667013    2013-11-15 12:52:00
A677334    2014-04-25 13:04:00
A699218    2015-03-28 13:11:00
A701489    2015-04-30 17:02:00
A671784    2014-02-04 17:17:00
A677747    2014-05-03 07:48:00
A668402    2013-12-05 15:50:00
A666320    2013-11-04 14:48:00
                  ...         
A519949    2014-06-30 17:34:00
A701044    2015-04-28 14:26:00
A707871    2015-07-20 09:00:00
A704405    2015-07-18 14:08:00
A662019    2014-07-17 09:43:00
A687056    2014-08-31 09:00:00
A716997    2016-01-29 18:52:00
A702446    2015-05-14 11:56:00
A718934    2016-01-20 18:59:00
A698128    2015-03-09 13:33:00
A677478    2014-04-27 12:22:00
A706629    2015-07-02 09:00:00
Name: DateTime, dtype: object

In [10]:
df.DateTime[-1]

'2015-07-02 09:00:00'

In [11]:
type(df.DateTime[-1])

str

> `DateTime` is composed of strings.

In [12]:
datetime_as_string = '2015-07-02 09:00:00'

> Let's convert these strings into a Date/Time format we can easily query.

In [13]:
datetime_as_timestamp = pd.to_datetime(datetime_as_string)

In [14]:
datetime_as_timestamp

Timestamp('2015-07-02 09:00:00')

In [15]:
type(datetime_as_timestamp)

pandas.tslib.Timestamp

In [16]:
datetime_as_timestamp.year

2015

In [17]:
datetime_as_timestamp.month

7

In [18]:
datetime_as_timestamp.day

2

In [19]:
datetime_as_timestamp.hour

9

In [20]:
datetime_as_timestamp.minute

0

In [21]:
datetime_as_timestamp.second

0

In [22]:
pd.to_datetime(df.DateTime)

AnimalID
A671945   2014-02-12 18:22:00
A656520   2013-10-13 12:44:00
A686464   2015-01-31 12:28:00
A683430   2014-07-11 19:09:00
A667013   2013-11-15 12:52:00
A677334   2014-04-25 13:04:00
A699218   2015-03-28 13:11:00
A701489   2015-04-30 17:02:00
A671784   2014-02-04 17:17:00
A677747   2014-05-03 07:48:00
A668402   2013-12-05 15:50:00
A666320   2013-11-04 14:48:00
                  ...        
A519949   2014-06-30 17:34:00
A701044   2015-04-28 14:26:00
A707871   2015-07-20 09:00:00
A704405   2015-07-18 14:08:00
A662019   2014-07-17 09:43:00
A687056   2014-08-31 09:00:00
A716997   2016-01-29 18:52:00
A702446   2015-05-14 11:56:00
A718934   2016-01-20 18:59:00
A698128   2015-03-09 13:33:00
A677478   2014-04-27 12:22:00
A706629   2015-07-02 09:00:00
Name: DateTime, dtype: datetime64[ns]

In [23]:
df.DateTime = pd.to_datetime(df.DateTime)

In [24]:
df.DateTime[-1]

Timestamp('2015-07-02 09:00:00')

In [25]:
type(df.DateTime[-1])

pandas.tslib.Timestamp

## `OutcomeType` and `OutcomeSubtype`

### `OutcomeType`

In [26]:
df.OutcomeType

AnimalID
A671945    Return_to_owner
A656520         Euthanasia
A686464           Adoption
A683430           Transfer
A667013           Transfer
A677334           Transfer
A699218           Transfer
A701489           Transfer
A671784           Adoption
A677747           Adoption
A668402           Transfer
A666320           Adoption
                ...       
A519949    Return_to_owner
A701044           Adoption
A707871           Transfer
A704405           Adoption
A662019           Adoption
A687056           Transfer
A716997           Adoption
A702446           Transfer
A718934           Transfer
A698128           Adoption
A677478           Transfer
A706629           Transfer
Name: OutcomeType, dtype: object

In [27]:
df.OutcomeType.unique()

array(['Return_to_owner', 'Euthanasia', 'Adoption', 'Transfer', 'Died'], dtype=object)

In [28]:
df.OutcomeType.value_counts()

Adoption           10769
Transfer            9422
Return_to_owner     4786
Euthanasia          1555
Died                 197
Name: OutcomeType, dtype: int64

> To illustrate the use of `apply`, let's convert every occurrence of 'Return_to_owner' or 'ReturnToOwner'.

In [29]:
df.OutcomeType = df.OutcomeType.\
    apply(lambda outcome_type: 'ReturnToOwner' if outcome_type == 'Return_to_owner' else outcome_type)

In [30]:
df.OutcomeType.value_counts()

Adoption         10769
Transfer          9422
ReturnToOwner     4786
Euthanasia        1555
Died               197
Name: OutcomeType, dtype: int64

> Instead of using a `lambda` function (for short expressions), we could have called an already defined function.

In [31]:
def process_outcome_type(outcome_type):
    if outcome_type == 'Return_to_owner':
        return 'ReturnToOwner'
    else:
        return outcome_type

df.OutcomeType = df.OutcomeType.\
    apply(process_outcome_type)

### OutcomeSubtype

In [32]:
df.OutcomeSubtype.unique()

array([nan, 'Suffering', 'Foster', 'Partner', 'Offsite', 'SCRP',
       'Aggressive', 'Behavior', 'Rabies Risk', 'Medical', 'In Kennel',
       'In Foster', 'Barn', 'Court/Investigation', 'Enroute', 'At Vet',
       'In Surgery'], dtype=object)

> What if we want to remove the spaces and other special characters in the different values?

In [33]:
'In Kennel'.translate(None, ' ')

'InKennel'

In [34]:
'Court/Investigation'.translate(None, ' ')

'Court/Investigation'

In [35]:
'Court/Investigation'.translate(None, '/')

'CourtInvestigation'

In [36]:
'Court/Investigation'.translate(None, '/ ')

'CourtInvestigation'

In [37]:
'In Kennel'.translate(None, '/ ')

'InKennel'

In [38]:
def process_outcome_subtype(outcome_subtype):
    if pd.isnull(outcome_subtype):
        return np.nan

    return outcome_subtype.translate(None, '/ ')

df.OutcomeSubtype = df.OutcomeSubtype.apply(process_outcome_subtype)

In [39]:
df.OutcomeSubtype.unique()

array([nan, 'Suffering', 'Foster', 'Partner', 'Offsite', 'SCRP',
       'Aggressive', 'Behavior', 'RabiesRisk', 'Medical', 'InKennel',
       'InFoster', 'Barn', 'CourtInvestigation', 'Enroute', 'AtVet',
       'InSurgery'], dtype=object)

### OutcomeType and OutcomeSubtype

In [40]:
pd.crosstab(df.OutcomeSubtype, df.OutcomeType, margins = True)

OutcomeType,Adoption,Died,Euthanasia,Transfer,All
OutcomeSubtype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Aggressive,0,0,320,0,320
AtVet,0,4,0,0,4
Barn,1,0,0,1,2
Behavior,0,0,86,0,86
CourtInvestigation,0,0,6,0,6
Enroute,0,8,0,0,8
Foster,1800,0,0,0,1800
InFoster,0,52,0,0,52
InKennel,0,114,0,0,114
InSurgery,0,3,0,0,3


> By default, `crosstab()` doesn't include counts of `NaN`.

In [41]:
pd.crosstab(df.OutcomeSubtype, df.OutcomeType, dropna = False, margins = True)

OutcomeType,Adoption,Died,Euthanasia,ReturnToOwner,Transfer,All
OutcomeSubtype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aggressive,0,0,320,0,0,320
AtVet,0,4,0,0,0,4
Barn,1,0,0,0,1,2
Behavior,0,0,86,0,0,86
CourtInvestigation,0,0,6,0,0,6
Enroute,0,8,0,0,0,8
Foster,1800,0,0,0,0,1800
InFoster,0,52,0,0,0,52
InKennel,0,114,0,0,0,114
InSurgery,0,3,0,0,0,3


> The totals are correct but the `NaN` don't have their own rows.

## `AnimalType`

In [42]:
df.AnimalType.value_counts(dropna = False)

Dog    15595
Cat    11134
Name: AnimalType, dtype: int64

## `SexuponOutcome`

In [43]:
df.SexuponOutcome.unique()

array(['Neutered Male', 'Spayed Female', 'Intact Male', 'Intact Female',
       'Unknown', nan], dtype=object)

> Let's convert the 'Unknown' to `NaN`.

In [44]:
df.SexuponOutcome = df.SexuponOutcome.\
    apply(lambda sex_upon_outcome: np.nan if sex_upon_outcome == 'Unknown' else sex_upon_outcome)

In [45]:
df.SexuponOutcome.unique()

array(['Neutered Male', 'Spayed Female', 'Intact Male', 'Intact Female',
       nan], dtype=object)

> How to find substrings:

In [46]:
'Intact' in 'Intact Male'

True

In [47]:
'Male' in 'Intact Male'

True

In [48]:
'Male ' in 'Intact Male'

False

In [49]:
is_male = 'Male' in df.SexuponOutcome

In [50]:
is_male

False

In [51]:
df.SexuponOutcome

AnimalID
A671945    Neutered Male
A656520    Spayed Female
A686464    Neutered Male
A683430      Intact Male
A667013    Neutered Male
A677334    Intact Female
A699218      Intact Male
A701489              NaN
A671784    Spayed Female
A677747    Spayed Female
A668402              NaN
A666320    Spayed Female
               ...      
A519949    Spayed Female
A701044    Neutered Male
A707871              NaN
A704405    Spayed Female
A662019    Spayed Female
A687056      Intact Male
A716997    Neutered Male
A702446      Intact Male
A718934    Spayed Female
A698128    Neutered Male
A677478      Intact Male
A706629      Intact Male
Name: SexuponOutcome, dtype: object

In [52]:
is_male = df.SexuponOutcome.apply(lambda sex_upon_outcome: pd.notnull(sex_upon_outcome) and ('Male' in sex_upon_outcome))

In [53]:
is_male

AnimalID
A671945     True
A656520    False
A686464     True
A683430     True
A667013     True
A677334    False
A699218     True
A701489    False
A671784    False
A677747    False
A668402    False
A666320    False
           ...  
A519949    False
A701044     True
A707871    False
A704405    False
A662019    False
A687056     True
A716997     True
A702446     True
A718934    False
A698128     True
A677478     True
A706629     True
Name: SexuponOutcome, dtype: bool

In [54]:
is_female = df.SexuponOutcome.apply(lambda sex_upon_outcome: pd.notnull(sex_upon_outcome) and ('Female' in sex_upon_outcome))

In [55]:
df['Sex'] = np.nan

In [56]:
df.Sex.value_counts(dropna = False)

NaN    26729
Name: Sex, dtype: int64

In [57]:
df.loc[is_male, 'Sex'] = 'Male'

In [58]:
df.Sex.value_counts(dropna = False)

NaN     13425
Male    13304
Name: Sex, dtype: int64

In [59]:
df.loc[is_female, 'Sex'] = 'Female'

In [60]:
df.Sex.value_counts(dropna = False)

Male      13304
Female    12331
NaN        1094
Name: Sex, dtype: int64

In [61]:
is_neutered = df.SexuponOutcome.apply(lambda sex_upon_outcome: pd.notnull(sex_upon_outcome) and ('Neutered' in sex_upon_outcome))
is_spayed = df.SexuponOutcome.apply(lambda sex_upon_outcome: pd.notnull(sex_upon_outcome) and ('Spayed' in sex_upon_outcome))

df['Sterilized'] = np.nan

df.loc[is_male, 'Sterilized'] = False
df.loc[is_female, 'Sterilized'] = False

df.loc[is_neutered, 'Sterilized'] = True
df.loc[is_spayed, 'Sterilized'] = True

In [62]:
df.Sterilized.value_counts(dropna = False)

True     18599
False     7036
NaN       1094
Name: Sterilized, dtype: int64

In [63]:
df.SexuponOutcome.value_counts()

Neutered Male    9779
Spayed Female    8820
Intact Male      3525
Intact Female    3511
Name: SexuponOutcome, dtype: int64

In [64]:
((df.Sex == 'Male') & df.Sterilized).sum()

9779

In [65]:
((df.Sex == 'Female') & df.Sterilized).sum()

8820

In [66]:
((df.Sex == 'Male') & (df.Sterilized == False)).sum()

3525

In [67]:
((df.Sex == 'Female') & (df.Sterilized == False)).sum()

3511

In [68]:
df.Sex.isnull().sum()

1094

In [69]:
df.Sterilized.isnull().sum()

1094

In [70]:
df.drop('SexuponOutcome', axis = 1, inplace = True)

## `AgeuponOutcome`

In [71]:
df.AgeuponOutcome.unique()

array(['1 year', '2 years', '3 weeks', '1 month', '5 months', '4 years',
       '3 months', '2 weeks', '2 months', '10 months', '6 months',
       '5 years', '7 years', '3 years', '4 months', '12 years', '9 years',
       '6 years', '1 weeks', '11 years', '4 weeks', '7 months', '8 years',
       '11 months', '4 days', '9 months', '8 months', '15 years',
       '10 years', '1 week', '0 years', '14 years', '3 days', '6 days',
       '5 days', '5 weeks', '2 days', '16 years', '1 day', '13 years', nan,
       '17 years', '18 years', '19 years', '20 years'], dtype=object)

### One way to do it...

In [72]:
age_upon_outcome = '1 year'

In [73]:
(value, unit) = age_upon_outcome.split()

In [74]:
print 'value =', value
print 'unit  =', unit

value = 1
unit  = year


In [75]:
def process_age_upcon_outcome(age_upon_outcome):
    if pd.isnull(age_upon_outcome):
        return np.nan

    (value, unit) = age_upon_outcome.split()
    value = float(value)

    if 'day' in unit:
        return value
    elif 'week' in unit:
        return value * 7
    elif 'month' in unit:
        return value * 30.5
    elif 'year' in unit:
        return value * 365.25
    else:
        raise Exception('No match for {}'.format(unit))

df['AgeuponOutcome_1'] = df.AgeuponOutcome.apply(process_age_upcon_outcome)

### Another way to do it (a bit more complex and overkill here but more powerful and more flexible)...

In [76]:
age_upon_outcome = '1 year'

In [77]:
match = re.search(r'(\d) year', age_upon_outcome)

In [78]:
match.group(1)

'1'

In [79]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'(\d) year', age_upon_outcome)
    if match:
        value_in_years = match.group(1)
        print 'We have a match in years:', value_in_years
    else:
        print 'We have no match for "{}"'.format(age_upon_outcome)

In [80]:
print_age_upcon_outcome('1 year')

We have a match in years: 1


In [81]:
print_age_upcon_outcome('2 years')

We have a match in years: 2


In [82]:
print_age_upcon_outcome('12 years')

We have a match in years: 2


In [83]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'(\d+) year', age_upon_outcome)
    if match:
        value_in_years = match.group(1)
        print 'We have a match in years:', value_in_years
    else:
        print 'We have no match for "{}"'.format(age_upon_outcome)

In [84]:
print_age_upcon_outcome('1 year')
print_age_upcon_outcome('2 years')
print_age_upcon_outcome('12 years')

We have a match in years: 1
We have a match in years: 2
We have a match in years: 12


In [85]:
print_age_upcon_outcome('1 month')
print_age_upcon_outcome('3 weeks')

We have no match for "1 month"
We have no match for "3 weeks"


In [86]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'(\d+) (\S+)', age_upon_outcome)
    if match:
        value = match.group(1)
        unit = match.group(2)
        print 'We have a match in {}: {}'.format(unit, value)
    else:
        print 'We have no match for "{}"'.format(age_upon_outcome)

In [87]:
print_age_upcon_outcome('1 year')
print_age_upcon_outcome('2 years')
print_age_upcon_outcome('12 years')
print_age_upcon_outcome('1 month')
print_age_upcon_outcome('3 weeks')

We have a match in year: 1
We have a match in years: 2
We have a match in years: 12
We have a match in month: 1
We have a match in weeks: 3


In [88]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'(\d+) (\S+[^s])', age_upon_outcome)
    if match:
        value = match.group(1)
        unit = match.group(2)
        print 'We have a match in {}: {}'.format(unit, value)
    else:
        print 'We have no match for "{}"'.format(age_upon_outcome)

In [89]:
print_age_upcon_outcome('1 year')
print_age_upcon_outcome('2 years')
print_age_upcon_outcome('12 years')
print_age_upcon_outcome('1 month')
print_age_upcon_outcome('3 weeks')

We have a match in year: 1
We have a match in year: 2
We have a match in year: 12
We have a match in month: 1
We have a match in week: 3


In [90]:
def print_age_upcon_outcome(age_upon_outcome):
    match = re.search(r'^(\d+) (\S+[^s])s?$', age_upon_outcome)
    if match:
        value = match.group(1)
        unit = match.group(2)
        print 'We have a match in {}: {}'.format(unit, value)
    else:
        raise Exception('No match for "{}"'.format(age_upon_outcome))

In [91]:
print_age_upcon_outcome('1 year')
print_age_upcon_outcome('2 years')
print_age_upcon_outcome('12 years')
print_age_upcon_outcome('1 month')
print_age_upcon_outcome('3 weeks')

We have a match in year: 1
We have a match in year: 2
We have a match in year: 12
We have a match in month: 1
We have a match in week: 3


In [92]:
def process_age_upcon_outcome(age_upon_outcome):
    if pd.isnull(age_upon_outcome):
        return np.nan

    match = re.search(r'^(\d+) (\S+[^s])s?$', age_upon_outcome)
    if not match:
        raise Exception('No match for "{}"'.format(age_upon_outcome))

    value = float(match.group(1))
    unit = match.group(2)

    if unit == 'day':
        return value
    elif unit == 'week':
        return value * 7
    elif unit == 'month':
        return value * 30.5
    elif unit == 'year':
        return value * 365.25
    else:
        raise Exception('No match for {}'.format(unit))

df['AgeuponOutcome_2'] = df.AgeuponOutcome.apply(process_age_upcon_outcome)

## `Breed`

In [93]:
df.Breed.unique().shape

(1380L,)

In [94]:
df.Breed.value_counts(dropna = False)

Domestic Shorthair Mix                   8810
Pit Bull Mix                             1906
Chihuahua Shorthair Mix                  1766
Labrador Retriever Mix                   1363
Domestic Medium Hair Mix                  839
German Shepherd Mix                       575
Domestic Longhair Mix                     520
Siamese Mix                               389
Australian Cattle Dog Mix                 367
Dachshund Mix                             318
Boxer Mix                                 245
Miniature Poodle Mix                      233
                                         ... 
Boston Terrier/Pit Bull                     1
German Shepherd/Great Dane                  1
Bluetick Hound                              1
Greyhound/Labrador Retriever                1
Collie Smooth/Siberian Husky                1
Dogue De Bordeaux/American Bulldog          1
Golden Retriever/Great Pyrenees             1
Norfolk Terrier/Dachshund                   1
Golden Retriever/Whippet          

In [95]:
df.Breed.unique().shape

(1380L,)

That's a lot of breeds to consider.  What would do you next?

## `Color`

In [96]:
df.Color.unique()

array(['Brown/White', 'Cream Tabby', 'Blue/White', 'Blue Cream', 'Tan',
       'Black/Tan', 'Blue Tabby', 'Brown Tabby', 'Red/White', 'White',
       'Black', 'Silver', 'Brown', 'Black/Red', 'White/Cream',
       'Orange Tabby/White', 'Black/White', 'Brown Brindle/White',
       'Black/Brown', 'Orange Tabby', 'Chocolate/White', 'White/Tan',
       'Cream Tabby/White', 'Blue', 'Calico', 'Torbie', 'Brown/Black',
       'Yellow', 'Tricolor', 'White/Black', 'Tortie', 'Blue Tabby/White',
       'Gray/White', 'Tan/Black', 'Tan/White', 'Buff', 'Brown Tabby/White',
       'Red', 'Blue/Tan', 'Seal Point', 'Brown Brindle', 'White/Brown',
       'Gray', 'Yellow Brindle/White', 'Fawn/White', 'Flame Point',
       'Black Tabby/White', 'Tortie/Calico', 'Black/Brown Brindle',
       'White/Gray', 'Tan/Silver', 'Red Tick/Black', 'White/Cream Tabby',
       'Blue Merle', 'Chocolate/Tan', 'Sable', 'Brown Merle/White',
       'Brown Tiger/White', 'Liver/White', 'White/Blue Tabby',
       'Black/Gray', 'C

In [97]:
df.Color.unique().shape

(366L,)

There's also a lot of colors to consider.  How could you handle them?