In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv("train.csv")
df.head(n=3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White


In [5]:
format = '%Y-%m-%d  %I:%M:S'
df['DateTime'] = df['DateTime'].apply(lambda x: pd.to_datetime(x, infer_datetime_format=True))

In [6]:
df['Hour'] = df['DateTime'].apply(lambda x: x.hour)

In [7]:
df['Day_of_week'] = df['DateTime'].apply(lambda x: x.dayofweek)

In [8]:
obj = re.compile("(?P<num>\d+)\s*(?P<word>\w+)")
def convert_age(string):
    match = obj.search(string)
    if match:
        return match.groups()
    else:
        None, None

In [9]:
%%bash
cat train.csv | cut -d, -f 8 | cut -d' ' -f 2 | sort | uniq -c

     18 
      1 AgeuponOutcome
     66 day
    332 days
   1281 month
   8339 months
    146 week
   1704 weeks
   3969 year
  10874 years


In [10]:
convert_dict = {None:0, 
                'day':1, 
                'days':1, 
                'month':30, 
                'months':30, 
                'year':365, 
                'years':365, 
                'week':7, 
                'weeks':7
               }

In [15]:
for i in range(df.shape[0]):
    try:
        string = df.loc[i, 'AgeuponOutcome']
        num, word = convert_age(string)
        num_days = int(num) * convert_dict[word]
        df.loc[i, 'AgeuponOutcome_inDays'] = num_days
    except:
        df.loc[i, 'AgeuponOutcome_inDays'] = 0
        continue

In [17]:
df.head(n=3)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Hour,Day_of_week,AgeuponOutcome_inDays
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,18,2,365
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,12,6,365
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,12,5,730


In [21]:
df['has_name_bool'] = df['Name'].apply(lambda x: 1 if x is not None else 0)

In [39]:
color = df.groupby('Color').count()['AnimalID'].sort(inplace=False)

In [42]:
color.sample(n=10).head()

Color
Cream/Brown            2
Apricot/Brown          2
Blue Merle/Red         1
Lynx Point/White      13
Silver Tabby/White    12
Name: AnimalID, dtype: int64

In [67]:
obj = re.compile('(\w+)\s*(\w+)*/*(\w+)*')
def break_up_colors(string):
    match = obj.search(string)
    if match:
        colors = match.groups()
        return colors
    else:
        return None

In [91]:
colors = []
for c in color.index:
    colors +=break_up_colors(c)

colors = np.unique(colors)
print (colors)

[None 'Agouti' 'Apricot' 'Black' 'Blue' 'Brindle' 'Brown' 'Buff' 'Calico'
 'Chocolate' 'Cream' 'Fawn' 'Flame' 'Gold' 'Gray' 'Lilac' 'Liver' 'Lynx'
 'Merle' 'Orange' 'Pink' 'Point' 'Red' 'Ruddy' 'Sable' 'Seal' 'Silver'
 'Smoke' 'Tabby' 'Tan' 'Tick' 'Tiger' 'Torbie' 'Tortie' 'Tricolor' 'White'
 'Yellow']


In [101]:
dff = pd.DataFrame(columns=colors, data=np.zeros((df.shape[0], dff.shape[1]), np.int8))

In [103]:
for i in range(df.shape[0]):
    color_string = df.loc[i, 'Color']
    color_string_tuple = break_up_colors(color_string)
    dff.loc[i, color_string_tuple] = 1

In [123]:
df_final = pd.get_dummies(df[['OutcomeType', 'OutcomeSubtype', 'AnimalType', 'SexuponOutcome', 
                              'Breed', 'Hour', 'Day_of_week']])
df_final = pd.concat([df_final, dff, df['AgeuponOutcome_inDays']], axis=1)

Name, OutcomeType, OutcomeSubtype, AnimalType, SexuponOutcom, Breed, (colors dummies), has_name_bool
hour, Day_of_week, AgeuponOutcome_inDays

In [124]:
df_final.columns


Index([u'Hour', u'Day_of_week', u'OutcomeType_Adoption', u'OutcomeType_Died',
       u'OutcomeType_Euthanasia', u'OutcomeType_Return_to_owner',
       u'OutcomeType_Transfer', u'OutcomeSubtype_Aggressive',
       u'OutcomeSubtype_At Vet', u'OutcomeSubtype_Barn', 
       ...
       u'Tabby', u'Tan', u'Tick', u'Tiger', u'Torbie', u'Tortie', u'Tricolor',
       u'White', u'Yellow', u'AgeuponOutcome_inDays'],
      dtype='object', length=1447)

In [125]:
df_final.head()

Unnamed: 0,Hour,Day_of_week,OutcomeType_Adoption,OutcomeType_Died,OutcomeType_Euthanasia,OutcomeType_Return_to_owner,OutcomeType_Transfer,OutcomeSubtype_Aggressive,OutcomeSubtype_At Vet,OutcomeSubtype_Barn,...,Tabby,Tan,Tick,Tiger,Torbie,Tortie,Tricolor,White,Yellow,AgeuponOutcome_inDays
0,18,2,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,365
1,12,6,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,365
2,12,5,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,730
3,19,4,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,21
4,12,4,0,0,0,0,1,0,0,0,...,0,1,0,0,0,0,0,0,0,730
