In [69]:
# import 
import pandas as pd
import numpy as np
import datetime
from dateutil.parser import parse
import re

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

In [70]:
# data
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [71]:
print(train.shape)
train.head(5).T

(26729, 10)


Unnamed: 0,0,1,2,3,4
AnimalID,A671945,A656520,A686464,A683430,A667013
Name,Hambone,Emily,Pearce,,
DateTime,2014-02-12 18:22:00,2013-10-13 12:44:00,2015-01-31 12:28:00,2014-07-11 19:09:00,2013-11-15 12:52:00
OutcomeType,Return_to_owner,Euthanasia,Adoption,Transfer,Transfer
OutcomeSubtype,,Suffering,Foster,Partner,Partner
AnimalType,Dog,Cat,Dog,Cat,Dog
SexuponOutcome,Neutered Male,Spayed Female,Neutered Male,Intact Male,Neutered Male
AgeuponOutcome,1 year,1 year,2 years,3 weeks,2 years
Breed,Shetland Sheepdog Mix,Domestic Shorthair Mix,Pit Bull Mix,Domestic Shorthair Mix,Lhasa Apso/Miniature Poodle
Color,Brown/White,Cream Tabby,Blue/White,Blue Cream,Tan


In [72]:
print(test.shape)
test.head(5).T

(11456, 8)


Unnamed: 0,0,1,2,3,4
ID,1,2,3,4,5
Name,Summer,Cheyenne,Gus,Pongo,Skooter
DateTime,2015-10-12 12:15:00,2014-07-26 17:59:00,2016-01-13 12:20:00,2013-12-28 18:12:00,2015-09-24 17:59:00
AnimalType,Dog,Dog,Cat,Dog,Dog
SexuponOutcome,Intact Female,Spayed Female,Neutered Male,Intact Male,Neutered Male
AgeuponOutcome,10 months,2 years,1 year,4 months,2 years
Breed,Labrador Retriever Mix,German Shepherd/Siberian Husky,Domestic Shorthair Mix,Collie Smooth Mix,Miniature Poodle Mix
Color,Red/White,Black/Tan,Brown Tabby,Tricolor,White


## clean sexuponoutcome

In [73]:
def update_sex(df):
    unique_sex = ['Neutered Male', 'Spayed Female', 'Intact Male', 'Intact Female', 'Unknown']
    df = df[df["SexuponOutcome"].isin(unique_sex)]
    df["Male"] = df["SexuponOutcome"].apply(lambda x: 1 if x.endswith("Male") else 0)
    df["Female"] = df["SexuponOutcome"].apply(lambda x: 1 if x.endswith("Female") else 0)
    df["SexType"] = df["SexuponOutcome"].apply(lambda x: x.split(" ")[0])
    df = pd.get_dummies(df,columns=["SexType"], prefix="SexType")
    # df = df.drop("SexuponOutcome",1)
    return df

In [74]:
train = update_sex(train)
test = update_sex(test)

In [75]:
train.head(5)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,Male,Female,SexType_Intact,SexType_Neutered,SexType_Spayed,SexType_Unknown
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White,1,0,0.0,1.0,0.0,0.0
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby,0,1,0.0,0.0,1.0,0.0
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White,1,0,0.0,1.0,0.0,0.0
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream,1,0,1.0,0.0,0.0,0.0
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan,1,0,0.0,1.0,0.0,0.0


## clean ageuponoutcome

In [76]:
def map_age(s):
    if type(s) == float:
        return -1
    if "year" in s:
        return int(s.split(" ")[0])*365
    elif "month" in s:
        return int(s.split(" ")[0])*30.5
    elif "week" in s:
        return int(s.split(" ")[0])*7
    elif "day" in s:
        return int(s.split(" ")[0])
    
def update_age(df):
    df["AgeuponOutcome"] = df["AgeuponOutcome"].apply(map_age)
    train_age_means_by_outcome = dict(train.groupby("OutcomeType")["AgeuponOutcome"].mean())
    # set value of 0 to outcome group mean
    for i in train[train.AgeuponOutcome == -1].index:
        df.ix[i,"AgeuponOutcome"] = train_age_means_by_outcome[df.ix[i,"OutcomeType"]]
    return df

In [77]:
train = update_age(train)
test = update_age(test)

## clean breed

In [78]:
# get unique_breed
all_breeds = list(train["Breed"].apply(lambda x: x.split("/")))

unique_breed = []

for l in all_breeds:
    for s in l:
        if "," in l:
            sp = l.split(",")
            for i in l: 
                if i not in unique_breed:
                    unique_breed.append(i)
        else:
            if s not in unique_breed:
                unique_breed.append(s)
                
# get rid of the "_Mix"
unique_breed = [x[:-4] if "Mix" in x else x for x in unique_breed]
unique_breed = list(set(unique_breed))

print("Number of unique breeds: %d (not considering mix)" % len(unique_breed))

Number of unique breeds: 225 (not considering mix)


In [79]:
def map_breed(df):
    
    # number of "/" or "," in Breed
    df["crosses"] = df["Breed"].apply(lambda x: len(re.split(",|/",'Shetland Sheepdog Mix/Dog')))
    # check if it is a mix
    df["mix"] = df["Breed"].apply(lambda x: 1 if "Mix" in x else 0)
    # get rid of the "_Mix" at the end if there is one
    df["Breed"] = df["Breed"].apply(lambda x: x[:-4] if "Mix" in x else x)
    
    # dummy mapping for all unique breeds
    for b in unique_breed:
        df["Breed_"+b] = df["Breed"].apply(lambda x: b in x)
    # replace the spaces in column names with "_"
    df.columns = [x.replace(" ","_") for x in df.columns]
    
    '''
    # get all breed cols in train
    train_breed_only = train[[x for x in train.columns if x.startswith("Breed_")]]
    # get top 100 names
    train_top_100_names = train_breed_only.sum(axis=0).sort_values(ascending=False)[:100].index
    # get the rest column names
    train_else_names = train_breed_only.sum(axis=0).sort_values(ascending=False)[100:].index
    # get other cols of df
    df_else_cols = df[[x for x in df.columns if not x.startswith("Breed_")]]
    # combine other columns with columns of top 100 breeds
    df_new = pd.concat([df_else_cols, df[train_top_100_names]], axis=1)
    # add a new column for other breeds (binary)
    df_new["Breed_Other"] = df[train_else_names].sum(axis=1).apply(lambda x: 1 if x>0 else 0)
    '''
    
    # df_new = df_new.drop("Breed",1) 
    # return df_new
    return df

In [80]:
train_after_breed = map_breed(train)
test_after_breed = map_breed(test)

train = train_after_breed
test = test_after_breed

In [81]:
train.head(5)

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,...,Breed_Devon_Rex,Breed_Nova_Scotia_Duck_Tolling_Retriever,Breed_Feist,Breed_Bull_Terrier_Miniature,Breed_Cocker_Spaniel,Breed_Borzoi,Breed_Old_English_Bulldog,Breed_Cymric,Breed_Swedish_Vallhund,Breed_Balinese
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,365.0,Shetland Sheepdog,Brown/White,...,False,False,False,False,False,False,False,False,False,False
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,365.0,Domestic Shorthair,Cream Tabby,...,False,False,False,False,False,False,False,False,False,False
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,730.0,Pit Bull,Blue/White,...,False,False,False,False,False,False,False,False,False,False
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,21.0,Domestic Shorthair,Blue Cream,...,False,False,False,False,False,False,False,False,False,False
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,730.0,Lhasa Apso/Miniature Poodle,Tan,...,False,False,False,False,False,False,False,False,False,False


## clean color

In [82]:
# get unique color
all_color = list(train["Color"].apply(lambda x: x.split("/")))

unique_color = []
unique_description = []

for l in all_color:
    for s in l:
        if len(s.split(" ")) > 1:
            if s.split(" ")[1] not in unique_description:
                unique_description.append(s.split(" ")[1])
        if s not in unique_color:
            unique_color.append(s)

unique_color = list(set([x.split(" ")[0] for x in unique_color] + unique_description))
print(unique_color)

['Smoke', 'Lynx', 'Pink', 'Black', 'Calico', 'Ruddy', 'Blue', 'Merle', 'Tan', 'Seal', 'Tortie', 'Gray', 'Orange', 'Sable', 'Tiger', 'Yellow', 'Torbie', 'Tricolor', 'Silver', 'Agouti', 'Gold', 'Red', 'Lilac', 'Apricot', 'Tick', 'Point', 'Liver', 'Brindle', 'White', 'Buff', 'Flame', 'Chocolate', 'Cream', 'Tabby', 'Fawn', 'Brown']


In [83]:
def map_color(df):
    for c in unique_color:
        df["Color_"+c] = df["Color"].apply(lambda x: c in x)
    # number of mix colors
    df["Color_num_mix"] = df["Color"].apply(lambda x: len(re.split(",|/",x)))
    # df = df.drop("Color",1)
    return df

In [84]:
train = map_color(train)
test = map_color(test)

## clean animaltype

In [85]:
def map_animal(df):
    df["animal_is_dog"] = df["AnimalType"].apply(lambda x: "Dog" in x)
    # df = df.drop("AnimalType",1)
    return df

In [86]:
train = map_animal(train)
test = map_animal(test)

## clean time

In [87]:
def clean_time(df):
    df["Year"] = df["DateTime"].apply(lambda x: x.split(" ")[0].split("-")[0])
    df["Month"] = df["DateTime"].apply(lambda x: x.split(" ")[0].split("-")[1])
    df["Day"] = df["DateTime"].apply(lambda x: x.split(" ")[0].split("-")[2])
    df["Weekday"] = df["DateTime"].apply(lambda x: parse(x.split(" ")[0]).weekday())
    df["Hour"] = df["DateTime"].apply(lambda x: x.split(" ")[1].split(":")[0])
    df["Minute"] = df["DateTime"].apply(lambda x: x.split(" ")[1].split(":")[1])
    # df = df.drop("DateTime",1)
    return df

In [88]:
train = clean_time(train)
test = clean_time(test)

## clean name
1. bool for if there is a name to the animal
2. calculate the length of the name
3. initials?
4. name popularity?

In [116]:
train_name_freq = train["Name"].value_counts() / train.shape[0]
popular_names = train_name_freq[:100].index
popular_names

Index(['Max', 'Bella', 'Charlie', 'Daisy', 'Lucy', 'Buddy', 'Princess',
       'Rocky', 'Luna', 'Jack', 'Lola', 'Sadie', 'Molly', 'Shadow', 'Coco',
       'Maggie', 'Chico', 'Oreo', 'Lily', 'Blue', 'Lucky', 'Toby', 'Ginger',
       'Cookie', 'Lady', 'Sam', 'Duke', 'Sophie', 'Oliver', 'Milo', 'Rosie',
       'Leo', 'Bear', 'Buster', 'Riley', 'Ruby', 'Sasha', 'Marley', 'Peanut',
       'Diamond', 'Jake', 'Brownie', 'Rex', 'Chloe', 'Cooper', 'Zeus', 'Penny',
       'Roxy', 'Minnie', 'Pepper', 'Jasper', 'Smokey', 'Bailey', 'Jasmine',
       'Tiger', 'Gracie', 'Jackson', 'Zoey', 'Mia', 'Oso', 'Nala', 'Bruno',
       'Stella', 'Rusty', 'Abby', 'Sugar', 'Emma', 'Hank', 'Fiona', 'Oscar',
       'Lilly', 'Lulu', 'Louie', 'Luke', 'Simba', 'Harley', 'Angel', 'Sandy',
       'Annie', 'Dexter', 'Sammy', 'Petey', 'Precious', 'Roscoe', 'Loki',
       'Scout', 'George', 'Betty', 'Beau', 'Bonnie', 'Baby', 'Mickey', 'Romeo',
       'Ellie', 'Star', 'Honey', 'Jax', 'X', 'Bandit', 'Willie'],
      dtype='

In [117]:
train[train.Name.isin(popular_names)]["OutcomeType"].value_counts() / train[train.Name.isin(popular_names)].shape[0]

Adoption           0.464484
Return_to_owner    0.292560
Transfer           0.193836
Euthanasia         0.046713
Died               0.002408
Name: OutcomeType, dtype: float64

In [118]:
train[~train.Name.isin(popular_names)]["OutcomeType"].value_counts() / train[~train.Name.isin(popular_names)].shape[0]

Adoption           0.391584
Transfer           0.381705
Return_to_owner    0.158140
Euthanasia         0.060288
Died               0.008283
Name: OutcomeType, dtype: float64

In [64]:
def clean_name(df,df_type):
    
    # if there is a name
    df["has_name"] = df["Name"].apply(lambda x: type(x) != float)
    
    # name length
    # df["name_length"] = df["Name"].apply(lambda x: len(x) if type(x) != float else 0)
    
    # initial and map to int
    '''
    df["initial"] = df["Name"].apply(lambda x: x[0] if type(x) != float else "!")
    if df_type == "train":
        df["initial"][df["initial"].str.contains("'")] = "S"
        df["initial"][df["initial"].str.contains("0")] = "J"
        df["initial"][df["initial"].str.contains("3")] = "B"
        df["initial"][df["initial"].str.contains(" ")] = "J"
        df["initial"][df["initial"].str.contains(" ")] = pd.Series(["J","M"])
    else:
        df["initial"][df["initial"].str.contains("0")] = "!"
        df["initial"][df["initial"].str.contains("3")] = "!"
        df["initial"][df["initial"].str.contains("6")] = "!"
    D = {i:j for i,j in zip(list("!ABCDEFGHIJKLMNOPQRSTUVWXYZ"),list(range(27)))}
    df["initial"] = df["initial"].replace(D)
    '''
    
    # df = df.drop("Name",1)
    return df

In [65]:
train = clean_name(train,"train")
test = clean_name(test,"test")

## clean currently not going to be used variables

In [66]:
# delete columns transformed
train = train.drop("SexuponOutcome",1)
train = train.drop("Breed",1)
train = train.drop("Color",1)
train = train.drop("DateTime",1)
train = train.drop("Name",1)
train = train.drop("AnimalType",1)

test = test.drop("SexuponOutcome",1)
test = test.drop("Breed",1)
test = test.drop("Color",1)
test = test.drop("DateTime",1)
test = test.drop("Name",1)
test = test.drop("AnimalType",1)

In [67]:
train = train.drop("OutcomeSubtype",1)
train = train.drop("AnimalID",1)

In [38]:
print(train.shape)
print(test.shape)

(26728, 280)
(11456, 280)


In [39]:
train.to_csv("data/train_clean_2.csv",index=False)
test.to_csv("data/test_clean_2.csv",index=False)

## oversample
to deal with unbalanced labels in the training data
- https://github.com/stavskal/ADASYN
- https://github.com/fmfn/UnbalancedDataset (binary?)

done in R using SMOTE

In [190]:
train.OutcomeType.value_counts()

Adoption           10769
Transfer            9422
Return_to_owner     4785
Euthanasia          1555
Died                 197
Name: OutcomeType, dtype: int64