In [1]:
import pandas as pd

In [2]:
# Import Austin Animal Center intakes and outcomes - 
aac_df = pd.read_csv("aac_intakes_outcomes.csv")[[
    'animal_id_intake',
    'animal_type',
    'breed',
    'intake_month',
    'intake_weekday',
    'color',
    'age_upon_intake_(years)',
    'intake_type',
    'intake_condition',
    'sex_upon_intake',
    'sex_upon_outcome',
    'time_in_shelter_days' # goal
]]

aac_df.rename(columns={"animal_id_intake": "id"}, inplace=True)

aac_df.head(10)

Unnamed: 0,id,animal_type,breed,intake_month,intake_weekday,color,age_upon_intake_(years),intake_type,intake_condition,sex_upon_intake,sex_upon_outcome,time_in_shelter_days
0,A006100,Dog,Spinone Italiano Mix,12,Thursday,Yellow/White,10.0,Stray,Normal,Neutered Male,Neutered Male,0.588194
1,A006100,Dog,Spinone Italiano Mix,12,Friday,Yellow/White,7.0,Public Assist,Normal,Neutered Male,Neutered Male,1.259722
2,A006100,Dog,Spinone Italiano Mix,3,Friday,Yellow/White,6.0,Public Assist,Normal,Neutered Male,Neutered Male,1.113889
3,A047759,Dog,Dachshund,4,Wednesday,Tricolor,10.0,Owner Surrender,Normal,Neutered Male,Neutered Male,4.970139
4,A134067,Dog,Shetland Sheepdog,11,Saturday,Brown/White,16.0,Public Assist,Injured,Neutered Male,Neutered Male,0.119444
5,A141142,Dog,Labrador Retriever/Pit Bull,11,Saturday,Black/White,15.0,Stray,Aged,Spayed Female,Spayed Female,0.870833
6,A163459,Dog,Miniature Schnauzer Mix,11,Friday,Black/Gray,15.0,Stray,Normal,Intact Female,Intact Female,0.178472
7,A165752,Dog,Lhasa Apso Mix,9,Monday,Brown/White,15.0,Stray,Normal,Neutered Male,Neutered Male,0.213194
8,A178569,Dog,Shetland Sheepdog Mix,3,Monday,White/Black,15.0,Public Assist,Normal,Neutered Male,Neutered Male,6.258333
9,A189592,Dog,Shetland Sheepdog Mix,9,Friday,Brown/White,18.0,Stray,Normal,Spayed Female,Spayed Female,0.054167


In [3]:
f'Samples {aac_df.shape[0]}'

'Samples 79672'

In [4]:
# Filter only dogs and cats
aac_df = aac_df[aac_df['animal_type'].str.contains('Dog|Cat')]

f'Samples {aac_df.shape[0]}'

'Samples 74905'

In [5]:
# Convert sex_upon_intake/sex_upon_outcome

# Testing features
n_samples_sex_diff = aac_df[aac_df['sex_upon_intake'] != aac_df['sex_upon_outcome']].shape[0]
print(f'Number of animals with reproductive state changed: {n_samples_sex_diff}')

n_samples_sex_unknown_io = aac_df.query('sex_upon_intake == "Unknown" & sex_upon_outcome == "Unknown"').shape[0]
print(f'Number of animals with reproductive state unknown (both intake and outcome): {n_samples_sex_unknown_io}')

n_samples_sex_unknown_i_no_o = aac_df.query('sex_upon_intake == "Unknown" & sex_upon_outcome != "Unknown"').shape[0]
print(f'Number of animals with intake reproductive state unknown and diff outcome reproductive state: {n_samples_sex_unknown_i_no_o}')

n_samples_sex_unknown_no_i_o = aac_df.query('sex_upon_intake != "Unknown" & sex_upon_outcome == "Unknown"').shape[0]
print(f'Number of animals with outcome reproductive state unknown and diff intake reproductive state: {n_samples_sex_unknown_i_no_o}')

print("If sex_upon_intake is unknown so will be sex_upon_outcome")

n_samples_only_sex_diff = aac_df[ \
    ((aac_df['sex_upon_intake'].str.contains('Male', na=False)) \
    &(aac_df['sex_upon_outcome'].str.contains('Female', na=False))) \
    | \
    (aac_df['sex_upon_intake'].str.contains('Female', na=False)) \
    &(aac_df['sex_upon_outcome'].str.contains('Male', na=False))].shape[0]

print(f'Number of animals with different sex (Male/Female) after io: {n_samples_only_sex_diff}')

print("If sex_upon_intake is Male/Female so will be sex_upon_outcome => all the changes are due to spaying/neutering")


Number of animals with reproductive state changed: 29883
Number of animals with reproductive state unknown (both intake and outcome): 2819
Number of animals with intake reproductive state unknown and diff outcome reproductive state: 0
Number of animals with outcome reproductive state unknown and diff intake reproductive state: 0
If sex_upon_intake is unknown so will be sex_upon_outcome
Number of animals with different sex (Male/Female) after io: 0
If sex_upon_intake is Male/Female so will be sex_upon_outcome => all the changes are due to spaying/neutering


In [6]:
# Breaking down features
aac_df[['reproductive_state_intake', 'sex']] = aac_df['sex_upon_intake'].str.split(expand=True)
aac_df.loc[aac_df['sex'].isna(), 'sex'] = 'Unknown'
aac_df['reproductive_state_outcome'] = aac_df['sex_upon_outcome'].str.split(expand=True)[0]

# convert neutered/spayed to sterile
aac_df[['reproductive_state_intake', 'reproductive_state_outcome']] = \
    aac_df[['reproductive_state_intake', 'reproductive_state_outcome']].replace({'Neutered':'Sterile', 'Spayed':'Sterile'})

aac_df.drop(columns=['sex_upon_intake', 'sex_upon_outcome'], inplace=True)

aac_df.head(20)

Unnamed: 0,id,animal_type,breed,intake_month,intake_weekday,color,age_upon_intake_(years),intake_type,intake_condition,time_in_shelter_days,reproductive_state_intake,sex,reproductive_state_outcome
0,A006100,Dog,Spinone Italiano Mix,12,Thursday,Yellow/White,10.0,Stray,Normal,0.588194,Sterile,Male,Sterile
1,A006100,Dog,Spinone Italiano Mix,12,Friday,Yellow/White,7.0,Public Assist,Normal,1.259722,Sterile,Male,Sterile
2,A006100,Dog,Spinone Italiano Mix,3,Friday,Yellow/White,6.0,Public Assist,Normal,1.113889,Sterile,Male,Sterile
3,A047759,Dog,Dachshund,4,Wednesday,Tricolor,10.0,Owner Surrender,Normal,4.970139,Sterile,Male,Sterile
4,A134067,Dog,Shetland Sheepdog,11,Saturday,Brown/White,16.0,Public Assist,Injured,0.119444,Sterile,Male,Sterile
5,A141142,Dog,Labrador Retriever/Pit Bull,11,Saturday,Black/White,15.0,Stray,Aged,0.870833,Sterile,Female,Sterile
6,A163459,Dog,Miniature Schnauzer Mix,11,Friday,Black/Gray,15.0,Stray,Normal,0.178472,Intact,Female,Intact
7,A165752,Dog,Lhasa Apso Mix,9,Monday,Brown/White,15.0,Stray,Normal,0.213194,Sterile,Male,Sterile
8,A178569,Dog,Shetland Sheepdog Mix,3,Monday,White/Black,15.0,Public Assist,Normal,6.258333,Sterile,Male,Sterile
9,A189592,Dog,Shetland Sheepdog Mix,9,Friday,Brown/White,18.0,Stray,Normal,0.054167,Sterile,Female,Sterile


In [7]:
# string columns to lowercase
str_cols = [
    'animal_type',
    'breed',
    'color',
    'intake_type',
    'intake_weekday',
    'intake_condition',
    'sex',
    'reproductive_state_intake', 
    'reproductive_state_outcome'
]

for col in str_cols:
    aac_df[col] = aac_df[col].str.lower()
    
aac_df['intake_type'] = aac_df['intake_type'].str.replace(' ', '_')

aac_df.head()

Unnamed: 0,id,animal_type,breed,intake_month,intake_weekday,color,age_upon_intake_(years),intake_type,intake_condition,time_in_shelter_days,reproductive_state_intake,sex,reproductive_state_outcome
0,A006100,dog,spinone italiano mix,12,thursday,yellow/white,10.0,stray,normal,0.588194,sterile,male,sterile
1,A006100,dog,spinone italiano mix,12,friday,yellow/white,7.0,public_assist,normal,1.259722,sterile,male,sterile
2,A006100,dog,spinone italiano mix,3,friday,yellow/white,6.0,public_assist,normal,1.113889,sterile,male,sterile
3,A047759,dog,dachshund,4,wednesday,tricolor,10.0,owner_surrender,normal,4.970139,sterile,male,sterile
4,A134067,dog,shetland sheepdog,11,saturday,brown/white,16.0,public_assist,injured,0.119444,sterile,male,sterile


In [8]:
# One hot encoding categorical data - all but color

one_hot_cols = [
    'intake_type',
    'intake_weekday',
    'intake_condition',
    'sex',
    'reproductive_state_intake',
    'reproductive_state_outcome'
]

aac_df_oh = pd.get_dummies(aac_df, columns=one_hot_cols)
aac_df_oh.head()

Unnamed: 0,id,animal_type,breed,intake_month,color,age_upon_intake_(years),time_in_shelter_days,intake_type_euthanasia_request,intake_type_owner_surrender,intake_type_public_assist,...,intake_condition_sick,sex_female,sex_male,sex_unknown,reproductive_state_intake_intact,reproductive_state_intake_sterile,reproductive_state_intake_unknown,reproductive_state_outcome_intact,reproductive_state_outcome_sterile,reproductive_state_outcome_unknown
0,A006100,dog,spinone italiano mix,12,yellow/white,10.0,0.588194,0,0,0,...,0,0,1,0,0,1,0,0,1,0
1,A006100,dog,spinone italiano mix,12,yellow/white,7.0,1.259722,0,0,1,...,0,0,1,0,0,1,0,0,1,0
2,A006100,dog,spinone italiano mix,3,yellow/white,6.0,1.113889,0,0,1,...,0,0,1,0,0,1,0,0,1,0
3,A047759,dog,dachshund,4,tricolor,10.0,4.970139,0,1,0,...,0,0,1,0,0,1,0,0,1,0
4,A134067,dog,shetland sheepdog,11,brown/white,16.0,0.119444,0,0,1,...,0,0,1,0,0,1,0,0,1,0


In [9]:
aac_df_oh.shape

(74905, 35)

In [10]:
# Save datasets 
dogs_df = aac_df_oh[aac_df_oh['animal_type'] == 'dog'].drop(columns=['animal_type'])
cats_df = aac_df_oh[aac_df_oh['animal_type'] == 'cat'].drop(columns=['animal_type'])

dogs_df.to_csv('data/dogs-no-breed-info.csv')
cats_df.to_csv('data/cats-no-breed-info.csv')