In [1]:
import pandas as pd

# Importing data and extracting dogs only

In [2]:
# Dataframe for animal intakes and outcomes
dataframe_intakes_outcomes = pd.read_csv("aac_intakes_outcomes.csv")

df_dogs = dataframe_intakes_outcomes[dataframe_intakes_outcomes["animal_type"].str.contains("Dog")==True]

df_dogs = df_dogs[['animal_id_intake', 'intake_number','breed']]

df_dogs

Unnamed: 0,animal_id_intake,intake_number,breed
0,A006100,1.0,Spinone Italiano Mix
1,A006100,2.0,Spinone Italiano Mix
2,A006100,3.0,Spinone Italiano Mix
3,A047759,1.0,Dachshund
4,A134067,1.0,Shetland Sheepdog
...,...,...,...
79656,A769042,1.0,Miniature Poodle/Maltese
79657,A769043,1.0,Beagle/Australian Cattle Dog
79660,A769047,1.0,Border Collie Mix
79670,A769066,1.0,Labrador Retriever Mix


### Add a column for mixed or pure breed

In [3]:
df_dogs['mixed_breed'] = (df_dogs['breed'].str.contains('Mix') | df_dogs['breed'].str.contains('/'))

### Merge with dogs_cleaned dataset

In [4]:
dogs_cleaned = pd.read_csv("dogs_cleaned.csv")
dogs_cleaned = dogs_cleaned.rename(columns={
    'Breed Name': 'breed'})

# Merge original dataset with dog breed size

# Transform breed to ignore mixed races
df_dogs['breed'] = df_dogs['breed'].map(lambda i: i.split('/')[0])
df_dogs['breed'] = df_dogs['breed'].map(lambda i: i.split(' Mix')[0])

# Correcting breed names accross data
df_dogs.loc[df_dogs["breed"] == 'German Shepherd', 'breed'] = 'German Shepherd Dog'
df_dogs.loc[df_dogs["breed"] == 'Pit Bull', 'breed'] = 'American Pit Bull Terrier'
df_dogs.loc[df_dogs["breed"] == 'Miniature Poodle', 'breed'] = 'Pomeranian'
df_dogs.loc[df_dogs["breed"] == 'Chihuahua Shorthair', 'breed'] = 'Chihuahua'
df_dogs.loc[df_dogs["breed"] == 'Alaskan Husky', 'breed'] = 'Siberian Husky'
df_dogs.loc[df_dogs["breed"] == 'Chesa Bay Retr', 'breed'] = 'Chesapeake Bay Retriever'
df_dogs.loc[df_dogs["breed"] == 'Catahoula', 'breed'] = 'Catahoula Bulldog'
df_dogs.loc[df_dogs["breed"] == 'Wire Hair Fox Terrier', 'breed'] = 'Fox Terrier'
df_dogs.loc[df_dogs["breed"] == 'West Highland', 'breed'] = 'West Highland White Terrier'
df_dogs.loc[df_dogs["breed"] == 'Treeing Tennesse Brindle', 'breed'] = 'Treeing Tennessee Brindle'
df_dogs.loc[df_dogs["breed"] == 'Staffordshire', 'breed'] = 'Staffordshire Bull Terrier'
df_dogs.loc[df_dogs["breed"] == 'Chihuahua Longhair', 'breed'] = 'Chihuahua'
df_dogs.loc[df_dogs["breed"] == 'Dachshund Wirehair', 'breed'] = 'Dachshund'
df_dogs.loc[df_dogs["breed"] == 'Dachshund Longhair', 'breed'] = 'Dachshund'
df_dogs.loc[df_dogs["breed"] == 'Chinese Sharpei', 'breed'] = 'Chinese Shar-Pei'
df_dogs.loc[df_dogs["breed"] == 'Anatol Shepherd', 'breed'] = 'Anatolian Shepherd Dog'
df_dogs.loc[df_dogs["breed"] == 'Plott Hound', 'breed'] = 'Plott'
df_dogs.loc[df_dogs["breed"] == 'Doberman Pinsch', 'breed'] = 'Doberman Pinscher'
df_dogs.loc[df_dogs["breed"] == 'Chinese Sharpei', 'breed'] = 'Chinese Shar-Pei'

# Merge
df_dogs = pd.merge(df_dogs, dogs_cleaned, on='breed', how='left')


In [5]:
# helper dataframe for refinement
df_no_breed_info = df_dogs[df_dogs['Detailed Description Link'].isnull()] 
df_no_breed_info['Frequency'] = df_no_breed_info.groupby('breed')['breed'].transform('count')
df_no_breed_info.sort_values('Frequency', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_no_breed_info['Frequency'] = df_no_breed_info.groupby('breed')['breed'].transform('count')


Unnamed: 0,animal_id_intake,intake_number,breed,mixed_breed,Detailed Description Link,Dog Size,Dog Breed Group,Height,"Avg. Height, cm",Weight,...,Potential For Mouthiness,Prey Drive,Tendency To Bark Or Howl,Wanderlust Potential,Physical Needs,Energy Level,Intensity,Exercise Needs,Potential For Playfulness,Frequency
21,A230482,1.0,Queensland Heeler,False,,,,,,,...,,,,,,,,,,183
19377,A700890,1.0,Queensland Heeler,True,,,,,,,...,,,,,,,,,,183
22033,A707873,1.0,Queensland Heeler,False,,,,,,,...,,,,,,,,,,183
20701,A704303,1.0,Queensland Heeler,True,,,,,,,...,,,,,,,,,,183
20084,A702593,1.0,Queensland Heeler,True,,,,,,,...,,,,,,,,,,183
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4986,A667463,1.0,Eng Toy Spaniel,True,,,,,,,...,,,,,,,,,,1
16117,A694112,1.0,Sealyham Terr,True,,,,,,,...,,,,,,,,,,1
6832,A671428,1.0,Swiss Hound,True,,,,,,,...,,,,,,,,,,1
34828,A740412,1.0,Grand Basset Griffon Vendeen,True,,,,,,,...,,,,,,,,,,1


### Drop dogs with unknown breed characteristics

In [6]:
df_dogs = df_dogs[df_dogs["Detailed Description Link"].isna()==False]
df_dogs
# About 3k are dropped, 43k remain

Unnamed: 0,animal_id_intake,intake_number,breed,mixed_breed,Detailed Description Link,Dog Size,Dog Breed Group,Height,"Avg. Height, cm",Weight,...,Intelligence,Potential For Mouthiness,Prey Drive,Tendency To Bark Or Howl,Wanderlust Potential,Physical Needs,Energy Level,Intensity,Exercise Needs,Potential For Playfulness
0,A006100,1.0,Spinone Italiano,True,https://dogtime.com/dog-breeds/spinone-italiano,Very Large,Sporting Dogs,22 to 28 inches,63.50,61 to 86 pounds,...,4.0,2.0,4.0,2.0,2.0,3.33,4.0,2.0,4.0,4.0
1,A006100,2.0,Spinone Italiano,True,https://dogtime.com/dog-breeds/spinone-italiano,Very Large,Sporting Dogs,22 to 28 inches,63.50,61 to 86 pounds,...,4.0,2.0,4.0,2.0,2.0,3.33,4.0,2.0,4.0,4.0
2,A006100,3.0,Spinone Italiano,True,https://dogtime.com/dog-breeds/spinone-italiano,Very Large,Sporting Dogs,22 to 28 inches,63.50,61 to 86 pounds,...,4.0,2.0,4.0,2.0,2.0,3.33,4.0,2.0,4.0,4.0
3,A047759,1.0,Dachshund,False,https://dogtime.com/dog-breeds/dachshund,Medium,Hound Dogs,8 inches to 9 inches tall at the shoulder,21.59,16 to 32 pounds,...,4.0,4.0,5.0,5.0,5.0,3.00,3.0,3.0,3.0,4.0
4,A134067,1.0,Shetland Sheepdog,False,https://dogtime.com/dog-breeds/shetland-sheepdog,Medium,Herding Dogs,13 to 16 inches tall at the shoulder,36.83,Starts at 20 pounds,...,5.0,1.0,3.0,4.0,2.0,3.33,4.0,2.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45361,A769042,1.0,Pomeranian,True,https://dogtime.com/dog-breeds/pomeranian,Very Small,Companion Dogs,7 to 12 inches tall at the shoulder,24.13,3 to 7 pounds,...,4.0,2.0,2.0,5.0,1.0,2.33,3.0,2.0,2.0,3.0
45362,A769043,1.0,Beagle,True,https://dogtime.com/dog-breeds/beagle,Medium,Hound Dogs,13 to 15 inches tall at the shoulder,35.56,18 to 30 pounds,...,4.0,3.0,5.0,5.0,5.0,4.33,4.0,5.0,4.0,5.0
45363,A769047,1.0,Border Collie,True,https://dogtime.com/dog-breeds/border-collie,Large,Herding Dogs,18 to 22 inches tall at the shoulder,50.80,30 to 45 pounds,...,5.0,3.0,3.0,2.0,3.0,4.33,5.0,3.0,5.0,5.0
45364,A769066,1.0,Labrador Retriever,True,https://dogtime.com/dog-breeds/labrador-retriever,Very Large,Sporting Dogs,21 to 24 inches at the shoulder,57.15,55 to 80 pounds,...,5.0,5.0,2.0,4.0,3.0,5.00,5.0,5.0,5.0,5.0


In [7]:
df_dogs.columns

Index(['animal_id_intake', 'intake_number', 'breed', 'mixed_breed',
       'Detailed Description Link', 'Dog Size', 'Dog Breed Group', 'Height',
       'Avg. Height, cm', 'Weight', 'Avg. Weight, kg', 'Life Span',
       'Avg. Life Span, years', 'Adaptability',
       'Adapts Well To Apartment Living', 'Good For Novice Owners',
       'Sensitivity Level', 'Tolerates Being Alone', 'Tolerates Cold Weather',
       'Tolerates Hot Weather', 'All Around Friendliness',
       'Affectionate With Family', 'Kid-Friendly', 'Dog Friendly',
       'Friendly Toward Strangers', 'Health And Grooming Needs',
       'Amount Of Shedding', 'Drooling Potential', 'Easy To Groom',
       'General Health', 'Potential For Weight Gain', 'Size', 'Trainability',
       'Easy To Train', 'Intelligence', 'Potential For Mouthiness',
       'Prey Drive', 'Tendency To Bark Or Howl', 'Wanderlust Potential',
       'Physical Needs', 'Energy Level', 'Intensity', 'Exercise Needs',
       'Potential For Playfulness'],
     

In [8]:
# Reorder columns
# df_dogs = df_dogs[[
#     'animal_id_intake',
#     'intake_number',
#     'breed',
#     'mixed_breed',
#     'Dog Size',
#     'Avg. Height, cm',
#     'Avg. Weight, kg',
#     'Avg. Life Span, years',
#     'Adaptability',
#     'All Around Friendliness',
#     'Health And Grooming Needs',
#     'Trainability',
#     'Physical Needs'  
# ]]

df_dogs = df_dogs[[
    'animal_id_intake',
    'intake_number',
    'breed',
    'mixed_breed',
    'Dog Size',
    'Avg. Height, cm',
    'Avg. Weight, kg',
    'Avg. Life Span, years',
    'Adapts Well To Apartment Living', 'Good For Novice Owners',
   'Sensitivity Level', 'Tolerates Being Alone', 'Tolerates Cold Weather',
   'Tolerates Hot Weather', 'Affectionate With Family', 'Kid-Friendly',
   'Dog Friendly', 'Friendly Toward Strangers', 'Amount Of Shedding',
   'Drooling Potential', 'Easy To Groom', 'General Health',
   'Potential For Weight Gain', 'Size', 'Easy To Train', 'Intelligence',
   'Potential For Mouthiness', 'Prey Drive', 'Tendency To Bark Or Howl',
   'Wanderlust Potential', 'Energy Level', 'Intensity', 'Exercise Needs',
   'Potential For Playfulness'
]]


# Rename columns
df_dogs = df_dogs.rename(columns={
    'Dog Size': 'dog_size'})
df_dogs = df_dogs.rename(columns={
    'Avg. Height, cm': 'avg_height'})
df_dogs = df_dogs.rename(columns={
    'Avg. Weight, kg': 'avg_weight'})
df_dogs = df_dogs.rename(columns={
    'Avg. Life Span, years': 'avg_lifespan'})

# Switch enum size to Integer
df_dogs.loc[df_dogs["dog_size"] == 'Very Small', 'dog_size'] = '1'
df_dogs.loc[df_dogs["dog_size"] == 'Small', 'dog_size'] = '2'
df_dogs.loc[df_dogs["dog_size"] == 'Medium', 'dog_size'] = '3'
df_dogs.loc[df_dogs["dog_size"] == 'Large', 'dog_size'] = '4'
df_dogs.loc[df_dogs["dog_size"] == 'Very Large', 'dog_size'] = '5'

In [9]:
df_dogs

Unnamed: 0,animal_id_intake,intake_number,breed,mixed_breed,dog_size,avg_height,avg_weight,avg_lifespan,Adapts Well To Apartment Living,Good For Novice Owners,...,Easy To Train,Intelligence,Potential For Mouthiness,Prey Drive,Tendency To Bark Or Howl,Wanderlust Potential,Energy Level,Intensity,Exercise Needs,Potential For Playfulness
0,A006100,1.0,Spinone Italiano,True,5,63.50,33.08,12.0,4.0,4.0,...,4.0,4.0,2.0,4.0,2.0,2.0,4.0,2.0,4.0,4.0
1,A006100,2.0,Spinone Italiano,True,5,63.50,33.08,12.0,4.0,4.0,...,4.0,4.0,2.0,4.0,2.0,2.0,4.0,2.0,4.0,4.0
2,A006100,3.0,Spinone Italiano,True,5,63.50,33.08,12.0,4.0,4.0,...,4.0,4.0,2.0,4.0,2.0,2.0,4.0,2.0,4.0,4.0
3,A047759,1.0,Dachshund,False,3,21.59,10.80,14.0,5.0,4.0,...,2.0,4.0,4.0,5.0,5.0,5.0,3.0,3.0,3.0,4.0
4,A134067,1.0,Shetland Sheepdog,False,3,36.83,9.00,14.0,2.0,3.0,...,5.0,5.0,1.0,3.0,4.0,2.0,4.0,2.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45361,A769042,1.0,Pomeranian,True,1,24.13,2.25,14.0,4.0,4.0,...,4.0,4.0,2.0,2.0,5.0,1.0,3.0,2.0,2.0,3.0
45362,A769043,1.0,Beagle,True,3,35.56,10.80,12.0,4.0,3.0,...,1.0,4.0,3.0,5.0,5.0,5.0,4.0,5.0,4.0,5.0
45363,A769047,1.0,Border Collie,True,4,50.80,16.88,14.0,2.0,2.0,...,5.0,5.0,3.0,3.0,2.0,3.0,5.0,3.0,5.0,5.0
45364,A769066,1.0,Labrador Retriever,True,5,57.15,30.38,11.0,1.0,3.0,...,5.0,5.0,5.0,2.0,4.0,3.0,5.0,5.0,5.0,5.0


In [10]:
# Export to CSV
# df_dogs.to_csv("data/processed_dogs_cleaned.csv", sep='\t')
df_dogs.to_csv("data/processed_dogs_cleaned_sep.csv", sep='\t')