# Data Science Final Project - Shelter Animal Adoptions

## Questions:
## 1.  Are animal type, sex, and age significant predictors of shelter dog and cat adoption?
## 2.  What time of year are shelter dog and cat adoptions most popular?

## Import Packages

In [1]:
import pandas as pd   # for reading data, data wrangling, visualization, and analysis
import numpy as np   # for mathematical operations on arrays (in modeling)
from scipy import stats   # for independent chi-square (statistical tests)
import statsmodels as sm   # for mcnemar chi-square (statistical tests and models)
from statsmodels.stats.contingency_tables import mcnemar   # for mcnemar chi-square

## Import Data

In [2]:
adoptions = pd.read_csv("Austin_Animal_Center_Outcomes.csv")

In [3]:
adoptions.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A794011,Chunk,5/8/2019 18:20,19-May,5/2/2017,Rto-Adopt,,Cat,Neutered Male,2 years,Domestic Shorthair Mix,Brown Tabby/White
1,A776359,Gizmo,7/18/2018 16:02,18-Jul,7/12/2017,Adoption,,Dog,Neutered Male,1 year,Chihuahua Shorthair Mix,White/Brown
2,A821648,,8/16/2020 11:38,20-Aug,8/16/2019,Euthanasia,,Other,Unknown,1 year,Raccoon,Gray
3,A720371,Moose,2/13/2016 17:59,16-Feb,10/8/2015,Adoption,,Dog,Neutered Male,4 months,Anatol Shepherd/Labrador Retriever,Buff
4,A674754,,3/18/2014 11:47,14-Mar,3/12/2014,Transfer,Partner,Cat,Intact Male,6 days,Domestic Shorthair Mix,Orange Tabby


## Data Wrangling

### Keep only relevant columns

In [4]:
adoptions1 = adoptions[["MonthYear", "Outcome Type", "Animal Type", "Sex upon Outcome", "Age upon Outcome"]]

In [5]:
adoptions1.head()

Unnamed: 0,MonthYear,Outcome Type,Animal Type,Sex upon Outcome,Age upon Outcome
0,19-May,Rto-Adopt,Cat,Neutered Male,2 years
1,18-Jul,Adoption,Dog,Neutered Male,1 year
2,20-Aug,Euthanasia,Other,Unknown,1 year
3,16-Feb,Adoption,Dog,Neutered Male,4 months
4,14-Mar,Transfer,Cat,Intact Male,6 days


### Split MonthYear into 2 columns

In [6]:
adoptions2 = adoptions1["MonthYear"].str.split("-", expand=True)

In [7]:
adoptions2.head

<bound method NDFrame.head of          0    1
0       19  May
1       18  Jul
2       20  Aug
3       16  Feb
4       14  Mar
...     ..  ...
136751  22  Feb
136752  22  Feb
136753  22  Feb
136754  22  Feb
136755  22  Feb

[136756 rows x 2 columns]>

### Add new columns back to dataframe

In [8]:
adoptions3 = pd.concat([adoptions1, adoptions2], axis=1)

In [9]:
adoptions3.head()

Unnamed: 0,MonthYear,Outcome Type,Animal Type,Sex upon Outcome,Age upon Outcome,0,1
0,19-May,Rto-Adopt,Cat,Neutered Male,2 years,19,May
1,18-Jul,Adoption,Dog,Neutered Male,1 year,18,Jul
2,20-Aug,Euthanasia,Other,Unknown,1 year,20,Aug
3,16-Feb,Adoption,Dog,Neutered Male,4 months,16,Feb
4,14-Mar,Transfer,Cat,Intact Male,6 days,14,Mar


### Rename new columns and drop original date column

In [10]:
adoptions3.rename(columns={0: "Year", 1: "Month"}, inplace=True)
adoptions3.drop(["MonthYear"], axis=1, inplace=True)

In [11]:
adoptions3.head()

Unnamed: 0,Outcome Type,Animal Type,Sex upon Outcome,Age upon Outcome,Year,Month
0,Rto-Adopt,Cat,Neutered Male,2 years,19,May
1,Adoption,Dog,Neutered Male,1 year,18,Jul
2,Euthanasia,Other,Unknown,1 year,20,Aug
3,Adoption,Dog,Neutered Male,4 months,16,Feb
4,Transfer,Cat,Intact Male,6 days,14,Mar


### Recode and Group Sex upon Outcome column

In [12]:
adoptions3["Sex upon Outcome"].value_counts()

Neutered Male    48172
Spayed Female    43461
Intact Male      17371
Intact Female    16872
Unknown          10879
Name: Sex upon Outcome, dtype: int64

In [13]:
def sex (series): 
    if series == "Neutered Male" : 
        return "Male"
    if series == "Spayed Female": 
        return "Female"
    if series == "Intact Male" : 
        return "Male"
    if series == "Intact Female" : 
        return "Female"
    if series =="Unknown":
        return "Unknown"
    
adoptions3["Sex"] = adoptions3["Sex upon Outcome"].apply(sex)

In [14]:
adoptions3.head(10)

Unnamed: 0,Outcome Type,Animal Type,Sex upon Outcome,Age upon Outcome,Year,Month,Sex
0,Rto-Adopt,Cat,Neutered Male,2 years,19,May,Male
1,Adoption,Dog,Neutered Male,1 year,18,Jul,Male
2,Euthanasia,Other,Unknown,1 year,20,Aug,Unknown
3,Adoption,Dog,Neutered Male,4 months,16,Feb,Male
4,Transfer,Cat,Intact Male,6 days,14,Mar,Male
5,Adoption,Dog,Spayed Female,7 years,20,Oct,Female
6,Adoption,Dog,Neutered Male,2 years,20,May,Male
7,Adoption,Cat,Neutered Male,2 months,14,Oct,Male
8,Adoption,Cat,Neutered Male,2 months,14,Aug,Male
9,Adoption,Dog,Spayed Female,2 years,21,Aug,Female


### Drop Sex upon Outcome column

In [15]:
adoptions3.drop(["Sex upon Outcome"], axis=1, inplace=True)

In [16]:
adoptions3.head()

Unnamed: 0,Outcome Type,Animal Type,Age upon Outcome,Year,Month,Sex
0,Rto-Adopt,Cat,2 years,19,May,Male
1,Adoption,Dog,1 year,18,Jul,Male
2,Euthanasia,Other,1 year,20,Aug,Unknown
3,Adoption,Dog,4 months,16,Feb,Male
4,Transfer,Cat,6 days,14,Mar,Male


### Filter Animal Type to only include Dog and Cat

In [17]:
adoptions4 = adoptions3[(adoptions3["Animal Type"].isin(["Dog", "Cat"]))]

In [18]:
adoptions4.head(10)

Unnamed: 0,Outcome Type,Animal Type,Age upon Outcome,Year,Month,Sex
0,Rto-Adopt,Cat,2 years,19,May,Male
1,Adoption,Dog,1 year,18,Jul,Male
3,Adoption,Dog,4 months,16,Feb,Male
4,Transfer,Cat,6 days,14,Mar,Male
5,Adoption,Dog,7 years,20,Oct,Female
6,Adoption,Dog,2 years,20,May,Male
7,Adoption,Cat,2 months,14,Oct,Male
8,Adoption,Cat,2 months,14,Aug,Male
9,Adoption,Dog,2 years,21,Aug,Female
10,Transfer,Cat,2 years,14,Jul,Female


### Filter out partial years from Year column

In [19]:
adoptions4["Year"].value_counts().sort_index(ascending=True)

13     4265
14    17556
15    17606
16    16572
17    16518
18    15791
19    18861
20     9046
21    11267
22     1377
Name: Year, dtype: int64

In [20]:
adoptions5 = adoptions4[(adoptions4["Year"].isin(["14", "15", "16", "17", "18", "19", "20", "21"]))]

In [21]:
adoptions5["Year"].unique()

array(['19', '18', '16', '14', '20', '21', '17', '15'], dtype=object)

In [22]:
adoptions5.shape

(123217, 6)

In [23]:
adoptions5["Year"].value_counts().sort_index(ascending=True)

14    17556
15    17606
16    16572
17    16518
18    15791
19    18861
20     9046
21    11267
Name: Year, dtype: int64

In [24]:
adoptions5.head()

Unnamed: 0,Outcome Type,Animal Type,Age upon Outcome,Year,Month,Sex
0,Rto-Adopt,Cat,2 years,19,May,Male
1,Adoption,Dog,1 year,18,Jul,Male
3,Adoption,Dog,4 months,16,Feb,Male
4,Transfer,Cat,6 days,14,Mar,Male
5,Adoption,Dog,7 years,20,Oct,Female


### Drop outlier rows from Age upon Outcome column

In [25]:
adoptions5["Age upon Outcome"].unique()

array(['2 years', '1 year', '4 months', '6 days', '7 years', '2 months',
       '2 days', '3 weeks', '9 months', '4 weeks', '2 weeks', '3 months',
       '9 years', '10 years', '6 months', '8 years', '3 years',
       '7 months', '6 years', '4 years', '1 month', '12 years', '5 years',
       '1 weeks', '5 months', '5 days', '15 years', '11 months',
       '10 months', '4 days', '16 years', '1 day', '8 months', '11 years',
       '13 years', '1 week', '14 years', '3 days', '0 years', '5 weeks',
       '17 years', '20 years', '18 years', '22 years', '-2 years',
       '19 years', '23 years', '24 years', '-1 years', '-3 years'],
      dtype=object)

In [26]:
adoptions5["Age upon Outcome"].value_counts()

1 year       19328
2 years      18392
2 months     16181
3 years       7430
3 months      6251
1 month       5639
4 years       4436
4 months      4170
5 years       4166
5 months      2994
6 years       2770
6 months      2674
8 years       2430
7 years       2396
3 weeks       2269
2 weeks       2185
8 months      1992
10 years      1980
10 months     1843
7 months      1634
4 weeks       1535
9 years       1352
9 months      1303
12 years       947
1 weeks        935
11 months      809
11 years       774
1 week         701
13 years       624
14 years       419
3 days         384
2 days         354
15 years       345
1 day          285
6 days         249
4 days         228
0 years        167
16 years       156
5 weeks        147
5 days         143
17 years        88
18 years        51
19 years        26
20 years        19
-1 years         6
22 years         6
23 years         1
24 years         1
-2 years         1
-3 years         1
Name: Age upon Outcome, dtype: int64

In [27]:
indexNames = adoptions5[adoptions5["Age upon Outcome"] == "22 years"].index

In [28]:
adoptions5.drop(indexNames, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [29]:
adoptions5["Age upon Outcome"].value_counts()

1 year       19328
2 years      18392
2 months     16181
3 years       7430
3 months      6251
1 month       5639
4 years       4436
4 months      4170
5 years       4166
5 months      2994
6 years       2770
6 months      2674
8 years       2430
7 years       2396
3 weeks       2269
2 weeks       2185
8 months      1992
10 years      1980
10 months     1843
7 months      1634
4 weeks       1535
9 years       1352
9 months      1303
12 years       947
1 weeks        935
11 months      809
11 years       774
1 week         701
13 years       624
14 years       419
3 days         384
2 days         354
15 years       345
1 day          285
6 days         249
4 days         228
0 years        167
16 years       156
5 weeks        147
5 days         143
17 years        88
18 years        51
19 years        26
20 years        19
-1 years         6
23 years         1
24 years         1
-2 years         1
-3 years         1
Name: Age upon Outcome, dtype: int64

In [30]:
indexNames1 = adoptions5[adoptions5["Age upon Outcome"] == "-1 years"].index
indexNames2 = adoptions5[adoptions5["Age upon Outcome"] == "23 years"].index
indexNames3 = adoptions5[adoptions5["Age upon Outcome"] == "-3 years"].index
indexNames4 = adoptions5[adoptions5["Age upon Outcome"] == "24 years"].index
indexNames5 = adoptions5[adoptions5["Age upon Outcome"] == "-2 years"].index

In [31]:
adoptions5.drop(indexNames1, inplace=True)
adoptions5.drop(indexNames2, inplace=True)
adoptions5.drop(indexNames3, inplace=True)
adoptions5.drop(indexNames4, inplace=True)
adoptions5.drop(indexNames5, inplace=True)

In [32]:
adoptions5["Age upon Outcome"].value_counts()

1 year       19328
2 years      18392
2 months     16181
3 years       7430
3 months      6251
1 month       5639
4 years       4436
4 months      4170
5 years       4166
5 months      2994
6 years       2770
6 months      2674
8 years       2430
7 years       2396
3 weeks       2269
2 weeks       2185
8 months      1992
10 years      1980
10 months     1843
7 months      1634
4 weeks       1535
9 years       1352
9 months      1303
12 years       947
1 weeks        935
11 months      809
11 years       774
1 week         701
13 years       624
14 years       419
3 days         384
2 days         354
15 years       345
1 day          285
6 days         249
4 days         228
0 years        167
16 years       156
5 weeks        147
5 days         143
17 years        88
18 years        51
19 years        26
20 years        19
Name: Age upon Outcome, dtype: int64

### Recode and Group Age upon Outcome column

In [33]:
def age (series): 
    if series == "1 year" : 
        return "0-1 years"
    if series == "2 years": 
        return "2-5 years"
    if series == "2 months" : 
        return "0-1 years"
    if series == "3 years" : 
        return "2-5 years"
    if series =="3 months":
        return "0-1 years"
    if series =="1 month":
        return "0-1 years"
    if series =="4 years":
        return "2-5 years"
    if series =="4 months":
        return "0-1 years"
    if series =="5 years":
        return "2-5 years"
    if series =="5 months":
        return "0-1 years"
    if series =="6 years":
        return "6-9 years"
    if series =="6 months":
        return "0-1 years"
    if series =="8 years":
        return "6-9 years"
    if series =="7 years":
        return "6-9 years"
    if series =="3 weeks":
        return "0-1 years"
    if series =="2 weeks":
        return "0-1 years"
    if series =="8 months":
        return "0-1 years"
    if series =="10 years":
        return "10-15 years"
    if series =="10 months":
        return "0-1 years"
    if series =="7 months":
        return "0-1 years"
    if series =="4 weeks":
        return "0-1 years"
    if series =="9 years":
        return "6-9 years"
    if series =="9 months":
        return "0-1 years"
    if series =="12 years":
        return "10-15 years"
    if series =="1 weeks":
        return "0-1 years"
    if series =="11 months":
        return "0-1 years"
    if series =="11 years":
        return "10-15 years"
    if series =="13 years":
        return "10-15 years"
    if series =="14 years":
        return "10-15 years"
    if series =="3 days":
        return "0-1 years"
    if series =="2 days":
        return "0-1 years"
    if series =="15 years":
        return "10-15 years"
    if series =="1 day":
        return "0-1 years"
    if series =="6 days":
        return "0-1 years"
    if series =="4 days":
        return "0-1 years"
    if series =="16 years":
        return "16+ years"
    if series =="5 weeks":
        return "0-1 years"
    if series =="5 days":
        return "0-1 years"
    if series =="17 years":
        return "16+ years"
    if series =="18 years":
        return "16+ years"
    if series =="19 years":
        return "16+ years"
    if series =="20 years":
        return "16+ years"
     
adoptions5["Age"] = adoptions5["Age upon Outcome"].apply(age)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adoptions5["Age"] = adoptions5["Age upon Outcome"].apply(age)


In [35]:
adoptions5["Age"].value_counts()

0-1 years      73532
2-5 years      34424
6-9 years       8948
10-15 years     5089
16+ years        340
Name: Age, dtype: int64

### Drop Age upon Outcome column

In [37]:
adoptions5.drop(["Age upon Outcome"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [38]:
adoptions5.head()

Unnamed: 0,Outcome Type,Animal Type,Year,Month,Sex,Age
0,Rto-Adopt,Cat,19,May,Male,2-5 years
1,Adoption,Dog,18,Jul,Male,0-1 years
3,Adoption,Dog,16,Feb,Male,0-1 years
4,Transfer,Cat,14,Mar,Male,0-1 years
5,Adoption,Dog,20,Oct,Female,6-9 years


### Recode Outcome Type to Adoption or Not Adoption

In [40]:
adoptions5["Outcome Type"].value_counts()

Adoption           58910
Transfer           36797
Return to Owner    21731
Euthanasia          3687
Died                1033
Rto-Adopt            819
Disposal             137
Missing               65
Relocate               4
Name: Outcome Type, dtype: int64

In [41]:
def adopt (series): 
    if series == "Adoption" : 
        return 1
    else:
        return 0
    
adoptions5["AdoptionYN"] = adoptions5["Outcome Type"].apply(adopt)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adoptions5["AdoptionYN"] = adoptions5["Outcome Type"].apply(adopt)


In [42]:
adoptions5.head()

Unnamed: 0,Outcome Type,Animal Type,Year,Month,Sex,Age,AdoptionYN
0,Rto-Adopt,Cat,19,May,Male,2-5 years,0
1,Adoption,Dog,18,Jul,Male,0-1 years,1
3,Adoption,Dog,16,Feb,Male,0-1 years,1
4,Transfer,Cat,14,Mar,Male,0-1 years,0
5,Adoption,Dog,20,Oct,Female,6-9 years,1


In [43]:
adoptions5["AdoptionYN"].value_counts()

0    64291
1    58910
Name: AdoptionYN, dtype: int64

### Drop Outcome Type column

In [45]:
adoptions5.drop(["Outcome Type"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [46]:
adoptions5.head()

Unnamed: 0,Animal Type,Year,Month,Sex,Age,AdoptionYN
0,Cat,19,May,Male,2-5 years,0
1,Dog,18,Jul,Male,0-1 years,1
3,Dog,16,Feb,Male,0-1 years,1
4,Cat,14,Mar,Male,0-1 years,0
5,Dog,20,Oct,Female,6-9 years,1


### Recode Animal Type to a numeric variable

In [48]:
adoptions5.dtypes

Animal Type    object
Year           object
Month          object
Sex            object
Age            object
AdoptionYN      int64
dtype: object

In [49]:
def type_recode (series):
    if series == "Cat":
        return 0
    if series == "Dog":
        return 1
    
adoptions5['typeR'] = adoptions5['Animal Type'].apply(type_recode)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adoptions5['typeR'] = adoptions5['Animal Type'].apply(type_recode)


In [50]:
adoptions5.head()

Unnamed: 0,Animal Type,Year,Month,Sex,Age,AdoptionYN,typeR
0,Cat,19,May,Male,2-5 years,0,0
1,Dog,18,Jul,Male,0-1 years,1,1
3,Dog,16,Feb,Male,0-1 years,1,1
4,Cat,14,Mar,Male,0-1 years,0,0
5,Dog,20,Oct,Female,6-9 years,1,1


### Convert data type of Year to integer

In [52]:
adoptions5['Year'] = adoptions5['Year'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adoptions5['Year'] = adoptions5['Year'].astype(int)


In [53]:
adoptions5["Year"].dtypes

dtype('int32')

### Drop "unknown" rows from Sex column

In [55]:
adoptions5["Sex"].value_counts()

Male       61827
Female     56912
Unknown     4462
Name: Sex, dtype: int64

In [56]:
indexNames = adoptions5[adoptions5["Sex"] == "Unknown"].index

In [57]:
adoptions5.drop(indexNames, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [58]:
adoptions5["Sex"].value_counts()

Male      61827
Female    56912
Name: Sex, dtype: int64

### Recode Sex to a numeric variable

In [60]:
def sex_recode (series):
    if series == "Male":
        return 0
    if series == "Female":
        return 1
    
adoptions5['sexR'] = adoptions5['Sex'].apply(sex_recode)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adoptions5['sexR'] = adoptions5['Sex'].apply(sex_recode)


In [61]:
adoptions5.head()

Unnamed: 0,Animal Type,Year,Month,Sex,Age,AdoptionYN,typeR,sexR
0,Cat,19,May,Male,2-5 years,0,0,0
1,Dog,18,Jul,Male,0-1 years,1,1,0
3,Dog,16,Feb,Male,0-1 years,1,1,0
4,Cat,14,Mar,Male,0-1 years,0,0,0
5,Dog,20,Oct,Female,6-9 years,1,1,1


### Recode Age to a numeric variable

In [63]:
adoptions5['Age'].value_counts()

0-1 years      70101
2-5 years      33931
6-9 years       8910
10-15 years     5054
16+ years        337
Name: Age, dtype: int64

In [64]:
adoptions5['Age'].dtypes

dtype('O')

In [65]:
def age_recode (series):
    if series == "0-1 years":
        return 0
    if series == "2-5 years":
        return 1
    if series == "6-9 years":
        return 2
    if series == "10-15 years":
        return 3
    if series == "16+ years":
        return 4
    
adoptions5['ageR'] = adoptions5['Age'].apply(age_recode)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adoptions5['ageR'] = adoptions5['Age'].apply(age_recode)


In [66]:
adoptions5.head()

Unnamed: 0,Animal Type,Year,Month,Sex,Age,AdoptionYN,typeR,sexR,ageR
0,Cat,19,May,Male,2-5 years,0,0,0,1.0
1,Dog,18,Jul,Male,0-1 years,1,1,0,0.0
3,Dog,16,Feb,Male,0-1 years,1,1,0,0.0
4,Cat,14,Mar,Male,0-1 years,0,0,0,0.0
5,Dog,20,Oct,Female,6-9 years,1,1,1,2.0


In [67]:
adoptions5['ageR'].value_counts()

0.0    70101
1.0    33931
2.0     8910
3.0     5054
4.0      337
Name: ageR, dtype: int64

In [68]:
adoptions5.dtypes

Animal Type     object
Year             int32
Month           object
Sex             object
Age             object
AdoptionYN       int64
typeR            int64
sexR             int64
ageR           float64
dtype: object