# Data Science Final Project - Shelter Animal Adoptions

## Analyses in Python

## Q1. Are animal type, sex, age, and color significant predictors of shelter dog and cat adoption?

### Import packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder

### Load Data

In [2]:
adoptions7 = pd.read_csv("C:/Users/bueno/OneDrive/Documents/GitHub/FinalProject/data/adoptions5.csv")

In [3]:
adoptions7.head()

Unnamed: 0.1,Unnamed: 0,Animal Type,Color,Year,Month,Sex,Age,AdoptionYN,typeR,sexR,ageR,colorR
0,0,Cat,Brown,19,May,Male,2-5 years,0,0,0,1.0,4
1,1,Dog,White,18,Jul,Male,0-1 years,1,1,0,0.0,25
2,3,Dog,Buff,16,Feb,Male,0-1 years,1,1,0,0.0,5
3,4,Cat,Orange,14,Mar,Male,0-1 years,0,0,0,0.0,16
4,5,Dog,Brown,20,Oct,Female,6-9 years,1,1,1,2.0,4


## Data Wrangling

### Drop "Unnamed:0" column

In [4]:
adoptions7.drop('Unnamed: 0', axis=1, inplace=True)

In [5]:
adoptions7.head()

Unnamed: 0,Animal Type,Color,Year,Month,Sex,Age,AdoptionYN,typeR,sexR,ageR,colorR
0,Cat,Brown,19,May,Male,2-5 years,0,0,0,1.0,4
1,Dog,White,18,Jul,Male,0-1 years,1,1,0,0.0,25
2,Dog,Buff,16,Feb,Male,0-1 years,1,1,0,0.0,5
3,Cat,Orange,14,Mar,Male,0-1 years,0,0,0,0.0,16
4,Dog,Brown,20,Oct,Female,6-9 years,1,1,1,2.0,4


### Drop null or infinite values

In [6]:
adoptions7.replace([np.inf, -np.inf], np.nan, inplace=True)

In [7]:
adoptions7 = adoptions7.reset_index()

In [8]:
adoptions7 = adoptions7.dropna()

In [9]:
adoptions7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118333 entries, 0 to 118738
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   index        118333 non-null  int64  
 1   Animal Type  118333 non-null  object 
 2   Color        118333 non-null  object 
 3   Year         118333 non-null  int64  
 4   Month        118333 non-null  object 
 5   Sex          118333 non-null  object 
 6   Age          118333 non-null  object 
 7   AdoptionYN   118333 non-null  int64  
 8   typeR        118333 non-null  int64  
 9   sexR         118333 non-null  int64  
 10  ageR         118333 non-null  float64
 11  colorR       118333 non-null  int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 11.7+ MB


### Convert float to int

In [10]:
adoptions7['ageR'] = adoptions7['ageR'].astype(int)

In [11]:
adoptions7.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 118333 entries, 0 to 118738
Data columns (total 12 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   index        118333 non-null  int64 
 1   Animal Type  118333 non-null  object
 2   Color        118333 non-null  object
 3   Year         118333 non-null  int64 
 4   Month        118333 non-null  object
 5   Sex          118333 non-null  object
 6   Age          118333 non-null  object
 7   AdoptionYN   118333 non-null  int64 
 8   typeR        118333 non-null  int64 
 9   sexR         118333 non-null  int64 
 10  ageR         118333 non-null  int32 
 11  colorR       118333 non-null  int64 
dtypes: int32(1), int64(6), object(5)
memory usage: 11.3+ MB


## Python Analysis 1

## Create a Random Forest to classify predictor variables

### Subset data into x and y variables

In [12]:
x = adoptions7.drop(['Animal Type', 'Color', 'Year', 'Month', 'Sex', 'Age', 'AdoptionYN'], axis=1)
y = adoptions7['AdoptionYN']

### Train Test Split

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3)

### Random Forest Model

In [14]:
forest = RandomForestClassifier(n_estimators=500)
forest.fit(x_train, y_train)

RandomForestClassifier(n_estimators=500)

### Create predictions set and print confusion matrix and classification report

In [15]:
forestPredictions = forest.predict(x_test)
print(confusion_matrix(y_test, forestPredictions))
print(classification_report(y_test, forestPredictions))

[[9576 8227]
 [8453 9244]]
              precision    recall  f1-score   support

           0       0.53      0.54      0.53     17803
           1       0.53      0.52      0.53     17697

    accuracy                           0.53     35500
   macro avg       0.53      0.53      0.53     35500
weighted avg       0.53      0.53      0.53     35500



### Discovery:  Since the predictor variables don't influence each other, a random forest model is not an effective predictive model.