In [1]:
# Import dependencies

import pandas as pd
import psycopg2
import numpy as np
import pickle
import json
from sqlalchemy import create_engine
from config import my_password
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Read dog adoptions DataFrame from PostgreSQL
db_string = f"postgresql://postgres:{my_password}@127.0.0.1:5432/PetFindingDB"
engine = create_engine(db_string)
dog_adoptions = pd.read_sql("select * from \"learning_table\"", con=engine)
dog_adoptions.head()

Unnamed: 0,age,gender,size,status_changed_at,published_at,breeds_primary,breeds_mixed,breeds_unknown,spayed_neutered,house_trained,special_needs,shots_current,location,population
0,Young,Male,Medium,2021-09-17T16:17:43+0000,2021-09-02T15:02:30+0000,Border Collie,True,False,False,True,False,True,"Abilene, KS",6201.0
1,Baby,Female,Large,2018-07-21T14:54:35+0000,2018-07-16T20:20:09+0000,Anatolian Shepherd,True,False,False,True,False,True,"Abilene, KS",6201.0
2,Young,Female,Medium,2018-10-12T16:08:01+0000,2018-10-10T19:56:20+0000,Collie,True,False,False,False,False,True,"Abilene, KS",6201.0
3,Adult,Female,Large,2018-10-12T16:08:14+0000,2018-10-10T20:37:36+0000,Labrador Retriever,True,False,False,True,False,True,"Abilene, KS",6201.0
4,Adult,Female,Medium,2018-10-18T12:44:50+0000,2018-10-10T20:52:55+0000,Labrador Retriever,True,False,False,True,False,True,"Abilene, KS",6201.0


In [3]:
# See NaN values

dog_adoptions.isna().sum(axis=0)

age                     0
gender                  0
size                    0
status_changed_at       0
published_at            0
breeds_primary          0
breeds_mixed            0
breeds_unknown          0
spayed_neutered         0
house_trained           0
special_needs           0
shots_current           0
location                0
population           8882
dtype: int64

In [4]:
# Drop any rows with NaN values and confirm they have been dropped
dog_adoptions=dog_adoptions.dropna(how='any')
dog_adoptions.isna().sum(axis=0)

age                  0
gender               0
size                 0
status_changed_at    0
published_at         0
breeds_primary       0
breeds_mixed         0
breeds_unknown       0
spayed_neutered      0
house_trained        0
special_needs        0
shots_current        0
location             0
population           0
dtype: int64

In [5]:
# Get length of clean_dogs DataFrame

len(dog_adoptions.index)

141818

In [6]:
# Change status changed at (to adopted) and published at to datetime and subtract to calculate length of stay and drop columns

dog_adoptions[['status_changed_at', 'published_at']] = dog_adoptions[['status_changed_at', 'published_at']].apply(pd.to_datetime)
dog_adoptions['duration'] = (dog_adoptions['status_changed_at'] - dog_adoptions['published_at']).dt.days
dog_adoptions = dog_adoptions.drop(['status_changed_at', 'published_at', 'location'], axis=1)
dog_adoptions.head()

Unnamed: 0,age,gender,size,breeds_primary,breeds_mixed,breeds_unknown,spayed_neutered,house_trained,special_needs,shots_current,population,duration
0,Young,Male,Medium,Border Collie,True,False,False,True,False,True,6201.0,15
1,Baby,Female,Large,Anatolian Shepherd,True,False,False,True,False,True,6201.0,4
2,Young,Female,Medium,Collie,True,False,False,False,False,True,6201.0,1
3,Adult,Female,Large,Labrador Retriever,True,False,False,True,False,True,6201.0,1
4,Adult,Female,Medium,Labrador Retriever,True,False,False,True,False,True,6201.0,7


In [7]:
# Drop all rows with a length of stay of 0
dog_adoptions = dog_adoptions[dog_adoptions['duration'] > 0]
dog_adoptions.head()

Unnamed: 0,age,gender,size,breeds_primary,breeds_mixed,breeds_unknown,spayed_neutered,house_trained,special_needs,shots_current,population,duration
0,Young,Male,Medium,Border Collie,True,False,False,True,False,True,6201.0,15
1,Baby,Female,Large,Anatolian Shepherd,True,False,False,True,False,True,6201.0,4
2,Young,Female,Medium,Collie,True,False,False,False,False,True,6201.0,1
3,Adult,Female,Large,Labrador Retriever,True,False,False,True,False,True,6201.0,1
4,Adult,Female,Medium,Labrador Retriever,True,False,False,True,False,True,6201.0,7


In [8]:
# Get length of clean_dogs DataFrame

len(dog_adoptions.index)

132150

In [9]:
# Make copy of adoptions_df and encode boolean values
encoded_df = dog_adoptions.copy()
dog_adoptions[['breeds_mixed', 'breeds_unknown', 'spayed_neutered','house_trained', 'special_needs', 'shots_current']] = encoded_df[['breeds_mixed', 'breeds_unknown', 'spayed_neutered','house_trained', 'special_needs', 'shots_current']].astype(int)
encoded_df.head()

Unnamed: 0,age,gender,size,breeds_primary,breeds_mixed,breeds_unknown,spayed_neutered,house_trained,special_needs,shots_current,population,duration
0,Young,Male,Medium,Border Collie,True,False,False,True,False,True,6201.0,15
1,Baby,Female,Large,Anatolian Shepherd,True,False,False,True,False,True,6201.0,4
2,Young,Female,Medium,Collie,True,False,False,False,False,True,6201.0,1
3,Adult,Female,Large,Labrador Retriever,True,False,False,True,False,True,6201.0,1
4,Adult,Female,Medium,Labrador Retriever,True,False,False,True,False,True,6201.0,7


In [10]:
# Encode gender column
encoded_df['gender'] = encoded_df['gender'].replace(['Female', 'Male'], [0,1])
encoded_df.head()

Unnamed: 0,age,gender,size,breeds_primary,breeds_mixed,breeds_unknown,spayed_neutered,house_trained,special_needs,shots_current,population,duration
0,Young,1,Medium,Border Collie,True,False,False,True,False,True,6201.0,15
1,Baby,0,Large,Anatolian Shepherd,True,False,False,True,False,True,6201.0,4
2,Young,0,Medium,Collie,True,False,False,False,False,True,6201.0,1
3,Adult,0,Large,Labrador Retriever,True,False,False,True,False,True,6201.0,1
4,Adult,0,Medium,Labrador Retriever,True,False,False,True,False,True,6201.0,7


In [11]:
# Encode age and size columns

encoded_df = pd.get_dummies(encoded_df, columns=['age', 'size'])
encoded_df.head()

Unnamed: 0,gender,breeds_primary,breeds_mixed,breeds_unknown,spayed_neutered,house_trained,special_needs,shots_current,population,duration,age_Adult,age_Baby,age_Senior,age_Young,size_Extra Large,size_Large,size_Medium,size_Small
0,1,Border Collie,True,False,False,True,False,True,6201.0,15,0,0,0,1,0,0,1,0
1,0,Anatolian Shepherd,True,False,False,True,False,True,6201.0,4,0,1,0,0,0,1,0,0
2,0,Collie,True,False,False,False,False,True,6201.0,1,0,0,0,1,0,0,1,0
3,0,Labrador Retriever,True,False,False,True,False,True,6201.0,1,1,0,0,0,0,1,0,0
4,0,Labrador Retriever,True,False,False,True,False,True,6201.0,7,1,0,0,0,0,0,1,0


In [12]:
encoded_df['breed_pitbull'] = np.where(encoded_df['breeds_primary'] == 'Pit Bull Terrier', 1, 0)
encoded_df = encoded_df.drop(columns=["breeds_primary"])
encoded_df.head()

Unnamed: 0,gender,breeds_mixed,breeds_unknown,spayed_neutered,house_trained,special_needs,shots_current,population,duration,age_Adult,age_Baby,age_Senior,age_Young,size_Extra Large,size_Large,size_Medium,size_Small,breed_pitbull
0,1,True,False,False,True,False,True,6201.0,15,0,0,0,1,0,0,1,0,0
1,0,True,False,False,True,False,True,6201.0,4,0,1,0,0,0,1,0,0,0
2,0,True,False,False,False,False,True,6201.0,1,0,0,0,1,0,0,1,0,0
3,0,True,False,False,True,False,True,6201.0,1,1,0,0,0,0,1,0,0,0
4,0,True,False,False,True,False,True,6201.0,7,1,0,0,0,0,0,1,0,0


In [13]:
def bucketPopulation(row):
    if row['population'] > 0 and row['population'] <= 10000:
        return '0 to 10,000'
    elif row['population'] > 10000 and row['population'] <= 50000:
        return '10,000 to 50,000'
    elif row['population'] > 50000 and row['population'] <= 100000:
        return '50,000 to 100,000'
    return 'greater than 100,000'

In [14]:
encoded_df['bucketed_population'] = encoded_df.apply(lambda row: bucketPopulation(row), axis=1)
encoded_df.drop('population', axis=1, inplace=True)
encoded_df.head()

Unnamed: 0,gender,breeds_mixed,breeds_unknown,spayed_neutered,house_trained,special_needs,shots_current,duration,age_Adult,age_Baby,age_Senior,age_Young,size_Extra Large,size_Large,size_Medium,size_Small,breed_pitbull,bucketed_population
0,1,True,False,False,True,False,True,15,0,0,0,1,0,0,1,0,0,"0 to 10,000"
1,0,True,False,False,True,False,True,4,0,1,0,0,0,1,0,0,0,"0 to 10,000"
2,0,True,False,False,False,False,True,1,0,0,0,1,0,0,1,0,0,"0 to 10,000"
3,0,True,False,False,True,False,True,1,1,0,0,0,0,1,0,0,0,"0 to 10,000"
4,0,True,False,False,True,False,True,7,1,0,0,0,0,0,1,0,0,"0 to 10,000"


In [15]:
# Encode age and size columns

encoded_df = pd.get_dummies(encoded_df, columns=['bucketed_population'])
encoded_df.head()

Unnamed: 0,gender,breeds_mixed,breeds_unknown,spayed_neutered,house_trained,special_needs,shots_current,duration,age_Adult,age_Baby,...,age_Young,size_Extra Large,size_Large,size_Medium,size_Small,breed_pitbull,"bucketed_population_0 to 10,000","bucketed_population_10,000 to 50,000","bucketed_population_50,000 to 100,000","bucketed_population_greater than 100,000"
0,1,True,False,False,True,False,True,15,0,0,...,1,0,0,1,0,0,1,0,0,0
1,0,True,False,False,True,False,True,4,0,1,...,0,0,1,0,0,0,1,0,0,0
2,0,True,False,False,False,False,True,1,0,0,...,1,0,0,1,0,0,1,0,0,0
3,0,True,False,False,True,False,True,1,1,0,...,0,0,1,0,0,0,1,0,0,0
4,0,True,False,False,True,False,True,7,1,0,...,0,0,0,1,0,0,1,0,0,0


In [16]:
encoded_df['duration'] = np.where(encoded_df['duration'] <= 7, 1, 0)
encoded_df.head()

Unnamed: 0,gender,breeds_mixed,breeds_unknown,spayed_neutered,house_trained,special_needs,shots_current,duration,age_Adult,age_Baby,...,age_Young,size_Extra Large,size_Large,size_Medium,size_Small,breed_pitbull,"bucketed_population_0 to 10,000","bucketed_population_10,000 to 50,000","bucketed_population_50,000 to 100,000","bucketed_population_greater than 100,000"
0,1,True,False,False,True,False,True,0,0,0,...,1,0,0,1,0,0,1,0,0,0
1,0,True,False,False,True,False,True,1,0,1,...,0,0,1,0,0,0,1,0,0,0
2,0,True,False,False,False,False,True,1,0,0,...,1,0,0,1,0,0,1,0,0,0
3,0,True,False,False,True,False,True,1,1,0,...,0,0,1,0,0,0,1,0,0,0
4,0,True,False,False,True,False,True,1,1,0,...,0,0,0,1,0,0,1,0,0,0


In [80]:
standard_format = encoded_df.head()

In [81]:
# Read clean_dog_adoptions DataFrame into SQL
db_string = f"postgresql://postgres:{my_password}@127.0.0.1:5432/PetFindingDB"
engine = create_engine(db_string)
standard_format.to_sql(name='standard_format', con=engine, if_exists='replace')

In [66]:
# Convert duration (number of days) to number of weeks (rounded up) (e.g., 10 days =<2 weeks)
# encoded_df['duration'] = encoded_df['duration'] / 7
# encoded_df['duration'] = np.ceil(encoded_df.duration).astype(int)
# encoded_df.head()

In [17]:
# Create features

X = encoded_df.drop(columns=["duration"])
y = encoded_df["duration"]

In [18]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [19]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [20]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [21]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [22]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 0, 1, ..., 0, 0, 0])

In [23]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,20572,1478
Actual 1,9008,1980


In [24]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [25]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,20572,1478
Actual 1,9008,1980


Accuracy Score : 0.6826079060475816
Classification Report
              precision    recall  f1-score   support

           0       0.70      0.93      0.80     22050
           1       0.57      0.18      0.27     10988

    accuracy                           0.68     33038
   macro avg       0.63      0.56      0.54     33038
weighted avg       0.65      0.68      0.62     33038



In [76]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.06260998, 0.07813851, 0.        , 0.08908448, 0.10001941,
       0.0329886 , 0.08719876, 0.03240088, 0.08511259, 0.01536514,
       0.02351087, 0.00996303, 0.02456113, 0.02250212, 0.04232117,
       0.05212063, 0.03331717, 0.03103498, 0.12736532, 0.05038522])

In [77]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.12736532159758862, 'bucketed_population_50,000 to 100,000'),
 (0.10001941202190631, 'house_trained'),
 (0.08908447852457804, 'spayed_neutered'),
 (0.08719876324344114, 'shots_current'),
 (0.0851125948942018, 'age_Baby'),
 (0.07813850597129583, 'breeds_mixed'),
 (0.06260998210204914, 'gender'),
 (0.05212063486479021, 'breed_pitbull'),
 (0.050385223027505976, 'bucketed_population_greater than 100,000'),
 (0.04232116982146199, 'size_Small'),
 (0.03331717169534308, 'bucketed_population_0 to 10,000'),
 (0.032988598330875654, 'special_needs'),
 (0.03240087710831055, 'age_Adult'),
 (0.031034980217060384, 'bucketed_population_10,000 to 50,000'),
 (0.024561127630457008, 'size_Large'),
 (0.02351086879470996, 'age_Young'),
 (0.0225021219709776, 'size_Medium'),
 (0.015365137935495106, 'age_Senior'),
 (0.009963030247951601, 'size_Extra Large'),
 (0.0, 'breeds_unknown')]

In [79]:
# Save model
filename = 'finalized_model.sav'
pickle.dump(rf_model, open(filename, 'wb'))