# Decision Tree Regressor

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
import pickle

In [2]:
# Load the data
df = pd.read_csv('../Data_Preprocessing/Raw_Data/dataset_na_dropped.csv')
df.head()

Unnamed: 0,year,state,population_million,education_million,welfare_million,crime_rate,unemployment_rate,divorce_rate_per_1000_people,homeownership_rate,minimum_wage_effective,CPI_Average,inflation_rate,avg_wage_index,poverty_rate
0,1984,National,235.164,191925.0,143711.9,539.2,7.637412,4.9778,64.5,3.360196,103.9,4.3,16135.07,16.04
1,1985,National,237.369,209227.0,165441.6,556.6,7.304109,4.9222,63.9,3.360196,107.6,3.5,16822.51,15.72
2,1986,National,239.595,227705.0,159050.4,620.1,7.096872,4.8667,63.8,3.360196,109.6,1.9,17321.82,15.2
3,1987,National,241.842,242694.0,167313.2,609.7,6.297189,4.8111,64.0,3.360196,113.6,3.7,18426.51,14.48
4,1988,National,244.11,260024.0,176861.6,637.2,5.589161,4.7556,63.8,3.409804,118.3,4.1,19334.04,14.81


In [3]:
# Drop state column
df.drop(columns = ['state', 'year'], inplace = True)

In [4]:
# Split data for training and testing
X = df.drop(columns = 'poverty_rate')
y = df['poverty_rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [5]:
# Create regressor and fit it with the data
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(random_state=0)

In [6]:
# Predicting the values for the X_test
y_pred = regressor.predict(X_test)

In [None]:
# Check r squared value
r_squared = r2_score(y_test, y_pred)
print(f'R-Squared value: {r_squared}')

In [None]:
# Checking mean squared error and root mean squared error
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Mean Squared Error: {mse} \n' f'Root Mean Squared Error: {rmse}')

In [None]:
# Creating a dataframe with actual vs predicted poverty rates
pred_df = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
pred_df.head()

In [None]:
# List the features sorted in descending order by feature importance
importances = regressor.feature_importances_
ranked_importance = sorted(zip(importances, X.columns), reverse=True)
for i in range(len(ranked_importance)):
    print(f'{ranked_importance[i][1]}: ({ranked_importance[i][0]})')

In [None]:
# Defining function to pull data from a row and split it into features and target
def feat_targ_vals(row):
    feature_vals = []
    target_vals = []
    for i in range(len(df.columns)):
        feature_vals.append(df.iloc[row][i])
    target_vals = feature_vals.pop()
    return feature_vals, target_vals

In [None]:
# Predicting a poverty value using an existing row (75). Row 75 was in test set
prediction = regressor.predict([feat_targ_vals(75)[0]])
actual = feat_targ_vals(75)[1]
print(f'Predicted Value: {prediction[0]} \nActual Value: {actual}')

In [None]:
# Checking features and values for custom input
X.tail(15)

In [None]:
# Creating function to take custom inputs
def predict_pov_rate():
    custom_features = []
    minimum_wage_effective = input('Enter min wage effective: ')
    unemployment_rate = input('Enter unemployment_rate: ')
    crime_rate = input('Enter crime_rate: ')
    population_million = input('Enter population_million: ')
    homeownership_rate = input('Enter homeownership_rate: ')
    education_million = input('Enter education_million: ')
    welfare_million = input('Enter welfare_million: ')
    avg_wage_index = input('Enter avg_wage_index: ')
    divorce_rate_per_1000_people = input('Enter divorce_rate_per_1000_people: ')
    CPI_Average = input('Enter CPI_Average: ')
    custom_features.extend([float(population_million), 
                     float(education_million),
                     float(welfare_million),
                     float(crime_rate),
                     float(unemployment_rate),
                     float(divorce_rate_per_1000_people), 
                     float(homeownership_rate), 
                     float(minimum_wage_effective), 
                     float(CPI_Average),
                     float(avg_wage_index)])
    return custom_features

In [None]:
# Predicting poverty rate based on custom feature values
predicted_poverty_rate = regressor.predict([predict_pov_rate()])
print(f'The predicted poverty rate is: {predicted_poverty_rate}')

In [9]:
# Exporting saved model
filename = 'Saved_DTR_Models/entireRegressor.sav'
pickle.dump(regressor, open(filename, 'wb'))