# Training and saving a DTR for each state and for entire dataset

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
import pickle
from sqlalchemy import inspect, create_engine
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
import config as creds
import pandas as pd

In [2]:
# Create engine
engine = create_engine(f'postgresql://{creds.PGUSER}:{creds.PGPASSWORD}@{creds.PGHOST}:5432/{creds.PGDATABASE}')

In [3]:
# Create our session (link) from Python to the DB
session = Session(bind=engine.connect())

In [4]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [5]:
# List tables in database
inspect(engine).get_table_names()

['economic_features_full',
 'features_table',
 'target_table',
 'economic_features',
 'high_low_poverty']

In [6]:
# List columns in a specific table ('min_wage')
[column['name'] for column in inspect(engine).get_columns('economic_features')]

['year',
 'state',
 'population_million',
 'education_million',
 'welfare_million',
 'crime_rate',
 'unemployment_rate',
 'divorce_rate_per_1000_people',
 'homeownership_rate',
 'minimum_wage_effective',
 'cpi_average',
 'inflation_rate',
 'avg_wage_index',
 'poverty_rate']

In [7]:
# Defining function that takes in a table name and outputs a dataframe
def db_reader(tablename):
    table_df = pd.read_sql_table(f'{tablename}', engine)
    return table_df

In [8]:
# Reading a database table into a dataframe
econ_features = db_reader('economic_features')
econ_features.head()

Unnamed: 0,year,state,population_million,education_million,welfare_million,crime_rate,unemployment_rate,divorce_rate_per_1000_people,homeownership_rate,minimum_wage_effective,cpi_average,inflation_rate,avg_wage_index,poverty_rate
0,1976,National,216.945,107290.0,72155.2,467.8,7.786562,5.0,7.79,2.226667,56.9,5.7,9226.48,14.1
1,1977,National,219.307,115893.0,83417.8,475.9,7.132505,5.1,7.13,2.226667,60.6,6.5,9779.44,13.92
2,1978,National,221.694,128541.0,81037.9,497.8,6.134198,5.2,6.13,2.226667,65.2,7.6,10556.03,13.43
3,1979,National,224.107,140169.0,83691.3,548.9,5.923827,5.3,5.92,2.91,72.6,11.3,11479.46,13.33
4,1980,National,226.546,153686.0,108251.7,596.6,7.255717,5.2,7.26,3.110196,82.4,13.5,12513.46,14.65


In [9]:
# Drop unwanted columns
features_df = econ_features.drop(columns = ['state', 'year'])

In [10]:
# Calculate additional features
features_df['education_per_capita'] = features_df['education_million'] / features_df['population_million']
features_df['welfare_per_capita'] = features_df['welfare_million'] / features_df['population_million']

# Drop the non-beneficial columns
columns_to_drop = ['education_million','welfare_million']
features_df = features_df.drop(columns=columns_to_drop, axis=1)

In [11]:
# Training a DTR for each state
# Adding state_list to iterate through saved models and data list to create dataframe from scores
state_list =[]
for state in econ_features['state'].unique():
    state_list.append(state)
state_list.append('entire')
data = []

for state in econ_features['state'].unique():
    
    # Split data for training and testing
    temp_df = features_df.loc[econ_features['state'] == state]
    X = temp_df.drop(columns = 'poverty_rate')
    y = temp_df['poverty_rate']
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=1)
    
    # Create regressor and fit it with the data
    regressor = DecisionTreeRegressor(random_state = 0)
    regressor.fit(X_train, y_train)
    
    # Predicting the values for the X_test
    y_pred = regressor.predict(X_test)
    
    score = regressor.score(X_test, y_test)
    data.append({'state':state, 'score':score})
    
    # Add score to dataframe
    state_scores= pd.DataFrame(data)

    # Save trained model for current state
    filename = f'Saved_DTR_Models/{state}Regressor.sav'
    pickle.dump(regressor, open(filename, 'wb'))

In [12]:
# Training for entire dataset
# Split data for training and testing
X = features_df.drop(columns = 'poverty_rate')
y = features_df['poverty_rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=1)

# Create regressor and fit it with the data
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

# Predicting the values for the X_test
y_pred = regressor.predict(X_test)

# Score model and add to state_scores dataframe
score = regressor.score(X_test, y_test)
state_scores.loc[len(state_scores.index)] = ['Entire', score]

# Exporting saved model
filename = 'Saved_DTR_Models/EntireRegressor.sav'
pickle.dump(regressor, open(filename, 'wb'))

In [13]:
# Create function to read r2 (score), and feature importance for a chosen state
def display_metrics(state):
    # Read in the saved state model
    filename = f'Saved_DTR_Models/{state}Regressor.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    
    # Compute metrics
    score = state_scores['score'].loc[state_scores['state'] == state].item()
    importances = loaded_model.feature_importances_
    ranked_importance = sorted(zip(importances, X.columns), reverse=True)

    # Printing metrics
    print(f'R-Squared value: {score}\n')
    print('Feature importance:')
    for i in range(len(ranked_importance)):
        print(f'{ranked_importance[i][1]}: ({ranked_importance[i][0]})')

In [14]:
display_metrics('National')

R-Squared value: 0.26844269289142

Feature importance:
homeownership_rate: (0.6006142572438061)
unemployment_rate: (0.19186949851280144)
cpi_average: (0.11253319473235324)
inflation_rate: (0.05028313621675418)
divorce_rate_per_1000_people: (0.014923262491323478)
avg_wage_index: (0.011980671465709743)
minimum_wage_effective: (0.005429111093638382)
crime_rate: (0.00524255167312922)
welfare_per_capita: (0.0051145778752377975)
population_million: (0.0020047681955903175)
education_per_capita: (4.970499656106646e-06)


In [15]:
# Sorted dataframe showing score for each saved model
state_scores.sort_values(by = ['score'], inplace = True, ascending = False)
state_scores

Unnamed: 0,state,score
44,Utah,0.987452
25,Missouri,0.981439
18,Louisiana,0.975836
26,Montana,0.972191
23,Minnesota,0.970457
4,Arkansas,0.970198
3,Arizona,0.96246
13,Illinois,0.958556
24,Mississippi,0.953919
28,Nevada,0.953766
