# Training and saving a DTR for each state and for entire dataset

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
import pickle
from sqlalchemy import inspect, create_engine
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
import config as creds
import pandas as pd

In [2]:
# Create engine
engine = create_engine(f'postgresql://{creds.PGUSER}:{creds.PGPASSWORD}@{creds.PGHOST}:5432/{creds.PGDATABASE}')

In [3]:
# Create our session (link) from Python to the DB
session = Session(bind=engine.connect())

In [4]:
# reflect an existing database into a new model
Base = automap_base()
# reflect the tables
Base.prepare(engine, reflect=True)

In [5]:
# List tables in database
inspect(engine).get_table_names()

['high_low_poverty',
 'ave_wage_indexing',
 'welfare_education',
 'economic_features_full',
 'economic_features',
 'lowest_poverty_rates',
 'highest_poverty_rates',
 'cpi_inflation_rate',
 'crime_rate',
 'divorce_rate',
 'homeownership_rate',
 'min_wage_effective',
 'poverty_rates',
 'unemployment_rate']

In [6]:
# List columns in a specific table ('min_wage')
[column['name'] for column in inspect(engine).get_columns('economic_features')]

['year',
 'state',
 'population_million',
 'education_million',
 'welfare_million',
 'crime_rate',
 'unemployment_rate',
 'divorce_rate_per_1000_people',
 'homeownership_rate',
 'minimum_wage_effective',
 'cpi_average',
 'inflation_rate',
 'avg_wage_index',
 'poverty_rate']

In [7]:
# Defining function that takes in a table name and outputs a dataframe
def db_reader(tablename):
    table_df = pd.read_sql_table(f'{tablename}', engine)
    return table_df

In [8]:
# Reading a database table into a dataframe
econ_features = db_reader('economic_features')

In [9]:
# Drop unwanted columns
features_df = econ_features.drop(columns = ['state', 'year'])

In [10]:
# Calculate additional features
features_df['education_per_capita'] = features_df['education_million'] / features_df['population_million']
features_df['welfare_per_capita'] = features_df['welfare_million'] / features_df['population_million']

# Drop the non-beneficial columns
columns_to_drop = ['education_million','welfare_million']
features_df = features_df.drop(columns=columns_to_drop, axis=1)

In [11]:
# Training a DTR for each state
# Adding state_list to iterate through saved models and data list to create dataframe from scores
state_list =[]
for state in econ_features['state'].unique():
    state_list.append(state)
state_list.append('entire')
data = []

for state in econ_features['state'].unique():
    
    # Split data for training and testing
    temp_df = features_df.loc[econ_features['state'] == state]
    X = temp_df.drop(columns = 'poverty_rate')
    y = temp_df['poverty_rate']
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=1)
    
    # Create regressor and fit it with the data
    regressor = DecisionTreeRegressor(random_state = 0)
    regressor.fit(X_train, y_train)
    
    # Predicting the values for the X_test
    y_pred = regressor.predict(X_test)
    
    score = regressor.score(X_test, y_test)
    data.append({'state':state, 'score':score})
    
    # Add score to dataframe
    state_scores= pd.DataFrame(data)

    # Save trained model for current state
    filename = f'Saved_DTR_Models/{state}Regressor.sav'
    pickle.dump(regressor, open(filename, 'wb'))

In [12]:
# Training for entire dataset
# Split data for training and testing
X = features_df.drop(columns = 'poverty_rate')
y = features_df['poverty_rate']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=1)

# Create regressor and fit it with the data
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

# Predicting the values for the X_test
y_pred = regressor.predict(X_test)

# Score model and add to state_scores dataframe
score = regressor.score(X_test, y_test)
state_scores.loc[len(state_scores.index)] = ['Entire', score]

# Exporting saved model
filename = 'Saved_DTR_Models/EntireRegressor.sav'
pickle.dump(regressor, open(filename, 'wb'))

In [13]:
# Create function to read r2 (score), and feature importance for a chosen state
def display_metrics(state):
    # Read in the saved state model
    filename = f'Saved_DTR_Models/{state}Regressor.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    
    # Compute metrics
    score = state_scores['score'].loc[state_scores['state'] == state].item()
    importances = loaded_model.feature_importances_
    ranked_importance = sorted(zip(importances, X.columns), reverse=True)

    # Printing metrics
    print(f'R-Squared value: {score}\n')
    print('Feature importance:')
    for i in range(len(ranked_importance)):
        print(f'{ranked_importance[i][1]}: ({ranked_importance[i][0]})')

In [14]:
display_metrics('Alabama')

R-Squared value: 0.9856549839735169

Feature importance:
divorce_rate_per_1000_people: (0.8429836540485565)
avg_wage_index: (0.0826312539481652)
minimum_wage_effective: (0.06064633565440465)
education_per_capita: (0.009216443777442225)
unemployment_rate: (0.002603443637972416)
inflation_rate: (0.0018227669853089952)
homeownership_rate: (8.108601875208859e-05)
crime_rate: (1.5015929397854186e-05)
welfare_per_capita: (0.0)
population_million: (0.0)
cpi_average: (0.0)


In [15]:
# Sorted dataframe showing score for each saved model
state_scores.sort_values(by = ['score'], inplace = True, ascending = False)
state_scores

Unnamed: 0,state,score
1,Alabama,0.985655
43,Tennessee,0.983854
36,Oklahoma,0.981607
23,Minnesota,0.978842
44,Utah,0.977537
2,Alaska,0.977188
6,Colorado,0.976462
40,South Carolina,0.972948
18,Louisiana,0.972566
46,Virginia,0.966804
