# Training and saving a DTR for each state

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
import pickle

In [13]:
# Load the data
df = pd.read_csv('../Data_Preprocessing/Raw_Data/dataset_na_dropped.csv')
df.head()

Unnamed: 0,year,state,population_million,education_million,welfare_million,crime_rate,unemployment_rate,divorce_rate_per_1000_people,homeownership_rate,minimum_wage_effective,CPI_Average,inflation_rate,avg_wage_index,poverty_rate
0,1984,National,235.164,191925.0,143711.9,539.2,7.637412,4.9778,64.5,3.360196,103.9,4.3,16135.07,16.04
1,1985,National,237.369,209227.0,165441.6,556.6,7.304109,4.9222,63.9,3.360196,107.6,3.5,16822.51,15.72
2,1986,National,239.595,227705.0,159050.4,620.1,7.096872,4.8667,63.8,3.360196,109.6,1.9,17321.82,15.2
3,1987,National,241.842,242694.0,167313.2,609.7,6.297189,4.8111,64.0,3.360196,113.6,3.7,18426.51,14.48
4,1988,National,244.11,260024.0,176861.6,637.2,5.589161,4.7556,63.8,3.409804,118.3,4.1,19334.04,14.81


In [14]:
# Drop unwanted columns
features_df = df.drop(columns = ['state', 'year'])
features_df.head()

Unnamed: 0,population_million,education_million,welfare_million,crime_rate,unemployment_rate,divorce_rate_per_1000_people,homeownership_rate,minimum_wage_effective,CPI_Average,inflation_rate,avg_wage_index,poverty_rate
0,235.164,191925.0,143711.9,539.2,7.637412,4.9778,64.5,3.360196,103.9,4.3,16135.07,16.04
1,237.369,209227.0,165441.6,556.6,7.304109,4.9222,63.9,3.360196,107.6,3.5,16822.51,15.72
2,239.595,227705.0,159050.4,620.1,7.096872,4.8667,63.8,3.360196,109.6,1.9,17321.82,15.2
3,241.842,242694.0,167313.2,609.7,6.297189,4.8111,64.0,3.360196,113.6,3.7,18426.51,14.48
4,244.11,260024.0,176861.6,637.2,5.589161,4.7556,63.8,3.409804,118.3,4.1,19334.04,14.81


In [43]:
# Training a DTR for each state

for state in df['state'].unique():
    
    # Split data for training and testing
    temp_df = features_df.loc[df['state'] == state]
    X = temp_df.drop(columns = 'poverty_rate')
    y = temp_df['poverty_rate']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
    
    # Create regressor and fit it with the data
    regressor = DecisionTreeRegressor(random_state = 0)
    regressor.fit(X_train, y_train)
    
    # Predicting the values for the X_test
    y_pred = regressor.predict(X_test)

    # Save trained model for current state
    filename = f'Saved_DTR_Models/{state}Regressor.sav'
    pickle.dump(regressor, open(filename, 'wb'))

In [50]:
# Create function to read r2 (score), root mean squared error, and feature importance
# for a chosen state
def display_metrics(state):
    # Read in the saved state model
    filename = f'Saved_DTR_Models/{state}Regressor.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    
    # Compute metrics
    score = loaded_model.score(X_test, y_test)
    importances = loaded_model.feature_importances_
    ranked_importance = sorted(zip(importances, X.columns), reverse=True)
    
    # Printing metrics
    print(f'R-Squared value: {score}\n')
    print('Feature importance:')
    for i in range(len(ranked_importance)):
        print(f'{ranked_importance[i][1]}: ({ranked_importance[i][0]})')

In [52]:
display_metrics('entire')

R-Squared value: 0.9969507030323564

Feature importance:
minimum_wage_effective: (0.24778890566589265)
unemployment_rate: (0.21719723333428204)
homeownership_rate: (0.1679875032582579)
population_million: (0.11342979921740065)
welfare_million: (0.0893358451428584)
crime_rate: (0.08571149151953791)
education_million: (0.0398311476879875)
CPI_Average: (0.014712735694257044)
divorce_rate_per_1000_people: (0.009038707100842596)
inflation_rate: (0.008729742328226648)
avg_wage_index: (0.006236889050456556)


In [68]:
state_list =[]
for state in df['state'].unique():
    temp_list.append(state)
state_list.append('entire')
data = []

# Creating function to return R2 values for each model
for state in temp_list:
    filename = f'Saved_DTR_Models/{state}Regressor.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    
    # Compute scores
    score = loaded_model.score(X_test, y_test)
    data.append({'state':state, 'score':score})
# Add score to dataframe
state_scores= pd.DataFrame(data)

In [72]:
state_scores.sort_values(by = ['score'], inplace = True, ascending = False)
state_scores

Unnamed: 0,state,score
51,entire,0.996951
50,Wyoming,0.95155
45,Vermont,0.740132
44,Utah,0.560562
15,Iowa,0.548365
47,Washington,0.264103
7,Connecticut,0.138065
16,Kansas,-0.027613
20,Maryland,-0.188548
2,Alaska,-0.265797
