### Reverse Engineering, Feature Selection, and Classification

In [158]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import from sklearn.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import statsmodels.api as sm


pd.set_option('display.max_rows', 500)

In [247]:
# Load in census data
census = pd.read_csv("data/census.csv").drop(columns='Unnamed: 0')
census.shape

(72864, 149)

In [248]:
# Baseline Model
census.groupby(by = 'fd_flag').agg({
    'POP2010'      : 'mean',
    'OHU2010'      : 'mean',
    'Urban'        : 'mean',
    'TractSNAP'    : 'mean',
    'PovertyRate'  : 'mean',
    'TractHUNV'    : 'mean',
    'TractKids'    : 'mean',
    'TractSeniors' : 'mean',
    'TractBlack'   : 'mean',
    'TractHispanic': 'mean',
    'TractWhite'   : 'mean',
    'TractAsian'   : 'mean'
     
}).T.round(5)

fd_flag,0,1
POP2010,4285.77977,4126.07991
OHU2010,1633.39954,1529.46154
Urban,0.70823,0.86947
TractSNAP,150.91082,335.69344
PovertyRate,11.81266,27.46918
TractHUNV,129.16386,182.48762
TractKids,1010.61329,1035.20609
TractSeniors,580.17644,489.51424
TractBlack,376.57537,895.89795
TractHispanic,571.51989,970.79535


### Interpretation of Baseline Model
- Differences observed in Urban Flag, Household SNAP recipients, Poverty Rates, Vehicle Access, and Racial Demographics

In [160]:
# Feature selection, look at correlations for low-income
corr = census.corrwith(census['LowIncomeTracts'])
corr.sort_values(ascending = False).head()

LowIncomeTracts         1.000000
fd_flag                 0.770212
LILATracts_halfAnd10    0.731157
PovertyRate             0.716020
TractSNAP               0.532868
dtype: float64

In [161]:
# Feature selection, look at correlations for low-access
corr = census.corrwith(census['la_flag'])
corr.sort_values(ascending = False).head()

la_flag          1.000000
LAhalfand10      0.914019
LATracts_half    0.828551
LAPOP05_10       0.605017
LALOWI05_10      0.504786
dtype: float64

In [162]:
# Feature selection, look for correlations with Food Desert
corr = census.corrwith(census['fd_flag'])
corr.sort_values(ascending = False).head()

fd_flag                 1.000000
LILATracts_halfAnd10    0.949293
LowIncomeTracts         0.770212
LILATracts_Vehicle      0.634062
LILATracts_1And10       0.577265
dtype: float64

### Low Income Predicting Model - Logistic Regression

In [174]:
# Set Low Income as our Target Vector
y = census['LowIncomeTracts']
# Select Demographic features from census data
features = ['POP2010',  'Urban', 'OHU2010',
            'TractSNAP', 'PovertyRate',
            'TractKids', 'TractSeniors',
           'TractBlack', 'TractHispanic',
           'TractWhite', 'TractAsian']

X = census[features]

In [175]:
# Check Baseline model
y.value_counts(normalize = True)

0    0.576334
1    0.423666
Name: LowIncomeTracts, dtype: float64

In [176]:
# Train/test split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [179]:
# Instantiate and fit Logistic Regression
lr = LogisticRegression(solver = 'liblinear')
lr.fit(X_train, y_train)
# Print Training and Testing Accuracy Scores
print(f'Training Accuracy: {round(lr.score(X_train, y_train),5)}.')
print(f'Testing Accuracy: {round(lr.score(X_test, y_test),5)}.')

Training Accuracy: 0.89259.
Testing Accuracy: 0.89542.


In [180]:
#Interpretation - pull coefficients into a DataFrame
features_df = pd.DataFrame(lr.coef_.T, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
PovertyRate,0.243638
TractSNAP,0.006807
POP2010,0.000619
TractSeniors,0.000134
TractHispanic,0.000104
OHU2010,-0.000332
TractKids,-0.00056
TractBlack,-0.00073
TractAsian,-0.000804
TractWhite,-0.001077


### Findings - Low Income Logistic Regression
- Poverty Rate and Tract SNAP benefits have the strongest effect on a area being designated low-income

### Low Income Predicting Model - Random Forest

In [181]:
# Instantiate and fit Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=20)
rfc.fit(X_train, y_train)
# Print Training and Testing Accuracy Scores
print(f'Training Accuracy: {round(rfc.score(X_train, y_train),5)}.')
print(f'Testing Accuracy: {round(rfc.score(X_test, y_test),5)}.')

Training Accuracy: 0.99722.
Testing Accuracy: 0.90201.


In [183]:
# Interpretation of Features
features_df = pd.DataFrame(rfc.feature_importances_, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
PovertyRate,0.453575
TractSNAP,0.182395
TractWhite,0.077786
TractBlack,0.053044
TractHispanic,0.052186
TractSeniors,0.043076
OHU2010,0.038634
TractKids,0.033472
POP2010,0.032244
TractAsian,0.029873


### Findings - Low Income Random Forest
- PovertyRate is most siginificant factor used in the Random Forest to classify Low Income

### Low Access Predicting Model - Logistic Regression

In [184]:
# Select our constructed Low-Access flag as target Vector
y = census['la_flag']

# Select Features
features = ['POP2010',  'Urban', 'OHU2010',
            'TractSNAP', 'PovertyRate', 'TractHUNV',
            'TractKids', 'TractSeniors',
           'TractBlack', 'TractHispanic',
           'TractWhite', 'TractAsian']

X = census[features]

In [185]:
# Train/test split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [186]:
# Set up Logistic Regression
lr = LogisticRegression(solver = 'liblinear')
lr.fit(X_train, y_train)
# Accuracy Scores for Training and Testing Sets
print(f'Training Score: {round(lr.score(X_train, y_train),5)}.')
print(f'Testing Score: {round(lr.score(X_test, y_test),5)}.')

Training Score: 0.82306.
Testing Score: 0.82047.


In [187]:
# Interpretation - pull coefficients into DataFrame
features_df = pd.DataFrame(lr.coef_.T, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
Urban,3.596765
PovertyRate,0.008103
TractSNAP,0.002453
TractSeniors,0.00163
TractKids,0.000446
POP2010,0.000187
OHU2010,6.7e-05
TractBlack,3e-06
TractWhite,-8.6e-05
TractHispanic,-0.000455


### Findings - Low Access Logistic Regression
- Urban flag has strongest effect on classifying Low-Access areas

### Low Access Model  - Random Forest

In [188]:
# Instantiate and fit Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=20)
rfc.fit(X_train, y_train)
# Print Training and Testing Accuracy Scores
print(f'Training Accuracy: {round(rfc.score(X_train, y_train),5)}.')
print(f'Testing Accuracy: {round(rfc.score(X_test, y_test),5)}.')

Training Accuracy: 0.99803.
Testing Accuracy: 0.8668.


In [189]:
#Feature Importances DataFrame
features_df = pd.DataFrame(rfc.feature_importances_, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
Urban,0.222331
TractHUNV,0.184226
TractAsian,0.067132
TractWhite,0.065797
TractBlack,0.060373
TractHispanic,0.058967
PovertyRate,0.058538
TractSeniors,0.058078
OHU2010,0.057641
TractSNAP,0.057458


### Findings - Low Access Random Forest
- Urban Flag and Availability of Vehicles are most siginificant factors used by model to determine low access

### Low-Income + Low-Access Model Logistic Regression

In [211]:
# Use constructed Food Desert Flag as Target Vector
y = census['fd_flag']

features = ['POP2010',  'Urban', 'OHU2010',
            'TractSNAP', 'PovertyRate', 'TractHUNV',
            'TractKids', 'TractSeniors',
           'TractBlack', 'TractHispanic',
           'TractWhite', 'TractAsian']


X = census[features]

In [212]:
# Baseline Model
y.value_counts(normalize=True)

0    0.696338
1    0.303662
Name: fd_flag, dtype: float64

In [213]:
# Train-test-split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [214]:
# Set up Logistic Regression
lr = LogisticRegression(solver = 'liblinear')
lr.fit(X_train, y_train)
# Accuracy Scores for Training and Testing Sets
print(f'Training Score: {round(lr.score(X_train, y_train),5)}.')
print(f'Testing Score: {round(lr.score(X_test, y_test),5)}.')

Training Score: 0.82773.
Testing Score: 0.83565.


In [215]:
#Interpretation - pull cefficients into DataFrame
features_df = pd.DataFrame(lr.coef_.T, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
Urban,1.01504
PovertyRate,0.087418
TractSNAP,0.006106
POP2010,0.000566
OHU2010,0.00053
TractSeniors,9e-05
TractHispanic,-3.9e-05
TractBlack,-0.000445
TractWhite,-0.000755
TractKids,-0.001001


### FIndings - Low-Income + Low-Access Model  Logistic Regression
- Urban Flag and Poverty Rate have most significant effects on our classification

### Low-Income + Low-Access Model - Random Forest

In [218]:
# Instantiate and fit Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=20)
rfc.fit(X_train, y_train)
# Print Training and Testing Accuracy Scores
print(f'Training Accuracy: {round(rfc.score(X_train, y_train),5)}.')
print(f'Testing Accuracy: {round(rfc.score(X_test, y_test),5)}.')

Training Accuracy: 0.99679.
Testing Accuracy: 0.87106.


In [219]:
#Interpretation - pull features into DataFrame
features_df = pd.DataFrame(rfc.feature_importances_, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
PovertyRate,0.284971
TractSNAP,0.140784
TractHUNV,0.118125
TractWhite,0.063125
TractBlack,0.058794
TractHispanic,0.056559
TractSeniors,0.056487
TractAsian,0.051553
OHU2010,0.049243
TractKids,0.046949


### Findings - Low-Income + Low-Access Model Random Forest
- Consistent with previous Models, Poverty Rate and Availabiliy of Vehicles are most important factors for classfying a region as Food Desert