### Reverse Engineering, Feature Selection, and Classification

In [363]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans


### Load Data and Create Geographic Clusters

In [268]:
# Load in census data
census = pd.read_csv("data/census.csv",index_col=0)
census.shape

(72864, 149)

In [270]:
# Load US Census Tracts
us_tracts = pd.read_csv('./data/us_tracts.csv', index_col=0)
# Merge with US Census Data
census = census.merge(us_tracts, left_on='CensusTract', right_on='tract')

In [273]:
# Clustering our Points geographically
location_data = census[['lat', 'lon']]
# Create 20 clusters and add back to census data
km = KMeans(n_clusters=20, random_state = 1776)
km.fit(location_data)
clusters = km.predict(location_data)
census.loc[:,'clusters'] = clusters

### 1. Baseline Model

In [318]:
# Baseline Model
census.groupby(by = 'fd_flag').agg({
    'POP2010'      : 'mean',
    'OHU2010'      : 'mean',
    'Urban'        : 'mean',
    'TractSNAP'    : 'mean',
    'PovertyRate'  : 'mean',
    'TractHUNV'    : 'mean',
    'TractKids'    : 'mean',
    'TractSeniors' : 'mean',
    'TractBlack'   : 'mean',
    'TractHispanic': 'mean',
    'TractWhite'   : 'mean',
    'TractAsian'   : 'mean'
     
}).T.round(3)

fd_flag,0,1
POP2010,4285.78,4126.08
OHU2010,1633.4,1529.462
Urban,0.708,0.869
TractSNAP,150.911,335.693
PovertyRate,11.813,27.469
TractHUNV,129.164,182.488
TractKids,1010.613,1035.206
TractSeniors,580.176,489.514
TractBlack,376.575,895.898
TractHispanic,571.52,970.795


### 1. Interpretation of Baseline Model
- Differences observed in Urban Flag, Household SNAP recipients, Poverty Rates, Vehicle Access, and Racial Demographics

In [160]:
# Feature selection, look at correlations for low-income
corr = census.corrwith(census['LowIncomeTracts'])
corr.sort_values(ascending = False).head()

LowIncomeTracts         1.000000
fd_flag                 0.770212
LILATracts_halfAnd10    0.731157
PovertyRate             0.716020
TractSNAP               0.532868
dtype: float64

In [161]:
# Feature selection, look at correlations for low-access
corr = census.corrwith(census['la_flag'])
corr.sort_values(ascending = False).head()

la_flag          1.000000
LAhalfand10      0.914019
LATracts_half    0.828551
LAPOP05_10       0.605017
LALOWI05_10      0.504786
dtype: float64

In [162]:
# Feature selection, look for correlations with Food Desert
corr = census.corrwith(census['fd_flag'])
corr.sort_values(ascending = False).head()

fd_flag                 1.000000
LILATracts_halfAnd10    0.949293
LowIncomeTracts         0.770212
LILATracts_Vehicle      0.634062
LILATracts_1And10       0.577265
dtype: float64

### 2.1 Low Income Predicting Model - Logistic Regression

In [286]:
# Set Low Income as our Target Vector
y = census['LowIncomeTracts']
# Select Demographic features from census data
features = ['POP2010',  'Urban', 'OHU2010',
            'TractSNAP', 'PovertyRate',
            'TractKids', 'TractSeniors',
           'TractBlack', 'TractHispanic',
           'TractWhite', 'TractAsian']

X = census[features]

In [287]:
# Check Baseline model
y.value_counts(normalize = True)

0    0.576334
1    0.423666
Name: LowIncomeTracts, dtype: float64

In [288]:
# Train/test split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [289]:
# Instantiate and fit Logistic Regression
lr = LogisticRegression(solver = 'liblinear')
lr.fit(X_train, y_train)
# Print Training and Testing Accuracy Scores
print(f'Training Accuracy: {round(lr.score(X_train, y_train),5)}.')
print(f'Testing Accuracy: {round(lr.score(X_test, y_test),5)}.')

Training Accuracy: 0.89259.
Testing Accuracy: 0.89542.


In [290]:
#Interpretation - pull coefficients into a DataFrame
features_df = pd.DataFrame(lr.coef_.T, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
PovertyRate,0.243638
TractSNAP,0.006807
POP2010,0.000619
TractSeniors,0.000134
TractHispanic,0.000104
OHU2010,-0.000332
TractKids,-0.00056
TractBlack,-0.00073
TractAsian,-0.000804
TractWhite,-0.001077


### 2.1 Interpretation - Low Income Logistic Regression
- Poverty Rate and Tract SNAP benefits have the strongest effect on a area being designated low-income

### 2.2 Low Income Predicting Model - Random Forest

In [291]:
# Instantiate and fit Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=20)
rfc.fit(X_train, y_train)
# Print Training and Testing Accuracy Scores
print(f'Training Accuracy: {round(rfc.score(X_train, y_train),5)}.')
print(f'Testing Accuracy: {round(rfc.score(X_test, y_test),5)}.')

Training Accuracy: 0.99762.
Testing Accuracy: 0.90201.


In [292]:
# Interpretation of Features
features_df = pd.DataFrame(rfc.feature_importances_, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
PovertyRate,0.431693
TractSNAP,0.18826
TractWhite,0.089193
TractHispanic,0.053833
TractBlack,0.046691
TractSeniors,0.040306
OHU2010,0.039298
TractAsian,0.035926
TractKids,0.035112
POP2010,0.034454


### 2.2 Interpretations - Low Income Random Forest
- PovertyRate is most siginificant factor used in the Random Forest to classify Low Income

### 3.1 Low Access Predicting Model - Logistic Regression

In [293]:
# Select our constructed Low-Access flag as target Vector
y = census['la_flag']

# Select Features
features = ['POP2010',  'Urban', 'OHU2010',
            'TractSNAP', 'PovertyRate', 'TractHUNV',
            'TractKids', 'TractSeniors',
           'TractBlack', 'TractHispanic',
           'TractWhite', 'TractAsian']

X = census[features]

In [294]:
# Train/test split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [295]:
# Set up Logistic Regression
lr = LogisticRegression(solver = 'liblinear')
lr.fit(X_train, y_train)
# Accuracy Scores for Training and Testing Sets
print(f'Training Score: {round(lr.score(X_train, y_train),5)}.')
print(f'Testing Score: {round(lr.score(X_test, y_test),5)}.')

Training Score: 0.82306.
Testing Score: 0.82047.


In [296]:
# Interpretation - pull coefficients into DataFrame
features_df = pd.DataFrame(lr.coef_.T, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
Urban,3.596765
PovertyRate,0.008103
TractSNAP,0.002453
TractSeniors,0.00163
TractKids,0.000446
POP2010,0.000187
OHU2010,6.7e-05
TractBlack,3e-06
TractWhite,-8.6e-05
TractHispanic,-0.000455


### 3.1 Interpretations - Low Access Logistic Regression
- Urban flag has strongest effect on classifying Low-Access areas

### 3.2 Low Access Model  - Random Forest

In [297]:
# Instantiate and fit Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=20)
rfc.fit(X_train, y_train)
# Print Training and Testing Accuracy Scores
print(f'Training Accuracy: {round(rfc.score(X_train, y_train),5)}.')
print(f'Testing Accuracy: {round(rfc.score(X_test, y_test),5)}.')

Training Accuracy: 0.99766.
Testing Accuracy: 0.86775.


In [298]:
#Feature Importances DataFrame
features_df = pd.DataFrame(rfc.feature_importances_, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
Urban,0.230937
TractHUNV,0.185107
TractWhite,0.067698
TractBlack,0.060698
TractHispanic,0.060135
TractAsian,0.060117
OHU2010,0.058226
PovertyRate,0.056648
TractSeniors,0.056276
TractSNAP,0.056173


### 3.2 Interpretations - Low Access Random Forest
- Urban Flag and Availability of Vehicles are most siginificant factors used by model to determine low access

### 4.1 Low-Income + Low-Access Model Logistic Regression

In [390]:
# Use constructed Food Desert Flag as Target Vector
y = census['fd_flag']

features = ['POP2010',  'Urban', 'OHU2010',
            'TractSNAP', 'PovertyRate', 'TractHUNV',
            'TractKids', 'TractSeniors',
           'TractBlack', 'TractHispanic',
           'TractWhite', 'TractAsian']


features = ['POP2010',  'Urban', 'OHU2010',
            'TractSNAP', 'PovertyRate', 'TractHUNV',
            'TractKids', 'TractSeniors',
           'TractBlack', 'TractHispanic',
           'TractWhite', 'TractAsian']


X = census[features]

In [391]:
# Baseline Model
y.value_counts(normalize=True)

0    0.696338
1    0.303662
Name: fd_flag, dtype: float64

In [392]:
# Train-test-split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [393]:
# Set up Logistic Regression
lr = LogisticRegression(solver = 'liblinear')
lr.fit(X_train, y_train)
# Accuracy Scores for Training and Testing Sets
print(f'Training Score: {round(lr.score(X_train, y_train),5)}.')
print(f'Testing Score: {round(lr.score(X_test, y_test),5)}.')

Training Score: 0.82773.
Testing Score: 0.83565.


In [394]:
#Interpretation - pull cefficients into DataFrame
features_df = pd.DataFrame(lr.coef_.T, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
Urban,1.01504
PovertyRate,0.087418
TractSNAP,0.006106
POP2010,0.000566
OHU2010,0.00053
TractSeniors,9e-05
TractHispanic,-3.9e-05
TractBlack,-0.000445
TractWhite,-0.000755
TractKids,-0.001001


### 4.1 Interpretations - Low-Income + Low-Access Model  Logistic Regression
- Urban Flag and Poverty Rate have most significant effects on our classification

### 4.2 Low-Income + Low-Access Model - Random Forest

In [361]:
# Instantiate and fit Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=20)
rfc.fit(X_train, y_train)
# Print Training and Testing Accuracy Scores
print(f'Training Accuracy: {round(rfc.score(X_train, y_train),5)}.')
print(f'Testing Accuracy: {round(rfc.score(X_test, y_test),5)}.')

Training Accuracy: 0.99669.
Testing Accuracy: 0.87147.


In [367]:
#Interpretation - pull features into DataFrame
features_df = pd.DataFrame(rfc.feature_importances_, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
PovertyRate,0.268948
TractSNAP,0.156057
TractHUNV,0.119038
TractWhite,0.067863
TractBlack,0.059465
TractHispanic,0.057034
TractSeniors,0.054988
TractAsian,0.050992
OHU2010,0.049592
TractKids,0.047884


### 4.2 Findings - Low-Income + Low-Access Model Random Forest
- Poverty Rate, SNAP units, and Vehicle Units are most significant features used by the model to determine Food Desert Status

### 5.1 Combined Model + Geographic Cluster Logistic Regression

In [372]:
# Use constructed Food Desert Flag as Target Vector
y = census['fd_flag']

features = ['POP2010',  'Urban', 'OHU2010',
            'TractSNAP', 'PovertyRate', 'TractHUNV',
            'TractKids', 'TractSeniors',
           'TractBlack', 'TractHispanic',
           'TractWhite', 'TractAsian',
           'clusters']


X = census[features]
X = pd.get_dummies(columns=['clusters'], data = X)

In [373]:
# Train-test-split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [374]:
# Set up Logistic Regression
lr = LogisticRegression(solver = 'liblinear')
lr.fit(X_train, y_train)
# Accuracy Scores for Training and Testing Sets
print(f'Training Score: {round(lr.score(X_train, y_train),5)}.')
print(f'Testing Score: {round(lr.score(X_test, y_test),5)}.')

Training Score: 0.82759.
Testing Score: 0.83435.


In [380]:
#Interpretation - pull cefficients into DataFrame
features_df = pd.DataFrame(lr.coef_.T, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)
features_df['Absolute Val.'] = np.absolute(features_df['Importance'])
features_df.sort_values(by='Absolute Val.', ascending = False)


Unnamed: 0,Importance,Absolute Val.
clusters_3,-0.817444,0.817444
Urban,0.690838,0.690838
clusters_15,-0.320676,0.320676
clusters_4,-0.31973,0.31973
clusters_14,-0.299597,0.299597
clusters_13,-0.298752,0.298752
clusters_1,-0.268471,0.268471
clusters_5,-0.215713,0.215713
clusters_9,-0.142472,0.142472
clusters_18,-0.10838,0.10838


### 5.1 Interpretations - Combined + Cluster Logistic Regresssion
- Clusters 3, 15, 4, 14, 13, 1, 5 are negatively associated with Food Desert status
- Urban flag and Poverty Rate are still most significant features positively predicting Food Desert status
- Clusters 7, 16 are most significant positive predictors of Food Desert status

### 5.2 Combined + Clustered Random Forest

In [376]:
# Instantiate and fit Random Forest Classifier
rfc = RandomForestClassifier(n_estimators=20)
rfc.fit(X_train, y_train)
# Print Training and Testing Accuracy Scores
print(f'Training Accuracy: {round(rfc.score(X_train, y_train),5)}.')
print(f'Testing Accuracy: {round(rfc.score(X_test, y_test),5)}.')

Training Accuracy: 0.9972.
Testing Accuracy: 0.87298.


In [379]:
#Interpretation - pull cefficients into DataFrame
features_df = pd.DataFrame(rfc.feature_importances_.T, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)
features_df['Absolute Val.'] = np.absolute(features_df['Importance'])
features_df.sort_values(by='Absolute Val.', ascending = False).T


Unnamed: 0,PovertyRate,TractSNAP,TractHUNV,TractBlack,TractWhite,TractHispanic,TractAsian,TractSeniors,TractKids,OHU2010,...,clusters_0,clusters_2,clusters_5,clusters_18,clusters_7,clusters_17,clusters_8,clusters_19,clusters_11,clusters_6
Importance,0.247035,0.142597,0.110475,0.058638,0.058229,0.054265,0.05395,0.050022,0.047393,0.046061,...,0.002376,0.002257,0.002101,0.001962,0.001953,0.001777,0.00145,0.001226,0.000386,0.000323
Absolute Val.,0.247035,0.142597,0.110475,0.058638,0.058229,0.054265,0.05395,0.050022,0.047393,0.046061,...,0.002376,0.002257,0.002101,0.001962,0.001953,0.001777,0.00145,0.001226,0.000386,0.000323


### 5.2 Interpretations Combined + Clustered Random Forest
- Geographic Clusters do not have as much significance as in 5.1 Logistic Regression
- Features are consistent with 4.2 Random Forest
- With respect to cluster features considered, 5.2 similarily identifies clusters 3, 4, 13, 1, and 16 as significant, in addition to 12 and 14