### FOOD ATLAS - PCA

In [9]:
# Import our libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import from sklearn.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import statsmodels.api as sm


pd.set_option('display.max_rows', 500)

In [3]:
# Load in census data
census = pd.read_csv("data/census.csv").drop(columns='Unnamed: 0')
census.shape

(72864, 149)

In [75]:
# Feature selection, look at correlations
corr = census.corrwith(census['la_flag'])
corr.sort_values(ascending = False).head()

la_flag          1.000000
LAhalfand10      0.914019
LATracts_half    0.828551
LAPOP05_10       0.605017
LALOWI05_10      0.504786
dtype: float64

In [76]:
corr = census.corrwith(census['fd_flag'])
corr.sort_values(ascending = False).head()

fd_flag                 1.000000
LILATracts_halfAnd10    0.949293
LowIncomeTracts         0.770212
LILATracts_Vehicle      0.634062
LILATracts_1And10       0.577265
dtype: float64

### Low Income Predicting Model - Logistic Regression

In [45]:
y = census['LowIncomeTracts']

features = ['POP2010',  'Urban', 'OHU2010',
            'TractSNAP', 'PovertyRate',
            'TractKids', 'TractSeniors',
           'TractBlack', 'TractHispanic',
           'TractWhite', 'TractAsian']

X = census[features]

#X = pd.concat([X, dummy], axis = 1)

In [46]:
y.value_counts(normalize = True)

0    0.576334
1    0.423666
Name: LowIncomeTracts, dtype: float64

In [47]:
# Train/test split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [48]:
# Set up Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
# Score on training and testing sets
print(f'Training Score: {round(lr.score(X_train, y_train),4)}.')

# Score on testing set.
print(f'Testing Score: {round(lr.score(X_test, y_test),4)}.')



Training Score: 0.8926.
Testing Score: 0.8954.


In [49]:
#turn it into a data frame
features_df = pd.DataFrame(lr.coef_.T, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
PovertyRate,0.243638
TractSNAP,0.006807
POP2010,0.000619
TractSeniors,0.000134
TractHispanic,0.000104
OHU2010,-0.000332
TractKids,-0.00056
TractBlack,-0.00073
TractAsian,-0.000804
TractWhite,-0.001077


### Findings - Low income LogReg
- The Logistic Regression Identifies Urban flag and Poverty Rate as most significant predictors of Low Income status

### Low Income Predicting Model - Random Forest

In [50]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [51]:
rfc.score(X_train, y_train)

0.9908733766790757

In [52]:
rfc.score(X_test, y_test)

0.8983737047965416

In [53]:
#turn it into a data frame
features_df = pd.DataFrame(rfc.feature_importances_, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
PovertyRate,0.499867
TractSNAP,0.162961
TractWhite,0.073271
TractBlack,0.046679
TractHispanic,0.043064
TractSeniors,0.0408
TractKids,0.034437
POP2010,0.03442
OHU2010,0.031842
TractAsian,0.029374


### Findings - Low income Random Forest
- We see that Poverty Rate is, unsurpsingly, the most significant predictor of whether a census tract is classified as low-income, followed by the amount of housing units that receive SNAP benefits

### Low Access Predicting Model - Logistic Regression

In [54]:
y = census['la_flag']

features = ['POP2010',  'Urban', 'OHU2010',
            'TractSNAP', 'PovertyRate', 'TractHUNV',
            'TractKids', 'TractSeniors',
           'TractBlack', 'TractHispanic',
           'TractWhite', 'TractAsian']

X = census[features]

In [55]:
# Train/test split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [56]:
# Set up Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
# Score on training set. (We'll use R^2 for the score today.)
print(f'Training Score: {round(lr.score(X_train, y_train),4)}.')

# Score on testing set.
print(f'Testing Score: {round(lr.score(X_test, y_test),4)}.')



Training Score: 0.8231.
Testing Score: 0.8205.


In [57]:
#turn it into a data frame
features_df = pd.DataFrame(lr.coef_.T, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
Urban,3.596765
PovertyRate,0.008103
TractSNAP,0.002453
TractSeniors,0.00163
TractKids,0.000446
POP2010,0.000187
OHU2010,6.7e-05
TractBlack,3e-06
TractWhite,-8.6e-05
TractHispanic,-0.000455


### Findings - Low Income LogReg
- Urban flag is most significant predictor of whether a region is low access

### Low Access Model  - Random Forest

In [58]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [59]:
rfc.score(X_train, y_train)

0.9924208283829735

In [60]:
rfc.score(X_test, y_test)

0.8573151459702237

In [61]:
#turn it into a data frame
features_df = pd.DataFrame(rfc.feature_importances_, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
Urban,0.223032
TractHUNV,0.173362
TractWhite,0.075726
TractSNAP,0.065672
TractAsian,0.064862
TractHispanic,0.06204
TractKids,0.059445
TractBlack,0.057979
PovertyRate,0.056341
TractSeniors,0.05517


### Findings - Low Income RF
- The availability of vehicles is most significant, intuitive result, followed by Urban Flag

### Combined Model - Low Income, Low Access Logistic Regression

In [62]:
y = census['fd_flag']

# take features from the other two models
features = ['POP2010',  'Urban', 'OHU2010', 'TractSNAP', 'PovertyRate', 'TractHUNV']

features = ['POP2010',  'Urban', 'OHU2010',
            'TractSNAP', 'PovertyRate', 'TractHUNV',
            'TractKids', 'TractSeniors',
           'TractBlack', 'TractHispanic',
           'TractWhite', 'TractAsian']


X = census[features]
#X = pd.concat([X, dummy], axis = 1)

In [63]:
# Baseline Model
y.value_counts(normalize=True)


0    0.696338
1    0.303662
Name: fd_flag, dtype: float64

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [65]:
# Set up Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [66]:
# Score on training set. (We'll use R^2 for the score today.)
print(f'Training Score: {round(lr.score(X_train, y_train),4)}.')

# Score on testing set.
print(f'Testing Score: {round(lr.score(X_test, y_test),4)}.')

Training Score: 0.8277.
Testing Score: 0.8357.


In [67]:
#turn it into a data frame
features_df = pd.DataFrame(lr.coef_.T, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
Urban,1.01504
PovertyRate,0.087418
TractSNAP,0.006106
POP2010,0.000566
OHU2010,0.00053
TractSeniors,9e-05
TractHispanic,-3.9e-05
TractBlack,-0.000445
TractWhite,-0.000755
TractKids,-0.001001


### FIndings - Combined Model  LogReg
- consistent with previous two logistic regressions, Urban flag has highest coefficient when classifying a census tract as Food Desert

### Combined Model - Random Forest

In [68]:
rfc = RandomForestClassifier()

In [69]:
rfc.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [70]:
rfc.score(X_train, y_train)

0.990032766636359

In [71]:
rfc.score(X_test, y_test)

0.8675633019968435

In [72]:
#turn it into a data frame
features_df = pd.DataFrame(rfc.feature_importances_, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)



Unnamed: 0,Importance
PovertyRate,0.301512
TractSNAP,0.153962
TractHUNV,0.102376
TractWhite,0.065293
TractBlack,0.056
TractHispanic,0.053094
TractSeniors,0.052664
TractAsian,0.051446
OHU2010,0.046903
TractKids,0.046025


### Findings - Combined Model RF
- Consistent with previous Models, Poverty Rate and Availabiliy of Vehicles are most important factors for classfying a region as Food Desert