### FOOD ATLAS - PCA

In [36]:
# Import our libraries.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import from sklearn.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.decomposition import PCA

import statsmodels.api as sm

In [4]:
# Load in census data

census = pd.read_csv("data/census.csv").drop(columns='Unnamed: 0')
census.shape

(72864, 149)

### SET UP LOGISTIC REGRESSION MODEL

In [5]:
# Make Locational Dummies


dummy = pd.get_dummies(census[['State', 'County']])
dummy.head()


Unnamed: 0,State_Alabama,State_Alaska,State_Arizona,State_Arkansas,State_California,State_Colorado,State_Connecticut,State_Delaware,State_District of Columbia,State_Florida,...,County_Yoakum,County_Yolo,County_York,County_Young,County_Yuba,County_Yukon-Koyukuk,County_Yuma,County_Zapata,County_Zavala,County_Ziebach
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Low Income Predicting Model - Logistic Regression

In [6]:
y = census['LowIncomeTracts']

features = ['POP2010',  'Urban', 'OHU2010', 'TractSNAP', 'PovertyRate']

X = census[features]

#X = pd.concat([X, dummy], axis = 1)

In [7]:
y.value_counts(normalize = True)

0    0.576334
1    0.423666
Name: LowIncomeTracts, dtype: float64

In [8]:
# Train/test split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [9]:
# Set up Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
# Score on training and testing sets
print(f'Training Score: {round(lr.score(X_train, y_train),4)}.')

# Score on testing set.
print(f'Testing Score: {round(lr.score(X_test, y_test),4)}.')



Training Score: 0.89.
Testing Score: 0.8912.


In [10]:
#turn it into a data frame
features_df = pd.DataFrame(lr.coef_.T, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
Urban,0.38928
PovertyRate,0.263059
TractSNAP,0.006206
POP2010,-8.7e-05
OHU2010,-0.001064


### Findings - Low income LogReg
- The Logistic Regression Identifies Urban flag and Poverty Rate as most significant predictors of Low Income status

### Low Income Predicting Model - Random Forest

In [11]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
rfc.score(X_train, y_train)

0.989878368873411

In [13]:
rfc.score(X_test, y_test)

0.8882179372812736

In [14]:
#turn it into a data frame
features_df = pd.DataFrame(rfc.feature_importances_, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
PovertyRate,0.581046
TractSNAP,0.214144
OHU2010,0.103298
POP2010,0.09155
Urban,0.009962


### Findings - Low income Random Forest
- We see that Poverty Rate is, unsurpsingly, the most significant predictor of whether a census tract is classified as low-income, followed by the amount of housing units that receive SNAP benefits

### Low Access Predicting Model - Logistic Regression

In [15]:
y = census['la_flag']

features = ['Urban','POP2010', 'OHU2010', 'TractHUNV']

X = census[features]

In [16]:
# Train/test split our data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42)

In [17]:
# Set up Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
# Score on training set. (We'll use R^2 for the score today.)
print(f'Training Score: {round(lr.score(X_train, y_train),4)}.')

# Score on testing set.
print(f'Testing Score: {round(lr.score(X_test, y_test),4)}.')



Training Score: 0.8092.
Testing Score: 0.8058.


In [18]:
#turn it into a data frame
features_df = pd.DataFrame(lr.coef_.T, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
Urban,2.917787
OHU2010,0.001036
POP2010,-1.8e-05
TractHUNV,-0.004351


### Findings - Low Income LogReg
- Urban flag is most significant predictor of whether a region is low access

### Low Access Model  - Random Forest

In [19]:
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [20]:
rfc.score(X_train, y_train)

0.9884059158507108

In [21]:
rfc.score(X_test, y_test)

0.8316975796390252

In [22]:
#turn it into a data frame
features_df = pd.DataFrame(rfc.feature_importances_, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
TractHUNV,0.31668
Urban,0.244841
POP2010,0.223263
OHU2010,0.215215


### Findings - Low Income RF
- The availability of vehicles is most significant, intuitive result, followed by Urban Flag

### Combined Model - Low Income, Low Access Logistic Regression

In [23]:
y = census['fd_flag']

# take features from the other two models
features = ['POP2010',  'Urban', 'OHU2010', 'TractSNAP', 'PovertyRate', 'TractHUNV']

features = ['POP2010', 'OHU2010', 'Urban', 'PovertyRate', 'TractHUNV', 'TractSNAP', 'TractLOWI']

X = census[features]
#X = pd.concat([X, dummy], axis = 1)

In [25]:
# Baseline Model
y.value_counts(normalize=True)


0    0.696338
1    0.303662
Name: fd_flag, dtype: float64

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [27]:
# Set up Logistic Regression
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
# Score on training set. (We'll use R^2 for the score today.)
print(f'Training Score: {round(lr.score(X_train, y_train),4)}.')

# Score on testing set.
print(f'Testing Score: {round(lr.score(X_test, y_test),4)}.')

Training Score: 0.8323.
Testing Score: 0.8379.


In [30]:
#turn it into a data frame
features_df = pd.DataFrame(lr.coef_.T, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)

Unnamed: 0,Importance
Urban,1.086596
PovertyRate,0.063027
TractSNAP,0.003562
TractLOWI,0.001225
OHU2010,0.000674
POP2010,-0.000756
TractHUNV,-0.004404


### FIndings - Combined Model  LogReg
- consistent with previous two logistic regressions, Urban flag has highest coefficient when classifying a census tract as Food Desert

### Combined Model - Random Forest

In [31]:
rfc = RandomForestClassifier()

In [32]:
rfc.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [33]:
rfc.score(X_train, y_train)

0.9896896604964746

In [34]:
rfc.score(X_test, y_test)

0.8619364578329788

In [35]:
#turn it into a data frame
features_df = pd.DataFrame(rfc.feature_importances_, X_train.columns, columns=['Importance'])
features_df.sort_values('Importance', ascending = False)



Unnamed: 0,Importance
PovertyRate,0.291859
TractSNAP,0.190528
TractHUNV,0.158558
TractLOWI,0.125582
OHU2010,0.101028
POP2010,0.100188
Urban,0.032257


### Findings - Combined Model RF
- Consistent with previous Models, Poverty Rate and Availabiliy of Vehicles are most important factors for classfying a region as Food Desert