In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import plotly.express as px
import plotly.io as pio

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


In [2]:
df = pd.read_pickle(r'./../data/AtlasCleaned.pkl')
df.head()

Unnamed: 0,FIPS,State,County,LACCESS_POP10,LACCESS_POP15,PCH_LACCESS_POP_10_15,PCT_LACCESS_POP10,PCT_LACCESS_POP15,LACCESS_LOWI10,LACCESS_LOWI15,...,PCT_HISP10,PCT_NHASIAN10,PCT_NHNA10,PCT_NHPI10,PCT_65OLDER10,PCT_18YOUNGER10,MEDHHINC15,POVRATE15,CHILDPOVRATE15,METRO13
0,1001,AL,Autauga,18428.439685,17496.693038,-5.056026,33.769657,32.062255,5344.427472,6543.676824,...,2.400542,0.855766,0.397647,0.040314,11.995382,26.777959,56580.0,12.7,18.8,1
1,1003,AL,Baldwin,35210.814078,30561.26443,-13.204891,19.318473,16.767489,9952.144027,9886.831137,...,4.384824,0.735193,0.628755,0.043343,16.771185,22.987408,52387.0,12.9,19.6,1
2,1005,AL,Barbour,5722.305602,6069.523628,6.067799,20.840972,22.10556,3135.676086,2948.790251,...,5.051535,0.3897,0.218524,0.087409,14.236807,21.906982,31433.0,32.0,45.2,0
3,1007,AL,Bibb,1044.867327,969.378841,-7.224696,4.559753,4.230324,491.449066,596.162829,...,1.771765,0.096007,0.279293,0.030548,12.68165,22.696923,40767.0,22.2,29.3,1
4,1009,AL,Blount,1548.175559,3724.428242,140.568857,2.70084,6.49738,609.027708,1650.959482,...,8.0702,0.200621,0.497191,0.031402,14.722096,24.608353,50487.0,14.7,22.2,1


In [3]:
df.columns

Index(['FIPS', 'State', 'County', 'LACCESS_POP10', 'LACCESS_POP15',
       'PCH_LACCESS_POP_10_15', 'PCT_LACCESS_POP10', 'PCT_LACCESS_POP15',
       'LACCESS_LOWI10', 'LACCESS_LOWI15', 'PCH_LACCESS_LOWI_10_15',
       'PCT_LACCESS_LOWI10', 'PCT_LACCESS_LOWI15', 'LACCESS_HHNV10',
       'LACCESS_HHNV15', 'PCH_LACCESS_HHNV_10_15', 'PCT_LACCESS_HHNV10',
       'PCT_LACCESS_HHNV15', 'LACCESS_SNAP15', 'PCT_LACCESS_SNAP15',
       'LACCESS_CHILD10', 'LACCESS_CHILD15', 'LACCESS_CHILD_10_15',
       'PCT_LACCESS_CHILD10', 'PCT_LACCESS_CHILD15', 'LACCESS_SENIORS10',
       'LACCESS_SENIORS15', 'PCH_LACCESS_SENIORS_10_15',
       'PCT_LACCESS_SENIORS10', 'PCT_LACCESS_SENIORS15', 'LACCESS_WHITE15',
       'PCT_LACCESS_WHITE15', 'LACCESS_BLACK15', 'PCT_LACCESS_BLACK15',
       'LACCESS_HISP15', 'PCT_LACCESS_HISP15', 'LACCESS_NHASIAN15',
       'PCT_LACCESS_NHASIAN15', 'LACCESS_NHNA15', 'PCT_LACCESS_NHNA15',
       'LACCESS_NHPI15', 'PCT_LACCESS_NHPI15', 'LACCESS_MULTIR15',
       'PCT_LACCESS_MULT

In [4]:
df.drop(labels=['LACCESS_POP10', 'LACCESS_POP15', 'PCH_LACCESS_POP_10_15', 'LACCESS_LOWI10', 'LACCESS_LOWI15',
                    'PCH_LACCESS_LOWI_10_15', 'LACCESS_HHNV10', 'LACCESS_HHNV15', 'PCH_LACCESS_HHNV_10_15',
                    'LACCESS_SNAP15', 'LACCESS_CHILD10', 'LACCESS_CHILD15', 'LACCESS_CHILD_10_15', 
                    'LACCESS_SENIORS10', 'LACCESS_SENIORS15', 'PCH_LACCESS_SENIORS_10_15', 'LACCESS_WHITE15', 
                    'LACCESS_BLACK15', 'LACCESS_HISP15', 'LACCESS_NHASIAN15', 'LACCESS_NHNA15', 
                    'LACCESS_NHPI15', 'LACCESS_MULTIR15'], axis=1, inplace=True)

In [5]:
df.shape

(3142, 77)

In [6]:
df.columns

Index(['FIPS', 'State', 'County', 'PCT_LACCESS_POP10', 'PCT_LACCESS_POP15',
       'PCT_LACCESS_LOWI10', 'PCT_LACCESS_LOWI15', 'PCT_LACCESS_HHNV10',
       'PCT_LACCESS_HHNV15', 'PCT_LACCESS_SNAP15', 'PCT_LACCESS_CHILD10',
       'PCT_LACCESS_CHILD15', 'PCT_LACCESS_SENIORS10', 'PCT_LACCESS_SENIORS15',
       'PCT_LACCESS_WHITE15', 'PCT_LACCESS_BLACK15', 'PCT_LACCESS_HISP15',
       'PCT_LACCESS_NHASIAN15', 'PCT_LACCESS_NHNA15', 'PCT_LACCESS_NHPI15',
       'PCT_LACCESS_MULTIR15', 'GROCPTH11', 'GROCPTH16', 'SUPERCPTH11',
       'SUPERCPTH16', 'CONVSPTH11', 'CONVSPTH16', 'SPECSPTH11', 'SPECSPTH16',
       'SNAPSPTH12', 'SNAPSPTH17', 'FFRPTH11', 'FFRPTH16', 'FSRPTH11',
       'FSRPTH16', 'PC_FFRSALES07', 'PC_FFRSALES12', 'PC_FSRSALES07',
       'PC_FSRSALES12', 'PCT_SNAP12', 'PCT_SNAP17', 'SNAP_PART_RATE11',
       'SNAP_PART_RATE16', 'PCT_NSLP12', 'PCT_NSLP17', 'PCT_FREE_LUNCH10',
       'PCT_FREE_LUNCH15', 'PCT_REDUCED_LUNCH10', 'PCT_REDUCED_LUNCH15',
       'PCT_SBP12', 'PCT_SBP17', 'P

In [7]:
df10_cols = ['PCT_LACCESS_POP10', 'PCT_LACCESS_LOWI10', 'PCT_LACCESS_HHNV10',
             'PCT_LACCESS_CHILD10','PCT_LACCESS_SENIORS10', 'PCT_FREE_LUNCH10', 'PCT_REDUCED_LUNCH10',
             'PCT_NHWHITE10', 'PCT_NHBLACK10', 'PCT_HISP10', 'PCT_NHASIAN10', 'PCT_NHNA10', 'PCT_NHPI10',
             'PCT_65OLDER10', 'PCT_18YOUNGER10','GROCPTH11', 'SUPERCPTH11', 'CONVSPTH11', 'SPECSPTH11', 
             'SNAPSPTH12', 'FFRPTH11', 'FSRPTH11', 'PC_FFRSALES07','PC_FSRSALES07', 'PCT_SNAP12', 
             'SNAP_PART_RATE11', 'PCT_NSLP12','PCT_SBP12', 'PCT_SFSP12',
             'FDPIR12','FOODINSEC_12_14','VLFOODSEC_12_14','DIRSALES_FARMS07', 'FMRKTPTH13']


df15_cols = ['PCT_LACCESS_POP15', 'PCT_LACCESS_LOWI15', 'PCT_LACCESS_HHNV15', 
             'PCT_LACCESS_SNAP15', 'PCT_LACCESS_CHILD15', 'PCT_LACCESS_SENIORS15', 'PCT_LACCESS_WHITE15',
             'PCT_LACCESS_BLACK15','PCT_LACCESS_HISP15', 'PCT_LACCESS_NHASIAN15', 'PCT_LACCESS_NHNA15', 
             'PCT_LACCESS_NHPI15', 'PCT_LACCESS_MULTIR15', 'PCT_FREE_LUNCH15', 'PCT_REDUCED_LUNCH15', 
             'FDPIR15','GROCPTH16', 'SUPERCPTH16', 'CONVSPTH16', 'SPECSPTH16', 'SNAPSPTH17', 'FFRPTH16', 'FSRPTH16',
             'PC_FFRSALES12', 'PC_FSRSALES12', 'PCT_SNAP17','SNAP_PART_RATE16', 'PCT_NSLP17', 'PCT_SBP17', 'PCT_SFSP17',
             'FDPIR15', 'FOODINSEC_15_17','VLFOODSEC_15_17', 'DIRSALES_FARMS12', 'FMRKTPTH18']

# Foodhub, metro 13, medhincome 15 'POVRATE15' are not here

In [8]:
df10 = pd.DataFrame(df, columns = df10_cols)
df15 = pd.DataFrame(df, columns = df15_cols)

In [9]:
scaler = StandardScaler()
df10 = pd.DataFrame(scaler.fit_transform(df10), columns=df10.columns)


X = df10.drop(columns=['PCT_LACCESS_CHILD10','PCT_LACCESS_SENIORS10','PCT_NHWHITE10', 'PCT_NHBLACK10',
                       'PCT_HISP10', 'PCT_NHASIAN10', 'PCT_NHNA10', 'PCT_NHPI10',
             'PCT_65OLDER10', 'PCT_18YOUNGER10','FOODINSEC_12_14'], axis=1)
y = df10['FOODINSEC_12_14']

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ridge Regression
ridge = Ridge(alpha=0.5)  # alpha is the regularization parameter
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_r2 = ridge.score(X_test, y_test)
print('Ridge Regression Mean Squared Error:', ridge_mse)
print('Ridge Regression R2 score: ',  ridge_r2) 
print()

# Lasso Regression
lasso = Lasso(alpha=0.5)  # alpha is the regularization parameter
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_r2 = lasso.score(X_test, y_test)
print('Lasso Regression Mean Squared Error:', lasso_mse)
print('Lasso Regression R2 score: ',  lasso_r2) 
print()

# Elastic Net Regression
enet = ElasticNet(alpha=0.5, l1_ratio=0.5)  # alpha is the regularization parameter, l1_ratio controls the balance between L1 and L2 regularization
enet.fit(X_train, y_train)
enet_pred = enet.predict(X_test)
enet_mse = mean_squared_error(y_test, enet_pred)
enet_r2 = enet.score(X_test, y_test)
print('Elastic Net Regression Mean Squared Error:', enet_mse)
print('Elastic Net Regression R2 score: ',  enet_r2) 

Ridge Regression Mean Squared Error: 0.20002664708581427
Ridge Regression R2 score:  0.7871833838134672

Lasso Regression Mean Squared Error: 0.4747891754552357
Lasso Regression R2 score:  0.4948521749753252

Elastic Net Regression Mean Squared Error: 0.37419281667031273
Elastic Net Regression R2 score:  0.6018807983572345


In [18]:
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)
# dt_mse = mean_squared_error(y_test, dt_pred)
dt_score = dt_model.score(X_test, y_test)
print("Decision Tree score:", dt_score)
# dt_model.get_params(deep=True)

# from sklearn import tree
# tree.plot_tree(dt_model)


Decision Tree score: 1.0


#### Assistance programs

In [11]:
df_assist10 = pd.DataFrame(df, columns=['PCT_LACCESS_POP10','PCT_SNAP12','SNAP_PART_RATE11',
       'PCT_NSLP12', 'PCT_FREE_LUNCH10', 'PCT_REDUCED_LUNCH10', 'PCT_SBP12', 'PCT_SFSP12', 'FDPIR12'])

df_assist15 = pd.DataFrame(df, columns=['PCT_LACCESS_POP15', 'PCT_SNAP17','SNAP_PART_RATE16', 
        'PCT_NSLP17', 'PCT_FREE_LUNCH15', 'PCT_REDUCED_LUNCH15', 'PCT_SBP17', 'PCT_SFSP17', 'FDPIR15'])

In [12]:
print(df_assist10.shape)
print(df_assist15.shape)

(3142, 9)
(3142, 9)


In [13]:
pd.reset_option('display.max_rows', 10)
pd.reset_option('display.max_columns', None)

In [14]:
df_assist10

Unnamed: 0,PCT_LACCESS_POP10,PCT_SNAP12,SNAP_PART_RATE11,PCT_NSLP12,PCT_FREE_LUNCH10,PCT_REDUCED_LUNCH10,PCT_SBP12,PCT_SFSP12,FDPIR12
0,33.769657,18.908476,84.020,68.226043,36.116721,6.886107,27.206328,3.160320,0
1,19.318473,18.908476,84.020,68.226043,36.761939,5.542340,27.206328,3.160320,0
2,20.840972,18.908476,84.020,68.226043,66.388961,4.582140,27.206328,3.160320,0
3,4.559753,18.908476,84.020,68.226043,53.502278,9.253986,27.206328,3.160320,0
4,2.700840,18.908476,84.020,68.226043,43.931360,8.097947,27.206328,3.160320,0
...,...,...,...,...,...,...,...,...,...
3138,30.570505,5.956719,58.381,59.171454,25.301359,7.989228,16.422670,3.220139,0
3139,29.174527,5.956719,58.381,59.171454,14.495114,3.786645,16.422670,3.220139,0
3140,20.220414,5.956719,58.381,59.171454,27.702089,12.148047,16.422670,3.220139,0
3141,10.915407,5.956719,58.381,59.171454,32.758621,16.246684,16.422670,3.220139,0


Do I need to look at age wise distribution as well?