In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import statsmodels.api as sm

# Read the CSV and Perform Basic Data Cleaning

In [2]:
# Load the data
file = 'FoodAccess.csv'
df = pd.read_csv(file)
# check data type
df.dtypes 

CensusTract             int64
state_county           object
Urban                   int64
Pop2010                 int64
OHU2010                 int64
PovertyRate           float64
MedianFamilyIncome    float64
TractLOWI             float64
LA1and10                int64
dtype: object

In [3]:
# Find null values
for column in df.columns :
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column CensusTract has 0 null values
Column state_county has 0 null values
Column Urban has 0 null values
Column Pop2010 has 0 null values
Column OHU2010 has 0 null values
Column PovertyRate has 3 null values
Column MedianFamilyIncome has 748 null values
Column TractLOWI has 4 null values
Column LA1and10 has 0 null values


In [4]:
# Remove null rows
#--Median Family Income has 748 null values which is 1.04%. This figure is insignificant compared to the importance of the
#feature towards prediction. So we deciide to drop null values than remove the column.
df=df.dropna()

In [5]:
#Find duplicate entries
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 0


# Define Model

In [6]:
# Create features
X = df.drop(columns=['CensusTract','state_county',"LA1and10"]) 

# Create target
y = df["LA1and10"]

# Split model
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


### Logistic Regression

In [7]:
# Define model-sklearn
model = LogisticRegression(solver='lbfgs', random_state=1)
# fit model

# Fit model
model.fit(X_train, y_train)
# make predictions on the entire training dataset
predictions = model.predict(X_test)

# Evaluate the model
y_pred = model.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.646


In [8]:
cm_logreg = confusion_matrix(y_test,y_pred)
cm_logreg

array([[10141,  1006],
       [ 5343,  1456]])

In [9]:
#Run Logistic Regression on scaled data
# Fit model
model.fit(X_train_scaled, y_train)
# make predictions on the entire training dataset
predictions_scaled = model.predict(X_test_scaled)

# Evaluate the model
y_pred_scaled = model.predict(X_test_scaled)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred_scaled):.3f}")

 Logistic regression model accuracy: 0.687


In [10]:
confusion_matrix(y_test,y_pred_scaled)

array([[9760, 1387],
       [4223, 2576]])

In [11]:
pd=pd.DataFrame({"Prediction": predictions, "Scaled Prediction":predictions_scaled, "Actual": y_test})
pd.head()

Unnamed: 0,Prediction,Scaled Prediction,Actual
11117,0,1,1
55918,0,1,1
49258,0,0,0
58139,1,1,1
41461,0,1,1


In [12]:
# Run Logistic Regression on scaled data using stats model
# Fit model
smodel = sm.Logit(y_train, X_train_scaled).fit()
# print summary table
print(smodel.summary())

Optimization terminated successfully.
         Current function value: 0.640126
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:               LA1and10   No. Observations:                53836
Model:                          Logit   Df Residuals:                    53830
Method:                           MLE   Df Model:                            5
Date:                Thu, 18 Nov 2021   Pseudo R-squ.:                 0.03638
Time:                        20:09:10   Log-Likelihood:                -34462.
converged:                       True   LL-Null:                       -35763.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.5188      0.010     53.192      0.000       0.500       0.538
x2             0.7032      0.

In [13]:
# Evaluate the model
sy_pred = smodel.predict(X_test_scaled)
sprediction = list(map(round, sy_pred))
print(f" Logistic regression model accuracy: {accuracy_score(y_test,sprediction):.3f}")

 Logistic regression model accuracy: 0.627


In [14]:
confusion_matrix(y_test,sprediction)

array([[6506, 4641],
       [2054, 4745]])

In [15]:
feature_importances=smodel.conf_int()[1]
feature_importances

x1    0.537887
x2    0.755263
x3   -0.040950
x4   -0.111059
x5   -0.180324
x6   -0.394490
Name: 1, dtype: float64

### Undersampling

In [16]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [17]:
Counter(y)

Counter({1: 27282, 0: 44500})

In [18]:
rus = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 20483, 1: 20483})

In [19]:
# Retry Logistic Regression with undersampled data
model.fit(X_resampled, y_resampled)
y_pred_resamp = model.predict(X_test)
print(f" Resampled logistic regression model accuracy: {accuracy_score(y_test,y_pred_resamp):.3f}")
confusion_matrix(y_test, y_pred_resamp)

 Resampled logistic regression model accuracy: 0.597


array([[6772, 4375],
       [2860, 3939]])

### SMOTEEN

In [20]:
# Use the SMOTEENN technique to perform combination sampling on the data
# Count the resampled classes
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 11753, 1: 17491})

In [21]:
st_model = LogisticRegression(solver='lbfgs', random_state=1)
st_model.fit(X_resampled, y_resampled)
st_y_pred = st_model.predict(X_test)

In [22]:
print(f"SMOTEEN logistic regression model accuracy: {accuracy_score(y_test,y_pred_resamp):.3f}")
confusion_matrix(y_test, st_y_pred)

 SMOTEEN logistic regression model accuracy: 0.597


array([[4894, 6253],
       [1939, 4860]])

### SVM

In [32]:
# Create the SVM model
svm = SVC(kernel='linear')

# Train the model
svm.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred_svm = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred_svm):.3f}")

confusion_matrix(y_test, y_pred_svm)

 SVM model accuracy: 0.675


array([[10324,   823],
       [ 5012,  1787]])

In [33]:
# Create the SVM model
svm_rbf = SVC()

# Train the model
svm_rbf.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred_svmrbf = svm_rbf.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred_svmrbf):.3f}")

confusion_matrix(y_test, y_pred_svmrbf)

 SVM model accuracy: 0.698


array([[9684, 1463],
       [3952, 2847]])

### Random Forest

In [40]:
# Create Random Forest Classifier model
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [41]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)
# Making predictions using the testing data.
rf_y_pred = rf_model.predict(X_test_scaled)
# Evaluate the model
print(f"Random Forest model accuracy: {accuracy_score(y_test,rf_y_pred):.3f}")

Random Forest model accuracy: 0.691


In [27]:
confusion_matrix(y_test,rf_y_pred)

array([[9207, 1940],
       [3620, 3179]])

In [28]:
rf_model.feature_importances_


array([0.06572456, 0.20359398, 0.19739847, 0.17111248, 0.18675653,
       0.17541399])

### Neural Network

In [29]:
import tensorflow as tf

In [30]:
# Define neural network model
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=15, activation="relu", input_dim=6))
nn_model.add(tf.keras.layers.Dense(units=6, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
561/561 - 1s - loss: 0.5699 - accuracy: 0.7038
Loss: 0.5699369311332703, Accuracy: 0.7038336992263794


In [42]:
print(f'Logistic Regression: {accuracy_score(y_test,y_pred):.3f}')
print(f'Scaled Logistic Regression: {accuracy_score(y_test,y_pred_scaled):.3f}')
print(f"Stats model Logistic regression model accuracy: {accuracy_score(y_test,sprediction):.3f}")
print(f'Undersampled Logistic Regression: {accuracy_score(y_test,y_pred_resamp):.3f}')
print(f"SMOTEEN logistic regression model accuracy: {accuracy_score(y_test,y_pred_resamp):.3f}")
print(f'SVM Accuracy: {accuracy_score(y_test,y_pred_svm):.3f}')
print(f"SVM-rbf model accuracy: {accuracy_score(y_test,y_pred_svmrbf):.3f}")
print(f"Random Forest model accuracy: {accuracy_score(y_test,rf_y_pred):.3f}")
print(f"Neural Net Loss: {model_loss}, Accuracy: {model_accuracy}")

Logistic Regression: 0.646
Scaled Logistic Regression: 0.687
Stats model Logistic regression model accuracy: 0.627
Undersampled Logistic Regression: 0.597
SMOTEEN logistic regression model accuracy: 0.597
SVM Accuracy: 0.675
SVM-rbf model accuracy: 0.698
Random Forest model accuracy: 0.691
Neural Net Loss: 0.5699369311332703, Accuracy: 0.7038336992263794
