In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import statsmodels.api as sm

# Read the CSV and Perform Basic Data Cleaning

In [2]:
# Load the data
file = 'FoodAccessDemographics.csv'
df = pd.read_csv(file)
# check data type
df.dtypes 

CensusTract             int64
Pop2010                 int64
MedianFamilyIncome    float64
PovertyRate           float64
PctWhite              float64
PctBlack              float64
PctAsian              float64
PctHispanic           float64
PctOtherMinority      float64
LA1and10                int64
dtype: object

In [3]:
# Find null values
for column in df.columns :
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column CensusTract has 0 null values
Column Pop2010 has 0 null values
Column MedianFamilyIncome has 748 null values
Column PovertyRate has 3 null values
Column PctWhite has 4 null values
Column PctBlack has 4 null values
Column PctAsian has 4 null values
Column PctHispanic has 4 null values
Column PctOtherMinority has 4 null values
Column LA1and10 has 0 null values


In [4]:
# Remove null rows
#--Median Family Income has 748 null values which is 1.04%. This figure is insignificant compared to the importance of the
#feature towards prediction. So we deciide to drop null values than remove the column.
df=df.dropna()

In [5]:
#Find duplicate entries
print(f"Duplicate entries: {df.duplicated().sum()}")

Duplicate entries: 0


# Define Model

In [6]:
# Create features
X = df.drop(columns=['CensusTract',"LA1and10"]) 

# Create target
y = df["LA1and10"]

# Split model
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


### Logistic Regression

In [7]:
# Define model-sklearn
model = LogisticRegression(solver='lbfgs', random_state=1)
# fit model

# Fit model
model.fit(X_train, y_train)
# make predictions on the entire training dataset
predictions = model.predict(X_test)

# Evaluate the model
y_pred = model.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.640


In [8]:
cm_logreg = confusion_matrix(y_test,y_pred)
cm_logreg

array([[10363,   784],
       [ 5681,  1118]])

In [9]:
#Run Logistic Regression on scaled data
# Fit model
model.fit(X_train_scaled, y_train)
# make predictions on the entire training dataset
predictions_scaled = model.predict(X_test_scaled)

# Evaluate the model
y_pred_scaled = model.predict(X_test_scaled)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred_scaled):.3f}")

 Logistic regression model accuracy: 0.652


In [10]:
confusion_matrix(y_test,y_pred_scaled)

array([[9919, 1228],
       [5015, 1784]])

In [11]:
pd=pd.DataFrame({"Prediction": predictions, "Scaled Prediction":predictions_scaled, "Actual": y_test})
pd.head()

Unnamed: 0,Prediction,Scaled Prediction,Actual
11117,0,0,1
55918,0,1,1
49258,0,0,0
58139,1,1,1
41461,0,0,1


In [12]:
# Run Logistic Regression on scaled data using stats model
# Fit model
smodel = sm.Logit(y_train, X_train_scaled).fit()
# print summary table
print(smodel.summary())

         Current function value: 0.657651
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:               LA1and10   No. Observations:                53836
Model:                          Logit   Df Residuals:                    53828
Method:                           MLE   Df Model:                            7
Date:                Thu, 18 Nov 2021   Pseudo R-squ.:                 0.01000
Time:                        19:14:14   Log-Likelihood:                -35405.
converged:                      False   LL-Null:                       -35763.
Covariance Type:            nonrobust   LLR p-value:                3.458e-150
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.4207      0.010     41.590      0.000       0.401       0.441
x2             0.1101      0.013      8.795      0.000       0.08



In [13]:
# Evaluate the model
sy_pred = smodel.predict(X_test_scaled)
sprediction = list(map(round, sy_pred))
print(f" Logistic regression model accuracy: {accuracy_score(y_test,sprediction):.3f}")

 Logistic regression model accuracy: 0.606


In [14]:
confusion_matrix(y_test,sprediction)

array([[6492, 4655],
       [2422, 4377]])

In [15]:
feature_importances=smodel.conf_int()[1]
feature_importances

x1         0.440564
x2         0.134688
x3        -0.056024
x4    777189.739651
x5    672246.343908
x6    255190.276934
x7        -0.222458
x8    345693.676833
Name: 1, dtype: float64

### Undersampling

In [16]:
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler

In [17]:
Counter(y)

Counter({1: 27282, 0: 44500})

In [18]:
rus = RandomUnderSampler(random_state=1)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
Counter(y_resampled)

Counter({0: 20483, 1: 20483})

In [19]:
# Retry Logistic Regression with undersampled data
model.fit(X_resampled, y_resampled)
y_pred_resamp = model.predict(X_test)
print(f" Resampled logistic regression model accuracy: {accuracy_score(y_test,y_pred_resamp):.3f}")
confusion_matrix(y_test, y_pred_resamp)

 Resampled logistic regression model accuracy: 0.576


array([[5986, 5161],
       [2444, 4355]])

### SMOTEEN

In [20]:
# Use the SMOTEENN technique to perform combination sampling on the data
# Count the resampled classes
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({0: 11404, 1: 14126})

In [21]:
st_model = LogisticRegression(solver='lbfgs', random_state=1)
st_model.fit(X_resampled, y_resampled)
st_y_pred = st_model.predict(X_test)

In [22]:
print(f" SMOTEEN logistic regression model accuracy: {accuracy_score(y_test,y_pred_resamp):.3f}")
confusion_matrix(y_test, st_y_pred)

 SMOTEEN logistic regression model accuracy: 0.576


array([[5112, 6035],
       [2004, 4795]])

### SVM

In [29]:
# Create the SVM model
svm = SVC(kernel='linear')

# Train the model
svm.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred_svm = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred_svm):.3f}")

confusion_matrix(y_test, y_pred_svm)

 SVM model accuracy: 0.621


array([[11147,     0],
       [ 6799,     0]])

In [30]:
# Create the SVM model
svm_rbf = SVC()

# Train the model
svm_rbf.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred_svmrbf = svm_rbf.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred_svmrbf):.3f}")

confusion_matrix(y_test, y_pred_svmrbf)

 SVM model accuracy: 0.667


array([[10257,   890],
       [ 5089,  1710]])

### Random Forest

In [23]:
# Create Random Forest Classifier model
rf_model = RandomForestClassifier(n_estimators=100, random_state=78) 

In [24]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)
# Making predictions using the testing data.
rf_y_pred = rf_model.predict(X_test_scaled)
# Evaluate the model
print(f"Random Forest model accuracy: {accuracy_score(y_test,rf_y_pred):.3f}")

Random Forest model accuracy: 0.671


In [25]:
confusion_matrix(y_test,rf_y_pred)

array([[9378, 1769],
       [4129, 2670]])

In [26]:
rf_model.feature_importances_


array([0.14987131, 0.11764574, 0.11644124, 0.12105239, 0.11580487,
       0.13002688, 0.1258561 , 0.12330147])

### Neural Network

In [27]:
import tensorflow as tf

In [31]:
# Define neural network model
nn_model = tf.keras.models.Sequential()
nn_model.add(tf.keras.layers.Dense(units=14, activation="relu", input_dim=8))
#nn_model.add(tf.keras.layers.Dense(units=6, activation="relu"))
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
561/561 - 1s - loss: 0.5995 - accuracy: 0.6689
Loss: 0.5995128750801086, Accuracy: 0.6688956022262573


In [32]:
print(f'Logistic Regression: {accuracy_score(y_test,y_pred):.3f}')
print(f'Scaled Logistic Regression: {accuracy_score(y_test,y_pred_scaled):.3f}')
print(f'Undersampled Logistic Regression: {accuracy_score(y_test,y_pred_resamp):.3f}')
print(f'SVM Accuracy: {accuracy_score(y_test,y_pred_svm):.3f}')

Logistic Regression: 0.640
Scaled Logistic Regression: 0.652
Undersampled Logistic Regression: 0.576
SVM Accuracy: 0.621
