In [29]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report 
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

# Read the CSV and Perform Basic Data Cleaning

In [30]:
# Load the data
file = "FoodAccess.csv"
df = pd.read_csv(file)
# check data type
df.dtypes 

CensusTract             int64
state_county           object
Urban                   int64
Pop2010                 int64
OHU2010                 int64
PovertyRate           float64
MedianFamilyIncome    float64
TractLOWI             float64
LA1and10                int64
dtype: object

In [31]:
# Find null values
for column in df.columns :
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column CensusTract has 0 null values
Column state_county has 0 null values
Column Urban has 0 null values
Column Pop2010 has 0 null values
Column OHU2010 has 0 null values
Column PovertyRate has 3 null values
Column MedianFamilyIncome has 748 null values
Column TractLOWI has 4 null values
Column LA1and10 has 0 null values


In [32]:
# Remove null rows
#--Median Family Income has 748 null values which is 1.04%. This figure is insignificant compared to the importance of the
#feature towards prediction. So we deciide to drop null values than remove the column.
df=df.dropna()

# Define Model

In [33]:
# Create features
X = df[["PovertyRate","MedianFamilyIncome","TractLOWI","Urban","Pop2010"]] 

# Create target
y = df["LA1and10"]


In [34]:
# Define model
model = LogisticRegression(solver='lbfgs', random_state=1)
# fit model

# Split model
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

# Fit model
model.fit(X_train, y_train)
# make predictions on the entire training dataset
predictions = model.predict(X_test)


In [35]:
pred=pd.DataFrame({"Prediction": predictions, "Actual": y_test})
pred.head(5)

Unnamed: 0,Prediction,Actual
36858,0,1
39902,0,0
45495,0,0
8231,0,0
64321,0,0


In [36]:
confusion_matrix(y_test,predictions)

array([[10199,   926],
       [ 5430,  1391]], dtype=int64)

In [37]:
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.65      0.92      0.76     11125
           1       0.60      0.20      0.30      6821

    accuracy                           0.65     17946
   macro avg       0.63      0.56      0.53     17946
weighted avg       0.63      0.65      0.59     17946



# Define Random Forest model

In [38]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [39]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 


In [40]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)


In [41]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)


In [42]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,9106,2019
Actual 1,3841,2980


In [26]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)


In [44]:
# Displaying results
print("Confusion Matrix")
print(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))


Confusion Matrix
          Predicted 0  Predicted 1
Actual 0         9106         2019
Actual 1         3841         2980
Accuracy Score : 0.6734648389613285
Classification Report
              precision    recall  f1-score   support

           0       0.70      0.82      0.76     11125
           1       0.60      0.44      0.50      6821

    accuracy                           0.67     17946
   macro avg       0.65      0.63      0.63     17946
weighted avg       0.66      0.67      0.66     17946

