***Import Libraries***

In [None]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import datetime as dt
import statsmodels.api as sm


pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



***Data Cleaning***


> In this step, I downloaded the datasets from my original GitHub repository and extracted the columns that I wanted to analyze. The datasets described median household income, walk score, healthy food availability index, life expectancy, and cluster analysis from Excel in Baltimore Community Statistical Areas.


In [None]:
#import datasets from GitHub
url_med = 'https://raw.githubusercontent.com/vchen19/baltimore_walk_score_and_income/main/Median_Household_Income.csv'
url_walk = 'https://raw.githubusercontent.com/vchen19/baltimore_walk_score_and_income/main/Walk_Score.csv'
url_food = 'https://raw.githubusercontent.com/vchen19/baltimore_food_deserts/main/Average_Healthy_Food_Availability_Index.csv'
url_life = 'https://raw.githubusercontent.com/vchen19/baltimore_food_deserts/main/Life_Expectancy.csv'
url_clusters = 'https://raw.githubusercontent.com/vchen19/baltimore-food-insecurity/main/Analysis%20Files/Clusters%201.csv'
#read datasets into pandas dataframes
df_med = pd.read_csv(url_med)
df_walk = pd.read_csv(url_walk)
df_food = pd.read_csv(url_food)
df_life = pd.read_csv(url_life)
df_clusters = pd.read_csv(url_clusters)
#extract relevant columns- household income
df_med = pd.DataFrame(df_med, columns = ['mhhi11'])
#make column take on the index in the new dataset, not the old
df_med = df_med.reset_index(drop=True) 
#extract relevant columns- walk score
df_walk = pd.DataFrame(df_walk, columns = ['OBJECTID', 'CSA2010',	'wlksc11'])
df_walk = df_walk.reset_index(drop=True)
#extract relevant columns- healthy food availability index
df_food = pd.DataFrame(df_food, columns = ['hfai12'])
df_food = df_food.reset_index(drop=True)
#extract relevant columns- life expectancy
df_life = pd.DataFrame(df_life, columns = ['lifexp11'])
df_life = df_life.reset_index(drop=True)
#extract relevant columns- clusters
df_clusters = pd.DataFrame(df_clusters, columns = ['Cluster'])
df_clusters = df_clusters.reset_index(drop=True)
#combine datasets into one big dataset
df = pd.concat([df_walk, df_med], axis=1)
df = pd.concat([df, df_food], axis = 1)
df = pd.concat([df,df_life], axis = 1)
df = pd.concat([df,df_clusters], axis = 1)
df = df.reset_index(drop=True) 

**Plotting using Plotly Express**


> In this step, I plotted walk score against life expectancy with the size corresponding to income. Then, I plotted walk score against life expectancy with the size corresponding to healthy food availability index. Then, I plotted income against life expectancy with the size corresponding to healthy food availability index. The color corresponded to cluster number from cluster analysis performed in Excel for all of these plots. This allowed me to visually analyze whether there were any interesting trends between all of these variables.

In [None]:
fig = px.scatter(df, x="wlksc11", y="lifexp11",color = "Cluster", size="mhhi11",  labels={
                     "wlksc11": "Walk Score 2011",
                     "lifexp11": "Life Expectancy 2011",
                     "Cluster": "Cluster",
                     "mhhi11": "Median Household Income 2011"
                 },title = "2011 Walk Score, Median Household Income, and Life Expectancy")
fig.show()

fig = px.scatter(df, x="wlksc11", y="lifexp11",color = "Cluster", size="hfai12", labels={
                     "wlksc11": "Walk Score 2011",
                     "lifexp11": "Life Expectancy 2011",
                     "Cluster": "Cluster",
                     "hfai12": "Healthy Food Availability Index 2012"
                 },title = "2011/2012 Walk Score, Health Food Availability Index, and Life Expectancy")
fig.show()

fig = px.scatter(df, x="mhhi11", y="lifexp11",color = "Cluster", size="hfai12",labels={
                     "mhhi11": "Median Household Income 2011",
                     "lifexp11": "Life Expectancy 2011",
                     "Cluster": "Cluster",
                     "hfai12": "Healthy Food Availability Index 2012"
                 }, title = "2011/2012 Median Household Income, Health Food Availability Index, and Life Expectancy")
fig.show()


**Cluster Analysis**


> Count the number of neighborhoods in each cluster.



In [None]:
#initialize the number of neighborhoods in each cluster
one = 0
two = 0
three = 0
four = 0
five = 0

#for each row, if the cluster is 1, add 1 to the count of cluster one
for i in range(len(df)):
  if df.loc[i,"Cluster"] == 1:
    one = one + 1

#repeat for cluster 2
for i in range(len(df)):
  if df.loc[i,"Cluster"] == 2:
    two = two + 1  

#repeat for cluster 3
for i in range(len(df)):
  if df.loc[i,"Cluster"] == 3:
    three = three + 1  

#repeat for cluster 4
for i in range(len(df)):
  if df.loc[i,"Cluster"] == 4:
    four = four + 1  

#repeat for cluster 5
for i in range(len(df)):
  if df.loc[i,"Cluster"] == 5:
    five = five + 1  

#print results
print("Cluster 1: %s" %one)
print("Cluster 2: %s" %two)
print("Cluster 3: %s" %three)
print("Cluster 4: %s" %four)
print("Cluster 5: %s" %five)

Cluster 1: 10
Cluster 2: 21
Cluster 3: 4
Cluster 4: 16
Cluster 5: 4
