In [1]:
import folium
import requests
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

### Create a dataframe contains income data by zip code 
Data was downloaded from IRS (year 2016, Texas) [Link](https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics-2016-zip-code-data-soi)

In [2]:
# file downloaded from IRS website
fn_irs="2016_zip_code_income_TX.xls"

# read excel file into dataframe
df_irs = pd.read_excel(fn_irs,header=3)

# keep only rows that contain total income data for each zip code
df_irs.dropna(subset=["ZIP\ncode [1]","Total income"],axis=0,inplace=True)
df_irs = df_irs[df_irs['Size of adjusted gross income'].isnull()]

# keep only columns that include zip code, number of returns, total amount of income
df_irs = df_irs.loc[:,['ZIP\ncode [1]','Total income','Unnamed: 18']]

# rename columns and set zip code as index
df_irs.columns = ["zip_code","number_of_returns","total_amount"]
df_irs["zip_code"] = df_irs["zip_code"].astype('str')

df_irs.head()

Unnamed: 0,zip_code,number_of_returns,total_amount
10,75001,9030,846328
18,75002,29990,2764087
26,75006,23940,1267845
34,75007,26050,1812445
42,75009,5940,659029


In [3]:
# Calculate the average income per return, convert to dollar
df_irs['avg_income'] = df_irs['total_amount']/df_irs['number_of_returns']*1000

df_irs.head()

Unnamed: 0,zip_code,number_of_returns,total_amount,avg_income
10,75001,9030,846328,93724.0
18,75002,29990,2764087,92167.0
26,75006,23940,1267845,52959.3
34,75007,26050,1812445,69575.6
42,75009,5940,659029,110948.0


### Create a dataframe contains coordinate data by zip code 
Data was downloaded from GitHub. [Link](https://gist.github.com/erichurst/7882666/) 

In [4]:
fn_cord = "zip_lat_lng.txt"

# read excel file into dataframe
df_cord = pd.read_csv(fn_cord, dtype={'ZIP': object})
df_cord.columns = ["zip_code","latitude","longitude"]

df_cord.head()

Unnamed: 0,zip_code,latitude,longitude
0,601,18.180555,-66.749961
1,602,18.361945,-67.175597
2,603,18.455183,-67.119887
3,606,18.158345,-66.932911
4,610,18.295366,-67.125135


### Merge both dataframes and use zip code as index

In [5]:
df_merge = pd.merge(df_irs,df_cord,how='inner',on=['zip_code'])
df_merge.set_index("zip_code",inplace=True)

df_merge.head()

Unnamed: 0_level_0,number_of_returns,total_amount,avg_income,latitude,longitude
zip_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
75001,9030,846328,93724.0,32.960047,-96.838522
75002,29990,2764087,92167.0,33.089854,-96.6086
75006,23940,1267845,52959.3,32.962141,-96.898585
75007,26050,1812445,69575.6,33.005262,-96.896742
75009,5940,659029,110948.0,33.338899,-96.752977


In [6]:
address = 'Houston, TX'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Houston are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Houston are 29.7589382, -95.3676974.


In [7]:
# create map of Toronto using latitude and longitude values
map_Houston = folium.Map(location=[latitude, longitude], zoom_start=12)

In [8]:
# add markers to map
for lat, lng, label in zip(df_merge['latitude'], df_merge['longitude'], df_merge.index):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Houston)  
    
map_Houston