In [1]:
import folium
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm

from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

In [2]:
# file downloaded from IRS website
fn_irs="2016_zip_code_income_TX.xls"

# read excel file into dataframe
df_irs = pd.read_excel(fn_irs,header=3)

# keep only rows that contain total income data for each zip code
df_irs.dropna(subset=["ZIP\ncode [1]","Total income"],axis=0,inplace=True)
df_irs = df_irs[df_irs['Size of adjusted gross income'].isnull()]

# keep only columns that include zip code, number of returns, total amount of income
df_irs = df_irs.loc[:,['ZIP\ncode [1]','Total income','Unnamed: 18']]

# rename columns and set zip code as index
df_irs.columns = ["zip_code","number_of_returns","total_amount"]
df_irs["zip_code"] = df_irs["zip_code"].astype('str')

df_irs.head()

Unnamed: 0,zip_code,number_of_returns,total_amount
10,75001,9030,846328
18,75002,29990,2764087
26,75006,23940,1267845
34,75007,26050,1812445
42,75009,5940,659029


In [3]:
# Calculate the average income per return, convert to dollar
df_irs['avg_income'] = df_irs['total_amount']/df_irs['number_of_returns']

df_irs = df_irs.reset_index(drop=True)


In [4]:
df_irs['zip_code'] = df_irs['zip_code'].astype(float)

In [5]:
from geopy.geocoders import Nominatim
address = 'Houston, TX'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

In [11]:
# create a numpy array of length 6 and has linear spacing from the minium total immigration to the maximum total immigration
threshold_scale = np.linspace(df_irs['avg_income'].min(),
                              df_irs['avg_income'].max(),
                              6, dtype=int)
threshold_scale = threshold_scale.tolist() # change the numpy array to a list
threshold_scale[-1] = threshold_scale[-1] + 1 # make sure that the last value of the list is greater than the maximum immigration


In [15]:
# Map without markers
hou_polygon_geo_data = 'hou_zip_code.geojson'

# create a plain world map
hou_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# generate choropleth map
folium.Choropleth(
    geo_data=hou_polygon_geo_data,
    data=df_irs,
    threshold_scale = threshold_scale,
    columns=['zip_code','avg_income'],
    key_on='feature.properties.ZIP_CODE',
    fill_color='YlOrRd',
    fill_opacity=0.5, 
    line_opacity=0.2,
    legend_name = 'Individual Income, Tax Year 2016 (in thousands of dollars)'
).add_to(hou_map)

# display map
hou_map
