In [1]:
import folium
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm

from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans

### Create a dataframe contains income data by zip code 
Data was downloaded from IRS (year 2016, Texas) [Link](https://www.irs.gov/statistics/soi-tax-stats-individual-income-tax-statistics-2016-zip-code-data-soi)

In [2]:
# file downloaded from IRS website
fn_irs="2016_zip_code_income_TX.xls"

# read excel file into dataframe
df_irs = pd.read_excel(fn_irs,header=3)

# keep only rows that contain total income data for each zip code
df_irs.dropna(subset=["ZIP\ncode [1]","Total income"],axis=0,inplace=True)
df_irs = df_irs[df_irs['Size of adjusted gross income'].isnull()]

# keep only columns that include zip code, number of returns, total amount of income
df_irs = df_irs.loc[:,['ZIP\ncode [1]','Total income','Unnamed: 18']]

# rename columns and set zip code as index
df_irs.columns = ["zip_code","number_of_returns","total_amount"]
df_irs["zip_code"] = df_irs["zip_code"].astype('str')

df_irs.head()

Unnamed: 0,zip_code,number_of_returns,total_amount
10,75001,9030,846328
18,75002,29990,2764087
26,75006,23940,1267845
34,75007,26050,1812445
42,75009,5940,659029


In [3]:
# Calculate the average income per return, convert to dollar
df_irs['avg_income'] = df_irs['total_amount']/df_irs['number_of_returns']*1000

df_irs.head()

Unnamed: 0,zip_code,number_of_returns,total_amount,avg_income
10,75001,9030,846328,93724.0
18,75002,29990,2764087,92167.0
26,75006,23940,1267845,52959.3
34,75007,26050,1812445,69575.6
42,75009,5940,659029,110948.0


### Create a dataframe contains coordinate data by zip code in Houston
Zip code coordinate data was downloaded from GitHub. [Link](https://gist.github.com/erichurst/7882666/)  
Zip code city data was downloaded from mongabay.com. [Link](https://data.mongabay.com/igapo/zip_codes/TX.htm)

In [4]:
# file contain zip code and coordinate
fn_cord = "zip_lat_lng.txt"

# read excel file into dataframe
df_cord = pd.read_csv(fn_cord, dtype={'ZIP': object})
df_cord.columns = ["zip_code","latitude","longitude"]


# file contain zip code and city
fn_hou = "TX_zip.xls"

# read excel file into dataframe
df_hou = pd.read_excel(fn_hou)

# get the list of zipcode for Houston
houston_zipcode = df_hou[df_hou['City'].isin(['Houston'])]['ZIP Code'].tolist()

# Extract dataframe for Houston based on Houston Zipcodes
df_cord = df_cord[df_cord['zip_code'].isin(list(map(str,houston_zipcode)))]
df_cord.shape

(97, 3)

### Merge both dataframes and use zip code as index

In [5]:
df_hou = pd.merge(df_irs,df_cord,how='inner',on=['zip_code']).reset_index(drop=True)

df_hou.head()

Unnamed: 0,zip_code,number_of_returns,total_amount,avg_income,latitude,longitude
0,77002,4630,737038,159187.0,29.756845,-95.365652
1,77003,5300,334463,63106.2,29.749778,-95.345885
2,77004,12620,890669,70576.0,29.724893,-95.363752
3,77005,10210,3480851,340926.0,29.718435,-95.423555
4,77006,11780,1516374,128724.0,29.74097,-95.391301


### Lable Houston Zip code, colored by average income

In [6]:
address = 'Houston, TX'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Houston are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Houston are 29.7589382, -95.3676974.


In [7]:
# create map of Houston using latitude and longitude values
map_Houston = folium.Map(location=[latitude, longitude], zoom_start=11)

# color by average income
df_hou['normalized_income'] = (df_hou['avg_income']-df_hou['avg_income'].min())/(df_hou['avg_income'].max()-df_hou['avg_income'].min())

cmap_colors = plt.get_cmap('YlOrRd')

for lat, lng, label, c, avg in zip(df_hou['latitude'], df_hou['longitude'], df_hou['zip_code'], df_hou['normalized_income'], df_hou['avg_income']):
    label = folium.Popup(label +"\n"+ str(int(avg)), parse_html=True)

    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        fill=True,
        color = colors.to_hex(cmap_colors(c)),
        fill_color=colors.to_hex(cmap_colors(c)),
        fill_opacity=0.7).add_to(map_Houston)  
    
map_Houston


### Define Foursquare Credentials and Version

In [8]:
# @hidden_cell
CLIENT_ID = 'CRDS4XWDD4KPADKBVJP1JE5NW3AYATHVMUCCUC4ZFNIWNJZ5' # your Foursquare ID
CLIENT_SECRET = 'ZBKR4ETVK14UV2SMZRJ1OCPSLAZFF2K2RPBEWFEKWHFVFHTE' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [9]:
def getNearbyVenues(names, latitudes, longitudes, radius=200, LIMIT=200):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['zip_code', 
                  'zip_code Latitude', 
                  'zip_code Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [10]:
hou_venues = getNearbyVenues(df_hou['zip_code'],df_hou['latitude'],df_hou['longitude'])
hou_venues.shape

KeyError: 'groups'

In [None]:
# one hot encoding
hou_onehot = pd.get_dummies(hou_venues[['Venue Category']], prefix="", prefix_sep="")

# add zip code column back to dataframe
hou_onehot['zip_code'] = hou_venues['zip_code'] 
# move Zip code column to the first column
fixed_columns = [hou_onehot.columns[-1]] + list(hou_onehot.columns[:-1])
hou_onehot = hou_onehot[fixed_columns]

hou_onehot.head()

In [None]:
hou_grouped = hou_onehot.groupby('zip_code').mean().reset_index()
hou_grouped.head()

In [None]:
num_top_venues = 5

for hood in hou_grouped['zip_code']:
    print("----"+hood+"----")
    temp = hou_grouped[hou_grouped['zip_code'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

### Run k-means to cluster the Zip code

In [None]:
# set number of clusters
kclusters = 10

hou_grouped_clustering = hou_grouped.drop('zip_code', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(hou_grouped_clustering)

# add clustering labels
hou_grouped.insert(0, 'Cluster Labels', kmeans.labels_)

In [None]:
hou_grouped = hou_grouped.join(df_hou.set_index('zip_code'), on='zip_code')
hou_grouped.head()

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# color by average income
cmap_colors = plt.get_cmap('YlOrRd')



# add markers to the map
markers_colors = []
for lat, lon, poi, cluster, avg, c in zip(hou_grouped['latitude'], hou_grouped['longitude'], hou_grouped['zip_code'], hou_grouped['Cluster Labels'], hou_grouped['avg_income'], hou_groped['normalized_income']):
    label = folium.Popup(str(poi) + '\n cluster' + str(cluster)+'\n'+str(int(avg)), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=colors.to_hex(cmap_colors(c)),
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
hou_grouped_mean = hou_grouped.groupby('Cluster Labels').mean()
hou_grouped_mean

In [None]:
hou_grouped_mean = hou_grouped_mean.drop(columns = ['latitude','longitude'])

x=pd.DataFrame(hou_grouped_mean).T
rslt = pd.DataFrame(np.zeros((0,5)), columns=['top1','top2','top3','top4','top5'])
for i in x.columns:
    df1row = pd.DataFrame(x.nlargest(5, i).index.tolist(), index=['top1','top2','top3','top4','top5']).T
    rslt = pd.concat([rslt, df1row], axis=0)

rslt.index=hou_grouped_mean.index
rslt