## 1. Import dependencies

In [1]:
import requests
import pandas as pd
import numpy as np
import scipy.spatial
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


## 2. Read data from sources

**1) US Zip codes and their responding coordinates.** 

In [2]:
df_zipcodes = pd.read_csv("https://gist.githubusercontent.com/erichurst/7882666/raw/5bdc46db47d9515269ab12ed6fb2850377fd869e/US%2520Zip%2520Codes%2520from%25202013%2520Government%2520Data", dtype={'ZIP': object})
df_zipcodes.head()

Unnamed: 0,ZIP,LAT,LNG
0,601,18.180555,-66.749961
1,602,18.361945,-67.175597
2,603,18.455183,-67.119887
3,606,18.158345,-66.932911
4,610,18.295366,-67.125135


In [3]:
# check the size of newly-made dataframe
df_zipcodes.shape

(33144, 3)

In [4]:
# rename columns
df_zipcodes.columns = ['zip code', 'latitude', 'longitude']
df_zipcodes.head()

Unnamed: 0,zip code,latitude,longitude
0,601,18.180555,-66.749961
1,602,18.361945,-67.175597
2,603,18.455183,-67.119887
3,606,18.158345,-66.932911
4,610,18.295366,-67.125135


**2) Zip codes for neighborhoods in Houston**

In [5]:
# Scrape useful data from HTML web page by BeautifulSoup4
url = "https://www.houstoniamag.com/articles/2017/3/24/neighborhoods-by-the-numbers-real-estate-data-2017"
result = requests.get(url).text
result = BeautifulSoup(result, 'html.parser')

In [6]:
tag = result.table.tbody 
rows = [row.text.strip().split("\n")[:2] for row in tag.find_all('tr')]

#  make dataframe with the scraped data
df_houston_neighborhoods = pd.DataFrame(rows)
df_houston_neighborhoods.columns = ['neighborhood', 'zip code']
df_houston_neighborhoods.head()

Unnamed: 0,neighborhood,zip code
0,1960/Cypress,77065
1,Aldine Area,77039
2,Alief,77072
3,Alvin North,77511
4,Alvin South,77511


In [7]:
# check the dataframe size
df_houston_neighborhoods.shape

(147, 2)

## 3. Data processing

**1) Modify data of Houston neighborhoods with zip code**

In [8]:
# combine neighborhoods with the same zip code, separated by ','
df_houston_grouped = df_houston_neighborhoods.groupby(['zip code'])['neighborhood'].apply(",".join) 
df_houston_grouped

zip code
77003                                 East End Revitalized
77004                            Midtown-Houston,Riverside
77005    Rice/Museum District,West University/Southside...
77006                                             Montrose
77007    Cottage Grove,Memorial Park,Rice Military/Wash...
77008        Heights/Greater Heights,Timbergrove/Lazybrook
77015                            Garden Oaks,North Channel
77018        Oak Forest East Area,Shepherd Park Plaza Area
77019                                      River Oaks Area
77020                                        Denver Harbor
77021                                      University Area
77024                  Memorial Close In,Memorial Villages
77025              Braeswood Place,Knollwood/Woodside Area
77027      Highland Village/Midlane,Royden Oaks/Afton Oaks
77028                                    Northeast Houston
77030                                  Medical Center Area
77031                                          

In [9]:
# make a new dataframe with grouped data
combined_neighborhood = df_houston_grouped.values
combined_post = [index for index in df_houston_grouped.index]  
grouped_dataframe = pd.DataFrame({'zip code': combined_post,'neighborhood': combined_neighborhood})
grouped_dataframe.head()

Unnamed: 0,neighborhood,zip code
0,East End Revitalized,77003
1,"Midtown-Houston,Riverside",77004
2,"Rice/Museum District,West University/Southside...",77005
3,Montrose,77006
4,"Cottage Grove,Memorial Park,Rice Military/Wash...",77007


In [10]:
# rearrange the columns
grouped_dataframe = grouped_dataframe[['zip code', 'neighborhood']]
grouped_dataframe.head()

Unnamed: 0,zip code,neighborhood
0,77003,East End Revitalized
1,77004,"Midtown-Houston,Riverside"
2,77005,"Rice/Museum District,West University/Southside..."
3,77006,Montrose
4,77007,"Cottage Grove,Memorial Park,Rice Military/Wash..."


**2) Merge two dataframes to make a new one of Houston neighborhoods with the coornidates**

In [11]:
# inner join two dataframes on PostalCode
coordinated_neighborhoods = grouped_dataframe.merge(df_zipcodes, how='inner', left_on='zip code', right_on='zip code')
coordinated_neighborhoods.head()

Unnamed: 0,zip code,neighborhood,latitude,longitude
0,77003,East End Revitalized,29.749778,-95.345885
1,77004,"Midtown-Houston,Riverside",29.724893,-95.363752
2,77005,"Rice/Museum District,West University/Southside...",29.718435,-95.423555
3,77006,Montrose,29.74097,-95.391301
4,77007,"Cottage Grove,Memorial Park,Rice Military/Wash...",29.771545,-95.411083


## 4. Create data of venues in neighborhoods by Foursquare

**1) Create dataframe of venues in Houston**

In [12]:
# parameters for making API request
client_id =  'ZIWHRS41QYWJAJNHG2HQMOPPD3YRTMWJATWIOSXW1HXM0444'
client_secret = 'EOHZPLIMBTMYGHAEUWHWTSY3BEB2Z2OSJB44XPY05MOUUEQX'
VERSION = '20181029' 

radius = 1000
limit = 100

In [13]:
# function to fetch venues data
def getNearbyVenues(names, latitudes, longitudes):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            client_id, 
            client_secret, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [14]:
houston_venues = getNearbyVenues(coordinated_neighborhoods['neighborhood'], coordinated_neighborhoods['latitude'], coordinated_neighborhoods['longitude'])

East End Revitalized
Midtown-Houston,Riverside
Rice/Museum District,West University/Southside Area
Montrose
Cottage Grove,Memorial Park,Rice Military/Washington Corridor,Washington East/Sabine
Heights/Greater Heights,Timbergrove/Lazybrook
Garden Oaks,North Channel
Oak Forest East Area,Shepherd Park Plaza Area
River Oaks Area
Denver Harbor
University Area
Memorial Close In,Memorial Villages
Braeswood Place,Knollwood/Woodside Area
Highland Village/Midlane,Royden Oaks/Afton Oaks
Northeast Houston
Medical Center Area
Brays Oaks
Southbelt/Ellington
Willow Meadows Area
Sharpstown Area
Aldine Area
Jersey Village
Eldridge North
Briargrove Park/Walnut Bend,Rivercrest,Westchase Area
Summerwood/Lakeshore
Five Corners
Greenway Plaza
Medical Center South
Spring Branch
Galleria,Tanglewood Area
Briargrove
Hobby Area
Clear Lake Area
Briarmeadow/Tanglewilde,Charnwood/Briarbend
Willowbrook
1960/Cypress
Champions Area
Alief
Energy Corridor
Memorial West
Gulfton
Mission Bend Area
Northwest Houston
Northsi

In [15]:
#  check the size of venues dataframe
print(houston_venues.shape)
houston_venues.head()

(2430, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,East End Revitalized,29.749778,-95.345885,Sigma Brewing Company,29.749057,-95.343389,Brewery
1,East End Revitalized,29.749778,-95.345885,Champ Burger,29.749796,-95.34035,American Restaurant
2,East End Revitalized,29.749778,-95.345885,BBVA Compass Stadium,29.752666,-95.352545,Soccer Stadium
3,East End Revitalized,29.749778,-95.345885,Moon Tower Inn,29.754298,-95.341045,Beer Garden
4,East End Revitalized,29.749778,-95.345885,Texian Army #TAilgate,29.754767,-95.350559,Soccer Stadium


**2) Ask user to enter the zip code as a template to search**

In [16]:
# For example, the zip code in Manhattan, New York City: 10030
template_zipcode = input("Please enter the zip code of your template neighborhood.")

Please enter the zip code of your template neighborhood.10030


In [17]:
# Make venues dataframe of the input zip code as we did for Houston
# find the coordinates of input zip code from zip codes dataframe
template_coordinates = df_zipcodes.loc[df_zipcodes['zip code'] == str(template_zipcode)]
template_lati = template_coordinates['latitude'].values
template_long = template_coordinates['longitude'].values

template_venues = getNearbyVenues([str(template_zipcode)], template_lati, template_long)

10030


In [18]:
# take a look at the input dataframe we just made
print(template_venues.shape)
template_venues.head()

(100, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,10030,40.818267,-73.942856,Ponty Bistro Harlem,40.817886,-73.941522,African Restaurant
1,10030,40.818267,-73.942856,Harlem Cycle,40.817201,-73.942592,Cycle Studio
2,10030,40.818267,-73.942856,The Edge Harlem,40.819692,-73.946073,American Restaurant
3,10030,40.818267,-73.942856,Harlem Nights,40.817198,-73.942146,Bar
4,10030,40.818267,-73.942856,Belle,40.81689,-73.94268,American Restaurant


**3) Combine the input dataframe with Houston dataframe for the convenience of later use**

In [19]:
total_venues = houston_venues.append(template_venues, ignore_index=True)
print(total_venues.shape)

(2530, 7)


## 5. Compare the similarity of input neighborhood with Houston's by venues information

**1) Normalize data**

In [20]:
# function to normalize data
def onehot(df):
    df_onehot = pd.get_dummies(df[['Venue Category']], prefix="", prefix_sep="")
    df_onehot['Neighborhood'] = df['Neighborhood']
    fixed_columns = list(df_onehot.columns)
    fixed_columns.remove('Neighborhood')
    fixed_columns = ['Neighborhood'] + list(fixed_columns)
    df_onehot = df_onehot[fixed_columns]
    return df_onehot

In [21]:
# create onehot dataframe of venue category
total_onehot = onehot(total_venues)
total_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Advertising Agency,African Restaurant,Airport,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Arcade,...,Volleyball Court,Warehouse Store,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo Exhibit
0,East End Revitalized,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,East End Revitalized,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,East End Revitalized,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,East End Revitalized,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,East End Revitalized,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# get the mean value of each catagory for all the venues grouped by the same neighborhood
total_grouped = total_onehot.groupby('Neighborhood', sort=False).mean().reset_index()
total_grouped.head()

Unnamed: 0,Neighborhood,Accessories Store,Advertising Agency,African Restaurant,Airport,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Arcade,...,Volleyball Court,Warehouse Store,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo Exhibit
0,East End Revitalized,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.018868,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Midtown-Houston,Riverside",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Rice/Museum District,West University/Southside...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010753,0.0,...,0.0,0.0,0.0,0.0,0.010753,0.0,0.0,0.0,0.010753,0.0
3,Montrose,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.0
4,"Cottage Grove,Memorial Park,Rice Military/Wash...",0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0


**2) Split dataframe into two parts as Houston and the input one**

The reason to combine the input venue dataframe with the Houston venues is to ensure them with the same columns (venue category), which will be comparable in later distance calculation.

In [23]:
df_houston = total_grouped.iloc[:-1]
df_template = total_grouped.iloc[-1:].reset_index(drop=True)
print(df_houston.shape)

(102, 301)


**3) Calculate the distance between the input neighborhood with Houston's** 

In [24]:
distances = scipy.spatial.distance.cdist(df_houston.iloc[:,1:], df_template.iloc[:,1:], metric='euclidean')

df_distances = pd.DataFrame(distances).transpose()
distances_columns = df_houston['Neighborhood']
df_distances.columns = distances_columns
df_distances

Neighborhood,East End Revitalized,"Midtown-Houston,Riverside","Rice/Museum District,West University/Southside Area",Montrose,"Cottage Grove,Memorial Park,Rice Military/Washington Corridor,Washington East/Sabine","Heights/Greater Heights,Timbergrove/Lazybrook","Garden Oaks,North Channel","Oak Forest East Area,Shepherd Park Plaza Area",River Oaks Area,Denver Harbor,...,"East End-Galveston,Midtown-Galveston","Near West End-Galveston,Tiki Island,West End-Galveston",La Marque,La Porte/Shoreacres,League City,Pearland,South Houston,Texas City,Webster,Crystal Beach
0,0.222002,0.253405,0.157003,0.182209,0.179444,0.205108,0.301393,0.201264,0.166423,0.298998,...,0.205408,1.011138,0.404537,0.722772,0.243721,0.260175,0.279489,0.303459,0.190581,0.531029


**4) Sort the values to get three neighborhoods with the lowest distance (highest similarity)**

In [25]:
lowest_three = df_distances.sort_values(by=0, axis=1)
lowest_three

Neighborhood,Greenway Plaza,"Rice/Museum District,West University/Southside Area",Upper Kirby,Briargrove,River Oaks Area,"Highland Village/Midlane,Royden Oaks/Afton Oaks","Galleria,Tanglewood Area","Cottage Grove,Memorial Park,Rice Military/Washington Corridor,Washington East/Sabine","Briarmeadow/Tanglewilde,Charnwood/Briarbend",Montrose,...,Crosby Area,Porter/New Caney East,La Porte/Shoreacres,Hockley,Lake Conroe Area,Plantersville Area,"Near West End-Galveston,Tiki Island,West End-Galveston",Spring Northeast,Cypress South,Katy-Old Towne
0,0.152315,0.157003,0.158114,0.161521,0.166423,0.173205,0.177328,0.179444,0.18102,0.182209,...,0.722772,0.722772,0.722772,1.001199,1.011138,1.011138,1.011138,1.011138,1.011138,1.011138


In [26]:
# find out three neighborhoods with the lowest distance
final_results = lowest_three.columns[:3]
labels = ['No.1 ', 'No.2 ', 'No.3 ']
print("The top three most similar neighborhoods as {} in Houston are: ".format(template_zipcode))
for i,result in enumerate(final_results):
    print(labels[i] + result)

The top three most similar neighborhoods as 10030 in Houston are: 
No.1 Greenway Plaza
No.2 Rice/Museum District,West University/Southside Area
No.3 Upper Kirby


## 6 . Visualize the results on map

**1) Geography of Houston**

In [27]:
address = 'Houston, Texas USA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Houston is {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Houston is 29.7589382, -95.3676974.


**2) Retrieve the geography data of the result neighborhoods**

In [28]:
df_results = coordinated_neighborhoods[coordinated_neighborhoods['neighborhood'].isin(final_results)]
# reset the index as the order of the result list
ret_results = df_results.set_index('neighborhood').loc[final_results].reset_index(inplace=False)
ret_results

Unnamed: 0,Neighborhood,zip code,latitude,longitude
0,Greenway Plaza,77046,29.733777,-95.433346
1,"Rice/Museum District,West University/Southside...",77005,29.718435,-95.423555
2,Upper Kirby,77098,29.734813,-95.416098


**3) Add the result neighborhoods onto the map of Houston**

In [29]:
# create map of Houston using latitude and longitude values
map_houston = folium.Map(location=[latitude, longitude], zoom_start=12)
colors = ['blue', 'green', 'cyan']

# add markers to map
for i, (lat, lng, neighborhood) in enumerate(zip(ret_results['latitude'], ret_results['longitude'], ret_results['Neighborhood'])):
    label = '{}'.format(labels[i] + neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=colors[i],
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_houston)  
    
map_houston

## Thank you for reviewing my analysis!