# Coursera Capstone Project - Identify the neighbourhood with the least variety of Restaurant types in Perth City


In [2]:
import random # library for random number generation
import numpy as np # library for vectorized computation
import pandas as pd # library to process data as dataframes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from bs4 import BeautifulSoup
import requests
import re

Extract Data from the Wikipedia page.

In [2]:
# Site URL
url="https://en.wikipedia.org/wiki/List_of_Perth_suburbs"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse HTML code for the entire site
soup = BeautifulSoup(html_content, "lxml")
#print(soup.prettify()) # print the parsed data of html

Extract table headings fromn the Dataset.
- iterating through the head HTML code and making list of clean headings

In [3]:
# The following line will generate a list of HTML content for each table
data = soup.find_all("table", attrs={"class": "wikitable"})
perth = data[0]

# Table 
table = perth.find_all("tr")
#Header Row
column_headers = table[0]

# Table Contents
table_rows = table[1:]

headings = []
for item in column_headers.find_all("th"): # loop through all th elements
    # convert the th elements to text and strip "\n"
    item = (item.text).rstrip("\n")
    # append the clean column name to headings
    headings.append(item)
print(headings)


['Suburb', 'Local government area']


Extract content from webpage table
- loop through all row entries
- row_item.text removes the tags from the entries
- the following regex is to remove \xa0 and \n and comma from row_item.text
- xa0 encodes the flag, \n is the newline and comma separates thousands in numbers

In [4]:
# Next is now to loop though the rest of the rows

#print(body_rows[0])
all_rows = [] # will be a list for list for all rows
for row_num in range(len(table_rows)): # A row at a time
    row = [] # this will old entries for one row
    for row_item in table_rows[row_num].find_all("td"): 
        aa = re.sub("(\xa0)|(\n)|,","",row_item.text)
        #append aa to row 
        row.append(aa)
    # append one row to all_rows
    all_rows.append(row)

In [5]:
df = pd.DataFrame(data=all_rows,columns=headings)
perth = df.reset_index(drop=True)
perth.head(12)

Unnamed: 0,Suburb,Local government area
0,Alexander Heights,Wanneroo
1,Alfred Cove,Melville
2,Alkimos,Wanneroo
3,Anketell,Kwinana
4,Applecross,Melville
5,Ardross,Melville
6,Armadale,Armadale
7,Ascot,Belmont
8,Ashby,Wanneroo
9,Ashendon,Armadale


In [7]:
df.shape

(355, 2)

Merge the Suburban Data with the geospatial coordinate data

In [7]:
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
geolocator = Nominatim(user_agent="Perth_explorer")

df['Major_Dist_Coord']= df['Suburb'].apply(geolocator.geocode).apply(lambda x: (x.latitude, x.longitude))
df[['Latitude', 'Longitude']] = df['Major_Dist_Coord'].apply(pd.Series)

df.drop(['Major_Dist_Coord'], axis=1, inplace=True)
df
df.to_csv('perth_locations.csv')

In [8]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


Focus of the Study is on the Perth City Centre and Surrounding Suburbs where most tourists would stay.

In [9]:

df = pd.read_csv('perth_locations.csv',index_col=0)
df = df[df['Suburb'].str.contains('Perth', case = False)].reset_index(drop=True)
df = df[~df['Suburb'].str.contains('Airport', case = False)].reset_index(drop=True)
df.groupby('Suburb').count()


Unnamed: 0_level_0,Local government area,Latitude,Longitude
Suburb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East Perth,1,1,1
North Perth,1,1,1
Perth City,1,1,1
South Perth,1,1,1
West Perth,1,1,1


#### Perth coordinates

In [10]:
latitude = -31.953512
longitude = 115.857048

#### Creating a Map of Perth with coordinates superimposed

In [11]:
# create map of Perth using latitude and longitude values
map_perth = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Local government area'], df['Suburb']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_perth)  
    
map_perth

#### Using Foursquare to identify venues within this area

In [42]:
# @hidden_cell
CLIENT_ID = 'IJQAAW01PVDVCGB0CMT35FKE1BWLQHZ5J25UDRCW4FLUH3T1' # your Foursquare ID
CLIENT_SECRET = 'NBPRRUGJU4XCLNSYHDA01443TXIT23K5F0CACVRVYTHOS5PV' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value
radius = 500

In [43]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [64]:
perth_venues = getNearbyVenues(names=df['Suburb'],
                                   latitudes=df['Latitude'],
                                   longitudes=df['Longitude']
                                  )

East Perth
North Perth
Perth City
South Perth
West Perth


#### Hospitality venue types withthe designated area

In [65]:
print(perth_venues.shape)
#perth_venues.head()
text = ['Restaurant','bar','Caf','Coff','pizza','break']
pattern = '|'.join(text)
perth_venues = perth_venues[perth_venues['Venue Category'].str.contains(pattern,case=False)].reset_index(drop=True)
##perth_gas.index = np.arange(1, len(perth_venues)+1)
perth_venues['Venue Category'].value_counts()

(155, 7)


Café                         26
Coffee Shop                  12
Bar                           6
Italian Restaurant            4
Restaurant                    3
Pizza Place                   3
Indian Restaurant             3
Australian Restaurant         3
Wine Bar                      3
Asian Restaurant              3
Thai Restaurant               3
Vietnamese Restaurant         2
Middle Eastern Restaurant     2
Sushi Restaurant              2
Korean Restaurant             2
Cocktail Bar                  1
Korean BBQ Restaurant         1
Beer Bar                      1
Breakfast Spot                1
Mediterranean Restaurant      1
Mexican Restaurant            1
Portuguese Restaurant         1
Ramen Restaurant              1
Name: Venue Category, dtype: int64

#### The number of venues for each area

In [66]:
perth_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
East Perth,14,14,14,14,14,14
North Perth,11,11,11,11,11,11
Perth City,39,39,39,39,39,39
South Perth,5,5,5,5,5,5
West Perth,16,16,16,16,16,16


#### Number of unique categories.

In [67]:
print('There are {} uniques categories.'.format(len(perth_venues['Venue Category'].unique())))

There are 23 uniques categories.


#### Top 10 most common venue types by suburb

In [68]:
# one hot encoding
perth_onehot = pd.get_dummies(perth_venues[['Venue Category']], prefix="", prefix_sep="")
perth_onehot['Neighborhood'] = perth_venues['Neighborhood'] 
perth_grouped = perth_onehot.groupby('Neighborhood').mean().reset_index()

num_top_venues = 10

for hood in perth_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp =perth_grouped[perth_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----East Perth----
                       venue  freq
0                       Café  0.43
1         Italian Restaurant  0.14
2           Asian Restaurant  0.07
3             Breakfast Spot  0.07
4                Coffee Shop  0.07
5                 Restaurant  0.07
6      Korean BBQ Restaurant  0.07
7                Pizza Place  0.07
8  Middle Eastern Restaurant  0.00
9      Vietnamese Restaurant  0.00


----North Perth----
                       venue  freq
0                       Café  0.36
1                Coffee Shop  0.18
2  Middle Eastern Restaurant  0.09
3                        Bar  0.09
4            Thai Restaurant  0.09
5          Indian Restaurant  0.09
6                Pizza Place  0.09
7           Asian Restaurant  0.00
8      Vietnamese Restaurant  0.00
9           Sushi Restaurant  0.00


----Perth City----
                   venue  freq
0            Coffee Shop  0.15
1                    Bar  0.10
2                   Café  0.10
3               Wine Bar  0.08
4  Australian

In [69]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = perth_grouped['Neighborhood']

for ind in np.arange(perth_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(perth_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,East Perth,Café,Italian Restaurant,Asian Restaurant,Restaurant,Breakfast Spot,Pizza Place,Coffee Shop,Korean BBQ Restaurant,Australian Restaurant,Bar
1,North Perth,Café,Coffee Shop,Thai Restaurant,Bar,Pizza Place,Middle Eastern Restaurant,Indian Restaurant,Wine Bar,Italian Restaurant,Australian Restaurant
2,Perth City,Coffee Shop,Bar,Café,Wine Bar,Vietnamese Restaurant,Australian Restaurant,Indian Restaurant,Korean Restaurant,Restaurant,Sushi Restaurant
3,South Perth,Café,Coffee Shop,Wine Bar,Korean BBQ Restaurant,Australian Restaurant,Bar,Beer Bar,Breakfast Spot,Cocktail Bar,Indian Restaurant
4,West Perth,Café,Coffee Shop,Italian Restaurant,Asian Restaurant,Thai Restaurant,Australian Restaurant,Bar,Korean BBQ Restaurant,Beer Bar,Breakfast Spot


#### Cluster Neighborhoods by Running K-means to cluster the neighborhood into 3 clusters representing The level of hospitality types by suburb

In [72]:
# set number of clusters
kclusters = 3
perth_grouped_clustering = perth_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(perth_grouped_clustering)

# add clustering label
neighborhoods_venues_sorted['Cluster Labels'] = kmeans.labels_
#neighborhoods_venues_sorted = neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
neighborhoods_venues_sorted
perth_merged = df
perth_merged = perth_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Suburb')
perth_merged.head()

Unnamed: 0,Suburb,Local government area,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels
0,East Perth,Perth; Vincent,-31.954341,115.877889,Café,Italian Restaurant,Asian Restaurant,Restaurant,Breakfast Spot,Pizza Place,Coffee Shop,Korean BBQ Restaurant,Australian Restaurant,Bar,0
1,North Perth,Vincent,-31.930863,115.856704,Café,Coffee Shop,Thai Restaurant,Bar,Pizza Place,Middle Eastern Restaurant,Indian Restaurant,Wine Bar,Italian Restaurant,Australian Restaurant,0
2,Perth City,Perth; Vincent,-31.952712,115.86048,Coffee Shop,Bar,Café,Wine Bar,Vietnamese Restaurant,Australian Restaurant,Indian Restaurant,Korean Restaurant,Restaurant,Sushi Restaurant,1
3,South Perth,South Perth,-31.980966,115.863943,Café,Coffee Shop,Wine Bar,Korean BBQ Restaurant,Australian Restaurant,Bar,Beer Bar,Breakfast Spot,Cocktail Bar,Indian Restaurant,2
4,West Perth,Perth; Vincent,-31.948659,115.841571,Café,Coffee Shop,Italian Restaurant,Asian Restaurant,Thai Restaurant,Australian Restaurant,Bar,Korean BBQ Restaurant,Beer Bar,Breakfast Spot,0


#### Visualisation of the Clusters

In [77]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(perth_merged['Latitude'], perth_merged['Longitude'], perth_merged['Suburb'], perth_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Cluster 1  - Moderate variety of restaurant types

In [74]:
perth_merged.loc[perth_merged['Cluster Labels'] == 0, perth_merged.columns[[0] + list(range(4, perth_merged.shape[1]))]]

Unnamed: 0,Suburb,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels
0,East Perth,Café,Italian Restaurant,Asian Restaurant,Restaurant,Breakfast Spot,Pizza Place,Coffee Shop,Korean BBQ Restaurant,Australian Restaurant,Bar,0
1,North Perth,Café,Coffee Shop,Thai Restaurant,Bar,Pizza Place,Middle Eastern Restaurant,Indian Restaurant,Wine Bar,Italian Restaurant,Australian Restaurant,0
4,West Perth,Café,Coffee Shop,Italian Restaurant,Asian Restaurant,Thai Restaurant,Australian Restaurant,Bar,Korean BBQ Restaurant,Beer Bar,Breakfast Spot,0


#### Cluster 2: Highest variety of restaurant types

In [75]:
perth_merged.loc[perth_merged['Cluster Labels'] == 1, perth_merged.columns[[0] + list(range(4, perth_merged.shape[1]))]]

Unnamed: 0,Suburb,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels
2,Perth City,Coffee Shop,Bar,Café,Wine Bar,Vietnamese Restaurant,Australian Restaurant,Indian Restaurant,Korean Restaurant,Restaurant,Sushi Restaurant,1


#### Cluster 3: Lowest variety of restaurant types

In [76]:
perth_merged.loc[perth_merged['Cluster Labels'] == 2, perth_merged.columns[[0] + list(range(4, perth_merged.shape[1]))]]

Unnamed: 0,Suburb,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels
3,South Perth,Café,Coffee Shop,Wine Bar,Korean BBQ Restaurant,Australian Restaurant,Bar,Beer Bar,Breakfast Spot,Cocktail Bar,Indian Restaurant,2


Based on the above analysis, South Perth should be the focus area for investing in new Hospitality venue types.