In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests

In [3]:
import json
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

ModuleNotFoundError: No module named 'folium'

Import URL content using _BeautifulSoup_

In [None]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urlopen(url)
soup = BeautifulSoup(html, 'lxml')

In [None]:
print(soup.prettify()[0:700])

Find all rows in the table and save to the variable rows

In [None]:
rows = soup.find_all("tr")
rows[0:5]

Save the column names (header) to the list variable col

In [None]:
col = []
for r in rows[0].find_all("th"):
    col.append(r.string.strip('\n'))
print(col)

Save the table contents in "rows" to the list variable in data

In [None]:
data = []
i = 1
for i in range(len(rows)):
    row = []
    for r in rows[i].find_all("td"):
            row.append(r.string)
    data.append(row)
    i+=1

In [None]:
del data[-4:]
del data[0]
print(data)

Remove "\n" from the table contents and save to the new list variable ndata

In [None]:
ndata = []
for d in data:
    row = []
    for e in d:
        row.append(e.strip('\n'))
    ndata.append(row)
print(ndata)

Create a dataframe with contents in ndata and column headers in col

In [None]:
df = pd.DataFrame(ndata, columns = col)
df.shape

In [None]:
df

Create a new dataframe toronto only containing rows needed for the project using the specified Postal Code 

In [None]:
toronto = pd.DataFrame(columns = col)
pc = ['M5G','M2H','M4B','M1J','M4G','M4M','M1R','M9V','M9L','M5V','M1B','M5A']
for i in range(len(pc)):
    toronto = toronto.append(df.loc[df['Postal Code'] == pc[i], :])
toronto

In [None]:
toronto.reset_index(inplace=True)
toronto.drop(columns = 'index', inplace=True)
toronto

Import geospatial data that contains latitudes and longitudes of each neighborhood

In [None]:
geospatial_data = pd.read_csv('https://cocl.us/Geospatial_data')
geospatial_data

Create a dataframe coordinates that contains only the coordinates of the specified neighborhoods

In [None]:
coordinates = pd.DataFrame()
for i in range(len(pc)):
    coordinates = coordinates.append(geospatial_data.loc[geospatial_data['Postal Code'] == pc[i], :])
coordinates

In [None]:
coordinates.reset_index(inplace=True)
coordinates.drop(columns = 'index', inplace=True)
coordinates

Add latitude and longitude columns to the dataframe toronto

In [None]:
toronto['Latitude'] = coordinates['Latitude']
toronto['Longitude'] = coordinates['Longitude']
toronto

Get the geographical coordinates of Toronto

In [None]:
address = 'Toronto, ON'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

Visualize Toronto and the neighborhoods in it

In [None]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

for lat, lng, label in zip(toronto['Latitude'], toronto['Longitude'], toronto['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

Define Foursquare Credentials and Version

__THE GOAL IS TO ANALYZE THE TYPES OF RESTAURANTS IN EACH NEIGHBORHOOD, CLUSTER THE NEIGHBORHOODS BY FREQUENCY OF VARIOUS TYPES OF RESTAURANTS SO TRAVELERS TO THE AREAS MARKED ABOVE CAN MAKE A BETTER DECISION ON WHICH NEIGHBORHOOD TO TRAVEL TO DEPENDING ON WHAT CUISINE THEY CRAVE.__

In [None]:
# The code was removed by Watson Studio for sharing.

Create a function that generates URLs, make GET requests, import only relevent information for each venues and return a dataframe

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'],  
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Run the getNearbyVenues function on each neighborhood and save to the dataframe toronto_venues

In [None]:
LIMIT = 100
toronto_venues = getNearbyVenues(names=toronto['Neighborhood'],
                                 latitudes=toronto['Latitude'],
                                 longitudes=toronto['Longitude']
                                 )

In [None]:
print(toronto_venues.shape)
toronto_venues.head()

Venues count for each neighborhood

In [None]:
toronto_venues.groupby('Neighborhood').count()

Number of unique venue categories

In [None]:
len(toronto_venues['Venue Category'].unique())

Create a new dataframe toronto_onehot with dummies for each venue category

In [None]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

toronto_onehot.head()

Drop all venue categories that are not restaurant

In [None]:
toronto_onehot = toronto_onehot[toronto_onehot.columns[toronto_onehot.columns.str.contains('Restaurant')]]
toronto_onehot

In [None]:
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot

In [None]:
toronto_onehot.shape

Make a new dataframe toronto_grouped with grouped rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

In [None]:
toronto_grouped.shape

Write a function to sort the rows in descending order

In [None]:
def return_most_common_restaurants(row, num_top_restaurants):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_restaurants]

Create a dataframe neighborhoods_venues_sorted with top 10 venue categories for each neighborhood

In [None]:
num_top_restaurants = 10
indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_restaurants):
    try:
        columns.append('{}{} Most Common Restaurant'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Restaurant'.format(ind+1))

neighborhoods_restaurants_sorted = pd.DataFrame(columns=columns)
neighborhoods_restaurants_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_restaurants_sorted.iloc[ind, 1:] = return_most_common_restaurants(toronto_grouped.iloc[ind, :], num_top_restaurants)

neighborhoods_restaurants_sorted

Run k-means clustering and cluster the neighborhoods into 5 clusters by restaurant types

In [None]:
kclusters = 5
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

kmeans.labels_[0:10] 

Create a new dataframe by merging toronto and neighborhoods_venues_sorted and dropping postal codes column, which returns only boroughs, neighborhoods, coordinates for each neighborhood, newly-added cluster labels, and venue categories

In [None]:
neighborhoods_restaurants_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto
toronto_merged = toronto_merged.join(neighborhoods_restaurants_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.drop(columns = 'Postal Code', inplace=True)
toronto_merged

Create a map that visualizes the resulting clusters

In [None]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

__THE FOLLOWING IS THE EXAMINATION OF EACH CLUSTER, SHOWING WHICH TYPES OF RESTAURANTS ARE FREQUENTLY POPULATED IN LISTED NEIGHBORHOODS. FOR THOSE WHO PLAN TO TRAVEL TO THE LISTED AREAS, THE FOLLOWING ANALYSIS SHOULD COME IN HANDY WHEN DECIDING WHICH NEIGHBORHOOD TO TRAVEL TO DEPENDING ON THE TYPE OF CUISINE YOU ARE CRAVING.__

__Cluster 1__: Mainly comprised of __Fast Food__, __Vegetarian/Vegan__, __Korean__, __Asian__, __Comfort Food Restaurants__, etc.

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

__Cluster 2__: Mainly comprised of __Italian__, __Vegetarian/Vegan__, __Asian__, __Japanese__, __Korean__, __Comfort Food Restaurants__, etc.

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

__Cluster 3__: Mainly comprised of __Fast Food__,__Vegetarian/Vegan__, __Korean__, __Asian Restaurants__ etc.

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

__Cluster 4__: Mainly comprised of __Middle Eastern__, __Vegetarian/Vegan__, __Korean__, __Asian__, __Comfort Food Restaurants__, etc.

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

__Cluster 5__: Mainly comprised of __Mediterranean__, __Vegetarian/Vegan__, __Korean__, __Asian__, __Comfort Food Restaurants__, etc.

In [None]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]