In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
import geocoder# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# import beautifulsoup for web scraping
from urllib.request import urlopen
from bs4 import BeautifulSoup

%matplotlib inline
print('Libraries imported.')

In [None]:
# 1. Scrape The Data Into DataFrame

In [286]:
# Get the url 
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urlopen(url)

# Scrape with BeautifulSoup
soup = BeautifulSoup(html, 'lxml')

print(soup.prettify())

In [287]:
# Find the table of Toronto postcode with class="wikitable sortable"
table = soup.find('table',attrs={'class':'wikitable sortable'})

# Grab all the rows
table_rows = table.find_all('tr')

# create a list of all rows and columns
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [td.text for td in td]
    l.append(row)

# To dataframe    
columns_name = ['PostCode','Borough','Neighborhood']
Toronto_df = pd.DataFrame(l, columns = columns_name)   

Toronto_df.head()

Unnamed: 0,PostCode,Borough,Neighborhood
0,,,
1,M1A,Not assigned,Not assigned\n
2,M2A,Not assigned,Not assigned\n
3,M3A,North York,Parkwoods\n
4,M4A,North York,Victoria Village\n


In [288]:
# quickly exame the dataframe
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(Toronto_df['Borough'].unique()),
        Toronto_df.shape[0]
    )
)

The dataframe has 13 boroughs and 289 neighborhoods.


In [None]:
# 2. Clean The Table

In [289]:
# remove the first row and reset the index

Toronto_df = Toronto_df.iloc[1:]


In [290]:
# ignore cells with a borough that is Not assigned.

Toronto_df = Toronto_df[Toronto_df['Borough'] != 'Not assigned'].reset_index(drop=True)


In [291]:
# remove the '\n' at the end of Neighborhood

Toronto_df['Neighborhood'] = Toronto_df['Neighborhood'].str[:-1]


In [292]:
# Combine the Neighborhood with same PostCode

Toronto_df = Toronto_df.groupby(['PostCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()


In [293]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

Toronto_df.loc[Toronto_df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = Toronto_df['Borough']

Toronto_df.head()

Unnamed: 0,PostCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [280]:
Toronto_df.shape

(103, 5)

In [None]:
# 3. Get Postcode

In [None]:
Toronto_df.head()

In [None]:
# geocoder is not working

In [None]:
# for p in Toronto_df['PostCode']:
#     # initialize variable to None
#     lat_lng_coords = None

#     # loop until get the coordinates
#     while(lat_lng_coords is None):
#       g = geocoder.google('{}, Toronto, Ontario'.format(p))
#       lat_lng_coords = g.latlng

#     latitude = lat_lng_coords[0]
#     longitude = lat_lng_coords[1]
    
#     Toronto_df['Latitude'] = latitude
#     Toronto_df['Longitude'] = longitude

In [None]:
# Grab the postcode from csv
geo_df = pd.read_csv('geo.csv')
geo_df.rename(columns={'Postal Code': 'PostCode'},inplace=True)



In [None]:
# Merge two dataframe
Toronto_df = pd.merge(Toronto_df, geo_df, on='PostCode')


In [285]:
Toronto_df.head()

Unnamed: 0,PostCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [None]:
# exclude the PostCode Column for analysis
Toronto_data = Toronto_df.iloc[:,1:]

# Only the Toronto data
Toronto_data = Toronto_data[Toronto_data['Borough'].str.contains("Toronto")].reset_index(drop=True)


# Check 
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(Toronto_data['Borough'].unique()),
        Toronto_data.shape[0]
    )
)

In [None]:
# get the latitude and longitude values of Toronto

latitude = 43.6529
longitude = -79.3849

# Create a map of Toronto using latitude and longtitude values
map_toronto = folium.Map(location=[latitude,longitude], zoom_start=11)

# add markers to map

for lat,lng,label in zip(Toronto_data['Latitude'],Toronto_data['Longitude'],Toronto_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
#show map
map_toronto

In [None]:
# Define Foursquare Credentials and Version

CLIENT_ID = 'THW4T43GANDRLSNMXNCEK1ZGIBT0T2ONW3UXXXPVFBR2LS2Y' # your Foursquare ID
CLIENT_SECRET = 'X3IFLAKKQQCOKA40VJVJS31FPHHAGP5O150C0WUENFZ1UPNF' # your Foursquare Secret
VERSION = '20180605'
LIMIT = 100

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
# function that extracts the category of the venue
def get_category(row):
    try:
        category_list = row['categories']
    except:
        category_list = row['venue.categories']
    
    if len(category_list) == 0:
        return None
    else:
        return category_list[0]['name']
    
# function that get neighborhoods in Toronto

def getNearbyVenues(names,latitudes,longitudes,radius=500):
    venues_list = []
    for name,lat,lngin in zip(names,latitudes,longitudes):
        print(name)
        
        # create API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
        name,
        lat,
        lng,
        v['venue']['name'],
        v['venue']['location']['lat'],
        v['venue']['location']['lng'],
        v['venue']['categories'][0]['name'])for v in results])
        
    nearby_venues = pd.DataFrame([item for v_list in venues_list for item in v_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return (nearby_venues)

In [None]:
# Run the function and create a new dataframe called Toronto_venues

Toronto_venues = getNearbyVenues(names=Toronto_data['Neighborhood'],
                                   latitudes=Toronto_data['Latitude'],
                                   longitudes=Toronto_data['Longitude'])


In [None]:
# Check the size of the dataframe

print(Toronto_venues.shape)
Toronto_venues

In [None]:
# Check how many venues were returned for each neighborhood
Toronto_venues.groupby('Neighborhood').count()

In [None]:
# Find out how many unique categories can be curated from all the returned venues

print('There are {} uniques categories'.format(len(Toronto_venues['Venue Category'].unique())))

In [None]:
# 4. Analyze Each Neighborhood

# one hot encoding
Toronto_onehot = pd.get_dummies(Toronto_venues[['Venue Category']], prefix="",prefix_sep="")

# add neighborhood column back to dataframe
Toronto_onehot['Neighborhood'] = Toronto_venues['Neighborhood']

# move neighborhood column to the first column
fixed_columns = [Toronto_onehot.columns[-1]]+list(Toronto_onehot.columns[:-1])
Toronto_onehot= Toronto_onehot[fixed_columns]

Toronto_onehot.head()

In [None]:
Toronto_onehot.shape

In [None]:
# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
Toronto_grouped = Toronto_onehot.groupby('Neighborhood').mean().reset_index()
Toronto_grouped

In [None]:
Toronto_grouped.shape

In [None]:
# Find out each neighborhood along with the top 5 most common venues
num_top_venues =5

for hood in Toronto_grouped['Neighborhood']:
    print("-----" + hood + "-----")
    temp = Toronto_grouped[Toronto_grouped['Neighborhood']==hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq':2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
# function to return the venues
def return_most_common_venues(row,num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
# Put the result into a dataframe

num_top_venues = 10

indicators = ['st','nd','rd']

# create columns according to number of top venues
columns=['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1,indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
        
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_grouped['Neighborhood']

for ind in np.arange(Toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind,1:] = return_most_common_venues(Toronto_grouped.iloc[ind,:],num_top_venues)
    
neighborhoods_venues_sorted.head()

In [None]:
neighborhoods_venues_sorted.shape

In [None]:
# 5. Cluster Neighborhoods

In [None]:
# set number of clusters

kclusters = 5

Toronto_grouped_clustering = Toronto_grouped.drop('Neighborhood',1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
len(kmeans.labels_)

In [None]:
# Create a nw dataframe that includes the cluster as well as the top 10 venues for each neighborhood

# add clustering labels
neighborhoods_venues_sorted.insert(0,'Cluster Labels',kmeans.labels_)

Toronto_merged = Toronto_data

#Merge 

Toronto_merged = Toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'),on='Neighborhood')

Toronto_merged['Cluster Labels'].unique()

In [None]:
# drop the row that do not have venues 

Toronto_merged = Toronto_merged.dropna()

# change label to int
Toronto_merged['Cluster Labels'] = Toronto_merged['Cluster Labels'].astype(int,inplace=True)


In [277]:
# Let's visualize the resulting clusters

# create a map

map_clusters = folium.Map(location=[latitude,longitude],zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
color_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in color_array]

# add markers to the map
markers_colors = []

for lat, lng, poi, cluster in zip(Toronto_merged['Latitude'], Toronto_merged['Longitude'], Toronto_merged['Neighborhood'], Toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + 'Cluster ' + str(cluster), parse_html = True)
    folium.CircleMarker(
        [lat,lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
    
map_clusters

In [None]:
# Examine Clusters

In [None]:
# Cluster 1 
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 0 , Toronto_merged.columns[[1] +list(range(5, Toronto_merged.shape[1]))]]

In [None]:
# Cluster 2
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 1 , Toronto_merged.columns[[1] +list(range(5, Toronto_merged.shape[1]))]]

In [None]:
# Cluster 3
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 2 , Toronto_merged.columns[[1] +list(range(5, Toronto_merged.shape[1]))]]

In [None]:
# Cluster 4
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 3 , Toronto_merged.columns[[1] +list(range(5, Toronto_merged.shape[1]))]]

In [None]:
# Cluster 5
Toronto_merged.loc[Toronto_merged['Cluster Labels'] == 4, Toronto_merged.columns[[1] +list(range(5, Toronto_merged.shape[1]))]]

In [None]:
# # Conclusion 

# Cluster 1: 5 to 10 most common venues basicly are the same. 1,2 and 3 are Hot Dog Joint, Harbor / Marina and Lake

# Cluster 2: Top 5 are mixed with Yoga Studio,Brewery and Playground
    
# Cluster 3: Venues are the same
    
# Cluster 4: Most of the venues are the same 

# Cluster 5: Top 4 are basicly the same venues: Indian Restaurant,Café,Grocery Store and Sandwich Place