# The code scraps the List_of_postal_codes_of_Canada data from Wikipedia.The scraped data is further cleaned such that no row contains 'Not assigned' entries.In the Neighborhood column, multiple entries are separated by commas.

In [14]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import numpy as np
from sklearn.cluster import KMeans
import folium
import pandas as pd
import requests
from bs4 import BeautifulSoup as Soup
from pandas.io.json import json_normalize

read=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M') #Scrape data from url
soup=Soup(read.text)
soup1=soup.find('tbody')
j=soup1.find_all('tr')

header=[]
upper=[]

for i in soup1.find_all('th'):          #Get header information
    header.append(i.text.strip('\n'))

for i in range(1,len(j)):  #Get data information
    lower=[]
    for td in j[i].find_all("td"):
        lower.append(td.text.strip('\n'))  
    upper.append(lower) 
    
Data=pd.DataFrame(upper,columns=header) #Clean the Dataset
Clean_Data=Data[Data.Borough!='Not assigned'].reset_index(drop=True)
Clean_Data.Neighborhood=Clean_Data.Neighborhood.apply(lambda x: x.replace('/',','))

for i in range(len(Clean_Data.Neighborhood)): #Convert 'Not assigned' Neighborhood to Borough
    if Clean_Data.loc[i,'Neighborhood']=='Not assigned':
        Clean_Data.loc[i,'Neighborhood']=Clean_Data.loc[i,'Borough']
        

df=pd.read_csv('Geospatial_Coordinates.csv')
Full_Data= Clean_Data.merge(df,how='inner',left_on='Postal code', right_on='Postal Code')
Full_Data=Full_Data[['Postal code','Borough','Neighborhood','Latitude','Longitude']]
Full_Data.rename(columns={'Postal code':'PostalCode'},inplace=True)

def Search(X):
    City=[i for i in Full_Data.Borough if X in i]
    return Full_Data[Full_Data.Borough.isin(City)]

My_City=Search('Toronto').reset_index(drop=True)




CLIENT_ID = 'FYM5SY3HVL1E2BWQ41KAFY054LGCJOYYMGZVWG14ESRABPOK' # your Foursquare ID
CLIENT_SECRET = 'KAS3C0GHNLF2EYQWQQBLSRC50ZSNPM0SZEI054RKCUYIPENC' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT=500

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return nearby_venues

My_City_venues = getNearbyVenues(names= My_City['Neighborhood'],
                                   latitudes= My_City['Latitude'],
                                   longitudes= My_City['Longitude']
                                  )


def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 10

indicators = ['st', 'nd', 'rd']

My_City_onehot = pd.get_dummies(My_City_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
My_City_onehot['Neighborhood'] = My_City_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [My_City_onehot.columns[-1]] + list(My_City_onehot.columns[:-1])
My_City_onehot = My_City_onehot[fixed_columns]

My_City_onehot.head()
My_City_grouped = My_City_onehot.groupby('Neighborhood').mean().reset_index()

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = My_City_grouped['Neighborhood']

for ind in np.arange(My_City_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(My_City_grouped.iloc[ind, :], num_top_venues)


# set number of clusters
kclusters = 5

My_City_grouped_clustering =My_City_grouped.drop('Neighborhood', 1)
# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(My_City_grouped_clustering)


# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

My_City_merged = My_City

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
My_City_merged = My_City_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')


# create map
latitude=43.741667
longitude= -79.373333
My_City_map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(My_City_merged['Latitude'], My_City_merged['Longitude'], My_City_merged['Neighborhood'], My_City_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(My_City_map_clusters)
       
My_City_map_clusters

Regent Park , Harbourfront
Queen's Park , Ontario Provincial Government
Garden District, Ryerson
St. James Town
The Beaches
Berczy Park
Central Bay Street
Christie
Richmond , Adelaide , King
Dufferin , Dovercourt Village
Harbourfront East , Union Station , Toronto Islands
Little Portugal , Trinity
The Danforth West , Riverdale
Toronto Dominion Centre , Design Exchange
Brockton , Parkdale Village , Exhibition Place
India Bazaar , The Beaches West
Commerce Court , Victoria Hotel
Studio District
Lawrence Park
Roselawn
Davisville North
Forest Hill North & West
High Park , The Junction South
North Toronto West
The Annex , North Midtown , Yorkville
Parkdale , Roncesvalles
Davisville
University of Toronto , Harbord
Runnymede , Swansea
Moore Park , Summerhill East
Kensington Market , Chinatown , Grange Park
Summerhill West , Rathnelly , South Hill , Forest Hill SE , Deer Park
CN Tower , King and Spadina , Railway Lands , Harbourfront West , Bathurst
 Quay , South Niagara , Island airport
Rosed