## Installing Befautiful Soup 

In [1]:
#install Beautiful Soup and requests for Web Scaping
!pip install BeautifulSoup4



## Installing lxml to read Wifi

In [2]:
pip install lxml


The following command must be run outside of the IPython shell:

    $ pip install lxml

The Python package manager (pip) can only be used from outside of IPython.
Please reissue the `pip` command in a separate terminal or command prompt.

See the Python documentation for more information on how to install packages:

    https://docs.python.org/3/installing/


## Importing Libiaries

In [3]:
#imports
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

## Reading wiki link

In [4]:

#get html from wiki page and create soup object
source = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(source.text, 'lxml')

#using soup object, iterate the .wikitable to get the data from the HTML page and store it into a list
data = []
columns = []
table = soup.find(class_='wikitable')
for index, tr in enumerate(table.find_all('tr')):
    section = []
    for td in tr.find_all(['th','td']):
        section.append(td.text.rstrip())
    
    #First row of data is the header
    if (index == 0):
        columns = section
    else:
        data.append(section)

#convert list into Pandas DataFrame
canada_df = pd.DataFrame(data = data,columns = columns)
canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## Remove Borough Not Assigned

In [5]:
#Remove Boroughs that are 'Not assigned'
canada_df = canada_df[canada_df['Borough'] != 'Not assigned']
canada_df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


## Group by Postal Code

In [6]:
# More than one neighborhood can exist in one postal code area, combined these into one row with the neighborhoods separated with a comma
canada_df["Neighbourhood"] = canada_df.groupby("Postcode")["Neighbourhood"].transform(lambda neigh: ', '.join(neigh))

#remove duplicates"
canada_df = canada_df.drop_duplicates()

#update index to be postcode if it isn't already
if(canada_df.index.name != 'Postcode'):
    canada_df = canada_df.set_index('Postcode')
    
canada_df.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,North York,"Lawrence Heights, Lawrence Manor"
M7A,Queen's Park,Not assigned


## If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

In [9]:
canada_df['Neighbourhood'].replace("Not assigned", canada_df["Borough"],inplace=True)
canada_df.head()

Unnamed: 0_level_0,Borough,Neighbourhood
Postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,Harbourfront
M6A,North York,"Lawrence Heights, Lawrence Manor"
M7A,Queen's Park,Queen's Park


## Printing Number of rows using shape()

In [10]:
canada_df.shape

(103, 2)

## Reading Geospatial Data via link

In [11]:
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner

link = "http://cocl.us/Geospatial_data"
dfgeo = pd.read_csv(link)

dfgeo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## Chainging Columns Name 

In [12]:
dfgeo.columns = ['Postcode','Latitude','Longitude']

cols = dfgeo.columns.tolist()
cols

['Postcode', 'Latitude', 'Longitude']

## Merge Canada & dfgeo

In [13]:
df_canda_loc = pd.merge(canada_df, dfgeo, on='Postcode')
df_canda_loc.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,"Lawrence Heights, Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


## Import Libraries

In [15]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

# All requested packages already installed.

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    folium-0.5.0               |             py_0          45 KB  conda-forge
    altair-4.0.0               |             py_0         606 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         704 KB

The following NEW packages will be INSTALLED:

    altair:  4.0.0-py_0 conda-forge
    branca:  0.3.1-py_0 conda-forge
    folium:  0.5.0-py_0 conda-forge
    vincent: 0.4.4-py_1 conda-forge


Down

## Creating Toronto Dataset

In [30]:
Toronto_data = df_canda_loc[df_canda_loc['Borough'].str.contains('Toronto')].reset_index(drop=True)# select Boroughs with "Toronto"
Toronto_data.head()


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
1,M9A,Downtown Toronto,Queen's Park,43.667856,-79.532242
2,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


## Creating Map using Toronto

In [33]:

address = 'Toronto, Ontario'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))


# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_data['Latitude'], Toronto_data['Longitude'], Toronto_data['Borough'], Toronto_data['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto



The geograpical coordinate of Toronto City are 43.653963, -79.387207.


## Explore Neighborhoods in Toronto

In [38]:
CLIENT_ID = 'S1DLN2WDUCQNN2LFQEC2FGL4XXTGYNOZOWXVG5S0F0HQODUZ' # your Foursquare ID
CLIENT_SECRET = 'W1KQ0UIOSXFGTVEOWCLEZYID3KGQBGOCYFKAUCMAGJ2OMQUZ' # your Foursquare Secret
VERSION = '20180604' # Foursquare API version


Toronto_data.loc[0, 'Neighbourhood']

neighborhood_latitude = Toronto_data.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = Toronto_data.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = Toronto_data.loc[0, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

LIMIT = 30 # limit of number of venues returned by Foursquare API

radius = 500 # define radius

Latitude and longitude values of Harbourfront are 43.6542599, -79.3606359.


In [40]:
 #create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5df43f4caba297274336a03e'},
 'response': {'suggestedFilters': {'header': 'Tap to show:',
   'filters': [{'name': 'Open now', 'key': 'openNow'}]},
  'headerLocation': 'Corktown',
  'headerFullLocation': 'Corktown, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 50,
  'suggestedBounds': {'ne': {'lat': 43.6587599045, 'lng': -79.3544279001486},
   'sw': {'lat': 43.6497598955, 'lng': -79.36684389985142}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '54ea41ad498e9a11e9e13308',
       'name': 'Roselle Desserts',
       'location': {'address': '362 King St E',
        'crossStreet': 'Trinity St',
        'lat': 43.653446723052674,
        'lng': -79.3620167174383,
        'labeledLatLngs': [{'label': 'display',
 

In [41]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Cooper Koo Family YMCA,Gym / Fitness Center,43.653191,-79.357947
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Morning Glory Cafe,Breakfast Spot,43.653947,-79.361149
5,Impact Kitchen,Restaurant,43.656369,-79.35698
6,Figs Breakfast & Lunch,Breakfast Spot,43.655675,-79.364503
7,Corktown Common,Park,43.655618,-79.356211
8,The Distillery Historic District,Historic Site,43.650244,-79.359323
9,Dominion Pub and Kitchen,Pub,43.656919,-79.358967


## Clustering

In [65]:
# one hot encoding
toronto_onehot = pd.get_dummies(nearby_venues[['categories']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = nearby_venues['name'] 
toronto_onehot['Latitude'] = nearby_venues['lat'] 
toronto_onehot['Longitude'] = nearby_venues['lng'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

Unnamed: 0,Longitude,Bakery,Breakfast Spot,Café,Chocolate Shop,Coffee Shop,Dessert Shop,Farmers Market,French Restaurant,Greek Restaurant,Gym / Fitness Center,Historic Site,Mexican Restaurant,Park,Performing Arts Venue,Pub,Restaurant,Spa,Theater,Neighborhood,Latitude
0,-79.362017,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Roselle Desserts,43.653447
1,-79.361809,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,Tandem Coffee,43.653559
2,-79.357947,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,Cooper Koo Family YMCA,43.653191
3,-79.359874,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,Body Blitz Spa East,43.654735
4,-79.361149,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Morning Glory Cafe,43.653947


#### Group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [66]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighborhood,Longitude,Bakery,Breakfast Spot,Café,Chocolate Shop,Coffee Shop,Dessert Shop,Farmers Market,French Restaurant,Greek Restaurant,Gym / Fitness Center,Historic Site,Mexican Restaurant,Park,Performing Arts Venue,Pub,Restaurant,Spa,Theater,Latitude
0,Alumnae Theatre,-79.364753,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,43.652756
1,Arvo,-79.361442,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,43.649963
2,Body Blitz Spa East,-79.359874,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,43.654735
3,Brick Street Bakery,-79.359539,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,43.650574
4,Cacao 70,-79.360723,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,43.650067
5,Caffe Furbo,-79.358849,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,43.64997
6,Cluny Bistro & Boulangerie,-79.357843,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,43.650565
7,Cocina Economica,-79.365657,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,43.654959
8,Cooper Koo Family YMCA,-79.357947,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,43.653191
9,Corktown Common,-79.356211,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,43.655618


#### Print each neighborhood along with the top 5 most common venues

In [67]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Alumnae Theatre----
        venue   freq
0    Latitude  43.65
1     Theater   1.00
2      Bakery   0.00
3         Spa   0.00
4  Restaurant   0.00


----Arvo----
           venue   freq
0       Latitude  43.65
1    Coffee Shop   1.00
2  Historic Site   0.00
3        Theater   0.00
4            Spa   0.00


----Body Blitz Spa East----
        venue   freq
0    Latitude  43.65
1         Spa   1.00
2      Bakery   0.00
3     Theater   0.00
4  Restaurant   0.00


----Brick Street Bakery----
        venue   freq
0    Latitude  43.65
1      Bakery   1.00
2     Theater   0.00
3         Spa   0.00
4  Restaurant   0.00


----Cacao 70----
           venue   freq
0       Latitude  43.65
1   Dessert Shop   1.00
2  Historic Site   0.00
3        Theater   0.00
4            Spa   0.00


----Caffe Furbo----
           venue   freq
0       Latitude  43.65
1           Café   1.00
2  Historic Site   0.00
3        Theater   0.00
4            Spa   0.00


----Cluny Bistro & Boulangerie----
             

## Putting into Dataframe

In [93]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood','Latitude','Longitude']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']
neighborhoods_venues_sorted['Latitude'] = toronto_grouped['Latitude']
neighborhoods_venues_sorted['Longitude'] = toronto_grouped['Longitude']


for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 3:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Alumnae Theatre,43.652756,-79.364753,Latitude,Theater,Bakery,Breakfast Spot,Café,Chocolate Shop,Coffee Shop,Dessert Shop,Farmers Market,French Restaurant
1,Arvo,43.649963,-79.361442,Latitude,Coffee Shop,Theater,Bakery,Breakfast Spot,Café,Chocolate Shop,Dessert Shop,Farmers Market,French Restaurant
2,Body Blitz Spa East,43.654735,-79.359874,Latitude,Spa,French Restaurant,Bakery,Breakfast Spot,Café,Chocolate Shop,Coffee Shop,Dessert Shop,Farmers Market
3,Brick Street Bakery,43.650574,-79.359539,Latitude,Bakery,Theater,Breakfast Spot,Café,Chocolate Shop,Coffee Shop,Dessert Shop,Farmers Market,French Restaurant
4,Cacao 70,43.650067,-79.360723,Latitude,Dessert Shop,Theater,Bakery,Breakfast Spot,Café,Chocolate Shop,Coffee Shop,Farmers Market,French Restaurant


In [94]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([3, 2, 3, 4, 4, 0, 4, 3, 4, 1], dtype=int32)

In [102]:
#Add Clustering label
neighborhoods_venues_sorted.insert(0, 'ClusterLabels', kmeans.labels_)

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(neighborhoods_venues_sorted['Latitude'], neighborhoods_venues_sorted['Longitude'], neighborhoods_venues_sorted['Neighborhood'], neighborhoods_venues_sorted['ClusterLabels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters
