# Scrape the Wikipedia Page

### Import libraries that we need

In [1]:
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

### Set the url to the website and access the site with the requests library

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(url)

### 200 means it went through

In [3]:
response

<Response [200]>

### Use the BeautifulSoup library to get data on the website

In [4]:
soup = BeautifulSoup(response.text, 'html.parser')


### Find the target table

In [5]:
target_table = soup.find_all('table', {'class': 'wikitable'})[0]

### Get the column titles we need

In [6]:
column_titles = []
columns = target_table.find_all('th')
for title in columns:
    text = title.text
    if '\n' in text:
        index = text.index('\n')
        text = text[:index] + text[index+2:]
    column_titles.append(text)
column_titles

['Postal Code', 'Borough', 'Neighbourhood']

### Get columns data we need

In [7]:
def boroughIsValid(borough):
    return borough != 'Not assigned'

In [8]:
def neibourhoodIsValid(neibourhood):
    return neibourhood != 'Not assigned'

In [9]:
from collections import defaultdict 
table_body = soup.find('tbody')
columns_data = defaultdict(list)
data = []
rows = table_body.find_all('tr')
for row in rows:
    cols = row.find_all('td')
    if len(cols) == 0:
        continue
    postal_code, borough, neighbourhood = [x.text.strip() for x in cols]
    # Only process the cells that have an assigned borough
    if not boroughIsValid(borough):
        continue
    # If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.
    if not neibourhoodIsValid(neighbourhood):
        neighbourhood = borough
    columns_data[(postal_code, borough)].append(neighbourhood)
data = []
# Combined rows with same PostalCode and Borough into one row with the neighborhoods separated with a comma
for key, value in columns_data.items():
    neighbourhoods = ','.join(value)
    data.append(list(key) + [neighbourhoods])

data

[['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Regent Park, Harbourfront'],
 ['M6A', 'North York', 'Lawrence Manor, Lawrence Heights'],
 ['M7A', 'Downtown Toronto', "Queen's Park, Ontario Provincial Government"],
 ['M9A', 'Etobicoke', 'Islington Avenue, Humber Valley Village'],
 ['M1B', 'Scarborough', 'Malvern, Rouge'],
 ['M3B', 'North York', 'Don Mills'],
 ['M4B', 'East York', 'Parkview Hill, Woodbine Gardens'],
 ['M5B', 'Downtown Toronto', 'Garden District, Ryerson'],
 ['M6B', 'North York', 'Glencairn'],
 ['M9B',
  'Etobicoke',
  'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale'],
 ['M1C', 'Scarborough', 'Rouge Hill, Port Union, Highland Creek'],
 ['M3C', 'North York', 'Don Mills'],
 ['M4C', 'East York', 'Woodbine Heights'],
 ['M5C', 'Downtown Toronto', 'St. James Town'],
 ['M6C', 'York', 'Humewood-Cedarvale'],
 ['M9C',
  'Etobicoke',
  'Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood

### Transform the data into a pandas dataframe

In [10]:
table_data = np.array(data)
df1 = pd.DataFrame(table_data, columns=column_titles)
df1

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [11]:
df1.shape

(103, 3)

# Create another dataframe with latitude and longitude

### Read the geographical coordinates csv

In [12]:
df2 = pd.read_csv('Geospatial_Coordinates.csv')
df2

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


### Join the two tables

In [13]:
df = df1.join(df2.set_index('Postal Code'), on='Postal Code')
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


# Explore and cluster the neighborhoods in Toronto

### Import libraries we need

In [14]:
import json

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [15]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


### Get Toronto latitude and longitude

In [16]:
latitude = 43.653225
longitude = -79.383186
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.653225, -79.383186.


### Generate maps

In [17]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto