In [2]:
# !pip install beautifulsoup4

Collecting beautifulsoup4
[?25l  Downloading https://files.pythonhosted.org/packages/cb/a1/c698cf319e9cfed6b17376281bd0efc6bfc8465698f54170ef60a485ab5d/beautifulsoup4-4.8.2-py3-none-any.whl (106kB)
[K     |████████████████████████████████| 112kB 23.6MB/s eta 0:00:01
[?25hCollecting soupsieve>=1.2 (from beautifulsoup4)
  Downloading https://files.pythonhosted.org/packages/81/94/03c0f04471fc245d08d0a99f7946ac228ca98da4fa75796c507f61e688c2/soupsieve-1.9.5-py2.py3-none-any.whl
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.8.2 soupsieve-1.9.5


In [17]:
# Import libraries

import requests
import pandas as pd
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from bs4 import BeautifulSoup
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [18]:
# %reload_ext autoreload
# %autoreload 2 # autoreload reloads modules automatically before entering the execution of code typed.
%matplotlib inline

In [19]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
page.status_code # A status_code of 200 means that the page downloaded successfully. 

200

In [20]:
soup = BeautifulSoup(page.content, 'html.parser')

In [21]:
table  = soup.find('table', { "class" : "wikitable sortable"})

# Transform the data into a pandas dataframe
1. **We will work on the columns first**

In [22]:
header_rows  = table.find_all('th')
header = [column_name.get_text().rstrip() for column_name in header_rows] # use rstrip() to remove all the trailing newline
header

['Postcode', 'Borough', 'Neighborhood']

2. **We proceed then on our data**
    1. Ignore cells with a borough that is Not assigned.
    2. If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [23]:
tr = table.findAll(['tr'])
data = []
for cell in tr:
    row    = cell.find_all('td')
    if len(row) == 0: # skip emtpy row
        continue
    else:
        if row[1].text == 'Not assigned': # Ignore cells with a borough that is "Not assigned". 
            continue
        else:
            if row[2].text.rstrip()  == 'Not assigned': # If a cell has a borough but that cell has a "Not assigned" neighborhood
                row[2] = row[1]                                   # then the neighborhood will be the same as the borough.
    df_row = [x.get_text().rstrip() for x in row]
    data.append(df_row)    

In [24]:
df =  pd.DataFrame(data=data,columns=header)
df.columns = ['PostalCode', 'Borough', 'Neighborhood'] # The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Queen's Park,Queen's Park
6,M9A,Downtown Toronto,Queen's Park
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


3. **Combined same postal code area into one row**

In [25]:
df_grouped = df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(','.join).reset_index()
df_grouped.head(11)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


4. **Use the .shape method to print the number of rows of your dataframe.**

In [26]:
df_grouped.shape

(103, 3)

# Get the latitude and the longitude coordinates of each neighborhood anbd merge

In [27]:
# Since the geocoder API didnt work,  we will use the CSV
path = 'http://cocl.us/Geospatial_data'
df_lon_lat = pd.read_csv(path)

In [28]:
df_lon_lat.rename(columns={'Postal Code':'PostalCode'}, inplace = True)
df_lon_lat.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [29]:
df_2 =pd.merge(df, df_lon_lat, on='PostalCode')
df_2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
3,M6A,North York,Lawrence Heights,43.718518,-79.464763
4,M6A,North York,Lawrence Manor,43.718518,-79.464763


# Explore and cluster the neighborhoods in Toronto. Replicate the same analysis we did to the New York City data.

In [30]:
address = 'Toronto'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto, Canada are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto, Canada are 43.653963, -79.387207.


#### Create a map of Toronto with neighborhoods superimposed on top.

In [31]:
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_2['Latitude'], df_2['Longitude'], df_2['Borough'], df_2['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto