In [None]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

# import beautifulsoup for web scraping
from urllib.request import urlopen
from bs4 import BeautifulSoup

%matplotlib inline
print('Libraries imported.')

In [None]:
# 1. Scrape The Data Into DataFrame

In [None]:
# Get the url 
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urlopen(url)

# Scrape with BeautifulSoup
soup = BeautifulSoup(html, 'lxml')

print(soup.prettify())

In [None]:
# Find the table of Toronto postcode with class="wikitable sortable"
table = soup.find('table',attrs={'class':'wikitable sortable'})

# Grab all the rows
table_rows = table.find_all('tr')

# create a list of all rows and columns
l = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [td.text for td in td]
    l.append(row)

# To dataframe    
columns_name = ['PostCode','Borough','Neighborhood']
Toronto_df = pd.DataFrame(l, columns = columns_name)   

Toronto_df.head()

In [None]:
# quickly exame the dataframe
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(Toronto_df['Borough'].unique()),
        Toronto_df.shape[0]
    )
)

In [None]:
# 2. Clean The Table

In [None]:
# remove the first row and reset the index

Toronto_df = Toronto_df.iloc[1:]

Toronto_df


In [None]:
# ignore cells with a borough that is Not assigned.

Toronto_df = Toronto_df[Toronto_df['Borough'] != 'Not assigned'].reset_index(drop=True)

Toronto_df

In [None]:
# remove the '\n' at the end of Neighborhood

Toronto_df['Neighborhood'] = Toronto_df['Neighborhood'].str[:-1]

Toronto_df

In [None]:
# Combine the Neighborhood with same PostCode

Toronto_df = Toronto_df.groupby(['PostCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()

Toronto_df

In [None]:
# If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

Toronto_df.loc[Toronto_df['Neighborhood'] == 'Not assigned', 'Neighborhood'] = Toronto_df['Borough']

Toronto_df

In [170]:
Toronto_df.shape

(103, 3)