# Web Scraping

### Set up environment - install libraries and imports

In [207]:
# !pip install --user beautifulsoup4

from bs4 import BeautifulSoup
import requests
import pandas as pd

### Read the webpage and select the table rows

In [208]:
r  = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
data = r.text

soup = BeautifulSoup(data, 'lxml')

table = soup.find("table", {"class" : "wikitable sortable"})

table_rows = table.find_all('tr')

### Define a function for adding row data to a dictionary

In [209]:
# function for adding a row list to the pbn (postalcode-borough-neighbourhood) dictionary
def add_to_pbndict(row):
    p = row[0]
    b = row[1]
    n = row[2]
    
    if b != 'Not assigned':  # don't add an unassigned borough
        # Use borough if neighbourhood is 'Not assigned'
        if n == 'Not assigned':
            n = b # Use value of borough instead
            
        # add or update the dictionary    
        if p in pbndict:
            # Postalcode p already in dictionary and will be updated
            pbndict[p] = {"Borough": b, "Neighbourhood": pbndict[p]['Neighbourhood'] + ", " + n}
        else:
            # Postalcode p is not in dictionary and will be added
            pbndict[p] = {"Borough": b, "Neighbourhood": n}

### Add the html table row data to the dictionary

In [210]:
# iterate the table rows and add to the pbn (postalcode borough neighbourhood) dictionary
pbndict = {}

for tr in table_rows:
    tds = tr.find_all('td')
    row = [td.text.strip() for td in tds]  # strip() seems to remove the newline
    if row:  # removes empty row (the table headings)
        add_to_pbndict(row)

### Copy the pbn (postalcode borough neighbourhood) dictionary contents to the DataFrame

In [211]:
# Create the datatable
df = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighbourhood'])

# Add the contents of the dictionary
for key in pbndict:
    df.loc[len(df)] = [key, pbndict[key]['Borough'], pbndict[key]['Neighbourhood']]

In [214]:
# Display the DataFrame
df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M9A,Etobicoke,Islington Avenue
1,M4H,East York,Thorncliffe Park
2,M9L,North York,Humber Summit
3,M9W,Etobicoke,Northwest
4,M5S,Downtown Toronto,"Harbord, University of Toronto"
5,M3J,North York,"Northwood Park, York University"
6,M2H,North York,Hillcrest Village
7,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So..."
8,M6C,York,Humewood-Cedarvale
9,M6L,North York,"Maple Leaf Park, North Park, Upwood Park"


### Print the number of rows in the DataFrame

In [213]:
print("There are", df.shape[0], "rows in the DataFrame")

There are 103 rows in the DataFrame
