# Web Scraping with Longitude and Latitude

### Set up environment - install libraries and imports

In [3]:
# !pip install --user beautifulsoup4

from bs4 import BeautifulSoup
import requests
import pandas as pd

### Read the webpage and select the table rows

In [4]:
r  = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
data = r.text

soup = BeautifulSoup(data, 'lxml')

table = soup.find("table", {"class" : "wikitable sortable"})

table_rows = table.find_all('tr')

### Define a function for adding row data to a dictionary

In [5]:
# function for adding a row list to the pbn (postalcode-borough-neighbourhood) dictionary
def add_to_pbndict(row):
    p = row[0]
    b = row[1]
    n = row[2]
    
    if b != 'Not assigned':  # don't add an unassigned borough
        # Use borough if neighbourhood is 'Not assigned'
        if n == 'Not assigned':
            n = b # Use value of borough instead
            
        # add or update the dictionary    
        if p in pbndict:
            # Postalcode p already in dictionary and will be updated
            pbndict[p] = {"Borough": b, "Neighbourhood": pbndict[p]['Neighbourhood'] + ", " + n}
        else:
            # Postalcode p is not in dictionary and will be added
            pbndict[p] = {"Borough": b, "Neighbourhood": n}

### Add the html table row data to the dictionary

In [6]:
# iterate the table rows and add to the pbn (postalcode borough neighbourhood) dictionary
pbndict = {}

for tr in table_rows:
    tds = tr.find_all('td')
    row = [td.text.strip() for td in tds]  # strip() seems to remove the newline
    if row:  # removes empty row (the table headings)
        add_to_pbndict(row)

### Copy the pbn (postalcode borough neighbourhood) dictionary contents to the DataFrame

In [7]:
# Create the datatable
df = pd.DataFrame(columns=['PostalCode', 'Borough', 'Neighbourhood'])

# Add the contents of the dictionary
for key in pbndict:
    df.loc[len(df)] = [key, pbndict[key]['Borough'], pbndict[key]['Neighbourhood']]

# * * * Add Longitude and Latitude * * *

In [8]:
# This is the DataFrame we created at the end of the first part of the assignment
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M9A,Etobicoke,Islington Avenue
1,M4H,East York,Thorncliffe Park
2,M9L,North York,Humber Summit
3,M9W,Etobicoke,Northwest
4,M5S,Downtown Toronto,"Harbord, University of Toronto"


### Read the Geospatial_Coordinates.csv file

In [46]:
# Template code for accessing a data file in the IBM Watson Studio environment
# For security I have redacted my credentials from the code below

import sys
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share your notebook.
client = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='redacted',
    ibm_auth_endpoint="https://iam.ng.bluemix.net/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client.get_object(Bucket='courseacapstone-donotdelete-pr-62ipahmxygi73y',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_geo = pd.read_csv(body)
df_geo.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### To enable merging the 2 DataFrames, rename the Postal Code column to be consistent

In [47]:
# Rename the postalcode column name in df_data_1
df_geo.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
df_geo.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge the dataframes on matching PostalCode

In [48]:
result = df.merge(df_geo, how='left', on=['PostalCode'])
result

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
1,M4H,East York,Thorncliffe Park,43.705369,-79.349372
2,M9L,North York,Humber Summit,43.756303,-79.565963
3,M9W,Etobicoke,Northwest,43.706748,-79.594054
4,M5S,Downtown Toronto,"Harbord, University of Toronto",43.662696,-79.400049
5,M3J,North York,"Northwood Park, York University",43.767980,-79.487262
6,M2H,North York,Hillcrest Village,43.803762,-79.363452
7,M8Y,Etobicoke,"Humber Bay, King's Mill Park, Kingsway Park So...",43.636258,-79.498509
8,M6C,York,Humewood-Cedarvale,43.693781,-79.428191
9,M6L,North York,"Maple Leaf Park, North Park, Upwood Park",43.713756,-79.490074
