<h3> web scraping</h3>

<p>Web scraping (also known as screen scraping, data scraping, web harvesting, web data extraction and a multitude of other aliases) is a method for extracting data from web pages.</p>

In [1]:
# import the library we use to open URLs

import urllib.request

In [2]:
# specify which URL/web page we are going to be scraping

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [3]:
# open the url using urllib.request and put the HTML into the page variable

page = urllib.request.urlopen(url)

In [4]:
# import the BeautifulSoup library so we can parse HTML and XML documents

from bs4 import BeautifulSoup

In [5]:
# parse the HTML from our URL into the BeautifulSoup parse tree format

soup = BeautifulSoup(page, "lxml")

In [15]:
#print(soup.prettify())

In [16]:
soup.title

<title>List of postal codes of Canada: M - Wikipedia</title>

In [17]:
soup.title.string

'List of postal codes of Canada: M - Wikipedia'

In [18]:
# use the 'find_all' function to bring back all instances of the 'table' tag in the HTML and store in 'all_tables' variable

all_tables=soup.find_all("table")

#all_tables

In [21]:
right_table=soup.find('table', class_='wikitable sortable')

#right_table

In [22]:
A=[]

B=[]

C=[]


for row in right_table.findAll('tr'):

    cells=row.findAll('td')

    if len(cells)==3:

        A.append(cells[0].find(text=True).replace('\n', ''))

        B.append(cells[1].find(text=True).replace('\n', ''))

        C.append(cells[2].find(text=True).replace('\n', ''))


In [23]:
import pandas as pd

df=pd.DataFrame(A,columns=['PostalCode'])

df['Borough']=B

df['Neighbourhood']=C

df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [24]:
# #Rename Postal code to PostalCode
# df.rename(columns={"Postal Code": "PostalCode"}, inplace = True )

Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [25]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
dropNotAssigned = df[df.Borough != 'Not assigned'].reset_index(drop=True)

More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11  in the above table.

In [26]:
grouped = dropNotAssigned.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))

If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

In [27]:
mask = grouped['Neighbourhood'] == "Not assigned"
grouped.loc[mask, 'Neighbourhood'] = grouped.loc[mask, 'Borough']


In [28]:
grouped.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [30]:
coordenades = pd.read_csv('Geospatial_Coordinates.csv')
coordenades

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [34]:
coordenades.rename(index=str, columns={"Postal Code": "PostalCode"}, inplace = True)
grouped.rename(index=str, columns={"Neighbourhood": "Neighborhood"}, inplace = True)
neighborhood = pd.merge(grouped, coordenades, on='PostalCode')
neighborhood.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [35]:
neighborhood.shape

(103, 5)

In [33]:
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

ModuleNotFoundError: No module named 'geocoder'