<h3> web scraping</h3>
<p>Web scraping (also known as screen scraping, data scraping, web harvesting, web data extraction and a multitude of other aliases) is a method for extracting data from web pages.</p>

In [2]:
# import the library we use to open URLs

import urllib.request

In [3]:
# specify which URL/web page we are going to be scraping

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [4]:
#open the url using urllib.request and put the HTML into the page variable

page = urllib.request.urlopen(url)

In [5]:
# import the BeautifulSoup library so we can parse HTML and XML documents
#!pip install BeautifulSoup4
from bs4 import BeautifulSoup

In [6]:
# parse the HTML from our URL into the BeautifulSoup parse tree format
#!pip3 install lxml
soup = BeautifulSoup(page, 'html5lib' )

In [7]:
#print(soup.prettify())

In [8]:
soup.title

<title>List of postal codes of Canada: M - Wikipedia</title>

In [9]:
soup.title.string

'List of postal codes of Canada: M - Wikipedia'

In [10]:
# use the 'find_all' function to bring back all instances of the 'table' tag in the HTML and store in 'all_tables' variable

all_tables=soup.find_all("table")

#all_tables

In [11]:
right_table=soup.find('table', class_='wikitable sortable')

#right_table

In [12]:
A=[]

B=[]

C=[]


for row in right_table.findAll('tr'):

    cells=row.findAll('td')

    if len(cells)==3:

        A.append(cells[0].find(text=True).replace('\n', ''))

        B.append(cells[1].find(text=True).replace('\n', ''))

        C.append(cells[2].find(text=True).replace('\n', ''))


In [13]:
import pandas as pd

df=pd.DataFrame(A,columns=['PostalCode'])

df['Borough']=B

df['Neighbourhood']=C

df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [14]:
# #Rename Postal code to PostalCode
# df.rename(columns={"Postal Code": "PostalCode"}, inplace = True )

<p>Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.</p>

In [33]:
#Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
dropNotAssigned = df[df.Borough != 'Not assigned'].reset_index(drop=True)
dropNotAssigned

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


<p>More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11  in the above table.</p>

In [16]:
grouped = dropNotAssigned.groupby(['PostalCode','Borough'], as_index=False).agg(lambda x: ','.join(x))

<h3>If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.</h3>

In [17]:
mask = grouped['Neighbourhood'] == "Not assigned"
grouped.loc[mask, 'Neighbourhood'] = grouped.loc[mask, 'Borough']


In [18]:
grouped.head(10)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [20]:
#download the geospatial  data as .csv file
!wget -q -O 'Geospatial_data.csv' http://cocl.us/Geospatial_data

In [22]:
coordenades = pd.read_csv('Geospatial_data.csv')
coordenades

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [23]:
coordenades.rename(index=str, columns={"Postal Code": "PostalCode"}, inplace = True)
neighborhood = pd.merge(grouped, coordenades, on='PostalCode')
neighborhood.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [24]:
neighborhood.shape

(103, 5)

In [25]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhood['Borough'].unique()),
        neighborhood.shape[0]
    )
)

The dataframe has 10 boroughs and 103 neighborhoods.


In [28]:
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'Toronto, canada'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [31]:
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhood['Latitude'], neighborhood['Longitude'], neighborhood['Borough'], neighborhood['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto