# Scrape wiki website

In [1]:
!pip install BeautifulSoup4

Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/b5/7bb03a696f2c9b7af792a8f51b82974e51c268f15e925fc834876a4efa0b/beautifulsoup4-4.9.0-py3-none-any.whl (109kB)
[K     |████████████████████████████████| 112kB 5.1MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/05/cf/ea245e52f55823f19992447b008bcbb7f78efc5960d77f6c34b5b45b36dd/soupsieve-2.0-py2.py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.9.0 soupsieve-2.0


In [2]:
import requests
import pandas as pd
import numpy as np # library to handle data in a vectorized manner
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup

# Get URL

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
results = requests.get(url).text
re = BeautifulSoup(results, 'html.parser')
table = re.find('table')
table

<table class="wikitable sortable">
<tbody><tr>
<th>Postal code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>
<tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>
<tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park / Harbourfront
</td></tr>
<tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor / Lawrence Heights
</td></tr>
<tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park / Ontario Provincial Government
</td></tr>
<tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue
</td></tr>
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern / Rouge
</td></tr>
<tr>
<td>M2B
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3B
</td>
<td>North York
</td>
<td>Don Mills
</td></tr>
<tr>
<td>M4B
</td>
<td>Ea

# Create empty dataframe

In [4]:
column_names = ['PostalCode', 'Borough', 'Neighborhood']
frame = pd.DataFrame(columns = column_names)

# Find info under 'td' and filter out not assigned

In [6]:
for tr in table.findAll("tr"):
    row = []
    for td in tr.findAll('td'):
        row.append(td.text.strip())

    if len(row) == 3 and row[2] != '':
            frame.loc[len(frame)] = row
#print (rows)

frame.isnull().sum() #check for bad rows
frame.Neighborhood = frame.Neighborhood.replace('Not assigned', frame.Borough)  

In [11]:
final_frame =frame.groupby(['PostalCode','Borough'])['Neighborhood'].agg(','.join).reset_index()
final_frame

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern / Rouge,Malvern / Rouge"
1,M1C,Scarborough,"Rouge Hill / Port Union / Highland Creek,Rouge..."
2,M1E,Scarborough,"Guildwood / Morningside / West Hill,Guildwood ..."
3,M1G,Scarborough,"Woburn,Woburn"
4,M1H,Scarborough,"Cedarbrae,Cedarbrae"
...,...,...,...
98,M9N,York,"Weston,Weston"
99,M9P,Etobicoke,"Westmount,Westmount"
100,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...
101,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...


In [13]:
final_frame.shape

(103, 3)

In [16]:
!pip install geopy

Collecting geopy
[?25l  Downloading https://files.pythonhosted.org/packages/53/fc/3d1b47e8e82ea12c25203929efb1b964918a77067a874b2c7631e2ec35ec/geopy-1.21.0-py2.py3-none-any.whl (104kB)
[K     |████████████████████████████████| 112kB 26.5MB/s eta 0:00:01
[?25hCollecting geographiclib<2,>=1.49 (from geopy)
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.21.0


In [19]:
from  geopy.geocoders import Nominatim

In [21]:
file = pd.read_csv('http://cocl.us/Geospatial_data')
file.head(10)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [27]:
file = file.rename(columns = {'Postal Code':'PostalCode'})


In [30]:
final_frame.set_index('PostalCode').join(file.set_index('PostalCode')).reset_index()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern / Rouge,Malvern / Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill / Port Union / Highland Creek,Rouge...",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood / Morningside / West Hill,Guildwood ...",43.763573,-79.188711
3,M1G,Scarborough,"Woburn,Woburn",43.770992,-79.216917
4,M1H,Scarborough,"Cedarbrae,Cedarbrae",43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,"Weston,Weston",43.706876,-79.518188
99,M9P,Etobicoke,"Westmount,Westmount",43.696319,-79.532242
100,M9R,Etobicoke,Kingsview Village / St. Phillips / Martin Grov...,43.688905,-79.554724
101,M9V,Etobicoke,South Steeles / Silverstone / Humbergate / Jam...,43.739416,-79.588437
