# Web Scraping to obtain the data on Toronto Neighborhood

### First Method: Using read_html function in pandas

In [1]:
#uncomment if xlml not installed. you may need to restart the kernal if xlml not found
#!pip install lxml

In [2]:
import pandas as pd
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#it will read the tablefrom html page, assign the first row as header and define Not assigned in Webpage to NaN in padas dataframe
#there are two tables on the page, we need just the first one
df_toronto = pd.read_html(url, header=0, na_values= ['Not assigned'])[0]
df_toronto.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [3]:
#this is the number of rows and colums including NaN values for Borough
df_toronto.shape

(180, 3)

In [4]:
#lets count missing values in our dataframe 
df_toronto.isna().sum()

Postal code      0
Borough         77
Neighborhood    77
dtype: int64

In [5]:
#lets drop missing rows if Borough is missing in that row
df_toronto.dropna(axis=0, subset=['Borough'], inplace=True)

In [6]:
#no values are missing now
df_toronto.isna().sum()

Postal code     0
Borough         0
Neighborhood    0
dtype: int64

In [7]:
#although the question says to combine postal address, turn out wikipedia has already done that in recent updates
len(df_toronto['Postal code'].unique())

103

In [8]:
#lets replace ' /' in Neighborhodd column with ',', we dont need to use regular expression 
df_toronto['Neighborhood'] = df_toronto.Neighborhood.str.replace(' /',',', regex=False)

In [9]:
#reset index
df_toronto.reset_index(drop = True, inplace=True)

In [10]:
df_toronto.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [11]:
#Shape of the dataset
#we dropped 77 rows (realize that there were 77 missing values in Borough)
print('The shape of the dataset is:',df_toronto.shape)

The shape of the dataset is: (103, 3)


In [12]:
#lets export the csv file now
df_toronto.to_csv('toronto_neigh.csv', index=False)

### Alternatively: Using BeautifulSoup

In [13]:
#Uncomment if requests not installed
#!pip install requests

In [14]:
import requests
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)

In [15]:
#print the first 500 characters of the HTML
print(r.text[0:500])


<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>List of postal codes of Canada: M - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"XptJpgpAICsAAFpdq2oAA


In [16]:
#uncomment if Beautifulsoup not installed
#!pip install beautifulsoup4

In [17]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(r.text, 'html.parser')

In [18]:
results = soup.find_all('td')

In [19]:
len(results)

573

In [20]:
#we dont need all of the data present in results
results = results[0:540]

In [21]:
#trial code
#[0:-1] is required to remove formatting character from end of string
#remove [0:-1] to see what i mean
results[0].text[0:-1]

'M1A'

In [22]:
#trial code
results[1].text[0:-1]

'Not assigned'

In [23]:
#trial code
results[2].text[0:-1]

''

In [24]:
records =[]
i = 0
while i < len(results):
    postal = results[i].text[0:-1]
    borough = results[i+1].text
    if borough == '\n' or borough == 'Not assigned\n':
        borough = None
    else:
        borough = results[i+1].text[0:-1]
    neighbor = results[i+2].text
    if neighbor == '\n' or neighbor == 'Not assigned\n':
        neighbor = None
    else:
        neighbor = results[i+2].text[0:-1]
    records.append((postal,borough,neighbor))
    i = i + 3

In [25]:
records

[('M1A', None, None),
 ('M2A', None, None),
 ('M3A', 'North York', 'Parkwoods'),
 ('M4A', 'North York', 'Victoria Village'),
 ('M5A', 'Downtown Toronto', 'Regent Park / Harbourfront'),
 ('M6A', 'North York', 'Lawrence Manor / Lawrence Heights'),
 ('M7A', 'Downtown Toronto', "Queen's Park / Ontario Provincial Government"),
 ('M8A', None, None),
 ('M9A', 'Etobicoke', 'Islington Avenue'),
 ('M1B', 'Scarborough', 'Malvern / Rouge'),
 ('M2B', None, None),
 ('M3B', 'North York', 'Don Mills'),
 ('M4B', 'East York', 'Parkview Hill / Woodbine Gardens'),
 ('M5B', 'Downtown Toronto', 'Garden District, Ryerson'),
 ('M6B', 'North York', 'Glencairn'),
 ('M7B', None, None),
 ('M8B', None, None),
 ('M9B',
  'Etobicoke',
  'West Deane Park / Princess Gardens / Martin Grove / Islington / Cloverdale'),
 ('M1C', 'Scarborough', 'Rouge Hill / Port Union / Highland Creek'),
 ('M2C', None, None),
 ('M3C', 'North York', 'Don Mills'),
 ('M4C', 'East York', 'Woodbine Heights'),
 ('M5C', 'Downtown Toronto', 'St. 

In [26]:
#540/3 = 180
len(records)

180

In [27]:
import pandas as pd
df_toronto = pd.DataFrame(records, columns = ['Postal code', 'Borough', 'Neighborhood'])
df_toronto.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Regent Park / Harbourfront


In [28]:
df_toronto.tail()

Unnamed: 0,Postal code,Borough,Neighborhood
175,M5Z,,
176,M6Z,,
177,M7Z,,
178,M8Z,Etobicoke,Mimico NW / The Queensway West / South of Bloo...
179,M9Z,,


In [29]:
#this is the number of rows and colums including NaN values for Borough
df_toronto.shape

(180, 3)

In [30]:
#lets count missing values in our dataframe 
df_toronto.isna().sum()

Postal code      0
Borough         77
Neighborhood    77
dtype: int64

In [31]:
#lets drop missing rows if Borough is missing in that row
df_toronto.dropna(axis=0, subset=['Borough'], inplace=True)

In [32]:
#no values are missing now
df_toronto.isna().sum()

Postal code     0
Borough         0
Neighborhood    0
dtype: int64

In [33]:
#lets replace ' /' in Neighborhodd column with ',', we dont need to use regular expression 
df_toronto['Neighborhood'] = df_toronto.Neighborhood.str.replace(' /',',', regex=False)

In [34]:
#reset index
df_toronto.reset_index(drop = True, inplace=True)

In [35]:
df_toronto.head()

Unnamed: 0,Postal code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [36]:
#Shape of the dataset
#we dropped 77 rows (realize that there were 77 missing values in Borough)
print('The shape of the dataset is:',df_toronto.shape)

The shape of the dataset is: (103, 3)


In [37]:
#lets export the csv file now
df_toronto.to_csv('toronto_neigh.csv', index=False)