# Parsing Toronto neighborhood data from Wikipedia page

#### Install BeautiulSoup and HTML Parser

In [1]:
#!pip install Beautifulsoup4
#!pip install lxml

#### Import libraries

In [2]:
import requests # library to handle requests
from bs4 import BeautifulSoup
import pandas as pd

#### Parse HTML page

In [3]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')

#### Convert HTML table into a dataframe

In [5]:
table = soup.body.find('tbody')

rows = table.find_all('tr')
del rows[0]

num_rows=len(rows)
col_names=['PostalCode','Borough','Neighborhood']
df = pd.DataFrame(columns=col_names, index = range(0,num_rows))

row_marker = 0
for row in rows:
    cols = row.find_all('td')
    col_marker = 0
    for col in cols:
        df.iat[row_marker,col_marker] = col.text.rstrip()
        col_marker = col_marker + 1
    row_marker = row_marker + 1

df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [6]:
df.shape

(289, 3)

#### Remove rows that do not contain Borough name

In [7]:
df = df[df.Borough != 'Not assigned']
df.shape

(212, 3)

In [8]:
df.head(7)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned


#### When Neighborhood name is not assigned use Borough name instead

In [9]:
df.Neighborhood.replace('Not assigned',df.Borough,inplace=True)
df.head(7)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Queen's Park


#### Save neighborhood data in CVS file

In [10]:
df.to_csv("TorontoCodesRaw.csv", index=False)

#### Combine Neighborhood names that belong one PostalCode area

In [11]:
df_new = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


#### Save aggregated neighborhood data in CVS file

In [12]:
df_new.to_csv("TorontoCodesAgg.csv", index=False)

#### Final dataframe dimensions

In [13]:
df_new.shape

(103, 3)