## Notebook to scrape Toronto Neighborhood
This notebook is to build a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name.   
The data comes from: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M   

BeautifulSoup is used.

### import library

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

### Read URL and find the table

In [2]:
website_url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
tmp = requests.get(website_url).text
soup = BeautifulSoup(tmp,'lxml')
my_table = soup.find('table',{'class':'wikitable sortable'})

### Save table content to dataframe

In [3]:
# get the column names
headers = my_table.findAll('th')
columns = []
for i in range(len(headers)):
    columns.append(headers[i].find(text=True).lstrip('\n').strip())
    
# create the dataframe
df = pd.DataFrame(columns=columns)
df

Unnamed: 0,Postcode,Borough,Neighbourhood


#### Save all the row content to the dataframe (dumping rows with borough = 'Not assigned')

In [4]:
for row in my_table.findAll("tr"):
    cells = row.findAll('td')
    if (len(cells) > 0):
        postcode = cells[0].find(text=True).lstrip('\n').strip()
        borough = cells[1].find(text=True).lstrip('\n').strip()
        neighbourhood = cells[2].find(text=True).lstrip('\n').strip()
        if (borough != 'Not assigned'):
            if (neighbourhood == 'Not assigned'):
                df = df.append({'Postcode': postcode,
                           'Borough':borough,
                           'Neighbourhood': borough}, ignore_index=True)
            else:
                df = df.append({'Postcode': postcode,
                           'Borough':borough,
                           'Neighbourhood': neighbourhood}, ignore_index=True)

print(df.shape)
df.head()

(211, 3)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


#### Group rows by Postcode and Borough

In [5]:
df_2 = df.groupby(['Postcode', 'Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df_2.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
print(df_2.shape)

(103, 3)
