In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
# extract html data
r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
html_doc = r.content.decode('utf-8')
soup = BeautifulSoup(html_doc, 'html.parser')
table_tag = soup.find('table', {'class': 'wikitable sortable'})

table_tag

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

In [3]:
# extract table header from html
columns = ['PostalCode', 'Borough', 'Neighborhood']

# extract rows from html
row_tags = table_tag.find_all('tr')
rows = []
for row_tag in row_tags:
    item_tags = row_tag.find_all('td')
    if len(item_tags) != len(columns):
        continue
    row = list(map(lambda x: x.text.strip(), item_tags))
    # ignore rows with unassigned borough
    if row[1] == 'Not assigned':
        continue
    # set neighborhood as borough if unassigned
    if row[2] == 'Not assigned':
        row[2] = row[1]
    rows.append(row)

# Requirements

* The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood.
* Only process the cells that have an assigned borough. Ignore cells with a borough that is **Not assigned**.
* More than one neighborhood can exist in one postal code area. These rows will be combined into one row with the neighborhoods separated with a comma.
* If a cell has a borough but a **Not assigned** neighborhood, then the neighborhood will be the same as the borough.
* Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.
* In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [4]:
# generate dataframe as requested
df = pd.DataFrame(rows, columns=columns)
df = df.groupby(['PostalCode', 'Borough']).agg(lambda x: ', '.join(set(x))).reset_index()
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, East Birchmount Park, Ionview"
7,M1L,Scarborough,"Clairlea, Oakridge, Golden Mile"
8,M1M,Scarborough,"Scarborough Village West, Cliffside, Cliffcrest"
9,M1N,Scarborough,"Cliffside West, Birch Cliff"


In [5]:
df.shape

(103, 3)