## Task 1: Extract data from Wikipedia and create a Dataframe

#### 1. Import libraries

In [1]:
import requests
import bs4
import pandas as pd
from bs4 import BeautifulSoup

#### 2. Create dataframe

In [2]:
# Get html raw data from Wikipedia page
r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [3]:
# Read html as text
html = BeautifulSoup(r.text)

In [4]:
# Create dataframe by looping over non-empty and Tag-type rows
postcode = []
borough = []
neighborhood = []

for tr in html.table.tbody:
    values = []    
    for td in tr:
        if not isinstance(td, bs4.element.Tag):
            continue
        if td.name == 'td':
            values.append(td.text.strip())
    if len(values) == 0:
        continue
    postcode.append(values[0])
    borough.append(values[1])
    neighborhood.append(values[2])
    
df = pd.DataFrame({
    "PostalCode": postcode,
    "Borough": borough,
    "Neighborhood": neighborhood
})

In [5]:
# Unclean dataframe
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


#### 3. Modify the dataframe as instructed

#### 3.1. Remove cells with a borough that is **Not assigned**

In [12]:
df = df[df['Borough'] != 'Not assigned']

In [13]:
df

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


#### 3.2. Check if there are any **Not assigned** neighborhoods

In [19]:
df[df['Neighborhood'] == 'Not assigned']

Unnamed: 0,PostalCode,Borough,Neighborhood


#### 3.3. Check the size of the final dataframe

In [20]:
df.shape

(103, 3)

##### The final dataframe has 103 rows.

#### **Optional:** Export the final dataframe so that it can be used for Task 2 and 3.

In [21]:
df.to_csv('can_df.csv', index=False)