### Scrape the following Wikipedia page: https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [1]:
# import the library we use to open URLs
import urllib.request
# specify which URL/web page we are going to be scraping
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)

In [2]:
from bs4 import BeautifulSoup

In [3]:
soup = BeautifulSoup(page, "lxml")

In [4]:
right_table=soup.find('table', class_='wikitable sortable')
right_table

<table class="wikitable sortable">
<tbody><tr>
<th>Postal Code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>
<tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M2A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3A
</td>
<td>North York
</td>
<td>Parkwoods
</td></tr>
<tr>
<td>M4A
</td>
<td>North York
</td>
<td>Victoria Village
</td></tr>
<tr>
<td>M5A
</td>
<td>Downtown Toronto
</td>
<td>Regent Park, Harbourfront
</td></tr>
<tr>
<td>M6A
</td>
<td>North York
</td>
<td>Lawrence Manor, Lawrence Heights
</td></tr>
<tr>
<td>M7A
</td>
<td>Downtown Toronto
</td>
<td>Queen's Park, Ontario Provincial Government
</td></tr>
<tr>
<td>M8A
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M9A
</td>
<td>Etobicoke
</td>
<td>Islington Avenue, Humber Valley Village
</td></tr>
<tr>
<td>M1B
</td>
<td>Scarborough
</td>
<td>Malvern, Rouge
</td></tr>
<tr>
<td>M2B
</td>
<td>Not assigned
</td>
<td>
</td></tr>
<tr>
<td>M3B
</td>
<td>North York
</td>
<td>Don Mills
</td></tr>
<tr>
<

In [5]:
A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))
    

In [6]:
import pandas as pd
df=pd.DataFrame(A,columns=['PostalCode'])
df['Borough']=B
df['Neighborhood']=C
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


### Ignore rows that have Boroughs Not Assigned

#### Went the route of converting 'Not assigned' to NaN in numpy. More than one way to achieve results.

In [7]:
import numpy as np
df = df.replace('Not assigned',np.nan, regex=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,,
1,M2A,,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [8]:
df.dropna(subset=["Borough"], axis=0, inplace=True)
df = df.reset_index(drop=True)
df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
PostalCode      103 non-null object
Borough         103 non-null object
Neighborhood    103 non-null object
dtypes: object(3)
memory usage: 2.5+ KB


### Cell has a borough but a Not assigned neighborhood, the neighborhood will be the same as the borough

In [15]:
df[df['Neighborhood'] == 0]
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [16]:
mask = df['Neighborhood'] == "Not assigned"
df.loc[mask, 'Neighborhood'] = df.loc[mask, 'Borough']
df


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


### Verify that there are no repeating postal codes

In [17]:
df['PostalCode'].unique()

array(['M3A\n', 'M4A\n', 'M5A\n', 'M6A\n', 'M7A\n', 'M9A\n', 'M1B\n',
       'M3B\n', 'M4B\n', 'M5B\n', 'M6B\n', 'M9B\n', 'M1C\n', 'M3C\n',
       'M4C\n', 'M5C\n', 'M6C\n', 'M9C\n', 'M1E\n', 'M4E\n', 'M5E\n',
       'M6E\n', 'M1G\n', 'M4G\n', 'M5G\n', 'M6G\n', 'M1H\n', 'M2H\n',
       'M3H\n', 'M4H\n', 'M5H\n', 'M6H\n', 'M1J\n', 'M2J\n', 'M3J\n',
       'M4J\n', 'M5J\n', 'M6J\n', 'M1K\n', 'M2K\n', 'M3K\n', 'M4K\n',
       'M5K\n', 'M6K\n', 'M1L\n', 'M2L\n', 'M3L\n', 'M4L\n', 'M5L\n',
       'M6L\n', 'M9L\n', 'M1M\n', 'M2M\n', 'M3M\n', 'M4M\n', 'M5M\n',
       'M6M\n', 'M9M\n', 'M1N\n', 'M2N\n', 'M3N\n', 'M4N\n', 'M5N\n',
       'M6N\n', 'M9N\n', 'M1P\n', 'M2P\n', 'M4P\n', 'M5P\n', 'M6P\n',
       'M9P\n', 'M1R\n', 'M2R\n', 'M4R\n', 'M5R\n', 'M6R\n', 'M7R\n',
       'M9R\n', 'M1S\n', 'M4S\n', 'M5S\n', 'M6S\n', 'M1T\n', 'M4T\n',
       'M5T\n', 'M1V\n', 'M4V\n', 'M5V\n', 'M8V\n', 'M9V\n', 'M1W\n',
       'M4W\n', 'M5W\n', 'M8W\n', 'M9W\n', 'M1X\n', 'M4X\n', 'M5X\n',
       'M8X\n', 'M4Y

### Use the .shape method to print the number of rows of your dataframe

In [20]:
print('The scrubbed dataframe has' , df.shape)

The scrubbed dataframe has (103, 3)
