## Install beautifulsoup4 to scrap webpage

In [2]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


## Import requests package to get the html content of a webpage 
## Then get the html content of the canada postal codes webpage into a variable

In [None]:
import requests
html_content = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

### Parse the content using beautiful soup html parser and print it

In [2]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
#print(soup.prettify()) # print the parsed data of html

## Print the title of the webpage

In [3]:
print(soup.title)

<title>List of postal codes of Canada: M - Wikipedia</title>


### By doing webpage inspect get the class name of the table and assign that table to a variable
### Get the header of the table using tag "th" and print the headers after removing spaces and new line character


In [4]:
canada_table = soup.find("table", attrs={"class": "wikitable sortable"})

t_headers = []
for th in canada_table.find_all("th"):
 # remove any newlines and extra spaces from left and right
 t_headers.append(th.text.replace('\n', ' ').strip())        
   
    
print(t_headers)


['Postal Code', 'Borough', 'Neighborhood']


## Get all rows from the table's body and assign that to an array

In [5]:
    table_data = []
    for tr in canada_table.tbody.find_all("tr"): # find all tr's from table's tbody
        t_row = {}
        # Each table row is stored in the form of
        # t_row = {'Postal Code': '', 'Borough': '', 'Neighborhood': ''}

        # find all td's(3) in tr and zip it with t_header
        for td, th in zip(tr.find_all("td"), t_headers): 
            t_row[th] = td.text.replace('\n', '').strip()
        table_data.append(t_row)

#Remove first record
table_data.pop(0) 
print(table_data[0])

{'Postal Code': 'M1A', 'Borough': 'Not assigned', 'Neighborhood': 'Not assigned'}


## Import pandas to create data frame

In [6]:
import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


## Define the data frame columns and instantiate the data frame

In [7]:
# define the dataframe columns
column_names = ['PostalCode', 'Borough', 'Neighborhood'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
neighborhoods

Unnamed: 0,PostalCode,Borough,Neighborhood


## Get the values form array and assign it to the corresponding columns in the data frame

In [8]:
for data in table_data:
    Borough = data['Borough']
    PostalCode = data['Postal Code']
    Neighborhood = data['Neighborhood']
    
    
    neighborhoods = neighborhoods.append({'Borough': Borough,
                                          'PostalCode': PostalCode,
                                          'Neighborhood': Neighborhood}, ignore_index=True)
neighborhoods.shape

(180, 3)

## Ignore the cells where a borough is not assigned

In [9]:
neighborhoods.drop(neighborhoods[neighborhoods['Borough']=="Not assigned"].index,axis=0, inplace=True)
neighborhoods = neighborhoods.reset_index()
neighborhoods.shape

(103, 4)

## Combine rows into one row with the neighborhoods separated with a comma if there are more than one neighborhood for a given Postalcode and Borough

In [10]:
neighborhoods_grouped = neighborhoods.groupby(['PostalCode', 'Borough']).agg(','.join)
neighborhoods_grouped = neighborhoods_grouped.reset_index()

In [11]:
neighborhoods_grouped.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


## If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough.

In [12]:
neighborhoods_grouped.loc[neighborhoods_grouped['Neighborhood'] == 'Not assigned' , 'Neighborhood'] = neighborhoods_grouped['Borough']
neighborhoods_grouped.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


## To print the number of rows of dataframe

In [13]:
neighborhoods_grouped.shape

(103, 3)