### Importing necessary modules 
#### Using BeautifulSoup for web scraping

In [42]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

#URL of the webpage
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

#Extracting the HTML
html = requests.get(url).text

soup = BeautifulSoup(html, "lxml")

#### Extracting column names 

In [57]:
#Finding the required table tag and its contents
hood = soup.find("table", attrs={"class": "wikitable sortable"})
hood_data = hood.tbody.find_all("tr") 

#print(hood_data[0])

#Variable for column names
columns = []

for th in hood_data[0].find_all("th"):
    columns.append(th.text.replace('\n', ' ').strip())

print(columns)

['Postal Code', 'Borough', 'Neighborhood']


#### Extracting row details

In [93]:
#Variable for saving row data
rows = []

for i in range(1,len(hood_data)): 
    
    row_data = []
    
    for td in hood_data[i].find_all("td"):
        
        td = td.text.replace('\n',' ').strip()
        row_data.append(td)

    rows.append(row_data)

### Creating the DataFrame

In [95]:
df = pd.DataFrame(rows, columns=columns)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [107]:
#To check the number of 'Not Assigned' values
df['Borough'].value_counts()

Not assigned        77
North York          24
Downtown Toronto    19
Scarborough         17
Etobicoke           12
Central Toronto      9
West Toronto         6
East York            5
York                 5
East Toronto         5
Mississauga          1
Name: Borough, dtype: int64

#### Removing all rows with 'Borough' values 'Not assigned'

In [112]:
for index in df.index:
    if(df['Borough'][index]=='Not assigned'):
        df.drop(index, inplace=True)

In [113]:
#Checking the shape of the new dataframe
df.shape

(103, 3)

#### To check for any unassigned 'Neighborhood' values

In [119]:
#Assigning unassigned 'Neighbourhood' values with value of respective 'Borough'
for index in df.index:
    if(df['Neighborhood'][index]=='Not Assigned'or df['Neighborhood'][index]==''):
        df['Neighborhood'][index] = df['Borough'][index]

In [128]:
#Resetting the index values after modifications ade
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [129]:
#Printing final shape of dataframe
df.shape

(103, 3)