# Neighborhood Data from Wikipedia

##### Import Python Libraries

In [1]:
from bs4 import BeautifulSoup

import urllib.request as urb
import requests
import pandas as pd

##### Using BeautifulSoup for Scrapping the Data

In [3]:
link = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urb.urlopen(link)
soup = BeautifulSoup(html, "lxml")

html_table = soup.select_one(".wikitable")

##### Converting Html table to Pandas Dataframe

In [231]:
raw_table = pd.read_html(html_table.prettify() ,skiprows=1)
rdf = pd.DataFrame(raw_table[0])
rdf.head()

Unnamed: 0,M1A,Not assigned,Not assigned.1
0,M2A,Not assigned,Not assigned
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M5A,Downtown Toronto,Regent Park


Giving Columns Name

In [66]:
rdf.columns = ['Postal Code','Borough','Neighborhood']
rdf.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M2A,Not assigned,Not assigned
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M5A,Downtown Toronto,Regent Park


##### Drop Not Assigned Values

In [72]:
drop_indexes = list(rdf[rdf['Borough'] == 'Not assigned'].index)

In [77]:
df = rdf.drop(drop_indexes, axis = 0)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
1,M3A,North York,Parkwoods
2,M4A,North York,Victoria Village
3,M5A,Downtown Toronto,Harbourfront
4,M5A,Downtown Toronto,Regent Park
5,M6A,North York,Lawrence Heights


In [142]:
df = df.reset_index()
df.drop('index',axis = 1,inplace = True)

In [146]:
df.head()

Unnamed: 0,level_0,Postal Code,Borough,Neighborhood
0,0,M3A,North York,Parkwoods
1,1,M4A,North York,Victoria Village
2,2,M5A,Downtown Toronto,Harbourfront
3,3,M5A,Downtown Toronto,Regent Park
4,4,M6A,North York,Lawrence Heights


In [185]:
grp = df.groupby('Postal Code')

In [222]:
grp.get_group('M5A')['Borough'].values[1]

'Downtown Toronto'

##### Grouping the Neighborhood according to Postal Code

In [190]:
Postal_Code = df['Postal Code'].unique()

In [225]:
Neighbourhood = []
Borough = []
for i in Postal_Code:
    Neighbourhood.append(", ".join(grp.get_group(i)['Neighborhood'].values))
    Borough.append(grp.get_group(i)['Borough'].values[0])

In [229]:
Neighborhood_Data = pd.DataFrame({
    'Postal Code':Postal_Code,
    'Borough':Borough,
    'Neighborhood':Neighbourhood
})

In [230]:
Neighborhood_Data

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront, Regent Park"
3,M6A,North York,"Lawrence Heights, Lawrence Manor"
4,M7A,Queen's Park,Not assigned
5,M9A,Etobicoke,Islington Avenue
6,M1B,Scarborough,"Rouge, Malvern"
7,M3B,North York,Don Mills North
8,M4B,East York,"Woodbine Gardens, Parkview Hill"
9,M5B,Downtown Toronto,"Ryerson, Garden District"


In [233]:
Neighborhood_Data.shape

(103, 3)