# Toronto Neighborhoods - Part 1

### Installing required libraries and packages for WebScraping

In [1]:
!pip install --upgrade beautifulsoup4
!pip install --upgrade lxml
!pip install --upgrade html5lib
!pip install --upgrade requests

Requirement already up-to-date: beautifulsoup4 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: soupsieve>=1.2 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from beautifulsoup4)
Requirement already up-to-date: lxml in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement already up-to-date: html5lib in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: six>=1.9 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from html5lib)
Requirement not upgraded as not directly required: webencodings in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from html5lib)
Requirement already up-to-date: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: chardet<3.1.0,>=3.0.2 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from requests)
Requirement not upgra

#### Importing required packages

In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### WebScraping from Wikipedia

In [3]:
#Getting the text from wiki link
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

#Using BeautifulSoup and lxml parser
soup = BeautifulSoup(source,'lxml')

#Getting the table from the Wiki page
Toronto_table = soup.find('table',{'class':'wikitable sortable'})

#Getting the table values in a list
PostalCodeList=[]
for i in Toronto_table.findAll('tr'):
    split_rows=i.text.split('\n')
    PostalCodeList.append(split_rows)
    empty_string=''
    while empty_string in split_rows:
        split_rows.remove(empty_string) 
PostalCodeList

[['Postcode', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Harbourfront'],
 ['M5A', 'Downtown Toronto', 'Regent Park'],
 ['M6A', 'North York', 'Lawrence Heights'],
 ['M6A', 'North York', 'Lawrence Manor'],
 ['M7A', "Queen's Park", 'Not assigned'],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue'],
 ['M1B', 'Scarborough', 'Rouge'],
 ['M1B', 'Scarborough', 'Malvern'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills North'],
 ['M4B', 'East York', 'Woodbine Gardens'],
 ['M4B', 'East York', 'Parkview Hill'],
 ['M5B', 'Downtown Toronto', 'Ryerson'],
 ['M5B', 'Downtown Toronto', 'Garden District'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B', 'Etobicoke', 'Cloverdale'],
 [

### Data Wrangling

In [4]:
#Converting the list into data frame
PostalCodeDf=pd.DataFrame(PostalCodeList[1:], columns=PostalCodeList[0])
PostalCodeDf.shape

(289, 3)

In [5]:
#Removing 'Not Assigned' Borough
FormattedDf=PostalCodeDf[PostalCodeDf.Borough!='Not assigned']

#Formatting Neighbourhood data
for i in range(0,len(FormattedDf)):
    if FormattedDf.iloc[i]['Neighbourhood']=='Not assigned':
        FormattedDf.iloc[i]['Neighbourhood']=FormattedDf.iloc[i]['Borough']
FormattedDf=FormattedDf.reset_index()
FormattedDf=FormattedDf.drop(['index'],axis=1)
FormattedDf.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


##### Getting the shape after formatting

In [6]:
FormattedDf.shape

(212, 3)

### Grouping the Neighbourhoods using Postal Code

In [7]:
#Grouping Neighbourhood using Postal Code
GroupedDf=pd.DataFrame(FormattedDf.groupby('Postcode',as_index=False).agg(lambda x:', '.join(x['Neighbourhood'])))

#Merging Borough into the Grouped dataframe
FinalDf=pd.merge(GroupedDf, FormattedDf, left_on='Postcode', right_on='Postcode', how='left').drop(['Borough_x','Neighbourhood_y'], axis=1)

#Dropping duplicate values in the data frame
FinalDf=FinalDf.drop_duplicates(subset=['Postcode','Borough_y','Neighbourhood_x'])

#Realigning the columns in the data frame
FinalDf=FinalDf[['Postcode','Borough_y','Neighbourhood_x']]

#Renaming the columns in the data frame
FinalDf.columns=['Postcode','Borough','Neighbourhood']

#Resetting the index
FinalDf=FinalDf.reset_index()
FinalDf=FinalDf.drop(['index'],axis=1)
FinalDf.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Shape of the Final Dataframe

In [8]:
FinalDf.shape

(103, 3)

In [9]:
FinalDf.to_csv('Toronto_Part1.csv')