## Applied Data Science Capstone Peer-graded Assignment week #3
### Segmenting and Clustering Neighborhoods in Toronto
#### Part One
Transform the data in the table on the Wikipedia page into the a pandas dataframe.

In [1]:
#Install lxml to process HTML with Python
pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/ec/be/5ab8abdd8663c0386ec2dd595a5bc0e23330a0549b8a91e32f38c20845b6/lxml-4.4.1-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 20.8MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.4.1
Note: you may need to restart the kernel to use updated packages.


In [1]:
import requests
import pandas as pd
import numpy as np
import lxml.html as lh

In [4]:
#Wiki data page url
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

#Create a handle, page, to handle the contents of the wikipage
wikipage = requests.get(url)

#Store the contents of the website under doc
doc = lh.fromstring(wikipage.content)

In [7]:
 #Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

#First row is the header, last 5 tr elements are from other tables, 
#Data is stored on the second row onwards to 288

#Create column list
col = [('Postcode', []), ('Borough', []), ('Neighbourhood', [])]
for j in range(1, len(tr_elements)):
    rows = tr_elements[j]

    #If row is not of size 3, the //tr data is not from our table 
    if len(rows) != 3:
        break
    
    #i is the index of the column
    i = 0
    
    #Iterate through each element of the row
    for r in rows.iterchildren():
        data = r.text_content().replace('\n', '') 
        col[i][1].append(data)
        i += 1   

#Creat pandas dataframe
df = {title:column for (title, column) in col}
df = pd.DataFrame(df)

print(df)
df.shape

    Postcode           Borough          Neighbourhood
0        M1A      Not assigned           Not assigned
1        M2A      Not assigned           Not assigned
2        M3A        North York              Parkwoods
3        M4A        North York       Victoria Village
4        M5A  Downtown Toronto           Harbourfront
..       ...               ...                    ...
283      M8Z         Etobicoke              Mimico NW
284      M8Z         Etobicoke     The Queensway West
285      M8Z         Etobicoke  Royal York South West
286      M8Z         Etobicoke         South of Bloor
287      M9Z      Not assigned           Not assigned

[288 rows x 3 columns]


(288, 3)

In [8]:
#Drop not assigned Borough
df = df[df.Borough != 'Not assigned']
df = df.reset_index(drop=True)
print(df)

    Postcode           Borough             Neighbourhood
0        M3A        North York                 Parkwoods
1        M4A        North York          Victoria Village
2        M5A  Downtown Toronto              Harbourfront
3        M5A  Downtown Toronto               Regent Park
4        M6A        North York          Lawrence Heights
..       ...               ...                       ...
206      M8Z         Etobicoke  Kingsway Park South West
207      M8Z         Etobicoke                 Mimico NW
208      M8Z         Etobicoke        The Queensway West
209      M8Z         Etobicoke     Royal York South West
210      M8Z         Etobicoke            South of Bloor

[211 rows x 3 columns]


In [9]:
#Set not assigned Neighbourhood with Borough
df.loc[df.Neighbourhood == 'Not assigned', 'Neighbourhood'] = df.Borough
print(df)

    Postcode           Borough             Neighbourhood
0        M3A        North York                 Parkwoods
1        M4A        North York          Victoria Village
2        M5A  Downtown Toronto              Harbourfront
3        M5A  Downtown Toronto               Regent Park
4        M6A        North York          Lawrence Heights
..       ...               ...                       ...
206      M8Z         Etobicoke  Kingsway Park South West
207      M8Z         Etobicoke                 Mimico NW
208      M8Z         Etobicoke        The Queensway West
209      M8Z         Etobicoke     Royal York South West
210      M8Z         Etobicoke            South of Bloor

[211 rows x 3 columns]


In [10]:
#For same Postcode, combine Neighbourhood into one row with the neighborhoods separated with a comma
neighbor_df = df.groupby(['Postcode', 'Borough']).Neighbourhood.agg([('Neighbourhood', ', '.join)])
neighbor_df = neighbor_df.reset_index()
print(neighbor_df)

    Postcode      Borough                                      Neighbourhood
0        M1B  Scarborough                                     Rouge, Malvern
1        M1C  Scarborough             Highland Creek, Rouge Hill, Port Union
2        M1E  Scarborough                  Guildwood, Morningside, West Hill
3        M1G  Scarborough                                             Woburn
4        M1H  Scarborough                                          Cedarbrae
..       ...          ...                                                ...
98       M9N         York                                             Weston
99       M9P    Etobicoke                                          Westmount
100      M9R    Etobicoke  Kingsview Village, Martin Grove Gardens, Richv...
101      M9V    Etobicoke  Albion Gardens, Beaumond Heights, Humbergate, ...
102      M9W    Etobicoke                                          Northwest

[103 rows x 3 columns]
