# Scraping Zip codes data from Wikipedia

**This work sheet is to scarpe Cananda Zip code data from Wikipedia**

In [1]:
! pip install lxml 

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 11.2MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0


In [41]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np

## scrape the table cell from the website

In [54]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [4]:
T = tr_elements[1].text_content()
T

'\nM1A\nNot assigned\nNot assigned\n'

In [5]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

## Parse table header

In [55]:
tr_elements = doc.xpath('//tr')
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content().strip('\n')
    print(i,name)
    col.append((name,[]))

1 Postcode
2 Borough
3 Neighbourhood


## Create Pandas Data Frame

In [56]:
#Since out first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    temp = tr_elements[j]
    temp = temp.text_content()
    # Ignore cells with a borough that is Not assigned.
    if temp[5:17] != 'Not assigned':
        T=tr_elements[j]  #T is our j'th row
 
        #If row is not of size 3, the //tr data is not from our table 
        if len(T)!=3:
            break

        #i is the index of our column
        i=0
        #Iterate through each element of the row
        for t in T.iterchildren():
            data=t.text_content().strip('\n')
            #print(data+'**')
            #Check if row is empty
            if i>0:
            #Convert any numerical value to integers
                try:
                    data=int(data)
                except:
                    pass
            #Append the data to the empty list of the i'th column
            col[i][1].append(data)
            #Increment i for the next column
            i+=1

In [8]:
[len(C) for (title,C) in col]

[210, 210, 210]

In [57]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [10]:
df.head(20)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M6A,North York,Lawrence Heights
4,M6A,North York,Lawrence Manor
5,M7A,Downtown Toronto,Queen's Park
6,M9A,Etobicoke,Islington Avenue
7,M1B,Scarborough,Rouge
8,M1B,Scarborough,Malvern
9,M3B,North York,Don Mills North


In [11]:
df.shape

(210, 3)

## Concentrate Neighbourhood column grouped by the Postcode and Borough

In [58]:
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv..."
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ..."


In [13]:
df.shape

(103, 3)

## Add geographical coordinates to the table

In [14]:
!pip install geocoder

Collecting geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K     |████████████████████████████████| 102kB 16.6MB/s ta 0:00:01
[?25hCollecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Collecting future (from geocoder)
[?25l  Downloading https://files.pythonhosted.org/packages/45/0b/38b06fd9b92dc2b68d58b75f900e97884c45bedd2ff83203d933cf5851c9/future-0.18.2.tar.gz (829kB)
[K     |████████████████████████████████| 829kB 28.4MB/s eta 0:00:01
Collecting click (from geocoder)
[?25l  Downloading https://files.pythonhosted.org/packages/fa/37/45185cb5abbc30d7257104c434fe0b07e5a195a6847506c074527aa599ec/Click-7.0-py2.py3-none-any.whl (81kB)
[K     |████████████████████████████████| 81kB 16.1MB/s eta 0:00:01
Building wheels 

In [59]:
import geocoder # import geocoder

# initialize your variable to None
latitude = []
longitude = []
zipcode = df['Postcode']

for i in zipcode:
    # loop until you get the coordinates
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(i))
    lat_lng_coords = g.latlng
    #print(g.latlng)      
    latitude.append(lat_lng_coords[0])
    longitude.append(lat_lng_coords[1]) 

In [60]:
df['latitude'] = latitude
df['longitude'] = longitude
df

Unnamed: 0,Postcode,Borough,Neighbourhood,latitude,longitude
0,M1B,Scarborough,"Rouge, Malvern",43.811525,-79.195517
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.785665,-79.158725
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765815,-79.175193
3,M1G,Scarborough,Woburn,43.768369,-79.217590
4,M1H,Scarborough,Cedarbrae,43.769688,-79.239440
...,...,...,...,...,...
98,M9N,York,Weston,43.704845,-79.517546
99,M9P,Etobicoke,Westmount,43.696505,-79.530252
100,M9R,Etobicoke,"Kingsview Village, Martin Grove Gardens, Richv...",43.686810,-79.557284
101,M9V,Etobicoke,"Albion Gardens, Beaumond Heights, Humbergate, ...",43.743145,-79.584664


## Cluster the Toronto Boroughs and visualization

Cluster the Boroughs by the geographical coordinate

In [69]:
#!conda install -c conda-forge folium=0.4.0 --yes 
#conda update -n base -c defaults conda
!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
# import k-means from clustering stage
from sklearn.cluster import KMeans
import folium # map rendering library
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

Solving environment: done


  current version: 4.5.11
  latest version: 4.8.2

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    scikit-learn-0.20.1        |   py36h22eb022_0         5.7 MB
    liblapack-3.8.0            |      11_openblas          10 KB  conda-forge
    numpy-1.18.1               |   py36h95a1406_0         5.2 MB  conda-forge
    liblapacke-3.8.0           |      11_openblas          10 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    libopenblas-0.3.6          |       h5a2b251_2         7.7 MB
    scipy-1.4.1                |   py36h921218d_0        

In [70]:
g = geocoder.arcgis('Toronto, Ontario')
lat_lng_coords = g.latlng
lat_lng_coords

[43.648690000000045, -79.38543999999996]

Create a map of Toronto with boroughs superimposed on top.

In [71]:
# create map of Toronto using latitude and longitude values
map_newyork = folium.Map(location=lat_lng_coords, zoom_start=10)
geolocator = Nominatim(user_agent="ny_explorer")

# add markers to map
for lat, lng, borough in zip(df['latitude'], df['longitude'], df['Borough']):
    label = '{}'.format(borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

Now clster boroughs using K-mean method

In [81]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

df_cluster = df.drop('Neighbourhood', 1)
df_cluster = df_cluster.set_index(['Postcode','Borough'])


# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 1, 1, 1, 1, 1, 3, 3, 1, 3], dtype=int32)

Add the cluster labels and Neighbourhood on to Toronto data

In [82]:
df_cluster['Cluster Labels'] = kmeans.labels_
df_cluster['Neighbourhood'] = df['Neighbourhood']

Visualize the cluster of the Borough on a map

In [83]:
# create map
map_clusters = folium.Map(location=lat_lng_coords, zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_cluster['latitude'], df_cluster['longitude'],df_cluster['Neighbourhood'], df_cluster['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters