In [1]:
# Code of Last assignment

# Start by installing necessary libraries
!conda install --yes beautifulsoup4 #Installing beautifulsoup
!conda install --yes lxml #Installing Parser
!conda install --yes requests #Installing Requests


# Importing
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np # library to handle data in a vectorized manner

# Getting the HTML file
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')

# Parsing Column Names
column_names = []
n_rows=0

rows = soup.find('table', class_='wikitable').find_all('tr')

for row in rows:
    n_rows+=1
    th_tags = row.find_all('th')
    for th in th_tags:
        text = th.get_text()
        column_names.append(text.rstrip("\n\r"))

# Creating Data Frame

df = pd.DataFrame(columns = column_names,index= range(0,n_rows))

row_index = 0
for row in rows:
    column_index = 0
    columns = row.find_all('td')
    for column in columns:
        text = column.get_text()
        df.iat[row_index,column_index] = text.rstrip("\n\r") # Cleaning \n
        column_index += 1
    row_index += 1
# Cleaning Data Frame from NaN's and Not Assigned

df.drop([0], axis=0, inplace=True) # Droping first NaN line

df_valuable = df[(df['Borough'] != 'Not assigned')] #Removing Not Assigned Borough

df_clean = df_valuable.where(df_valuable != 'Not assigned',df['Borough'], axis =0) #Copying Neighbourhood

# Not possible to use Geocoder, so I'm loading the csv file

df_coordinates = pd.read_csv('http://cocl.us/Geospatial_data')

dic_coordinates = df_coordinates.set_index('Postal Code').to_dict() #Creating a dictionary for coordinates

# This For-loop checks the latitude and longitude in the dictionary and passes it to a new dataframe

df_latitude_and_longitude = df_clean

for index, row in df_latitude_and_longitude.iterrows():
    df_latitude_and_longitude.at[index,'Latitude'] = dic_coordinates["Latitude"][row['Postcode']]
    df_latitude_and_longitude.at[index,'Longitude'] = dic_coordinates["Longitude"][row['Postcode']]
df_latitude_and_longitude.head()

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.6.3       |           py35_0         140 KB

The following packages will be UPDATED:

    beautifulsoup4: 4.6.0-py35h442a8c9_1 --> 4.6.3-py35_0


Downloading and Extracting Packages
beautifulsoup4-4.6.3 | 140 KB    | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - requests


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    requests-2.19.1            |           py35_0 

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
3,M3A,North York,Parkwoods,43.753259,-79.329656
4,M4A,North York,Victoria Village,43.725882,-79.315572
5,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
6,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
7,M6A,North York,Lawrence Heights,43.718518,-79.464763


Installing and Importing necessaring libraries

In [2]:
!conda install -c conda-forge folium=0.5.0 --yes #

import numpy as np # library to handle data in a vectorized manner
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/DSX-Python35

  added / updated specs: 
    - folium=0.5.0


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    folium-0.5.0               |             py_0          45 KB  conda-forge
    certifi-2018.8.24          |        py35_1001         139 KB  conda-forge
    openssl-1.0.2r             |       h14c3975_0         3.1 MB  conda-forge
    branca-0.3.1               |             py_0          25 KB  conda-forge
    ca-certificates-2019.3.9   |       hecc5488_0         146 KB  conda-forge
    altair-2.2.2               |           py35_1         462 KB  conda-forge
    vincent-0.4.4              |             py_1          28 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.0 MB

The following NEW packages will

In [3]:
# create map of Toronto using latitude and longitude values
map_newyork = folium.Map(location=[43.70011, -79.4163], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_latitude_and_longitude['Latitude'], df_latitude_and_longitude['Longitude'], df_latitude_and_longitude['Borough'], df_latitude_and_longitude['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

Clustering the Neighborhoods based on their Proximity

In [10]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 5

df_latitude_and_longitude_clustering = df_latitude_and_longitude.drop(['Postcode','Neighbourhood', 'Borough'], 1) #Droping sting columns

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_latitude_and_longitude_clustering)

#check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([2, 2, 4, 4, 0, 0, 4, 1, 2, 2], dtype=int32)

Creating a new dataframe with the clusters

In [11]:
# add clustering labels
df_latitude_and_longitude.insert(0, 'Cluster Labels', kmeans.labels_)

df_latitude_and_longitude.head() # check the last columns!

Unnamed: 0,Cluster Labels,Postcode,Borough,Neighbourhood,Latitude,Longitude
3,2,M3A,North York,Parkwoods,43.753259,-79.329656
4,2,M4A,North York,Victoria Village,43.725882,-79.315572
5,4,M5A,Downtown Toronto,Harbourfront,43.65426,-79.360636
6,4,M5A,Downtown Toronto,Regent Park,43.65426,-79.360636
7,0,M6A,North York,Lawrence Heights,43.718518,-79.464763


Visualizing Clusters

In [12]:
# create map
map_clusters = folium.Map(location=[43.70011, -79.4163], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_latitude_and_longitude['Latitude'], df_latitude_and_longitude['Longitude'], df_latitude_and_longitude['Neighbourhood'], df_latitude_and_longitude['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

NameError: name 'cm' is not defined