In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd

## 1. Use BeautifulSoup to transform the data in the table on the Wikipedia page into pandas dataframe

Use BeautifulSoup retrive data from the web

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source, 'lxml')
table = soup.find('table')

In [3]:
data=[]
head = [items.text.strip() for items in table.find_all('th')]

for rows in table.find_all('tr'):
    cols = [items.text.strip() for items in rows.find_all('td')]
    data.append(cols)
    
data[0]=head
data[0:5]

[['Postcode', 'Borough', 'Neighbourhood'],
 ['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village']]

In [4]:
# create the dataframe with retrived data
df=pd.DataFrame(data[1:], columns=data[0])
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [5]:
df.shape

(288, 3)

In [6]:
pc_canada = df.copy()
# remove rows with a borough that is Not assigned.
pc_canada=pc_canada[pc_canada['Borough']!='Not assigned']
pc_canada.sort_values(by = ['Postcode','Borough']).head()               

Unnamed: 0,Postcode,Borough,Neighbourhood
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern
27,M1C,Scarborough,Highland Creek
28,M1C,Scarborough,Rouge Hill
29,M1C,Scarborough,Port Union


In [7]:
pc_canada.shape

(211, 3)

In [8]:
# Combine rows with same Postcode and Borough
pc_canada_g = pc_canada.groupby(['Postcode','Borough'])['Neighbourhood'].apply(lambda x: ','.join(x)).reset_index()

# If a cell has a borough but a Not assigned neighborhood, replace with the borough
pc_canada_g['Neighbourhood'] = np.where(pc_canada_g['Neighbourhood'] == 'Not assigned', pc_canada_g['Borough'], pc_canada_g['Neighbourhood'])

pc_canada_g.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [9]:
pc_canada_g.shape

(103, 3)

In [10]:
#!conda install -c conda-forge geocoder
import json # library to handle JSON files
from geopy import geocoders
#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Collecting package metadata: done
Solving environment: / 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/linux-64::anaconda==5.3.1=py37_0
  - defaults/linux-64::astropy==3.0.4=py37h14c3975_0
  - defaults/linux-64::bkcharts==0.2=py37_0
  - defaults/linux-64::blaze==0.11.3=py37_0
  - defaults/linux-64::bokeh==0.13.0=py37_0
  - defaults/linux-64::bottleneck==1.2.1=py37h035aef0_1
  - defaults/linux-64::dask==0.19.1=py37_0
  - defaults/linux-64::datashape==0.5.4=py37_1
  - defaults/linux-64::mkl-service==1.1.2=py37h90e4bf4_5
  - defaults/linux-64::numba==0.39.0=py37h04863e7_0
  - defaults/linux-64::numexpr==2.6.8=py37hd89afb7_0
  - defaults/linux-64::odo==0.5.1=py37_0
  - defaults/linux-64::pytables==3.4.4=py37ha205bf6_0
  - defaults/linux-64::pytest-arraydiff==0.2=py37h39e3cac_0
  - defaults/linux-64::pytest-astropy==0.4.0=py37_0
  - defaults/linux-64::pytest-doctestplus==0.1.3=py37_0
  - defaults

## 2. The dataframe of the postal code of each neighborhood along with the borough name,  neighborhood name, latitude and the longitude coordinates

In [11]:
location = pd.read_csv('http://cocl.us/Geospatial_data')
location.rename(columns={'Postal Code': "Postcode"}, inplace=True)
location.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
toronto = pc_canada_g.merge(location, how='left', on ='Postcode')
toronto.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [17]:
# get latitude, longitude of Toronto
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto City are 43.653963, -79.387207.


## 3. Explore and cluster the neighborhoods in Toronto

In [19]:
# explore interested area
neighborhood_latitude = toronto.loc[66, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = toronto.loc[66, 'Longitude'] # neighborhood longitude value

neighborhood_name = toronto.loc[66, 'Neighbourhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Harbord,University of Toronto are 43.6626956, -79.4000493.


In [20]:
CLIENT_ID = '4EGBIM5Y5ENSBXZ11VSSLSJUFD3KWSYKAJ30KAC0XO3BZEAT' # your Foursquare ID
CLIENT_SECRET = 'NHI5XOICEHGV0LAVHIXMJIQMG0HVZYGNZKYVZPIYM2G1JBEP' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius

# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=4EGBIM5Y5ENSBXZ11VSSLSJUFD3KWSYKAJ30KAC0XO3BZEAT&client_secret=NHI5XOICEHGV0LAVHIXMJIQMG0HVZYGNZKYVZPIYM2G1JBEP&v=20180605&ll=43.6626956,-79.4000493&radius=500&limit=100'

In [21]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Retrieve data from foursqure for the neibourhood with only boroughs that contain the word Toronto
analysis will only focus on boroughs that contain the word Toronto 

In [22]:
##### retrieve infor from foursqure #####
toronto_t=toronto[toronto['Borough'].str.contains('Toronto')]
print("toronto borough with 'Toronto' data shap", toronto_t.shape)
toronto_venues = getNearbyVenues(names=toronto_t['Neighbourhood'],
                                   latitudes=toronto_t['Latitude'],
                                   longitudes=toronto_t['Longitude']
                                  )
print('toronto_venues data shape', toronto_venues.shape)
toronto_venues.head()

toronto borough with 'Toronto' data shap (38, 5)
toronto_venues data shape (1700, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
1,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
2,The Beaches,43.676357,-79.293031,St-Denis Studios Inc.,43.675031,-79.288022,Music Venue
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West,Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [23]:
toronto_venues.groupby('Neighborhood').count().head()


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide,King,Richmond",100,100,100,100,100,100
Berczy Park,57,57,57,57,57,57
"Brockton,Exhibition Place,Parkdale Village",19,19,19,19,19,19
Business Reply Mail Processing Centre 969 Eastern,17,17,17,17,17,17
"CN Tower,Bathurst Quay,Island airport,Harbourfront West,King and Spadina,Railway Lands,South Niagara",15,15,15,15,15,15


In [28]:
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))
print('There are {} Neighborboods '. format(toronto_venues.groupby('Neighborhood').count().shape[0]))

There are 236 uniques categories.
There are 38 Neighborboods 


##  Below, the venues data will be anylyzed with clustering

In [25]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

toronto_onehot.drop(['Neighborhood'], axis=1, inplace=True)
# add neighborhood column back to dataframe
toronto_Neighborhood= toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
toronto_onehot.insert(0, 'Neighborhood', toronto_Neighborhood)
#toronto_onehot['Neighborhood'].index
print("data shape", toronto_onehot.shape)
toronto_onehot.head()

data shape (1700, 236)


Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Danforth West,Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
toronto_grouped=toronto_onehot.groupby('Neighborhood').sum().reset_index()
print("data shape", toronto_grouped.shape)
toronto_grouped.head()

data shape (38, 236)


Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Yoga Studio
0,"Adelaide,King,Richmond",0,0,0,0,0,0,0,0,4,...,0,0,0,0,1,0,0,1,0,0
1,Berczy Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,"Brockton,Exhibition Place,Parkdale Village",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Business Reply Mail Processing Centre 969 Eastern,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,"CN Tower,Bathurst Quay,Island airport,Harbourf...",0,0,1,1,1,2,3,2,0,...,0,0,0,0,0,0,0,0,0,0


#### Run k-means to cluster the neighborhood into 5 clusters.

In [29]:
k = 5
neighborhoods_venues_sorted = pd.DataFrame(toronto_grouped['Neighborhood'])
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=k, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([3, 1, 4, 4, 4, 1, 3, 2, 4, 1, 3, 1, 4, 4, 3, 4, 3, 4, 1, 3, 1, 4,
       4, 1, 4, 4, 4, 4, 4, 1, 0, 0, 0, 1, 1, 4, 4, 1], dtype=int32)

In [30]:
# add clustering labels
#neighborhoods_venues_sorted.drop(['Cluster Labels'], axis=1, inplace=True)
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_t
toronto_merged.rename(columns={'Neighbourhood': 'Neighborhood'}, inplace=True)

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood').reset_index(drop=True)

toronto_merged.head() # check the last columns!

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,4
1,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,1
2,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,4
3,M4M,East Toronto,Studio District,43.659526,-79.340923,1
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4


In [31]:
toronto_merged['Cluster Labels'].value_counts()

4    17
1    11
3     6
0     3
2     1
Name: Cluster Labels, dtype: int64

In [32]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
pwd