# Segmenting and Clustering Neighborhoods in Toronto - 1


In [2]:
import pandas as pd
import numpy as np
import bs4 as bs
import requests

### Importing data through wiki and scrapping it through Beautiful soup and finding the table

In [3]:

url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
request = requests.get(url)
bsoup = bs.BeautifulSoup(request.content,'lxml') 
table = bsoup.find_all('table')[0]
df = pd.read_html(str(table))
data = pd.read_json(df[0].to_json(orient='records'))

In [4]:
data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Postal Code   180 non-null    object
 1   Borough       180 non-null    object
 2   Neighborhood  180 non-null    object
dtypes: object(3)
memory usage: 4.3+ KB


In [7]:
data.isnull().sum()

Postal Code     0
Borough         0
Neighborhood    0
dtype: int64

##### Choosing only data where field Borough doesn't have not assigned value

In [19]:

new_data = data[data['Borough'] != 'Not assigned' ]
new_data

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


##### Grouping data

In [20]:

new_data = new_data.groupby(['Borough', 'Postal Code'], as_index=False).agg(','.join)
new_data

Unnamed: 0,Borough,Postal Code,Neighborhood
0,Central Toronto,M4N,Lawrence Park
1,Central Toronto,M4P,Davisville North
2,Central Toronto,M4R,"North Toronto West, Lawrence Park"
3,Central Toronto,M4S,Davisville
4,Central Toronto,M4T,"Moore Park, Summerhill East"
...,...,...,...
98,York,M6C,Humewood-Cedarvale
99,York,M6E,Caledonia-Fairbanks
100,York,M6M,"Del Ray, Mount Dennis, Keelsdale and Silverthorn"
101,York,M6N,"Runnymede, The Junction North"


In [26]:
raw_data = new_data[new_data['Neighborhood']== 'Not assigned']
raw_data.head()

Unnamed: 0,Borough,Postal Code,Neighborhood


#### There are no NA values in Neighborhood but in case there is we can run the code by assigning borough value to neighborhood

In [29]:
new_data['Neighborhood'] = np.where(new_data['Neighborhood'] == 'Not assigned',
                                     new_data['Borough'],new_data['Neighborhood'])

In [31]:
new_data.shape

(103, 3)

# In order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood. 

In [32]:
geospatial_url = "http://cocl.us/Geospatial_data"
geo_df = pd.read_csv(geospatial_url)
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [33]:
Merged_data = pd.merge(new_data, geo_df, on = 'Postal Code')
Merged_data.head()

Unnamed: 0,Borough,Postal Code,Neighborhood,Latitude,Longitude
0,Central Toronto,M4N,Lawrence Park,43.72802,-79.38879
1,Central Toronto,M4P,Davisville North,43.712751,-79.390197
2,Central Toronto,M4R,"North Toronto West, Lawrence Park",43.715383,-79.405678
3,Central Toronto,M4S,Davisville,43.704324,-79.38879
4,Central Toronto,M4T,"Moore Park, Summerhill East",43.689574,-79.38316


# Locating all the Neighborhoods in Toronto

#### importing folium for generating maps

In [35]:
!conda install -c conda-forge folium=0.5.0 --yes
import folium

print('Folium installed and imported!')

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Folium installed and imported!


In [40]:

!conda install -c conda-forge geopy --yes


Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: d:\Users\MAHE\anaconda3

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          97 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.22.0-pyh9f0ad1d_0



Downloading and Extracting Packages

geographiclib-1.50   | 34 KB     |            |   0% 
geographiclib-1.50   | 34 KB     | ####7      |  47% 
geographiclib-1.50   

###  convert an address into latitude and longitude values

In [41]:
from geopy.geocoders import Nominatim 

In [44]:
# use geopy to obtain latitude/longitude of Toronto.
# coordinates will be used for map visualization
address = 'Toronto, Canada'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of the City of Toronto are {}, {}.'.format(latitude, longitude))



  """


The geograpical coordinate of the City of Toronto are 43.6534817, -79.3839347.


In [49]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Merged_data['Latitude'], Merged_data['Longitude'], Merged_data['Borough'], Merged_data['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3199cc',
        fill_opacity=0.3,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Let's explore Top 5 venues in each postcode within 500 meters radius with Foursquare

## Foursquare credentials

In [50]:
CLIENT_ID = 'LHR2NCA41XWDN34VMGDBBWKE3RE3HNH1JQ2DLJN0PKXDFO4H' # your Foursquare ID
CLIENT_SECRET = 'IN0OKUHEYZYDSTSCBNAEOK0L5ET3LDGOGVXBIOSW0BME3H24' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)



Your credentails:
CLIENT_ID: LHR2NCA41XWDN34VMGDBBWKE3RE3HNH1JQ2DLJN0PKXDFO4H
CLIENT_SECRET:IN0OKUHEYZYDSTSCBNAEOK0L5ET3LDGOGVXBIOSW0BME3H24


In [57]:
from urllib.request import urlopen

In [58]:
# define limit = 5 (limit to 5 venues only) & radius = 500 (meters)
LIMIT = 5
radius = 500

location_list = [] # initiate a list to store data from Foursquare API requests

for neighborhood, latitude, longitude in zip(Merged_data.Neighborhood, Merged_data.Latitude, Merged_data.Longitude):
    
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)
    
    data = requests.get(url).json()
    
    # use len() to check if any data within "items" (len = 0 if nothing)
    length = len(data['response']['groups'][0]['items'])
    if length == 0:
        continue # skip the row if nothing is found
    else:
        venue = data['response']['groups'][0]['items'][0]['venue']
        
        # extract info within 'venue'
        name = venue['name']
        lat = venue['location']['lat']
        lon = venue['location']['lng']
        cat = venue['categories'][0]['name']
        
        location_list.append([(neighborhood, latitude, longitude, name, lat, lon, cat)])

In [65]:
temp = pd.DataFrame(x for row in location_list for x in row)
temp.columns = ['Neighborhood','N_Latitude','N_Longitude','Venue','V_Latitude','V_Longitude','category']
temp.head()

Unnamed: 0,Neighborhood,N_Latitude,N_Longitude,Venue,V_Latitude,V_Longitude,category
0,Lawrence Park,43.72802,-79.38879,Lawrence Park Ravine,43.726963,-79.394382,Park
1,Davisville North,43.712751,-79.390197,Homeway Restaurant & Brunch,43.712641,-79.391557,Breakfast Spot
2,"North Toronto West, Lawrence Park",43.715383,-79.405678,Barreworks,43.71407,-79.400109,Yoga Studio
3,Davisville,43.704324,-79.38879,Jules Cafe Patisserie,43.704138,-79.388413,Dessert Shop
4,"Moore Park, Summerhill East",43.689574,-79.38316,Loring-Wyle Parkette,43.69027,-79.383438,Park


In [62]:
print("{} nearby locations downloaded for {} neighborhood.".format(len(temp.Venue), len(Merged_data.Neighborhood)))

99 nearby locations downloaded for 103 neighborhood.


## Making dummy values

In [66]:
cat = pd.get_dummies(temp.category) # one hot encoding with get_dummies()
df_01 = pd.concat([temp[['Neighborhood']], cat], axis=1) # combine neighbourhood & category tables
df_01.head()



Unnamed: 0,Neighborhood,Airport,Arts & Crafts Store,Bakery,Bank,Bar,Baseball Field,Boutique,Breakfast Spot,Brewery,...,Sandwich Place,Skating Rink,Sports Bar,Supermarket,Theme Restaurant,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Warehouse Store,Yoga Studio
0,Lawrence Park,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Davisville North,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,"North Toronto West, Lawrence Park",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,Davisville,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Moore Park, Summerhill East",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Clustering  the data set

In [67]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

In [78]:
df_02 = df_01.drop('Neighborhood', axis=1)

n_group = 6 # we will group neighbourhoods into 6 clusters

# run k-means clustering
kmeans = KMeans(n_clusters=n_group, random_state=0).fit(df_02)

# check cluster labels generated for each row in the dataframe
labels = kmeans.labels_
labels[0:10]

array([3, 0, 0, 0, 3, 0, 0, 0, 0, 0])

In [79]:

# add clustering labels
df_01.insert(1, 'label', kmeans.labels_)


df_01.head(5)

ValueError: cannot insert label, already exists