# Week 3 - Segmenting and Clustering Neighborhoods in Toronto

## Question 1

### Importing libraries 

In [1]:
import pandas as pd
import numpy as np 
import requests 
from bs4 import BeautifulSoup 

### Getting data from Wikipedia

In [2]:
wikiurl = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
wiki = requests.get(wikiurl).text
soup = BeautifulSoup(wiki,'lxml')

table = soup.find('table')
table_rows = table.find_all('tr')

data = []
for row in table_rows:
    data.append([t.text.strip() for t in row.find_all('td')])
    
df = pd.DataFrame(data, columns=['PostalCode', 'Borough', 'Neighbourhood'])
df = df[~df['PostalCode'].isnull()]

df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Cleaning the dataframe

In [3]:
df.drop(df[df['Borough']=="Not assigned"].index,axis=0, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [4]:
df1 = df.groupby('PostalCode').agg(lambda x: ','.join(x))
df1.head()

Unnamed: 0_level_0,Borough,Neighbourhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [5]:
df1.loc[df1['Neighbourhood']=="Not assigned",'Neighbourhood']=df1.loc[df1['Neighbourhood']=="Not assigned",'Borough']
df1.head()

Unnamed: 0_level_0,Borough,Neighbourhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae


In [6]:
df2 = df1.reset_index()
df2.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [7]:
df2['Borough']= df2['Borough'].str.replace('nan|[{}\s]','').str.split(',').apply(set).str.join(',').str.strip(',').str.replace(",{2,}",",")
df2.tail()

Unnamed: 0,PostalCode,Borough,Neighbourhood
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
102,M9W,Etobicoke,Northwest


In [8]:
df2.shape

(103, 3)

## Question 2

### Getting latitude & longitude information 

In [9]:
!wget -q -O "Geospatial_Coordinates.csv" http://cocl.us/Geospatial_data
coor = pd.read_csv('Geospatial_Coordinates.csv')
coor.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Combining above with cleanded dataframe in Q1

In [10]:
df_pc = df2.set_index('PostalCode')
coor_pc = coor.set_index('Postal Code')
df_geo = pd.concat([df_pc, coor_pc], axis=1, join='inner')

df_geo.index.name = 'PostalCode'
df_geo.reset_index(inplace=True)

df_geo.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Question 3

## Importing libraries

In [11]:
import json
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [12]:
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim

!conda install -c conda-forge folium=0.5.0 --yes 
import folium

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge

The following packages will be UPDATED:

    geopy: 1.21.0-py_0 conda-forge --> 1.22.0-pyh9f0ad1d_0 conda-forge


Downloading and Extracting Packages
geopy-1.22.0         | 63 KB     | ##################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Solving environment: done

# All requested packages already installed.



### Getting Toronto's latitude & longitude

In [13]:
address = 'Toronto, Ontario'

geolocator = Nominatim(user_agent="tr_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### Creating a map of Toronto 

In [14]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, long, post, borough, neigh in zip(df_geo['Latitude'], df_geo['Longitude'], df_geo['PostalCode'], df_geo['Borough'], df_geo['Neighbourhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

### Choosing a borough to analyze

In [15]:
df_geo['Borough'].value_counts()

NorthYork          24
DowntownToronto    19
Scarborough        17
Etobicoke          12
CentralToronto      9
WestToronto         6
EastToronto         5
York                5
EastYork            5
Mississauga         1
Name: Borough, dtype: int64

In [16]:
df_boroughs = ['NorthYork']
df_geo1 = df_geo[df_geo['Borough'].isin(df_boroughs)].reset_index(drop=True)
df_geo2 = df_geo1.groupby(['PostalCode','Borough', 'Neighbourhood']).mean().reset_index()

df_geo2.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M2H,NorthYork,Hillcrest Village,43.803762,-79.363452
1,M2J,NorthYork,"Fairview, Henry Farm, Oriole",43.778517,-79.346556
2,M2K,NorthYork,Bayview Village,43.786947,-79.385975
3,M2L,NorthYork,"York Mills, Silver Hills",43.75749,-79.374714
4,M2M,NorthYork,"Willowdale, Newtonbrook",43.789053,-79.408493


In [17]:
df_geo2.shape

(24, 5)

In [18]:
map_toronto1 = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, long, post, borough, neigh in zip(df_geo1['Latitude'], df_geo1['Longitude'], df_geo1['PostalCode'], df_geo1['Borough'], df_geo1['Neighbourhood']):
    label = "{} ({}): {}".format(borough, post, neigh)
    popup = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=5,
        popup=popup,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto1)
    
map_toronto1

### Exploring neighbourhoods in North York with Foursquare API

In [19]:
CLIENT_ID = 'L5AKWJ1I3VSHFI5DWFJVNRRDVSPCCEG4QOCL5M0IF5NEE0AC' 
CLIENT_SECRET = '0HR4VPXLTRDKFA2NM4VBTEIZJXLDCNQFI4EYHRRPQTCIGWA2' 
VERSION = '20200312' 

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: L5AKWJ1I3VSHFI5DWFJVNRRDVSPCCEG4QOCL5M0IF5NEE0AC
CLIENT_SECRET:0HR4VPXLTRDKFA2NM4VBTEIZJXLDCNQFI4EYHRRPQTCIGWA2


In [20]:
radius = 500
LIMIT = 100

venues = []

for lat, long, post, borough, neighbourhood in zip(df_geo1['Latitude'], df_geo1['Longitude'], df_geo1['PostalCode'], df_geo1['Borough'], df_geo1['Neighbourhood']):
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    for venue in results:
        venues.append((
            post, 
            borough,
            neighbourhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

        
venues_df = pd.DataFrame(venues)
venues_df.columns = ['PostalCode', 'Borough', 'Neighbourhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(233, 9)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,M2H,NorthYork,Hillcrest Village,43.803762,-79.363452,Eagle's Nest Golf Club,43.805455,-79.364186,Golf Course
1,M2H,NorthYork,Hillcrest Village,43.803762,-79.363452,AY Jackson Pool,43.804515,-79.366138,Pool
2,M2H,NorthYork,Hillcrest Village,43.803762,-79.363452,Villa Madina,43.801685,-79.363938,Mediterranean Restaurant
3,M2H,NorthYork,Hillcrest Village,43.803762,-79.363452,Duncan Creek Park,43.805539,-79.360695,Dog Run
4,M2J,NorthYork,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,The LEGO Store,43.778207,-79.343483,Toy / Game Store


In [21]:
northyork_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

northyork_onehot['PostalCode'] = venues_df['PostalCode'] 
northyork_onehot['Borough'] = venues_df['Borough'] 
northyork_onehot['Neighbourhood'] = venues_df['Neighbourhood'] 

fixed_columns = list(northyork_onehot.columns[-3:]) + list(northyork_onehot.columns[:-3])
northyork_onehot = northyork_onehot[fixed_columns]

print(northyork_onehot.shape)
northyork_onehot.head()

(233, 106)


Unnamed: 0,PostalCode,Borough,Neighbourhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,...,Supplement Shop,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Women's Store
0,M2H,NorthYork,Hillcrest Village,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M2H,NorthYork,Hillcrest Village,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M2H,NorthYork,Hillcrest Village,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M2H,NorthYork,Hillcrest Village,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M2J,NorthYork,"Fairview, Henry Farm, Oriole",0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [22]:
northyork_grouped = northyork_onehot.groupby(['PostalCode', 'Borough', 'Neighbourhood']).mean().reset_index()

northyork_grouped.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Accessories Store,Airport,American Restaurant,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Bakery,...,Supplement Shop,Sushi Restaurant,Tea Room,Thai Restaurant,Theater,Toy / Game Store,Video Game Store,Video Store,Vietnamese Restaurant,Women's Store
0,M2H,NorthYork,Hillcrest Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,M2J,NorthYork,"Fairview, Henry Farm, Oriole",0.0,0.0,0.015385,0.0,0.015385,0.0,0.030769,...,0.015385,0.0,0.015385,0.0,0.015385,0.030769,0.015385,0.0,0.0,0.015385
2,M2K,NorthYork,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,M2L,NorthYork,"York Mills, Silver Hills",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,M2N,NorthYork,Willowdale,0.0,0.0,0.0,0.029412,0.0,0.0,0.0,...,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.029412,0.0


In [23]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

areaColumns = ['PostalCode', 'Borough', 'Neighbourhood']
freqColumns = []
for ind in np.arange(num_top_venues):
    try:
        freqColumns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        freqColumns.append('{}th Most Common Venue'.format(ind+1))
columns = areaColumns+freqColumns

neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['PostalCode'] = northyork_grouped['PostalCode']
neighborhoods_venues_sorted['Borough'] = northyork_grouped['Borough']
neighborhoods_venues_sorted['Neighbourhood'] = northyork_grouped['Neighbourhood']

for ind in np.arange(northyork_grouped.shape[0]):
    row_categories = northyork_grouped.iloc[ind, :].iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    neighborhoods_venues_sorted.iloc[ind, 3:] = row_categories_sorted.index.values[0:num_top_venues]
neighborhoods_venues_sorted.sort_values(freqColumns, inplace=True)

neighborhoods_venues_sorted.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,M3H,NorthYork,"Bathurst Manor, Wilson Heights, Downsview North",Bank,Coffee Shop,Sandwich Place,Bridal Shop,Restaurant,Ice Cream Shop,Diner,Shopping Mall,Pizza Place,Pharmacy
3,M2L,NorthYork,"York Mills, Silver Hills",Cafeteria,Women's Store,Coffee Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop
1,M2J,NorthYork,"Fairview, Henry Farm, Oriole",Clothing Store,Coffee Shop,Fast Food Restaurant,Shoe Store,Toy / Game Store,Japanese Restaurant,Kids Store,Bank,Restaurant,Bakery
9,M3C,NorthYork,Don Mills,Coffee Shop,Clothing Store,Restaurant,Asian Restaurant,Gym,Beer Store,Chinese Restaurant,Dim Sum Restaurant,Discount Store,Sandwich Place
11,M3J,NorthYork,"Northwood Park, York University",Coffee Shop,Miscellaneous Shop,Caribbean Restaurant,Massage Studio,Bar,Discount Store,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega


### Clustering neighbourhoods

In [24]:
kclusters = 4

toronto_cluster = northyork_grouped.drop(['PostalCode', 'Borough', 'Neighbourhood'], 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_cluster)

northyork_merged = df_geo2
neighborhoods_venues_sorted.insert(0, 'Cluster Label', kmeans.labels_)

northyork_merged = northyork_merged.join(neighborhoods_venues_sorted.drop(
    ['Borough', 'Neighbourhood'], 1).set_index('PostalCode'), on='PostalCode')
northyork_merged.sort_values(['Cluster Label'] + freqColumns, inplace=True)

In [35]:
northyork_merged.dropna(subset = ['Cluster Label'], inplace = True)
northyork_merged['Cluster Label'] = northyork_merged['Cluster Label'].astype('int')
northyork_merged.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude,Cluster Label,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
20,M6B,NorthYork,Glencairn,43.709577,-79.445073,0,Japanese Restaurant,Pizza Place,Bakery,Pub,Discount Store,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega
6,M2P,NorthYork,York Mills West,43.752758,-79.400049,0,Park,Convenience Store,Bank,Distribution Center,Concert Hall,Construction & Landscaping,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop
11,M3H,NorthYork,"Bathurst Manor, Wilson Heights, Downsview North",43.754328,-79.442259,1,Bank,Coffee Shop,Sandwich Place,Bridal Shop,Restaurant,Ice Cream Shop,Diner,Shopping Mall,Pizza Place,Pharmacy
3,M2L,NorthYork,"York Mills, Silver Hills",43.75749,-79.374714,1,Cafeteria,Women's Store,Coffee Shop,Concert Hall,Construction & Landscaping,Convenience Store,Cosmetics Shop,Deli / Bodega,Department Store,Dessert Shop
1,M2J,NorthYork,"Fairview, Henry Farm, Oriole",43.778517,-79.346556,1,Clothing Store,Coffee Shop,Fast Food Restaurant,Shoe Store,Toy / Game Store,Japanese Restaurant,Kids Store,Bank,Restaurant,Bakery


### Mapping the clusters

In [36]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, post, bor, poi, cluster in zip(northyork_merged['Latitude'], 
                                             northyork_merged['Longitude'], northyork_merged['PostalCode'], 
                                             northyork_merged['Borough'], northyork_merged['Neighbourhood'], 
                                             northyork_merged['Cluster Label']):
    label = folium.Popup('{} ({}): {} - Cluster {}'.format(bor, post, poi, cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Assignment finished!