<h1 align=center>Peer-graded Assignment</h1> 
<h2 align=center>Segmenting and Clustering Neighborhoods in Toronto</h2>

In [42]:
# Import Libraries 
import numpy as np 

from bs4 import BeautifulSoup # library for webscrapping 
import requests # library to handle requests 

import pandas as pd # library for data analysis 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle json files 

!conda install -c conda-forge geopy 
from geopy.geocoders import Nominatim # convert address into latitude and longitude 

from pandas.io.json import json_normalize # transform JSON file into a pandas dataframe 

# Matplotlib and associated plotting modules 
import matplotlib.cm as cm 
import matplotlib.colors as colors 

# import k-means fro clustering stage 
from sklearn.cluster import KMeans 

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library 

print ('Libraries imported')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.6.16  |       hecc5488_0         145 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    certifi-2019.6.16          |           py36_1         149 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

### 1. Websrape Wikipedia page 

In [27]:
URL = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')

In [28]:
# extract the necesarry table 
table = soup.find('table', {'class':"wikitable sortable"}).tbody

In [29]:
# identify the rows and columns 
rows = table.find_all('tr')
columns = [v.text.replace('\n','') for v in rows[0].find_all('th')]

print(columns)

['Postcode', 'Borough', 'Neighbourhood']


In [30]:
# create the dataset by populating the columns 

df = pd.DataFrame(columns=columns)

for i in range(1, len(rows)):
    tds = rows[i].find_all('td')
    
    if len(tds) ==4:
        values = [tds[0].text, tds[1].text, tds[2].text.replace('\n','')]
    else:
        values = [td.text.replace('\n','') for td in tds]
    
# lastly append the column title to the instances

    df = df.append(pd.Series(values, index=columns), ignore_index=True)


In [31]:
neighborhood = df

In [32]:
# delete boroughs that are 'not assigned'
neighborhood = neighborhood.set_index("Borough")
neighborhood = neighborhood.drop("Not assigned", axis=0)


In [33]:
# combine neighborhoods with same postcode and borough
neighborhood = neighborhood.groupby(['Postcode', 'Borough']).sum()

In [34]:
neighborhood.reset_index(inplace = True)

In [35]:
# rearrange the columns and view the dataset

neighborhood = neighborhood[['Postcode', 'Borough', 'Neighbourhood']]


In [36]:
neighborhood

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,RougeMalvern
1,M1C,Scarborough,Highland CreekRouge HillPort Union
2,M1E,Scarborough,GuildwoodMorningsideWest Hill
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,East Birchmount ParkIonviewKennedy Park
7,M1L,Scarborough,ClairleaGolden MileOakridge
8,M1M,Scarborough,CliffcrestCliffsideScarborough Village West
9,M1N,Scarborough,Birch CliffCliffside West


The dataframe has 11 boroughs and 103 neighborhoods.


## 2. Get latitude and longitude coordinates for locations in dataset 

In [22]:
# import libraries 
import geocoder # import geocoder 

ModuleNotFoundError: No module named 'geocoder'

In [23]:
# imitialize the variable to None 
lat_lng_coords = None 

# loop until I get the coordinates 
while(lat_lng_coords is None):
    g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
    lat_lng_coords = g.latlng
    
latitude = lat_lng_coords[0]
longitude = lat lng[1]

SyntaxError: invalid syntax (<ipython-input-23-18f02d796461>, line 10)

In [38]:
# import the already prepaerd data set 
data = pd.read_csv("http://cocl.us/Geospatial_data")
data 

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [39]:
# rename column 'Postal code' to 'Postcode' 
data = data.rename(columns={"Postal Code": "Postcode"})

In [48]:
# merge the two datasets along the postal Code. 
df_merge = pd.merge(neighborhood, data, on ='Postcode')
df_merge

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,RougeMalvern,43.806686,-79.194353
1,M1C,Scarborough,Highland CreekRouge HillPort Union,43.784535,-79.160497
2,M1E,Scarborough,GuildwoodMorningsideWest Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,East Birchmount ParkIonviewKennedy Park,43.727929,-79.262029
7,M1L,Scarborough,ClairleaGolden MileOakridge,43.711112,-79.284577
8,M1M,Scarborough,CliffcrestCliffsideScarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Birch CliffCliffside West,43.692657,-79.264848


In [58]:
neighborhoods = df_merge 
neighborhoods

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,RougeMalvern,43.806686,-79.194353
1,M1C,Scarborough,Highland CreekRouge HillPort Union,43.784535,-79.160497
2,M1E,Scarborough,GuildwoodMorningsideWest Hill,43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,East Birchmount ParkIonviewKennedy Park,43.727929,-79.262029
7,M1L,Scarborough,ClairleaGolden MileOakridge,43.711112,-79.284577
8,M1M,Scarborough,CliffcrestCliffsideScarborough Village West,43.716316,-79.239476
9,M1N,Scarborough,Birch CliffCliffside West,43.692657,-79.264848


In [59]:
# check how many boroughs and neighbourhoods the dataset has 

print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 11 boroughs and 103 neighborhoods.


## 3. Explore and cluster neighborhoods in Toronto

#### Use geopy library to get the latitude and the longitude values of Toronto City 

In [57]:
address = 'Toronto'

geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(address) 
latitude = location.latitude 
longitude = location.longitude

print('The geographical coordinate of Toronto City are {}, {}'.format(latitude, longitude))

The geographical coordinate of Toronto City are 43.653963, -79.387207


In [61]:
# create map of Toronto using latitude and longitude values 
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map 
for lat, lng, borough, neighbourhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)
    
map_toronto

In [68]:
toronto_data = neighborhoods

Define Foursquare Credentials and Version

In [112]:
# @hidden cell 
# Define Foursquare credentials and version 
CLIENT_ID = '5LF3C42N1NUBGTAIXCJEOTF0YHE2BH5DJSAJTPFXWDKAMNXF'
CLIENT_SECRET = 'PHWX5L324PRFHSXGCBAX2MJWTSBFNZNEKGUGJO5KMCNHWBZE'
VERSION = '20180907'

print('Your credentials:')


Your credentials:


In [113]:
# Exploring the first neighborhood 

# get the neighborhoods name 
toronto_data.loc[0, 'Neighbourhood']

'RougeMalvern'

In [114]:
# Get the neighbourhood's latitude and longitude value 
neighborhood_latitude = toronto_data.loc[0, 'Latitude'] 
neighborhood_longitude = toronto_data.loc[0, 'Longitude']

neighborhood_name = toronto_data.loc[0,'Neighbourhood']

print('latitude and longitude values of {} are {}, {}.'.format(neighborhood_name,
                                                              neighborhood_latitude,
                                                              neighborhood_longitude))

latitude and longitude values of RougeMalvern are 43.806686299999996, -79.19435340000001.


In [115]:
# Now lets get for the top 100 values in RougeMalvern within a radius of 500 meters 
# First let's create the GET request URL naming the URL url 

LIMIT = 100 # limiting the number of request returned by Foursquare API

radius = 500 # define radius 

# create the url 
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID,
    CLIENT_SECRET,
    VERSION,
    neighborhood_latitude,
    neighborhood_longitude,
    radius,
    LIMIT) 

url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=5LF3C42N1NUBGTAIXCJEOTF0YHE2BH5DJSAJTPFXWDKAMNXF&client_secret=PHWX5L324PRFHSXGCBAX2MJWTSBFNZNEKGUGJO5KMCNHWBZE&v=20180907&ll=43.806686299999996,-79.19435340000001&radius=500&limit=100'

In [116]:
results = requests.get(url).json()
results 

{'meta': {'code': 200, 'requestId': '5d73b67f787dba0038bcb8c6'},
  'headerLocation': 'Malvern',
  'headerFullLocation': 'Malvern, Toronto',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 1,
  'suggestedBounds': {'ne': {'lat': 43.8111863045, 'lng': -79.18812958073042},
   'sw': {'lat': 43.80218629549999, 'lng': -79.2005772192696}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '4bb6b9446edc76b0d771311c',
       'name': "Wendy's",
       'location': {'crossStreet': 'Morningside & Sheppard',
        'lat': 43.80744841934756,
        'lng': -79.19905558052072,
        'labeledLatLngs': [{'label': 'display',
          'lat': 43.80744841934756,
          'lng': -79.19905558052072}],
        'distance': 387,
        'cc': 'CA',
        'city': 'Toronto',
    

In [117]:
# function that extract the category of the venue 
def get_category_type(row):
    try: 
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
    
    if len(categories_list) == 0: 
        return None
    else:
        return categories_list[0]['name']

In [118]:
# now we can clean the json and structure it into pandas dataframe 
venues = results['response']['groups'][0]['items']

nearby_venues = json_normalize(venues) # flatten JSON

# filter columns 
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]

# filter the category for each row 
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns 
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Wendy's,Fast Food Restaurant,43.807448,-79.199056


In [119]:
print('{} venues were returned by Foursquare'.format(nearby_venues.shape[0]))

1 venues were returned by Foursquare


#### Explore neighborhoods in Toronto 

Now we create a function to repeat the same process to all neighborhoods in Toronto 

In [120]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # create the API request URL 
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        lng,
        radius,
        LIMIT)
    
    # make the GET request 
    results = requests.get(url).json()['response']['groups'][0]['items']
    
    # return only relevant information for each nearby venue 
    venues_list.append([(
        name,
        lat,
        lng,
        v['venues']['name'],
        v['venue']['location']['lat'],
        v['venue']['location']['lng'],
        v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns =['Neighbourhood',
                    'Neighborhood Latitude',
                    'Neighborhood Longitude',
                    'Venue',
                    'Venue Latitude',
                    'Venue Longitude',
                    'Venue Category']
    
    return(nearby_venues)

In [123]:
# now we create a dataframe called toronto_venues 

toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                 latitudes=toronto_data['Latitude'],
                                 longitudes=toronto_data['Longitude'])

RougeMalvern
Highland CreekRouge HillPort Union
GuildwoodMorningsideWest Hill
Woburn
Cedarbrae
Scarborough Village
East Birchmount ParkIonviewKennedy Park
ClairleaGolden MileOakridge
CliffcrestCliffsideScarborough Village West
Birch CliffCliffside West
Dorset ParkScarborough Town CentreWexford Heights
MaryvaleWexford
Agincourt
Clarks CornersSullivanTam O'Shanter
Agincourt NorthL'Amoreaux EastMillikenSteeles East
L'Amoreaux West
Upper Rouge
Hillcrest Village
FairviewHenry FarmOriole
Bayview Village
Silver HillsYork Mills
NewtonbrookWillowdale
Willowdale South
York Mills West
Willowdale West
Parkwoods
Don Mills North
Flemingdon ParkDon Mills South
Bathurst ManorDownsview NorthWilson Heights
Northwood ParkYork University
CFB TorontoDownsview East
Downsview West
Downsview Central
Downsview Northwest
Victoria Village
Woodbine GardensParkview Hill
Woodbine Heights
The Beaches
Leaside
Thorncliffe Park
East Toronto
The Danforth WestRiverdale
The Beaches WestIndia Bazaar
Studio District
Lawrenc

KeyError: 'venues'

In [122]:
print(toronto_venues.shape)
toronto_venues.head()

NameError: name 'toronto_venues' is not defined

In [None]:
# Check how many unique categories can be curated from all the returned venues 
print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))

Analysis of each neighborhood

In [None]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = manhattan_onehot[fixed_columns]

toronto_onehot.head()

In [None]:
# dataframe size 
toronto_onehot.shape

In [None]:
toronto_grouped = manhattan_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

In [None]:
# Top 5 common venues for each neighbourhood
num_top_venues = 5

for hood in toronto_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

In [None]:
# Function to sort the venues in descending order 
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
# top ten venues for each neighborhood
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(manhattan_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(manhattan_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()


### Cluster Neighborhoods 
Run k-means to cluster the neighborhoods into 5 clusters

In [None]:
# set number of clusters
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

In [None]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

manhattan_merged = manhattan_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns!

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters