# Applied Data Science Capstone

## Segmenting and Clustering Neighborhoods in Toronto

### START of part 1

### Importing the required libraries

In [None]:
import pandas as pd
import numpy as np
import requests
import time
!conda install -c conda-forge geocoder --yes
import geocoder
!conda install -c conda-forge bs4 --yes
from bs4 import BeautifulSoup
import ssl
import json
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
ssl._create_default_https_context = ssl._create_unverified_context
print('Libraries imported')

Solving environment: / 

### Getting the file from the url

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)
r.status_code

### Scraping the contents

In [None]:
# instantiate a soup object with the response text
soup = BeautifulSoup(r.text, "html.parser")

# parse the table part of the response
postal_table = soup.find(class_="wikitable sortable")


### Transforming the data into a pandas dataframe

In [None]:
# build the initial dataframe
table_rows = postal_table.find_all('tr')
row_values = []
for tr in table_rows:
    td = tr.find_all('td')
    row_text = [tr.text.strip() for tr in td if tr.text.strip()]
    if row_text:
        row_values.append(row_text)

toronto_df = pd.DataFrame(row_values, columns=["PostalCode", "Borough", "Neighborhood"])
toronto_df.head(10)

### Cleaning the dataframe

In [None]:
# ignore cells with a Borough that is 'Not assigned'
borough_df = toronto_df[toronto_df.Borough != 'Not assigned']

# combine neighborhoods with the same PostalCode into single row 
combined_df = borough_df.groupby(['PostalCode', 'Borough'])['Neighborhood'].apply(', '.join).reset_index(drop=False)

combined_df.head(10)

In [None]:
print(combined_df.shape)

### END part 1

### START part 2

### Getting the Latitude & Longitude Coordinates of Each Neighborhood and Creating a Dataframe

In [None]:
# use the link to the csv file to get latitude & longitude as geocoder appears unstable
url = 'http://cocl.us/Geospatial_data'
lat_long_df = pd.read_csv(url)

# create a new dataframe with the required columns
detailed_df = pd.DataFrame({'PostalCode':combined_df['PostalCode'], 
                            'Borough':combined_df['Borough'], 
                            'Neighborhood':combined_df['Neighborhood'], 
                            'Latitude':lat_long_df['Latitude'], 
                            'Longitude':lat_long_df['Longitude']})

print('Toronto has a total of {} boroughs and {} neighborhoods.'.format(len(detailed_df.Borough.unique()), len(detailed_df.Neighborhood.unique())))
print('\n')
detailed_df.head()

### End part 2

### START part 3

### Conduct cluster analysis

In [None]:
# analyze the number of postal codes in 'Toronto' boroughs
toronto_borough = ['Downtown Toronto', 'Central Toronto', 'West Toronto', 'East Toronto']

for tor in toronto_borough:
    print("{} has a total of {} postal codes.".format(tor, detailed_df[detailed_df['Borough'] == tor].PostalCode.count()))

In [None]:
d_t = detailed_df[detailed_df['Borough'] == 'Downtown Toronto']
c_t = detailed_df[detailed_df['Borough'] == 'Central Toronto']
w_t = detailed_df[detailed_df['Borough'] == 'West Toronto']
e_t = detailed_df[detailed_df['Borough'] == 'East Toronto']

combined = pd.concat([d_t, c_t, w_t, e_t], sort=False)
toronto_dataframe = combined.reset_index(drop=True)

print(toronto_dataframe.shape)
toronto_dataframe.head(10)

In [None]:
# get the latitude & longitude values of Toronto
address = 'Toronto, Ontario'
geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

print('Coordinates of Toronto are {}, {}. '.format(latitude, longitude))

### Create a Map of Toronto with its Neighborhoods Superimposed on Top

In [None]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers
for lat, lng, borough, neighborhood in zip(toronto_dataframe['Latitude'], toronto_dataframe['Longitude'], toronto_dataframe['Borough'], toronto_dataframe['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.5,
        parse_html=False).add_to(map_toronto)
    
map_toronto

### Explore the Neighborhoods Utilizing the Foursquare API and Segment Them

In [None]:
# access Foursquare APIs
client_id = 'I0WSDFW5UEWAMMEATZRZL5B1VMKJV5TDZIAVJSGS1EVVBFST'
client_secret = 'KP1KVLWC4NMAVJSCPBODF0RIFXGPKZ2KDCIRMOO2TNMEO03X'
version = '20200504'

In [None]:
# first neighborhood in toronto_dataframe
toronto_dataframe.loc[0, 'Neighborhood']

In [None]:
# get Rosedale's latitude & longitude values
rosedale_latitude = toronto_dataframe.loc[0, 'Latitude']
rosedale_longitude = toronto_dataframe.loc[0, 'Longitude']

print('Rosedale\'s latitude and longitude values are {}, {}.'.format(rosedale_latitude, rosedale_longitude))

In [None]:
# get the top 100 venues that are in Rosedale within a radius of 500 meteres
no_of_venues = 100
radius = 500
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(client_id, client_secret, version, rosedale_latitude, rosedale_longitude, radius, no_of_venues)

response = requests.get(url).json()
response

In [None]:
# function to extract the venue's category
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
# clean the response json and structure it into pandas dataframe
venues = response['response']['groups'][0]['items']
# flatten json
nearby_venues = json_normalize(venues)
# filter only needed columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues = nearby_venues.loc[:, filtered_columns]
# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

### Explore all neighboorhoods and its venues

In [None]:
# explore all the neighborhoods of four different boroughs using Foursquare APIs
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list = []
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(client_id, client_secret, version, lat, lng, radius, no_of_venues)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues_list.append([(name, lat, lng, 
            v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude',
                            'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    return(nearby_venues)

In [None]:
# run the above function on each neighborhood and create a new dataframe called toronto_venues
toronto_venues = getNearbyVenues(names=toronto_dataframe['Neighborhood'],
                                 latitudes=toronto_dataframe['Latitude'],
                                 longitudes=toronto_dataframe['Longitude'])

In [None]:
# check how many venues were returned overall
print(toronto_venues.shape)
print('\n')
toronto_venues.head(10)

In [None]:
# check the number of venues for each neighborhood
toronto_venues.groupby('Neighborhood')['Venue'].count()

### Do One-hot Encoding All Neighborhoods based on the Venue Category

In [None]:
# use one-hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add the neighborhood column back to dataframe
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 

# move the neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.shape

### Group Rows by Neighborhood and Take the Mean of the Frequency of Occurrence of Each Category

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()

print(toronto_grouped.shape)
toronto_grouped.head(5)

### Each Neighborhood with Top 5 Common Venues

In [None]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("[--------"+hood+"--------]")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue', 'freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq':2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

### Create a Dataframe that Has Top 10 Venues for Each Neighborhood

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
# create a dataframe to display top 10 venues for each neighborhood
num_top_venues = 10
indicators = ['st', 'nd', 'rd', 'th']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
         columns.append('{}th Most Common Venue'.format(ind+1))
        
# create the new dataframe
neighborhood_venues_sorted = pd.DataFrame(columns=columns)
neighborhood_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhood_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhood_venues_sorted.shape

### Run K-Means to Cluster the Neighborhoods

In [None]:
# set the number of clusters
kclusters = 4
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_init=300 , n_clusters=kclusters, random_state=5).fit(toronto_grouped_clustering)

#check cluster lables generated for each row in the dataframe
kmeans.labels_[0:20]

### Create a New Dataframe that Includes the Clusters

In [None]:
# add cluster labels
neighborhood_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
toronto_merged = toronto_dataframe

# merge toronto_grouped with toronto_dataframe to add latitude & longitue for each neighborhood
toronto_merged = toronto_merged.join(neighborhood_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
toronto_merged.head()

### Visualize the Clusters

In [None]:
# create a map
cluster_map = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color schemes
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
    [lat, lon],
    radius=9,
    popup=label,
    colors=rainbow[cluster-1],
    fill=True,
    fill_color=rainbow[cluster-1],
    fill_opacity=2.0).add_to(cluster_map)

cluster_map

### Examine Each Cluster and Determine the Discriminating Venue Categories that Distinguish Each Cluster

In [None]:
# Cluster 1
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

In [None]:
# Cluster 2
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

In [None]:
# Cluster 3
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]

In [None]:
# Cluster - 4
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(4, toronto_merged.shape[1]))]]