<center><img src = "https://external-content.duckduckgo.com/iu/?u=https%3A%2F%2Ftse1.mm.bing.net%2Fth%3Fid%3DOIP.WmN3Js3CIU7Yy072-JKSEQHaDH%26pid%3DApi&f=1" width = 400></center>

# Segmenting and Clustering Neighborhoods in Toronto
Toronto is a beautiful city with many diverse neighborhoods. I will explore analyze them by scraping postal codes from Wikipedia and utilizing Foursquare's powerful API to derive additional insights.

In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from pandas.io.json import json_normalize
import json
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

!conda install -c conda-forge folium=0.5.0 --yes
import folium # map rendering library

!pip install geocoder
import geocoder

!pip install bs4
from bs4 import BeautifulSoup

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done


# Section 1: Scraping Postal Codes from Wikipedia

In [None]:
# Scraping text from Wikipedia
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
soup = BeautifulSoup(requests.get(url).text, 'html5lib')
tables = soup.find_all('table')
len(tables)  # Checking how many tables there are

In [None]:
# Getting details about the tables to identify the correct one
for i, table in enumerate(tables):
    print('Table ', i)
    print(table.attrs)
    print('# rows: ', len(table.find_all('tr')))
    print('# columns: ', len(table.tr.find_all('td')), '\n')

In [None]:
# Getting postal codes from the first table
pcTable = []  # Empty list to store table content

for cell in tables[0].find_all('td'):
    pcRow = {}
    if cell.span.text == 'Not assigned':  # Removing unnassigned codes
        pass
    else:
        pcRow['Postal Code'] = cell.p.text[:3]
        pcRow['Borough'] = (cell.span.text).split('(')[0]
        pcRow['Neighborhood'] = (((((cell.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        pcTable.append(pcRow)

boroughs = pd.DataFrame(pcTable)  # Creating dataframe
boroughs['Borough'] = boroughs['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade': 'Downtown Toronto Stn A',  # Cleaning text
                                       'East TorontoBusiness reply mail Processing Centre969 Eastern': 'East Toronto Business',
                                       'EtobicokeNorthwest': 'Etobicoke Northwest','East YorkEast Toronto': 'East York/East Toronto',
                                       'MississaugaCanada Post Gateway Processing Centre': 'Mississauga'})
boroughs.head()

In [None]:
print('DataFrame size: ', boroughs.shape)

# Section 2: Collecting Latitude and Longitude for Toronto Neighborhoods

Could never get geocoder to return a response, so I had to rely on the CSV provided in the hints.

In [None]:
gs_data = pd.read_csv('https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs_v1/Geospatial_Coordinates.csv')
gs_data.head()

In [None]:
type(gs_data)  # disregard..kept getting 'Nonetype has no attribute __' errors

In [None]:
type(boroughs)  # disregard..kept getting 'Nonetype has no attribute __' errors

In [None]:
# Joining the data from the csv with the first dataframe on Postal Code column
df = boroughs.merge(gs_data, on='Postal Code')
df.head()

In [None]:
print('New dataframe size: ', df.shape)

# Section 3: Extracting Toronto Data From Foursquare API


In [None]:
# The code was removed by Watson Studio for sharing.

In [None]:
VERSION = '20180605'
cred = '&client_id={}&client_secret={}&v={}'.format(YOUR_CLIENT_ID,
                                                   YOUR_CLIENT_SECRET,
                                                   VERSION)

#### Creating a formula to get venues and categories from each neighborhood:

In [None]:
def getVenues(neighs, latitudes, longitudes, radius=500, limit=500):
    venues = []
    for neigh, lat, lng in zip(neighs, latitudes, longitudes):
        print(neigh)
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?'+cred+'&ll={},{}&radius={}&limit={}'.format(
            lat,
            lng,
            radius,
            limit)

        # GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']

        # return only relevant information for each nearby venue
        venues.append([(
            neigh,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venues in venues for item in venues])
    nearby_venues.columns = ['Neighborhood',
                             'Neighborhood Latitude',
                             'Neighborhood Longitude',
                             'Venue',
                             'Venue Latitude',
                             'Venue Longitude',
                             'Venue Category']

    return(nearby_venues)

#### I chose to work with just neighborhoods in boroughs that had 'Toronto' in their name, so I created a new dataframe with just those:

In [None]:
t_neighs = []
for i, bor in enumerate(df['Borough']):
    t_data = {}
    if bor.find('Toronto') == -1:
        pass
    else:
        t_data['Postal Code'] = df['Postal Code'][i]
        t_data['Borough'] = bor
        t_data['Neighborhood'] = df['Neighborhood'][i]
        t_data['Longitude'] = df['Longitude'][i]
        t_data['Latitude'] = df['Latitude'][i]
        t_neighs.append(t_data)
tdf = pd.DataFrame(t_neighs)
tdf.head()

In [None]:
print('Size: ', tdf.shape)

In [None]:
# Running the formula on my new dataframe:
toronto_venues = getVenues(neighs=tdf['Neighborhood'],
                           latitudes=tdf['Latitude'],
                           longitudes=tdf['Longitude'])

In [None]:
toronto_venues.head()

In [None]:
toronto_venues.groupby('Neighborhood').count()

In [None]:
print('There are {} venues total and {} unique venue categories.'.format(
    toronto_venues.shape[0],
    len(toronto_venues['Venue Category'].unique())))

# Section 4: Neighborhood Analysis and Segmenting

In [None]:
# Doing some one-hot encoding with the categories
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

#### After multiple errors, I discovered that one of the venue categories is 'Neighborhood'. So when I add my Neighborhood column back into the dataframe, I will need to add an underscore to differentiate them.

In [None]:
# Checking to see which venues have the neighborhood category, out of curiosity
for i, category in enumerate(toronto_venues['Venue Category']):
    if category.find('Neigh') == -1:
        pass
    else:
        print('{} - {}'.format(toronto_venues['Venue'][i], category))

In [None]:
# Setting Neighborhood_ to the first column
fixed_columns = ['Neighborhood_'] + list(toronto_onehot.columns[::])

# Adding Neighborhood_
toronto_onehot['Neighborhood_'] = toronto_venues['Neighborhood']

toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()

In [None]:
print('Onehot dataframe size: ', toronto_onehot.shape)


#### Grouping by Neighborhood_ and getting the frequency for each category:

In [None]:
toronto_grouped = toronto_onehot.groupby('Neighborhood_').mean().reset_index()
toronto_grouped.head()

In [None]:
print('Grouped dataframe size is ', toronto_grouped.shape)

#### Creating the function to sort venues in descending order:

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)

    return row_categories_sorted.index.values[0:num_top_venues]

#### Creating a dataframe of the top ten categories per neighborhood, based on the frequencies above:

In [None]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# New dataframe
toronto_venues_sorted = pd.DataFrame(columns=columns)
toronto_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood_']

# Getting indicator values
for ind in np.arange(toronto_grouped.shape[0]):
    toronto_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

toronto_venues_sorted.head()

In [None]:
print('Size: ', toronto_venues_sorted.shape)

# Section 5: Neighborhood Clustering

#### Creating a K-means clustering model for the data:

In [None]:
# Making 5 clusters
k = 5

# Removing string values from the table
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood_', 1)

# Fitting the model
kmeans = KMeans(n_clusters=k, random_state=0).fit(toronto_grouped_clustering)

# Double-checking the labels generated for each row in the dataframe
kmeans.labels_[0:10]

#### Inserting cluster information into a final dataframe:

In [None]:
# Adding the labels to the dataframe
toronto_venues_sorted.insert(0, 'Cluster', kmeans.labels_)

# Joining with the Toronto ('tdf') dataframe to get lat/long for each neighborhood
toronto_merged = tdf.join(toronto_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

In [None]:
print('Merged dataframe size: ', toronto_merged.shape)

#### Generating the cluster map:

In [None]:
# Did a google search to get Toronto coords since geocoder wasn't working (ironic)
latitude = 43.7001100
longitude = -79.4163000

map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

map_clusters