# Assignment: Segmenting and Clustering Neighborhoods in Toronto #

### code to scrape the following Wikipedia page, https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M ####

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
data  = requests.get(url).text 
soup = BeautifulSoup(data,"html5lib")
tabla = soup.find('table') # It's the first table on the page
neighborhoods_postalcode = pd.DataFrame(columns=["Postal Code","Borough","Neighborhood"])

for row in tabla.tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        postalcode = col[0].text.strip()
        borough = col[1].text.strip()
        neighborhood = col[2].text.strip()
        neighborhoods_postalcode = neighborhoods_postalcode.append({"Postal Code":postalcode, "Borough":borough,"Neighborhood":neighborhood}, ignore_index=True)

### Ignore cells with a borough that is Not assigned.

In [2]:
indexNotAssigned = neighborhoods_postalcode[neighborhoods_postalcode['Borough']=='Not assigned'].index
neighborhoods_postalcode.drop(indexNotAssigned, inplace=True)
neighborhoods_postalcode.reset_index(inplace=True, drop=True)
neighborhoods_postalcode

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### <i>"More than one neighborhood can exist in one postal code area." </i>The table at the webpage is already with no duplicates postal codes 

In [3]:
neighborhoods_postalcode.groupby('Postal Code').count().max()


Borough         1
Neighborhood    1
dtype: int64

In [4]:
neighborhoods_postalcode.shape

(103, 3)

###  geographical coordinates neighborhoods

In [5]:
!wget -q -O 'coordinates.csv' http://cocl.us/Geospatial_data
print('¡recogida la info!')

¡recogida la info!


In [6]:
coordinates_all = pd.read_csv('coordinates.csv')
coordinates_all.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [7]:
postlcodes_list= neighborhoods_postalcode['Postal Code'].tolist()

In [8]:
neighborhoods = neighborhoods_postalcode.join(coordinates_all.set_index('Postal Code'), on='Postal Code')
neighborhoods.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### Visualize Toronto map

In [9]:
# install, import folium to visualize maps
#!conda install -c conda-forge folium=0.5.0 --yes
import folium
# create map of Toronto using latitude and longitude values
from geopy.geocoders import Nominatim
# create map of Toronto using latitude and longitude values
address = 'Toronto, ON'

geolocator = Nominatim(user_agent="on_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


Only neighborhoods in DownTown Toronto 

In [10]:
toronto_data = neighborhoods[neighborhoods['Borough']=='Downtown Toronto'].reset_index(drop=True)
toronto_data.groupby('Borough').count()
toronto_data.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


#### Foursquare data

In [11]:
# @hidden_cell
CLIENT_ID = 'A2KB1ZZUM0KTQ301DGYGSSX10ISFUQ4QN4WGXLVYHQ5EIRME' # your Foursquare ID
CLIENT_SECRET = 'YFAK1FYCEQUZQRI2JJUD00RSCYC3NIVEOZU35LNY55H2XSO' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
ACCESS_TOKEN = '45RZ10W5FD4FL2KMGKC0IGLL5Q3CTQKDPZDXKTJVA452QIIF' 
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: A2KB1ZZUM0KTQ301DGYGSSX10ISFUQ4QN4WGXLVYHQ5EIRME
CLIENT_SECRET:YFAK1FYCEQUZQRI2JJUD00RSCYC3NIVEOZU35LNY55H2XSO


#### We use the function from the NY neighborhoods exercise

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET,  lat, lng, ACCESS_TOKEN, VERSION,  radius, LIMIT)
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
toronto_venues = getNearbyVenues(names=toronto_data['Neighborhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )

Regent Park, Harbourfront
Queen's Park, Ontario Provincial Government
Garden District, Ryerson
St. James Town
Berczy Park
Central Bay Street
Christie
Richmond, Adelaide, King
Harbourfront East, Union Station, Toronto Islands
Toronto Dominion Centre, Design Exchange
Commerce Court, Victoria Hotel
University of Toronto, Harbord
Kensington Market, Chinatown, Grange Park
CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport
Rosedale
Stn A PO Boxes
St. James Town, Cabbagetown
First Canadian Place, Underground city
Church and Wellesley


In [14]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Roselle Desserts,43.653447,-79.362017,Bakery
1,"Regent Park, Harbourfront",43.65426,-79.360636,Tandem Coffee,43.653559,-79.361809,Coffee Shop
2,"Regent Park, Harbourfront",43.65426,-79.360636,Cooper Koo Family YMCA,43.653249,-79.358008,Distribution Center
3,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
4,"Regent Park, Harbourfront",43.65426,-79.360636,Impact Kitchen,43.656369,-79.35698,Restaurant


In [15]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot['Neighborhood'] = toronto_venues['Neighborhood'] 
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]
toronto_onehot.head()

Unnamed: 0,Yoga Studio,Accessories Store,Adult Boutique,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Clustering neighborhoods

In [16]:
import numpy as np
#!pip install scikit-learn
from sklearn.cluster import KMeans

In [17]:
kclusters = 5
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 0, 1, 4, 1, 1, 1, 1, 1, 3], dtype=int32)