In [1]:
import requests
import pandas as pd
import numpy as np

### Gettting the wikipedia page.

In [2]:
Toronto_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
# Now that we have the url where our data from Toronto is, we use BeautifulSoup to extract our data.
from bs4 import BeautifulSoup
soup = BeautifulSoup(Toronto_url,'lxml')



### Finding where our table with Boroughs and Neighborhoods starts

In [3]:
Toronto_BN = soup.find('table',{'class':'wikitable sortable'})


### Creating 3 lists with all Postal Codes, Borough and Neighborhoods

In [4]:
PostalCodes = []
Borough =[]
Neighborhood=[]
for row in Toronto_BN.findAll("tr"):
    cells = row.findAll("td")
    #For each "tr", assign each "td" to a variable.
    if len(cells) == 3:
        PostalCodes.append(cells[0].find(text=True))
        Borough.append(cells[1].find(text=True))
        Neighborhood.append(cells[2].find(text=True))


### Creating my Data Frame

In [5]:
Toronto_df = pd.DataFrame({'PostalCode': PostalCodes,'Neighborhood':Neighborhood,'Borough': Borough})


### Cleaning up table to adjust it as requested (Dropping the "Not assigned" Borough and One Postal Code per Borough and all Neighborhood in it )

In [6]:
# Dropping all rows where Borough is equal to 'Not assigned'
df_Toronto = Toronto_df[Toronto_df.Borough != 'Not assigned']
df_Toronto.reset_index(inplace=True)
df_Toronto = df_Toronto.replace('\n','', regex=True) # This removes all new lines ('\n') that I may have in all columns.
df_Toronto.drop('index',axis=1,inplace=True)


In [7]:
## Assigning the name of the Borough to neigborhoods with value 'Not assigned'
ind=[]
a = df_Toronto['Neighborhood'].str.find('Not assigned')
a = a.tolist()
for i in range(len(a)):
    if a[i] == 0:
        ind.append(i)
for id in ind:
    df_Toronto.loc[id,'Neighborhood']=df_Toronto.loc[id,'Borough']


In [8]:
# Creating one Postal Code per Borough and grouping in the Neighborhood column all Neighborhoods in the same Postal Code 
df_Torontonew = df_Toronto.groupby(df_Toronto['PostalCode'],as_index=False).aggregate({'Borough': 'first', 'Neighborhood': lambda x: ', '.join(x)},sort=False)


In [9]:
df_Torontonew.shape

(103, 3)

### Including LAT and LON (Using CSV file)

### Reading CSV file

In [10]:
LatLon_df = pd.read_csv(r'C:\Users\v.hernandez.byd\Documents\Data Science\Geospatial_Coordinates.csv')
     

### Changing the name of column in LatLon_df and the merging Data Frames (df_Torontonew and LatLon_df)

In [11]:
LatLon_df = LatLon_df.rename(columns={'Postal Code': 'PostalCode'})
Toronto_data_df = pd.merge(df_Torontonew, LatLon_df, on='PostalCode')

# Clustering Toronto

### Importing all needed libraries

In [18]:
import matplotlib.cm as cm
from geopy.geocoders import Nominatim
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

##  As per suggested in the assignment I'm just taking Borough = Downtown Toronto

In [38]:
dtw_Toronto=Toronto_data_df[Toronto_data_df['Borough']=='Downtown Toronto'].reset_index(drop=True)
dtw_Toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316
3,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937


In [39]:
address = 'Downtown Toronto, Canada'
geolocator=Nominatim()
location = geolocator.geocode(address)
Latdtw=location.latitude
Londtw=location.longitude
print('The geograpical coordinates of Downtown Toronto are: {}, {}'.format(Latdtw,Londtw))

  from ipykernel import kernelapp as app


The geograpical coordinates of Downtown Toronto are: 43.655115, -79.380219


In [47]:
# Generating Downtown Toronto Map
Toronto_map = folium.Map(location=[Latdtw,Londtw],zoom_start=12)
# Including points for all Boroughs
for la,lg,PostalCode,Borough,Neighborhood in zip(dtw_Toronto['Latitude'],dtw_Toronto['Longitude'],dtw_Toronto['PostalCode'],dtw_Toronto['Borough'],dtw_Toronto['Neighborhood']):
    label = '{},{},{}'.format(PostalCode,Borough,Neighborhood)
    label = folium.Popup(label,parse_html=True)
    folium.CircleMarker([la,lg],radius=5,popup=label,color='blue',fill=True,fill_color='#3186cc',fill_opacity=0.7).add_to(Toronto_map)
Toronto_map

## Starting Exploring and Clustering using Fuorsquare data and Downtown Toronto data.

### Defining Foursquare credentials

In [49]:
CLIENT_ID ='SVDP1YJGJZ5EMR51DKHDVEITMXMDUSZ2AS4HCHNI4QSX1RX3'
CLIENT_SECRET= 'DTG3XJSF3XWPLC2KUVX0V4KSUEZNGBAIUHOLP3LQK0MX4WAL'
VERSION ='20180605'

### Exploring all Neighborhoods in Downtown Toronto

In [60]:
# Creating a funtion to explore all neighborhoods in Downtown Toronto
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    LIMIT=100
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [63]:
Toronto_dtw_venues = getNearbyVenues(names=dtw_Toronto['Neighborhood'],
                                   latitudes=dtw_Toronto['Latitude'],
                                   longitudes=dtw_Toronto['Longitude'])

print(Toronto_dtw_venues.shape)

Rosedale
Cabbagetown, St. James Town
Church and Wellesley
Harbourfront, Regent Park
Ryerson, Garden District
St. James Town
Berczy Park
Central Bay Street
Adelaide, King, Richmond
Harbourfront East, Toronto Islands, Union Station
Design Exchange, Toronto Dominion Centre
Commerce Court, Victoria Hotel
Harbord, University of Toronto
Chinatown, Grange Park, Kensington Market
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara
Stn A PO Boxes 25 The Esplanade
First Canadian Place, Underground city
Christie
(1280, 7)


In [64]:
Toronto_dtw_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,"Cabbagetown, St. James Town",43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner


### Checking venues per neighborhood

In [66]:
Toronto_dtw_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Adelaide, King, Richmond",100,100,100,100,100,100
Berczy Park,55,55,55,55,55,55
"CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara",14,14,14,14,14,14
"Cabbagetown, St. James Town",46,46,46,46,46,46
Central Bay Street,84,84,84,84,84,84
"Chinatown, Grange Park, Kensington Market",100,100,100,100,100,100
Christie,15,15,15,15,15,15
Church and Wellesley,85,85,85,85,85,85
"Commerce Court, Victoria Hotel",100,100,100,100,100,100
"Design Exchange, Toronto Dominion Centre",100,100,100,100,100,100


### Checking all different kind of venues

In [84]:
Toronto_kv = pd.get_dummies(Toronto_dtw_venues[['Venue Category']], prefix="", prefix_sep="")
Toronto_kv['Neighborhood'] = Toronto_dtw_venues['Neighborhood'] 
Toronto_kv.head()

Unnamed: 0,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [85]:
# Shape of Data Frame
Toronto_kv.shape

(1280, 205)

In [86]:
# Grouping by Neighborhood and mean of frequency of ocurrance of each venue
Toronto_mean = Toronto_kv.groupby('Neighborhood').mean().reset_index()
Toronto_mean

Unnamed: 0,Neighborhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.0,0.071429,0.071429,0.071429,0.142857,0.142857,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cabbagetown, St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011905,...,0.0,0.0,0.011905,0.0,0.0,0.011905,0.0,0.0,0.0,0.011905
5,"Chinatown, Grange Park, Kensington Market",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.05,0.0,0.05,0.01,0.0,0.0,0.0,0.0
6,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Church and Wellesley,0.011765,0.011765,0.0,0.0,0.0,0.0,0.0,0.0,0.011765,...,0.0,0.0,0.0,0.011765,0.011765,0.0,0.011765,0.011765,0.0,0.011765
8,"Commerce Court, Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0
9,"Design Exchange, Toronto Dominion Centre",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0


In [87]:
Toronto_mean.shape

(18, 205)

## Clustering with Kmean

In [112]:
# Number of clusters
kclusters = 6
Toronto_clustering = Toronto_mean.drop('Neighborhood', 1)
# running k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Toronto_clustering)
# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:5] 

array([1, 5, 3, 5, 2])

### Looking for the top 5 venues in the Neighborhoods

In [113]:
# Creating the function to return Top venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

# Now lets create a column for Top venues
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Toronto_mean['Neighborhood']

for ind in np.arange(Toronto_mean.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Toronto_mean.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted


Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,"Adelaide, King, Richmond",Coffee Shop,Café,American Restaurant,Steakhouse,Thai Restaurant
1,Berczy Park,Coffee Shop,Cocktail Bar,Restaurant,Pub,Beer Bar
2,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Terminal,Airport Lounge,Harbor / Marina,Sculpture Garden
3,"Cabbagetown, St. James Town",Coffee Shop,Restaurant,Pizza Place,Café,Italian Restaurant
4,Central Bay Street,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Burger Joint
5,"Chinatown, Grange Park, Kensington Market",Café,Bar,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Chinese Restaurant
6,Christie,Grocery Store,Café,Park,Diner,Italian Restaurant
7,Church and Wellesley,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant
8,"Commerce Court, Victoria Hotel",Coffee Shop,Café,Hotel,Restaurant,American Restaurant
9,"Design Exchange, Toronto Dominion Centre",Coffee Shop,Café,Hotel,Restaurant,American Restaurant


### Creating new Data Frame with cluster labels and top 5 venues in every neighborhood

In [114]:
Toronto_Top = dtw_Toronto
# add clustering labels
Toronto_Top['Cluster Labels'] = kmeans.labels_
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_Top = Toronto_Top.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
Toronto_Top.head() 

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M4W,Downtown Toronto,Rosedale,43.679563,-79.377529,1,Park,Playground,Trail,Yoga Studio,Dim Sum Restaurant
1,M4X,Downtown Toronto,"Cabbagetown, St. James Town",43.667967,-79.367675,5,Coffee Shop,Restaurant,Pizza Place,Café,Italian Restaurant
2,M4Y,Downtown Toronto,Church and Wellesley,43.66586,-79.38316,3,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant
3,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65426,-79.360636,5,Coffee Shop,Bakery,Pub,Park,Theater
4,M5B,Downtown Toronto,"Ryerson, Garden District",43.657162,-79.378937,2,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant


## Now we put on a map all Clusters

In [115]:
map_clusters = folium.Map(location=[Latdtw,Londtw], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Toronto_Top['Latitude'], Toronto_Top['Longitude'], Toronto_Top['Neighborhood'], Toronto_Top['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Examining each cluster and the discriminating venue categories that distinguish each cluster

### Cluster 1

In [116]:
Toronto_Top.loc[Toronto_Top['Cluster Labels'] == 0, Toronto_Top.columns[[1] + list(range(5, Toronto_Top.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
14,Downtown Toronto,0,Airport Service,Airport Terminal,Airport Lounge,Harbor / Marina,Sculpture Garden


### Cluster 2

In [117]:
Toronto_Top.loc[Toronto_Top['Cluster Labels'] == 1, Toronto_Top.columns[[1] + list(range(5, Toronto_Top.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Downtown Toronto,1,Park,Playground,Trail,Yoga Studio,Dim Sum Restaurant
5,Downtown Toronto,1,Coffee Shop,Café,Restaurant,Hotel,Bakery
7,Downtown Toronto,1,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Burger Joint
15,Downtown Toronto,1,Coffee Shop,Restaurant,Café,Beer Bar,Seafood Restaurant


### Cluster 3

In [118]:
Toronto_Top.loc[Toronto_Top['Cluster Labels'] == 2, Toronto_Top.columns[[1] + list(range(5, Toronto_Top.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
4,Downtown Toronto,2,Coffee Shop,Clothing Store,Cosmetics Shop,Café,Middle Eastern Restaurant
8,Downtown Toronto,2,Coffee Shop,Café,American Restaurant,Steakhouse,Thai Restaurant
9,Downtown Toronto,2,Coffee Shop,Aquarium,Hotel,Café,Pizza Place
10,Downtown Toronto,2,Coffee Shop,Café,Hotel,Restaurant,American Restaurant
12,Downtown Toronto,2,Café,Bar,Japanese Restaurant,Coffee Shop,Bookstore


### Cluster 4

In [119]:
Toronto_Top.loc[Toronto_Top['Cluster Labels'] == 3, Toronto_Top.columns[[1] + list(range(5, Toronto_Top.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
2,Downtown Toronto,3,Coffee Shop,Japanese Restaurant,Gay Bar,Sushi Restaurant,Restaurant


### Cluster 5

In [120]:
Toronto_Top.loc[Toronto_Top['Cluster Labels'] == 4, Toronto_Top.columns[[1] + list(range(5, Toronto_Top.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
6,Downtown Toronto,4,Coffee Shop,Cocktail Bar,Restaurant,Pub,Beer Bar


### Cluster 5

In [122]:
Toronto_Top.loc[Toronto_Top['Cluster Labels'] == 5, Toronto_Top.columns[[1] + list(range(5, Toronto_Top.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,Downtown Toronto,5,Coffee Shop,Restaurant,Pizza Place,Café,Italian Restaurant
3,Downtown Toronto,5,Coffee Shop,Bakery,Pub,Park,Theater
11,Downtown Toronto,5,Coffee Shop,Café,Hotel,Restaurant,American Restaurant
13,Downtown Toronto,5,Café,Bar,Vietnamese Restaurant,Vegetarian / Vegan Restaurant,Chinese Restaurant
16,Downtown Toronto,5,Coffee Shop,Café,Hotel,Restaurant,Steakhouse
17,Downtown Toronto,5,Grocery Store,Café,Park,Diner,Italian Restaurant
