# Part1: creating dataframe from wiki page

In [115]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np

Load page and parse it into <tr> elements. Check that all rows have 3 columns.

In [22]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
doc = lh.fromstring(page.content)

tr_elements = doc.xpath('//table')[0].xpath(".//tr")

# check number of columns:
[len(T) for T in tr_elements if len(T)!=3]

[]

Now, lets fetch headers from tr_elements. For that lets take the first row:

In [23]:
headers = [T.text_content() for T in tr_elements[0]]
headers

['Postal Code\n', 'Borough\n', 'Neighborhood\n']

Lets remove useless \n

In [24]:
headers = [T.replace("\n", "") for T in headers]
headers

['Postal Code', 'Borough', 'Neighborhood']

Create list of columns with names and empty data

In [25]:
columns = [(title, []) for title in headers]
columns


[('Postal Code', []), ('Borough', []), ('Neighborhood', [])]

Add data to columns and remove useless \n from data

In [26]:
for r in tr_elements[1:]:
    for c in range(3):
        column = columns[c]
        column[1].append(r[c].text_content().replace("\n", ""))

Create Dict for creating pandas dataframe

In [27]:
d = {title: col for (title, col) in columns}

Create dataframe

In [28]:
df = pd.DataFrame(d)
df.shape

(180, 3)

Take DF without Not Assigned borough

In [29]:
df = df[df["Borough"]!="Not assigned"]
df.tail()

Unnamed: 0,Postal Code,Borough,Neighborhood
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


Lets find empty Neighborhood now:

In [30]:
rows_empty_neighborhood = df["Neighborhood"] == ""
df[rows_empty_neighborhood]

Unnamed: 0,Postal Code,Borough,Neighborhood


No such rows.

So, here is the final shape:

In [31]:
df.shape

(103, 3)

# Part2: Geocoding

Read geo data from csv into dataframe:

In [32]:
geo_df = pd.read_csv("http://cocl.us/Geospatial_data")

Take a look at geo dataframe:

In [33]:
geo_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Sort neighborhood and geo dataframes by postal code

In [34]:
geo_df.sort_values(by="Postal Code", inplace=True)
df.sort_values(by="Postal Code", inplace=True)

Check sizes of both dataframes:

In [35]:
print(geo_df.shape)
print(df.shape)

(103, 3)
(103, 3)


Yehoo! Sizes are the same!
Now just copy two columns to neighborhood dataframe:

In [36]:
df["Latitude"] = geo_df["Latitude"].values
df["Longitude"] = geo_df["Longitude"].values

df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
9,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
18,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
27,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
36,M1G,Scarborough,Woburn,43.770992,-79.216917
45,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# Part3: explore and cluster

Let's look at map of Toronto neiborhoods.
First let's get Toronto coordinates:

In [37]:
from geopy.geocoders import Nominatim

address = 'Toronto, Canada'

geolocator = Nominatim(user_agent="toronto_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


In [38]:
!pip install folium



In [39]:
import folium

map = folium.Map(location=[latitude, longitude], zoom_start=10)

for lat, lng, borough, neighborhood in zip(df['Latitude'], df['Longitude'], df['Borough'], df['Neighborhood']):
    label = f"Borough: {borough}, Neighborhoods: {neighborhood}"
    label = folium.Popup(label)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        full=True,
        fill_color='red',
        fill_opacity=0.7,
    ).add_to(map)

map

Let's get Downtown Toronto only:

In [40]:
df_dt = df[df['Borough'] == 'Downtown Toronto']
df_dt.shape

(19, 5)

FourSquare initialization:

In [41]:
CLIENT_ID = '5MOABCDFEOHA5VYFDGM3B0RHLYXHLV3KLAFJA5UPVNMY5YYZ' # your Foursquare ID
CLIENT_SECRET = 'HGYEQ2WYC2EOTGVD100H3CQKDPDNBM3C2254AEGYSGJU4BLW' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

Create a function of getting venues by latitude and longitude for all neighborhoods:

In [59]:
import requests
import json

def venues_for_neighborhood(latitudes, longitudes, neighborhoods):
    venues_list = []
    for lat, lng, neib_name in zip(latitudes, longitudes, neighborhoods):
        url = f'http://api.foursquare.com/v2/venues/explore?client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&v={VERSION}&ll={lat},{lng}&radius=500&limit=100'
        result = requests.get(url).json()
        venues = result['response']['groups'][0]['items']
        
        venues_list.extend([(
            neib_name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in venues])
        
    nearby_venues = pd.DataFrame(venues_list)
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Get data frame of venues for all neighborhoods:

In [60]:
downtown_venues = venues_for_neighborhood(neighborhoods=df_dt['Neighborhood'],
                                   latitudes=df_dt['Latitude'],
                                   longitudes=df_dt['Longitude']
                                  )

downtown_venues.shape

(1211, 7)

In [61]:
downtown_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Rosedale,43.679563,-79.377529,Rosedale Park,43.682328,-79.378934,Playground
1,Rosedale,43.679563,-79.377529,Whitney Park,43.682036,-79.373788,Park
2,Rosedale,43.679563,-79.377529,Alex Murray Parkette,43.6783,-79.382773,Park
3,Rosedale,43.679563,-79.377529,Milkman's Lane,43.676352,-79.373842,Trail
4,"St. James Town, Cabbagetown",43.667967,-79.367675,Cranberries,43.667843,-79.369407,Diner


Let's count number of venues in each neighborhood:

In [67]:
df_count = downtown_venues[['Neighborhood', 'Venue']].groupby('Neighborhood').count()
df_count

Unnamed: 0_level_0,Venue
Neighborhood,Unnamed: 1_level_1
Berczy Park,58
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",14
Central Bay Street,62
Christie,17
Church and Wellesley,77
"Commerce Court, Victoria Hotel",100
"First Canadian Place, Underground city",100
"Garden District, Ryerson",100
"Harbourfront East, Union Station, Toronto Islands",100
"Kensington Market, Chinatown, Grange Park",58


### Let's investigate categories a bit
Unique categories:

In [69]:
len(downtown_venues['Venue Category'].unique())

208

Frequency of occurence of categories for each neighborhood:

In [141]:
dummies = pd.get_dummies(downtown_venues[['Venue Category']], prefix='', prefix_sep='')

#There is a category named Neighborhood. Rename this category to avoid conflict with original Neighborhood column:
dummies.columns = np.where(dummies.columns=='Neighborhood', 'Neighborhood_', dummies.columns)

#add original NEighborhood column:
dummies['Neighborhood'] = downtown_venues['Neighborhood']
fixed_columns = ['Neighborhood']
fixed_columns.extend(dummies.columns[0:-1].values)
dummies = dummies[fixed_columns]

# Here is the average frequency of all categories for each neighborhood:
frequency = dummies.groupby('Neighborhood').mean()
frequency.head()

Unnamed: 0_level_0,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,Art Gallery,...,Theme Restaurant,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wings Joint,Women's Store,Yoga Studio
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017241,...,0.0,0.0,0.0,0.017241,0.0,0.0,0.0,0.0,0.0,0.0
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",0.0,0.071429,0.071429,0.142857,0.142857,0.142857,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.016129,0.0,0.0,0.016129,0.0,0.0,0.016129
Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Church and Wellesley,0.012987,0.0,0.0,0.0,0.0,0.0,0.012987,0.0,0.0,0.0,...,0.012987,0.0,0.0,0.0,0.0,0.0,0.0,0.012987,0.0,0.025974


Let's take a look at top 5 categories for each neighborhood:

In [182]:
t = frequency.copy()
t.reset_index(inplace=True)

for neighborhood in t['Neighborhood']:
    temp = t[t['Neighborhood']==neighborhood].T.reset_index()
    temp = temp.iloc[1:]
    temp.columns = ['Category', 'Frequency']
    temp['Frequency'] = temp['Frequency'].astype(float)
    temp.sort_values('Frequency', inplace=True, ascending=False)
    temp.reset_index(drop=True, inplace=True)
    print(f'\nNeighborhood: {neighborhood}:')
    #set index for better printing
#     temp.set_index('Category', inplace=True)
    print(temp.head())


Neighborhood: Berczy Park:
       Category  Frequency
0   Coffee Shop   0.068966
1  Cocktail Bar   0.051724
2          Café   0.034483
3   Cheese Shop   0.034483
4        Bakery   0.034483

Neighborhood: CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport:
           Category  Frequency
0    Airport Lounge   0.142857
1   Airport Service   0.142857
2  Airport Terminal   0.142857
3   Harbor / Marina   0.071429
4           Airport   0.071429

Neighborhood: Central Bay Street:
             Category  Frequency
0         Coffee Shop   0.177419
1      Sandwich Place   0.064516
2  Italian Restaurant   0.064516
3                Café   0.048387
4        Burger Joint   0.032258

Neighborhood: Christie:
        Category  Frequency
0  Grocery Store   0.235294
1           Café   0.176471
2           Park   0.117647
3     Baby Store   0.058824
4     Restaurant   0.058824

Neighborhood: Church and Wellesley:
              Category  Frequency
0   

Form a DataFrame of 10 most common venue categories for each neighborhood:

In [213]:
def ten_most_common_categories(row):
    row_categories = row.iloc[1:]
    row_categories = row_categories.sort_values(ascending=False)
    return row_categories.index.values[0:10]

indicators = ['st', 'nd', 'rd']
columns = ['Neighborhood']
for ind in np.arange(10):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

df_most_ten = pd.DataFrame(columns=columns)
df_most_ten['Neighborhood'] = t['Neighborhood']

for row in np.arange(frequency.shape[0]):
    df_most_ten.iloc[row, 1:] = ten_most_common_categories(t.iloc[row])

### Cluster neighborhoods

In [207]:
from sklearn.cluster import KMeans

In [237]:
clusters = 5
df_without_neighborhood = t.iloc[:, 1:]
df_without_neighborhood.head()

kmeans = KMeans(n_clusters=clusters, random_state=0).fit(df_without_neighborhood)
kmeans.labels_[0:10]

#let's look on top 10 categories and clusters together:
df_most_ten_with_clusters = df_most_ten.copy()
df_most_ten_with_clusters.insert(0, 'Cluster', kmeans.labels_)

fixed_columns = list(df_most_ten_with_clusters.columns)
# print(fixed_columns)
fixed_columns = ['Postal Code', 'Borough', 'Latitude', 'Longitude'] + fixed_columns
# print(fixed_columns)

df_most_ten_with_clusters = df_most_ten_with_clusters.join(df.set_index('Neighborhood'), on='Neighborhood')
df_most_ten_with_clusters = df_most_ten_with_clusters[fixed_columns]
df_most_ten_with_clusters.head()

Unnamed: 0,Postal Code,Borough,Latitude,Longitude,Cluster,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M5E,Downtown Toronto,43.644771,-79.373306,1,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Pub,Cheese Shop,Restaurant,Beer Bar,Seafood Restaurant,Café,Gourmet Shop
1,M5V,Downtown Toronto,43.628947,-79.39442,0,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Boat or Ferry,Boutique,Sculpture Garden,Plane,Bar,Harbor / Marina,Airport Food Court
2,M5G,Downtown Toronto,43.657952,-79.387383,4,Central Bay Street,Coffee Shop,Sandwich Place,Italian Restaurant,Café,Burger Joint,Japanese Restaurant,Salad Place,Bubble Tea Shop,Park,Hotel
3,M6G,Downtown Toronto,43.669542,-79.422564,3,Christie,Grocery Store,Café,Park,Candy Store,Italian Restaurant,Diner,Restaurant,Baby Store,Athletics & Sports,Nightclub
4,M4Y,Downtown Toronto,43.66586,-79.38316,1,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Gay Bar,Café,Pub,Smoke Shop,Hotel,Yoga Studio


Visualize resulting clusters:

In [242]:
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_most_ten_with_clusters['Latitude'], df_most_ten_with_clusters['Longitude'], df_most_ten_with_clusters['Neighborhood'], df_most_ten_with_clusters['Cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### let's take a look to clusters content:

In [243]:
df_most_ten_with_clusters.loc[df_most_ten_with_clusters['Cluster'] == 0, df_most_ten_with_clusters.columns[[1] + list(range(5, df_most_ten_with_clusters.shape[1]))]]

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har...",Airport Lounge,Airport Service,Airport Terminal,Boat or Ferry,Boutique,Sculpture Garden,Plane,Bar,Harbor / Marina,Airport Food Court


In [244]:
df_most_ten_with_clusters.loc[df_most_ten_with_clusters['Cluster'] == 1, df_most_ten_with_clusters.columns[[1] + list(range(5, df_most_ten_with_clusters.shape[1]))]]

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Downtown Toronto,Berczy Park,Coffee Shop,Cocktail Bar,Bakery,Pub,Cheese Shop,Restaurant,Beer Bar,Seafood Restaurant,Café,Gourmet Shop
4,Downtown Toronto,Church and Wellesley,Coffee Shop,Japanese Restaurant,Sushi Restaurant,Restaurant,Gay Bar,Café,Pub,Smoke Shop,Hotel,Yoga Studio
5,Downtown Toronto,"Commerce Court, Victoria Hotel",Coffee Shop,Café,Restaurant,Hotel,American Restaurant,Gym,Deli / Bodega,Japanese Restaurant,Italian Restaurant,Seafood Restaurant
6,Downtown Toronto,"First Canadian Place, Underground city",Coffee Shop,Café,Hotel,Restaurant,Japanese Restaurant,Gym,Asian Restaurant,Seafood Restaurant,Steakhouse,Deli / Bodega
7,Downtown Toronto,"Garden District, Ryerson",Clothing Store,Coffee Shop,Café,Italian Restaurant,Cosmetics Shop,Middle Eastern Restaurant,Bubble Tea Shop,Japanese Restaurant,Restaurant,Diner
8,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",Coffee Shop,Aquarium,Hotel,Café,Scenic Lookout,Brewery,Sporting Goods Shop,Restaurant,Fried Chicken Joint,Italian Restaurant
9,Downtown Toronto,"Kensington Market, Chinatown, Grange Park",Café,Coffee Shop,Dessert Shop,Bakery,Vietnamese Restaurant,Mexican Restaurant,Gaming Cafe,Grocery Store,Bar,Vegetarian / Vegan Restaurant
12,Downtown Toronto,"Richmond, Adelaide, King",Coffee Shop,Café,Restaurant,Gym,Clothing Store,Deli / Bodega,Thai Restaurant,Hotel,American Restaurant,Cosmetics Shop
14,Downtown Toronto,St. James Town,Coffee Shop,Café,Cocktail Bar,Gastropub,American Restaurant,Restaurant,Beer Bar,Gym,Hotel,Italian Restaurant
15,Downtown Toronto,"St. James Town, Cabbagetown",Coffee Shop,Chinese Restaurant,Restaurant,Pub,Café,Italian Restaurant,Pizza Place,Bakery,Butcher,Playground


In [245]:
df_most_ten_with_clusters.loc[df_most_ten_with_clusters['Cluster'] == 2, df_most_ten_with_clusters.columns[[1] + list(range(5, df_most_ten_with_clusters.shape[1]))]]

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
13,Downtown Toronto,Rosedale,Park,Playground,Trail,Dance Studio,Eastern European Restaurant,Donut Shop,Doner Restaurant,Dog Run,Distribution Center,Discount Store


In [246]:
df_most_ten_with_clusters.loc[df_most_ten_with_clusters['Cluster'] == 3, df_most_ten_with_clusters.columns[[1] + list(range(5, df_most_ten_with_clusters.shape[1]))]]

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Downtown Toronto,Christie,Grocery Store,Café,Park,Candy Store,Italian Restaurant,Diner,Restaurant,Baby Store,Athletics & Sports,Nightclub


In [247]:
df_most_ten_with_clusters.loc[df_most_ten_with_clusters['Cluster'] == 4, df_most_ten_with_clusters.columns[[1] + list(range(5, df_most_ten_with_clusters.shape[1]))]]

Unnamed: 0,Borough,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Downtown Toronto,Central Bay Street,Coffee Shop,Sandwich Place,Italian Restaurant,Café,Burger Joint,Japanese Restaurant,Salad Place,Bubble Tea Shop,Park,Hotel
10,Downtown Toronto,"Queen's Park, Ontario Provincial Government",Coffee Shop,Hobby Shop,Restaurant,Bank,Bar,Diner,Smoothie Shop,Discount Store,Café,Yoga Studio
11,Downtown Toronto,"Regent Park, Harbourfront",Coffee Shop,Pub,Bakery,Park,Breakfast Spot,Café,Theater,Yoga Studio,Farmers Market,Performing Arts Venue
