In [1]:
import json
import folium
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans

# Load data

1. Get html content using `requests.get`
2. Parse the html using `BeautifulSoup`
3. Load the parsed table into pandas dataframe using `pd.read_html`
4. Drop `NaN` values in column Borough
5. Replace `NaN` values in column Neighbour with the value from Borough (ZERO case found for the dadtaset)
6. Merge two neighbourhoods with the same Postal Code (ZERO case found for the dataset)
7. Reorder the dataset

In [2]:
# auxiliary function to merge duplicated postal code that does not exist
def merge_duplicate(df):
    """
    Find all the rows in [df] with same 'Postal Code' value
        then join their 'Neighbourhood' values, inplace

    Example:
        >>> test = pd.DataFrame({ \
                'Postal Code': ('M1A', 'M1A', 'M2A', 'M3A', 'M1A', 'M2A'), \
                'Neighbourhood': ('A', 'B', 'C', 'D', 'E', 'F'), \
            })
        >>> merge_duplicate(test)
        >>> test.loc[test['Postal Code'] == 'M1A']['Neighbourhood'].values[0]
        'A, B, E'
        >>> test.loc[test['Postal Code'] == 'M2A']['Neighbourhood'].values[0]
        'C, F'
    """
    counts = df['Postal Code'].value_counts()
    duplicate = set([code for code, c in zip(counts.index, counts) if c > 1])
    
    for code in duplicate:
        indices = np.where(df['Postal Code'] == code)[0]
        neighbour_names = df['Neighbourhood'][indices]
        df['Neighbourhood'][indices[0]] = ', '.join(neighbour_names)
        df.drop(indices[1:], inplace=True)


# step 1
url_to_parse = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url_to_parse).text

# step 2
parsed = BeautifulSoup(html)
table_html = str(parsed.table)

# step 3
df = pd.read_html(table_html)[0]

# step 4
df.replace({'Borough' : {'Not assigned' : np.nan}}, inplace=True)
df.dropna(inplace=True)

# step 5
replaced = np.where(
    df['Neighbourhood'] == 'not assigned',  # the condition
    df['Borough'],  # choose value from array if condition is True
    df['Neighbourhood'],  # choose value from array if condition is False
)
df['Neighbourhood'] = replaced

# step 6
merge_duplicate(df)

# step 7
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# Append Geographic Data

## Download the data

shame on Google

In [3]:
geo_data_url = 'https://cocl.us/Geospatial_data'
df_geo = pd.read_csv(geo_data_url)

## Reorder the new data frame

the rows in the `df_geo` table is re-ordered to match the `df`

In [4]:
a0 = np.array(df['Postal Code'])
a1 = np.array(df_geo['Postal Code'])
new_indices = [np.where(a1 == val)[0][0] for val in a0]
df_geo_ordered = df_geo.reindex(new_indices)
df_geo_ordered.reset_index(drop=True, inplace=True)
df_geo_ordered.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M3A,43.753259,-79.329656
1,M4A,43.725882,-79.315572
2,M5A,43.65426,-79.360636
3,M6A,43.718518,-79.464763
4,M7A,43.662301,-79.389494


## Concatenate the latitude and longitude

In [5]:
if 'Latitude' not in df.columns:  # only concatenate once
    df = pd.concat(
        [df, df_geo_ordered.drop('Postal Code', axis=1)],
        axis=1, ignore_index=False
    )
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Clustering

## Visualise the neighbourhoods without labels

In [6]:
location = df['Latitude'].mean(), df['Longitude'].mean()
map_toronto = folium.Map(location = location, zoom_start=11)
for lat, lng in zip(df['Latitude'], df['Longitude']):
    marker = folium.CircleMarker(
        [lat, lng], color='tomato', radius=4, fill_color='white',
        parse_html=False
    ).add_to(map_toronto)
map_toronto

## Explore each neighbourhood and obtain features


1. load my credential data for the Foursquare API
2. Make the request to get nearby venues of each neighbourhood
3. Use the count number of venue category as the feature
4. Cluster the featuer and label each neighbourhood
5. Append the cluster label information to origional dataframe
6. Visualise

In [7]:
# Step 1.
with open('.credential.json', 'r') as f:
    cred = json.load(f)
    
# Step 2.
radius = 1000
limit = 100
feature_dfs = []
for idx, row in df.iterrows():
    code = row['Postal Code']
    lat, lng = row['Latitude'], row['Longitude']
    url =  'https://api.foursquare.com/v2/venues/explore?&'
    url += 'client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        cred['client_id'],  cred['client_secret'],  cred['version'], 
        lat,  lng,  radius, limit
    )
    response = requests.get(url).json()["response"]
    try:
        results = response['groups'][0]['items']
    except:
        print(response)
    venue_cat = [v['venue']['categories'][0]['name'] for v in results]  # categories of nearby venues
    temp_df = pd.DataFrame({'Category' : venue_cat, 'Postal Code' : code})
    feature_dfs.append(temp_df)

Step 3

In [8]:
df_feature = pd.concat(feature_dfs)
print(df_feature.sample(3))

df_feature = pd.get_dummies(df_feature, columns=['Category'], prefix='', prefix_sep='')
df_feature = df_feature.groupby('Postal Code').sum()
df_feature.reset_index(inplace=True)
print("\nThe shape of the feature table is", df_feature.shape)
df_feature.head()

                 Category Postal Code
55            Coffee Shop         M4M
48       Asian Restaurant         M5L
88  Portuguese Restaurant         M5B

The shape of the feature table is (102, 328)


Unnamed: 0,Postal Code,Accessories Store,Afghan Restaurant,African Restaurant,Airport,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Aquarium,...,Video Store,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio,Zoo
0,M1B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,M1C,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,M1E,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,M1G,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,M1H,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0


Step 4

I choose 3 clusters because the result seems reasonable

In [9]:
cluster_num = 3
model = KMeans(n_clusters=cluster_num)
model.fit(df_feature.drop('Postal Code', axis=1))
labels = model.labels_
centres = model.cluster_centers_

Step 5

I re-ordered the labels so that its order matches the order of the `df` data frame

In [10]:
labels_ordered = np.empty(df.shape[0])

for i, code in df['Postal Code'].items():
    indices = np.where(df_feature['Postal Code'].values == code)[0]
    if len(indices) == 1:
        labels_ordered[i] = labels[indices[0]]
    elif len(indices) == 0:
        labels_ordered[i] = -1
    else:
        raise RuntimeError("Invalid duplicating Neighbourhood name found")

Step 6


The most satisfying cluster number I found is 3. Here shows the clustering result that spatially separated nicelly.

- The blue scatters around the harbour seems to be related to commercial & entertainment.
- The teal/green sctters seemd to be related to working environment.
- The orange scatter seems to be far away from the town and might related to housing area.
- There is one black dot without nearby venues

In [11]:
colors = ['teal', 'tomato', 'royalblue', 'crimson', 'green', 'orange', 'darkviolet']

location = df['Latitude'].mean(), df['Longitude'].mean()
map_toronto = folium.Map(location = location, zoom_start=11)

for i, (lat, lng) in enumerate(zip(df['Latitude'], df['Longitude'])):
    label = labels_ordered[i]
    if label >= 0:
        color = colors[int(label)]
    else:
        color = 'black'
    marker = folium.CircleMarker(
        [lat, lng], color=color, radius=8,
        fill_color='white',
        parse_html=False
    ).add_to(map_toronto)
    
map_toronto