# Step 1 - scrape and load table

In [191]:
import pandas as pd

from bs4 import BeautifulSoup
import requests

# read in the html file from the wikipedia page

url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')

In [192]:
# parse html file into a dict

data_dict = {'Postcode':[], 'Borough':[], 'Neighborhood':[]}
cols = list(data_dict.keys())
the_table = soup.find('table')

for row in the_table.find_all('tr')[1:]:
    the_tags = [tag for tag in row if tag != '\n']
    for i, val in enumerate(the_tags):
        if len(list(val.children)) == 0:
            # there is no link tag in this cell
            data_dict[cols[i]].append(val.string)
        else:
            data_dict[cols[i]].append(list(val.children)[0].string)
        
# load data dict into a dataframe

nbs_df = pd.DataFrame(data_dict, columns=cols)

In [193]:
# drop rows with no borough
nbs_df = nbs_df[~(nbs_df['Borough'] == 'Not assigned')]

# replace neighborhoods with 'Not assigned'
bad_rows = nbs_df['Neighborhood'].apply(lambda x: 'assigned' in x)
nbs_df.loc[bad_rows, 'Neighborhood'] = nbs_df.loc[bad_rows,'Borough']

In [194]:
# combine nbs with same postcode
grouped = nbs_df.groupby(['Postcode','Borough'])

# join all the entries of the neighborhood col for each group into a string
# some of the values have '\n' strings, so we get rid of those
collapse_nbs = grouped.apply(lambda x: ', '.join(x['Neighborhood']).replace('\n',''))

# recast as dataframe and fix up index and col names
final_df = pd.DataFrame(collapse_nbs)
final_df= final_df.reset_index()
final_df = final_df.rename(columns = {'Postcode':'PostalCode', 0:'Neighborhood'})
final_df.shape

(103, 3)

# Step 2 - get coordinates of each neighborhood

I first tried to use Nominatim for this, but it was unable to parse the Canadian postal codes. When it also failed to find the coordinates by neighborhood, I made a switch to the Google version. My API key has been taken out of the notebook for obvious reasons!

In [196]:
from geopy.geocoders import GoogleV3
import json


# start the geocoder

geolocator = GoogleV3(api_key = 'AIzaSyDNDzNUbiR7KEhQTOkjEPG62ogeqlsFH2Y')#snip

In [199]:
# make some blank columns

final_df['Latitude'] = pd.Series()
final_df['Longitude'] = pd.Series()

def get_location(string, altstring):
    '''
    Since there is sometimes timeout or other issues with the coordinate requests,
    this function will ask for coords until a valid result comes in.
    
    I also allow a specification of an alternative string to search.
    It will alternate between querying the two strings
    '''
    print('Getting coords for',string)
    try:
        location = geolocator.geocode(string, altstring)
        location.latitude
        return location
    except GeocoderTimedOut:
        print('Geocoder Timed Out! Trying again...')
        #sleep(2)
        return get_location(altstring, string)
    except AttributeError:
        print('Returned none! Trying again...')
        #sleep(2)
        return get_location(altstring, string)

# commenting this out to prevent myself from running lots of extra API calls
# sometimes the API can't locate something based on the postal code,
# so I use the nbd name to do another search

for i in final_df.index:
    # search for the postal code first
    search_string = final_df.loc[i,'PostalCode'] + ' 8T0, Canada'
    # if that fails, try the neighborhood name, using the first in the list if there are multiple
    alt_string = final_df.loc[i, 'Neighborhood'].split(',')[0] + ', Toronto, Canada'
    location = get_location(search_string, alt_string)
    final_df.loc[i,'Latitude'] = location.latitude
    final_df.loc[i, 'Longitude'] = location.longitude
    #sleep(2)
    
final_df



Getting coords for M1B 8T0, Canada
Getting coords for M1C 8T0, Canada
Geocoder Timed Out! Trying again...
Getting coords for Highland Creek, Toronto, Canada
Getting coords for M1E 8T0, Canada
Getting coords for M1G 8T0, Canada
Returned none! Trying again...
Getting coords for Woburn, Toronto, Canada
Getting coords for M1H 8T0, Canada
Getting coords for M1J 8T0, Canada
Getting coords for M1K 8T0, Canada
Getting coords for M1L 8T0, Canada
Getting coords for M1M 8T0, Canada
Getting coords for M1N 8T0, Canada
Getting coords for M1P 8T0, Canada
Getting coords for M1R 8T0, Canada
Getting coords for M1S 8T0, Canada
Returned none! Trying again...
Getting coords for Agincourt, Toronto, Canada
Getting coords for M1T 8T0, Canada
Returned none! Trying again...
Getting coords for Clarks Corners, Toronto, Canada
Getting coords for M1V 8T0, Canada
Getting coords for M1W 8T0, Canada
Getting coords for M1X 8T0, Canada
Getting coords for M2H 8T0, Canada
Getting coords for M2J 8T0, Canada
Getting coords 

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.790121,-79.173392
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.776470,-79.231728
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [200]:
# let's check out the columns to make sure nothing crazy has happened

final_df[['Latitude','Longitude']].describe()

Unnamed: 0,Latitude,Longitude
count,103.0,103.0
mean,43.704513,-79.397188
std,0.05246,0.096227
min,43.602414,-79.615819
25%,43.661372,-79.464763
50%,43.696948,-79.388516
75%,43.74532,-79.340923
max,43.836125,-79.173392


# Step 3 - Explore and cluster!

Start by simply making a map of the neighborhoods. Then, we'll do some clustering.

In [201]:
import folium

toronto_lat = 43.6532
toronto_lng = -79.3832

my_map = folium.Map(location=[toronto_lat, toronto_lng], zoom_start =10)

for i in final_df.index:
    lat = final_df.loc[i, 'Latitude']
    lng = final_df.loc[i, 'Longitude']
    folium.CircleMarker(
        location = [lat,lng],
        radius = 5,
        popup = final_df.loc[i, 'Neighborhood'].split(',')[0],
        color = 'blue',
        fill=True,
        fillColor = 'white'
    ).add_to(my_map)
    
my_map

I'll cluster all the neighborhoods. We

1. Read in the API key
2. Query the foursquare servers for venue data
3. Form a table whose index is the various zipcodes and whose columns are the venue categories. For each venue, add 1 to the appropriate zipcode-category entry.
4. Run k means clustering on the zip code's venue category counts, plot the results on a map, and examine the top ten kinds of venue for each cluster

Results: I'm a bit disappointed to see that the "Uncategorized" and "Building" classifications pop up quite a bit. It's hard to interpret the results with those being such popular categories and not knowing what sorts of places get labelled as uncategorized. Things like 'offices' and 'spas' come up pretty often. I'm afraid that the classifier is actually distingushing things just based on the relative counts of these few big categories.

At any rate, running it with k=5 gives a couple of neighborhood types that are distinguished. One type has churches, playground, parks, lots of apartments, and buslines among its top 10, so I assume that families and the like live there. Another area has tech startups, bookstores, and delis, so that's probably where I'd like to live!





In [202]:
# read in my foursquare API key

with open('foursquare_api_key.json') as f:
    creds = json.loads(f.read())
    
client_id = creds['client_id']
client_secret = creds['client_secret']
v = creds['v']

In [240]:
# read in data for each nbd

nbd_info = {}

def askfoursquare(url):
    result = requests.get(url)
    json_bytes = result.content
    json_string = json_bytes.decode('utf-8')
    json_dict = json.loads(json_string)
    if json_dict['meta']['code'] != 200:
        print(json_dict['meta'])
        sleep(2)
        return askfoursquare(url)
    else:
        return json_dict

for i in final_df.index:
    lat = final_df.loc[i, 'Latitude']
    lng = final_df.loc[i, 'Longitude']
    url = 'https://api.foursquare.com/v2/venues/search?ll={},{}&client_id={}&client_secret={}&v={}'.format(
        lat,
        lng,
        client_id,
        client_secret,
        v)
    nbd_info[final_df.loc[i,'PostalCode']] = askfoursquare(url)

In [258]:
# parse the JSON output for each
# extract venues

from collections import Counter
nb_zip = 'M1B'
category_table = pd.DataFrame()
for nb_zip in nbd_info.keys():
    venue_list = nbd_info[nb_zip]['response']['venues']
    nbd_cat_counter = Counter()

    for venue in venue_list:
        venue_categories = venue['categories']
        if len(venue_categories) > 0:
            nbd_cat_counter[venue_categories[0]['name']] += 1
            if not venue_categories[0]['primary']:
                print('Non-primary venue counted')
        else:
            nbd_cat_counter['Uncategorized'] += 1
    nbd_df = pd.DataFrame(nbd_cat_counter, index = [nb_zip])
    category_table = category_table.append(nbd_df)

In [265]:
# reformat the table a bit

category_table = category_table.fillna(0)
category_table = category_table.astype(int)

# let's make sure nothing weird happened with the counts
venue_count = category_table.sum(axis=1)

venue_count
# this shows 30 for every zipcode, so presumably the default limit is 30?

M1B    30
M1C    30
M1E    30
M1G    30
M1H    30
M1J    30
M1K    30
M1L    30
M1M    30
M1N    30
M1P    30
M1R    30
M1S    30
M1T    30
M1V    30
M1W    30
M1X    30
M2H    30
M2J    30
M2K    30
M2L    30
M2M    30
M2N    30
M2P    30
M2R    30
M3A    30
M3B    30
M3C    30
M3H    30
M3J    30
       ..
M6C    30
M6E    30
M6G    30
M6H    30
M6J    30
M6K    30
M6L    30
M6M    30
M6N    30
M6P    30
M6R    30
M6S    30
M7A    30
M7R    30
M7Y    30
M8V    30
M8W    30
M8X    30
M8Y    30
M8Z    30
M9A    30
M9B    30
M9C    30
M9L    30
M9M    30
M9N    30
M9P    30
M9R    30
M9V    30
M9W    30
Length: 103, dtype: int64

In [354]:
# now let's do some clustering

from sklearn.cluster import KMeans
from matplotlib import cm
from matplotlib.colors import rgb2hex


    
def cluster(k = 5):
    # run the alg and label the entries of our tables
    clusterer = KMeans(n_clusters = k)
    clusterer.fit_predict(category_table)
    
    final_df['Label'] = clusterer.labels_
    category_table['Label'] = clusterer.labels_
    
    # this makes the nice colors
    cmap = cm.get_cmap('RdBu', k)
    
    # create map and add the circle markers
    my_map = folium.Map(location=[toronto_lat, toronto_lng], zoom_start =10)
    
    for i in final_df.index:
        lat = final_df.loc[i, 'Latitude']
        lng = final_df.loc[i, 'Longitude']
        color = rgb2hex(cmap(final_df.loc[i,'Label'])[:3])
        folium.CircleMarker(
            location = [lat,lng],
            radius = 5,
            popup = final_df.loc[i, 'Neighborhood'].split(',')[0],
            color = 'black',
            fill=True,
            fill_color = color,
            fill_opacity = 1,
            weight=1
        ).add_to(my_map)
        
    # some html that will make us a nice label!    
    legend_label_html = ''
    for i in range(k):
        legend_label_html += '''
            Label {} <i class='fa fa-circle fa-1x' style=color:{}></i><br>
            '''.format(i, rgb2hex(cmap(i)))
    legend_html = '''
        <div style="position:fixed; 
            bottom: 50px; 
            left:50 px; 
            width: 100px; 
            height:90px,
            border:2px solid grey;
            z-index:9999;
            font-size:14px;
            background-color: white"> 
        ''' + legend_label_html + '</div>'
    my_map.get_root().html.add_child(folium.Element(legend_html))
    
    return my_map
    

the_map = cluster(5)
agg_table = category_table.groupby('Label').mean()
for label in agg_table.index:
    row = agg_table.loc[label,:].copy()
    row.sort_values(inplace=True, ascending=False)
    print(row[:10])
    print('')
the_map

# thanks to https://medium.com/@bobhaffner/creating-a-legend-for-a-folium-map-c1e0ffc34373
# for the map legend code

Uncategorized           1.615385
Office                  1.423077
Bank                    1.153846
Coffee Shop             0.961538
Spa                     0.730769
Pharmacy                0.692308
Building                0.653846
Automotive Shop         0.653846
Dentist's Office        0.615385
Fast Food Restaurant    0.538462
Name: 0, dtype: float64

Office                                      1.222222
Uncategorized                               1.194444
Coffee Shop                                 0.888889
Building                                    0.722222
Park                                        0.666667
Clothing Store                              0.611111
Medical Center                              0.527778
Residential Building (Apartment / Condo)    0.500000
Pizza Place                                 0.500000
Spa                                         0.416667
Name: 1, dtype: float64

Office                                      7.666667
Building                             