Determine similar places of interest by locations (e.g. Camden, Dartford, etc), venue names (e.g. Cafe Mexicana), venue category

## Imports

In [1]:
# import libraries
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl
import pandas as pd
import re
import requests
import numpy as np
from geopy.geocoders import Nominatim
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import io

## Data collection via web scraping

In [2]:
def getDataFromURL(url = 'https://en.wikipedia.org/wiki/List_of_areas_of_London'):
    
    # Ignore SSL certificate errors
    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE

    # Ask for url, open it and parse html
    url = "https://en.wikipedia.org/wiki/List_of_areas_of_London"
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')

    # find table row tags
    table = soup.find('table', {"class": "wikitable sortable"})
    table_rows = table.find_all('tr')
    table_headers = [str(header.text).replace(u'\xa0', u' ').strip('\n') for header in table.find_all('th')]
    
    # extract rows to dataframe
    data = []
    for tr in table_rows:
        td = tr.find_all('td')
        row = [tr.text.strip() for tr in td if tr.text.strip()]
        if row:
            data.append(row)
    
    df = pd.DataFrame(data, columns=table_headers)
    
    return df

In [3]:
df = getDataFromURL(url = 'https://en.wikipedia.org/wiki/List_of_areas_of_London')
df.head(10)

Unnamed: 0,Location,London borough,Post town,Postcode district,Dial code,OS grid ref
0,Abbey Wood,"Bexley, Greenwich [7]",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham[8]",LONDON,"W3, W4",20,TQ205805
2,Addington,Croydon[8],CROYDON,CR0,20,TQ375645
3,Addiscombe,Croydon[8],CROYDON,CR0,20,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728
5,Aldborough Hatch,Redbridge[9],ILFORD,IG2,20,TQ455895
6,Aldgate,City[10],LONDON,EC3,20,TQ334813
7,Aldwych,Westminster[10],LONDON,WC2,20,TQ307810
8,Alperton,Brent[11],WEMBLEY,HA0,20,TQ185835
9,Anerley,Bromley[11],LONDON,SE20,20,TQ345695


Next, let's remove the footnote symbols contained within brackets in the "London borough" column

In [4]:
df["London borough"] = df["London borough"].apply(lambda x: re.sub(r'\[[^\[]*\]', '', x))
df.head(10)

Unnamed: 0,Location,London borough,Post town,Postcode district,Dial code,OS grid ref
0,Abbey Wood,"Bexley, Greenwich",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham",LONDON,"W3, W4",20,TQ205805
2,Addington,Croydon,CROYDON,CR0,20,TQ375645
3,Addiscombe,Croydon,CROYDON,CR0,20,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728
5,Aldborough Hatch,Redbridge,ILFORD,IG2,20,TQ455895
6,Aldgate,City,LONDON,EC3,20,TQ334813
7,Aldwych,Westminster,LONDON,WC2,20,TQ307810
8,Alperton,Brent,WEMBLEY,HA0,20,TQ185835
9,Anerley,Bromley,LONDON,SE20,20,TQ345695


Let's now further clean the data by removing unneccessary columns and filtering out the rows whose "Post town" is not London.

In [5]:
# Keep only the datapoints where the post town is London and drop unneccessary columns
try:
    df = df[df["Post town"] == "LONDON"].reset_index()
    df.drop(columns=["Post town", "Dial code", "OS grid ref"], inplace = True)
except:
    print("The dataframe does not contain one or more of the following columns: Post town, Dial code, OS grid ref")
    raise

In [6]:
df.head()

Unnamed: 0,index,Location,London borough,Postcode district
0,0,Abbey Wood,"Bexley, Greenwich",SE2
1,1,Acton,"Ealing, Hammersmith and Fulham","W3, W4"
2,6,Aldgate,City,EC3
3,7,Aldwych,Westminster,WC2
4,9,Anerley,Bromley,SE20


In [7]:
print("The dataframe has a length of " + str(len(df)) + " but there are " + str(len(df["Postcode district"].unique())) + 
      " unique post code values")

The dataframe has a length of 299 but there are 151 unique post code values


To solve this issue, we will combine all locations with the same post code into one row. 

In [8]:
df = df.groupby("Postcode district").agg({"Location": ", ".join, "London borough": "first"}).reset_index()
print(df.shape)
df.head()

(151, 3)


Unnamed: 0,Postcode district,Location,London borough
0,DA5,Dartford,Dartford
1,E1,"Mile End, Ratcliff, Shadwell, Spitalfields, St...",Tower Hamlets
2,E10,Lea Bridge,Hackney
3,"E10, E15",Leyton,Waltham Forest
4,E11,"Cann Hall, Leytonstone, Snaresbrook, Wanstead",Waltham Forest


## Mapping the postcodes to their respective coordinates

To determine the coordinates of the postcodes, we will use the *Outcode Area Postcodes* dataset taken from the following [link](https://www.freemaptools.com/download-uk-postcode-lat-lng.htm)

In [9]:
# import the csv file from the given link into a dataframe
url="https://www.freemaptools.com/download/outcode-postcodes/postcode-outcodes.csv"
url_content = requests.get(url).content
uk_postcodes_data =pd.read_csv(io.StringIO(url_content.decode('utf-8')), index_col = False)

In [10]:
# drop any unneccesary columns and set the postcode as the index of the dataframe
uk_postcodes_data.drop(columns = "id", inplace=True)
uk_postcodes_data = uk_postcodes_data.set_index('postcode')
print("The dataframe has the following shape: " +str(uk_postcodes_data.shape) + " and the columns are of the following types.\n")
print(uk_postcodes_data.dtypes)
uk_postcodes_data.head()

The dataframe has the following shape: (3003, 2) and the columns are of the following types.

latitude     float64
longitude    float64
dtype: object


Unnamed: 0_level_0,latitude,longitude
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
AB10,57.13514,-2.11731
AB11,57.13875,-2.09089
AB12,57.101,-2.1106
AB13,57.10801,-2.23776
AB14,57.10076,-2.27073


One thing to notice is that some postcodes return a value of 0 in terms of longitude and latitude which is possibly due to missing data (see below). This is impossible and will be treated later on.

In [11]:
nb_invalid = len(uk_postcodes_data[(uk_postcodes_data["latitude"] == 0.0) & (uk_postcodes_data["longitude"] == 0.0)])

print(str(nb_invalid) + " postcodes lead to a 0 0 coordinate, which is impossible as it's in the Atlantic Ocean.")
uk_postcodes_data[(uk_postcodes_data["latitude"] == 0.0) & (uk_postcodes_data["longitude"] == 0.0)].head()

28 postcodes lead to a 0 0 coordinate, which is impossible as it's in the Atlantic Ocean.


Unnamed: 0_level_0,latitude,longitude
postcode,Unnamed: 1_level_1,Unnamed: 2_level_1
WF90,0.0,0.0
N81,0.0,0.0
SW95,0.0,0.0
RH77,0.0,0.0
WD99,0.0,0.0


Next we will determine the coordinates for the postcodes through the following function.

In [12]:
def mapPostcodeToCoordinates(data, postcode, outputColName = 'latitude'):
    
    '''
    This function returns either the latitude or the longitude of a 
    provided postcode. 
    
    In the case where more than one postcode is found, this function 
    will return the average value
    '''
    
    coord = 0
    codes = postcode.split(', ')
    
    for code in codes:
        try:
            coord += data[outputColName].loc[code]
        except:
            coord += 0.0
        
    return coord/len(codes)

In [13]:
# obtain the latitudes and longitudes corresponding to the postcodes
df["latitude"] = df["Postcode district"].apply(lambda x: mapPostcodeToCoordinates(uk_postcodes_data, x,
                                                                                  outputColName = 'latitude'))
df["longitude"] = df["Postcode district"].apply(lambda x: mapPostcodeToCoordinates(uk_postcodes_data, x,
                                                                                  outputColName = 'longitude'))

In [14]:
df.head()

Unnamed: 0,Postcode district,Location,London borough,latitude,longitude
0,DA5,Dartford,Dartford,51.44033,0.14698
1,E1,"Mile End, Ratcliff, Shadwell, Spitalfields, St...",Tower Hamlets,51.51766,-0.05841
2,E10,Lea Bridge,Hackney,51.56814,-0.01153
3,"E10, E15",Leyton,Waltham Forest,51.553625,-0.00423
4,E11,"Cann Hall, Leytonstone, Snaresbrook, Wanstead",Waltham Forest,51.56769,0.01443


After this stage, it would seem that we are lucky in the sense that no points lead us to the middle of the Atlantic Ocean! 

However, to make this code more robust, we will write a line of code to treat this in case it arises later on. In this case, we will simply drop such rows. 

In [15]:
len(df[(df["latitude"] == 0.0) & (df["longitude"] == 0.0)])

0

In [16]:
# remove any eventual rows that would lead us to the middle of the Atlantic Ocean
df = df[(df.latitude != 0.0) & (df.longitude != 0.0)]

## Map of London with its various locations

Let's now display a map of London with the locations contained within our dataframe.

In [17]:
def createMapWithLocations(df, address = 'London, UK'):
    
    geolocator = Nominatim(user_agent="ny_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print('The geograpical coordinates of {} are {}, {}.'.format(address, latitude, longitude))
    
    # create map using latitude and longitude values
    map_address = folium.Map(location=[latitude, longitude], zoom_start=11)

    # add markers to map
    for lat, lng, label in zip(df['latitude'], df['longitude'], df['Location']):
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color='blue',
            fill=True,
            fill_color='#3186cc',
            fill_opacity=0.7,
            parse_html=False).add_to(map_address)  
        
    return map_address, latitude, longitude

In [18]:
map_london, latitude, longitude = createMapWithLocations(df, address = 'London, UK')
map_london

The geograpical coordinates of London, UK are 51.5073219, -0.1276474.


In [19]:
CLIENT_ID = 'FYW1ZHUDL5ACFOR1POXG4I2GUGLETLLZPJCLMZAGORJQO4RO' # your Foursquare ID
CLIENT_SECRET = 'O0PWLL4R41CU0VVJKAREDCL4DR1VMRBJ24QDCV5YIAK5A021' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # limit of number of venues returned by Foursquare API

In [20]:
# function to obtain the venues within a given radius of a given location
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Location', 
                  'Location Latitude', 
                  'Location Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
london_venues = getNearbyVenues(names=df['Location'],
                                   latitudes=df['latitude'],
                                   longitudes=df['longitude']
                                  )

# Example of search by location name

In [22]:
london_venues[london_venues["Location"].str.contains("Camden")].head()

Unnamed: 0,Location,Location Latitude,Location Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
1664,"Camden Town, Chalk Farm, Somerstown",51.53252,-0.14286,Lost Boys Pizza,51.533883,-0.13808,Pizza Place
1665,"Camden Town, Chalk Farm, Somerstown",51.53252,-0.14286,Asakusa,51.534107,-0.138225,Japanese Restaurant
1666,"Camden Town, Chalk Farm, Somerstown",51.53252,-0.14286,La Patagonia,51.535337,-0.139257,Argentinian Restaurant
1667,"Camden Town, Chalk Farm, Somerstown",51.53252,-0.14286,Ferreira Delicatessen,51.536488,-0.143739,Deli / Bodega
1668,"Camden Town, Chalk Farm, Somerstown",51.53252,-0.14286,Cafe Mexicana,51.535685,-0.139336,Mexican Restaurant


In [23]:
# one hot encoding for the different venue categories
london_onehot = pd.get_dummies(london_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
london_onehot['Location'] = london_venues['Location'] 

# move location column to the first column
col_order = ['Location'] + [col for col in london_onehot.columns if col != 'Location']
london_onehot = london_onehot[col_order]

london_onehot.head()

Unnamed: 0,Location,Afghan Restaurant,African Restaurant,American Restaurant,Antique Shop,Arcade,Arepa Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,...,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo Exhibit
0,Dartford,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Dartford,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Dartford,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Dartford,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Dartford,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
london_grouped = london_onehot.groupby('Location').mean().reset_index()
london_grouped.head()

Unnamed: 0,Location,Afghan Restaurant,African Restaurant,American Restaurant,Antique Shop,Arcade,Arepa Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,...,Warehouse Store,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Xinjiang Restaurant,Yoga Studio,Zoo Exhibit
0,"Abbey Wood, Crossness, West Heath",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Acton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Aldgate, Tower Hill",0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,...,0.0,0.0,0.02,0.01,0.0,0.0,0.0,0.0,0.0,0.0
3,"Aldwych, Charing Cross, Covent Garden, St Giles",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.0,0.0
4,"Anerley, Penge",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
# function to determine the num_top_venues values in a row
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [26]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Location']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Location'] = london_grouped['Location']

for ind in np.arange(london_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(london_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Location,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Abbey Wood, Crossness, West Heath",Supermarket,Grocery Store,Pub,Convenience Store,Platform,Train Station,Coffee Shop,Campground,Falafel Restaurant,English Restaurant
1,Acton,Café,Gastropub,French Restaurant,Bakery,Bus Stop,Fast Food Restaurant,Ethiopian Restaurant,Event Space,Exhibit,Falafel Restaurant
2,"Aldgate, Tower Hill",Coffee Shop,Restaurant,Hotel,Cocktail Bar,Gym / Fitness Center,Italian Restaurant,English Restaurant,Salad Place,French Restaurant,Pub
3,"Aldwych, Charing Cross, Covent Garden, St Giles",Theater,Coffee Shop,Clothing Store,Ice Cream Shop,Bakery,Gym,Deli / Bodega,Dessert Shop,Tea Room,Pizza Place
4,"Anerley, Penge",Supermarket,Fast Food Restaurant,Pub,Pizza Place,Shopping Mall,Hotel,Furniture / Home Store,Coffee Shop,Falafel Restaurant,English Restaurant


## Cluster the neighborhoods

In [27]:
def clusterDataAndAddLabel(data, data_grouped, nb_clusters = 5):
    
    # Drop the 'Neighborhood' column
    data_clustering = data_grouped.drop('Location', 1).copy()

    # run k-means clustering
    kmeans = KMeans(n_clusters=nb_clusters, random_state=0).fit(data_clustering)
    
    # add clustering labels
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

    data_merged = data.copy()

    # merge data_grouped with data to add latitude/longitude for each neighborhood
    data_merged = data_merged.join(neighborhoods_venues_sorted.set_index('Location'), on='Location')
    
    return data_merged

In [29]:
k_clusters = 10
london_merged = clusterDataAndAddLabel(df, london_grouped, nb_clusters = k_clusters)
london_merged.head()

Unnamed: 0,Postcode district,Location,London borough,latitude,longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,DA5,Dartford,Dartford,51.44033,0.14698,0,Fast Food Restaurant,Pub,Breakfast Spot,Greek Restaurant,Toy / Game Store,Bar,Train Station,Chinese Restaurant,Indian Restaurant,Italian Restaurant
1,E1,"Mile End, Ratcliff, Shadwell, Spitalfields, St...",Tower Hamlets,51.51766,-0.05841,4,Hotel,Pub,Coffee Shop,Bakery,Sandwich Place,Indian Restaurant,Grocery Store,Fast Food Restaurant,Gym / Fitness Center,Burger Joint
2,E10,Lea Bridge,Hackney,51.56814,-0.01153,4,Indian Restaurant,Asian Restaurant,Convenience Store,Coffee Shop,Park,Farm,Grocery Store,Train Station,Fried Chicken Joint,Cricket Ground
3,"E10, E15",Leyton,Waltham Forest,51.553625,-0.00423,0,Pub,Coffee Shop,Park,Fast Food Restaurant,Electronics Store,Sandwich Place,Gym / Fitness Center,Clothing Store,Pharmacy,Supermarket
4,E11,"Cann Hall, Leytonstone, Snaresbrook, Wanstead",Waltham Forest,51.56769,0.01443,8,Pub,Café,Platform,Grocery Store,Fast Food Restaurant,Music Venue,Sandwich Place,Supermarket,Thai Restaurant,Mediterranean Restaurant


In [30]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k_clusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(london_merged['latitude'], london_merged['longitude'], london_merged['Location'], 
                                  london_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters