In [1]:
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans

try:
    from geopy.geocoders import Nominatim
except: #install package if it already does not exist
    !conda install -c conda-forge geopy --yes
    from geopy.geocoders import Nominatim
        
try:
    import folium # plotting library
except:
    !conda install -c conda-forge folium=0.5.0 --yes
    import folium # plotting library
import matplotlib.cm as cm
import matplotlib.colors as colors

### Web data extraction to data frame
Here, the location and borough anme data will exracted from a wikitable and compiled into a pandas dataframe for later venue identificationa dn clustering.

In [2]:
url = r'https://en.wikipedia.org/wiki/List_of_London_boroughs'
df = pd.DataFrame(columns = ['Borough', 'Area', 'Longitude', 'Latitude'])

In [3]:
res = requests.get(url).text
soup = BeautifulSoup(res,'lxml')
nulCase = 'Not assigned'
skipRow = False
for items in soup.find('table', class_='wikitable').find_all('tr')[1::1]:
    data = items.find_all(['th','td'])
    size = float(data[6].text[0:-1]) * 2.58999 #Convert sq miles to sq km
    loc = data[8].text[0:-1].split('/')[-1]
    loc = loc.split('(')
    borough = loc[-1][0:-1]
    loc = loc[0][0:-2].split('; ')

    if borough == nulCase:
        skipRow = True

    newRow = {'Borough' : borough, 'Area' : size, 'Latitude' : float(loc[0]), 'Longitude' : float(loc[1])}
    
    if skipRow == False:
        df = df.append(newRow, ignore_index = True)
    skipRow = False

In [4]:
df.head(5)

Unnamed: 0,Borough,Area,Longitude,Latitude
0,Barking and Dagenham,36.078561,0.1557,51.5607
1,Barnet,86.738765,-0.1517,51.6252
2,Bexley,60.553966,0.1505,51.4549
3,Brent,43.252833,-0.2817,51.5588
4,Bromley,150.14172,0.0198,51.4039


### Venue Identification
Food venues will be identified within each borough. A conservative search radius calculated from 60% of the borough size will be used to find the venues with minimal overlap between adjacent boroughs. The location data collected above will be passed through foursquare to collect the venue information.

In [5]:
# import data from user file here
file = open('./userCredentials.txt', 'r')
data = file.readlines()

idx = 0
userInfo = []

for line in data:
    values = line.split(':')[-1]
    if values[-1] is '\n':
        userInfo.append(values[1:-1])
    else:
        userInfo.append(values[1:])
    idx += 1

In [6]:
def getNearbyVenues(names, latitudes, longitudes, userInfo, radius=500, LIMIT = 30):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print('{} {} {} {}'.format(name, lat, lng, radius))
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            userInfo[0], 
            userInfo[1], 
            userInfo[2], 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Borough', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

def return_most_common_venues(row, num_top_venues = 5):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

def returnLeastCommonVenues(row, venueCount = 5):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[-num_top_venues:]

In [7]:
venueDF = None

for borough in df['Borough']:
    nbDF = df[df['Borough'] == borough]
    #Radius value needs to be whole number, so recast to int and change unit to metres
    r = int(np.sqrt((0.6 * nbDF['Area'] * 1000**2) / np.pi))
    
    venues = getNearbyVenues(names=nbDF['Borough'],
                                       latitudes=nbDF['Latitude'],
                                       longitudes=nbDF['Longitude'], userInfo = userInfo, radius = r )
    if venueDF is None:
        venueDF = venues
    else:
        venueDF = pd.concat([venueDF, venues])

Barking and Dagenham 51.5607 0.1557 2624
Barnet 51.6252 -0.1517 4070
Bexley 51.4549 0.1505 3400
Brent 51.5588 -0.2817 2874
Bromley 51.4039 0.0198 5354
Camden 51.529 -0.1255 2038
Croydon 51.3714 -0.0977 4065
Ealing 51.513 -0.3089 3256
Enfield 51.6538 -0.0799 3962
Greenwich 51.4892 0.0648 3007
Hackney 51.545 -0.0553 1908
Hammersmith and Fulham 51.4927 -0.2339 1769
Haringey 51.6 -0.1119 2376
Harrow 51.5898 -0.3346 3104
Havering 51.5812 0.1837 4630
Hillingdon 51.5441 -0.476 4700
Hounslow 51.4746 -0.368 3269
Islington 51.5416 -0.1022 1685
Kensington and Chelsea 51.502 -0.1947 1521
Kingston upon Thames 51.4085 -0.3064 2667
Lambeth 51.4607 -0.1163 2263
Lewisham 51.4452 -0.0209 2590
Merton 51.4014 -0.1958 2679
Newham 51.5077 0.0469 2629
Redbridge 51.559 0.0741 3282
Richmond upon Thames 51.4479 -0.326 3311
Southwark 51.5035 -0.0804 2347
Sutton 51.3618 -0.1945 2893
Tower Hamlets 51.5099 -0.0059 1942
Waltham Forest 51.5908 -0.0134 2723
Wandsworth 51.4567 -0.191 2558
Westminster 51.4973 -0.1372 20

In [8]:
venueDF.head()

Unnamed: 0,Borough,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Barking and Dagenham,51.5607,0.1557,Central Park,51.55956,0.161981,Park
1,Barking and Dagenham,51.5607,0.1557,Lara Grill,51.562445,0.147178,Turkish Restaurant
2,Barking and Dagenham,51.5607,0.1557,Costa Coffee,51.57689,0.179497,Coffee Shop
3,Barking and Dagenham,51.5607,0.1557,The Range,51.57555,0.180254,Furniture / Home Store
4,Barking and Dagenham,51.5607,0.1557,Asda,51.56577,0.143393,Supermarket


In [9]:
# one hot encoding
filterByFood = venueDF[venueDF['Venue Category'].str.contains('Restaurant', 'Cafe')]
filterByFood.groupby('Borough').count()
oneHot = pd.get_dummies(filterByFood[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
oneHot['Borough'] = filterByFood['Borough'] 

# move neighborhood column to the first column
fixed_columns = [oneHot.columns[-1]] + list(oneHot.columns[:-1])
oneHot = oneHot[fixed_columns]

groupedDF = oneHot.groupby('Borough').mean().reset_index()
groupedDF.head(20)

Unnamed: 0,Borough,African Restaurant,American Restaurant,Argentinian Restaurant,Asian Restaurant,Brazilian Restaurant,Cajun / Creole Restaurant,Caribbean Restaurant,Chinese Restaurant,Eastern European Restaurant,...,Restaurant,Seafood Restaurant,South American Restaurant,Spanish Restaurant,Sushi Restaurant,Tapas Restaurant,Thai Restaurant,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant
0,Barking and Dagenham,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0
1,Barnet,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.166667,0.0,0.0,0.0,0.166667,0.0,0.0,0.166667,0.0,0.0
2,Bexley,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Brent,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Bromley,0.0,0.166667,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0
5,Camden,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0
6,Croydon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,...,0.0,0.0,0.0,0.1,0.1,0.0,0.0,0.0,0.0,0.0
7,Ealing,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
8,Enfield,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0
9,Greenwich,0.25,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25


Identify the most common food venues within each borough

In [10]:
columns = ['Borough']
num_top_venues = 5
indicators = ['st', 'nd', 'rd']


for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
venuesSorted = pd.DataFrame(columns=columns)
venuesSorted['Borough'] = groupedDF['Borough']

for ind in np.arange(groupedDF.shape[0]):
    venuesSorted.iloc[ind, 1:] = return_most_common_venues(groupedDF.iloc[ind, :], num_top_venues)

venuesSorted.head()

Unnamed: 0,Borough,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Barking and Dagenham,Turkish Restaurant,American Restaurant,Italian Restaurant,Brazilian Restaurant,Restaurant
1,Barnet,Italian Restaurant,Turkish Restaurant,Indian Restaurant,Sushi Restaurant,Restaurant
2,Bexley,Italian Restaurant,English Restaurant,American Restaurant,Mediterranean Restaurant,Greek Restaurant
3,Brent,Indian Restaurant,Latin American Restaurant,Restaurant,English Restaurant,Italian Restaurant
4,Bromley,Turkish Restaurant,American Restaurant,Italian Restaurant,Asian Restaurant,Indian Restaurant


...And the 

In [11]:
columns = ['Borough']
num_top_venues = 5
indicators = ['st', 'nd', 'rd']


for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Least Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Least Common Venue'.format(ind+1))

# create a new dataframe
esotericVenuesSorted = pd.DataFrame(columns=columns)
esotericVenuesSorted['Borough'] = groupedDF['Borough']

for ind in np.arange(groupedDF.shape[0]):
    esotericVenuesSorted.iloc[ind, 1:] = returnLeastCommonVenues(groupedDF.iloc[ind, :], num_top_venues)

esotericVenuesSorted.head()

Unnamed: 0,Borough,1st Least Common Venue,2nd Least Common Venue,3rd Least Common Venue,4th Least Common Venue,5th Least Common Venue
0,Barking and Dagenham,Modern European Restaurant,Middle Eastern Restaurant,Mediterranean Restaurant,Malay Restaurant,African Restaurant
1,Barnet,Middle Eastern Restaurant,Mediterranean Restaurant,Malay Restaurant,Lebanese Restaurant,African Restaurant
2,Bexley,Okonomiyaki Restaurant,Modern European Restaurant,Middle Eastern Restaurant,Malay Restaurant,African Restaurant
3,Brent,Modern European Restaurant,Middle Eastern Restaurant,Mediterranean Restaurant,Malay Restaurant,African Restaurant
4,Bromley,Modern European Restaurant,Middle Eastern Restaurant,Mediterranean Restaurant,Malay Restaurant,African Restaurant


In [12]:
# compile data frame of venues for clustering here
clusterCount = num_top_venues
DFForClustering = groupedDF.drop('Borough', 1)

kmeans = KMeans(n_clusters=clusterCount, random_state=0).fit(DFForClustering)

venuesSorted.insert(0, 'Cluster Labels', kmeans.labels_)

venueMergedDF = df

# merge clustered data with the main data frame
venueMergedDF = venueMergedDF.join(venuesSorted.set_index('Borough'), on='Borough')
# some clusters are unlabelled, remove these
venueMergedDF.dropna(subset = ['Cluster Labels'], inplace = True)

In [13]:
# compile data frame of venues for clustering here
clusterCount = num_top_venues
DFForClustering = groupedDF.drop('Borough', 1)

kmeans = KMeans(n_clusters=clusterCount, random_state=0).fit(DFForClustering)

esotericVenuesSorted.insert(0, 'Cluster Labels', kmeans.labels_)

esotericVenueMergedDF = df

# merge clustered data with the main data frame
esotericVenueMergedDF = esotericVenueMergedDF.join(esotericVenuesSorted.set_index('Borough'), on='Borough')
# some clusters are unlabelled, remove these
esotericVenueMergedDF.dropna(subset = ['Cluster Labels'], inplace = True)

In [14]:
address = 'London, UK'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)

map_clusters = folium.Map(location=[location.latitude, location.longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(clusterCount)
ys = [i + x + (i*x)**2 for i in range(clusterCount)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(venueMergedDF['Latitude'], venueMergedDF['Longitude'], venueMergedDF['Borough'], venueMergedDF['Cluster Labels']):
    cluster = int(cluster)
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [15]:
address = 'London, UK'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)

map_clusters = folium.Map(location=[location.latitude, location.longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(clusterCount)
ys = [i + x + (i*x)**2 for i in range(clusterCount)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(esotericVenueMergedDF['Latitude'], esotericVenueMergedDF['Longitude'], esotericVenueMergedDF['Borough'], esotericVenueMergedDF['Cluster Labels']):
    cluster = int(cluster)
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [30]:
columns = ['Cluster Labels', 'Boroughs in Cluster']

for ind in np.arange(4):
    try:
        columns.append('{}{} Most Speciality'.format(ind+1, indicators[ind]))
        columns.append('{}{} Fraction'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Speciality'.format(ind+1))
        columns.append('{}th Fraction'.format(ind+1))

weightedClusterRankDF = pd.DataFrame(columns = columns)

for clusterLbl in venueMergedDF['Cluster Labels'].unique():
    newRow = {}
    clusterDF = venueMergedDF[venueMergedDF['Cluster Labels'] == clusterLbl]
    boroughList = clusterDF['Borough'].tolist()
    colIdx = 2
    for i in np.arange(4):
        idx = -5 + i
        venueNames = clusterDF.iloc[:, idx].unique()
        venuePop = clusterDF.iloc[:, idx].shape[0]
        commonVenueDF = pd.DataFrame(columns = ['Venue', 'Count', 'Fraction'])
        for n in venueNames:
            s = n.split(' ')
            if len(s) < 2:
                s = ['Unknown Type']
            count = clusterDF[clusterDF.iloc[:, idx].str.contains(s[0])].shape[0]
            row = {'Venue' : n, 'Count' : count, 'Fraction' : count / venuePop}
            commonVenueDF = commonVenueDF.append(row, ignore_index = True)
        commonVenueDF.sort_values(by = 'Count', ascending = False, inplace = True)
        venue = commonVenueDF.iloc[0, 0]
        frac = commonVenueDF.iloc[0, -1]
        newColumns = {columns[colIdx] : venue, columns[colIdx + 1] : frac}
        newRow = dict(newRow, **newColumns)
        colIdx += 2
    weightedClusterRankDF = weightedClusterRankDF.append(dict({'Cluster Labels' : clusterLbl, 'Boroughs in Cluster' : boroughList}, **newRow), ignore_index = True)
    
weightedClusterRankDF = weightedClusterRankDF.sort_values('Cluster Labels', ascending = True)
weightedClusterRankDF.set_index('Cluster Labels').head()

Unnamed: 0_level_0,Boroughs in Cluster,1st Most Speciality,1st Fraction,2nd Most Speciality,2nd Fraction,3rd Most Speciality,3rd Fraction,4th Most Speciality,4th Fraction
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,"[Enfield, Hillingdon, Lambeth, Waltham Forest]",Portuguese Restaurant,0.25,Turkish Restaurant,0.5,Middle Eastern Restaurant,0.25,Vietnamese Restaurant,0.25
1,"[Brent, Camden, Croydon, Harrow, Hounslow, Isl...",Indian Restaurant,0.625,Portuguese Restaurant,0.25,Middle Eastern Restaurant,0.25,English Restaurant,0.25
2,"[Ealing, Greenwich, Hackney, Richmond upon Tha...",Vietnamese Restaurant,0.75,Caribbean Restaurant,0.25,Fast Food Restaurant,0.25,Japanese Restaurant,0.25
3,"[Bexley, Hammersmith and Fulham, Newham, South...",Italian Restaurant,0.5,English Restaurant,0.166667,Italian Restaurant,0.333333,Mediterranean Restaurant,0.166667
4,"[Barking and Dagenham, Barnet, Bromley, Haring...",Turkish Restaurant,0.3,American Restaurant,0.3,Italian Restaurant,0.3,Indian Restaurant,0.2


In [31]:
columns = ['Cluster Labels', 'Boroughs in Cluster']

for ind in np.arange(4):
    try:
        columns.append('{}{} Least Common Speciality'.format(ind+1, indicators[ind]))
        columns.append('{}{} Fraction'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Least Common Speciality'.format(ind+1))
        columns.append('{}th Fraction'.format(ind+1))

weightedClusterRankDF = pd.DataFrame(columns = columns)

for clusterLbl in esotericVenueMergedDF['Cluster Labels'].unique():
    newRow = {}
    clusterDF = esotericVenueMergedDF[esotericVenueMergedDF['Cluster Labels'] == clusterLbl]
    boroughList = clusterDF['Borough'].tolist()
    colIdx = 2
    for i in np.arange(4):
        idx = -5 + i
        venueNames = clusterDF.iloc[:, idx].unique()
        venuePop = clusterDF.iloc[:, idx].shape[0]
        commonVenueDF = pd.DataFrame(columns = ['Venue', 'Count', 'Fraction'])
        for n in venueNames:
            s = n.split(' ')
            if len(s) < 2:
                s = ['Unknown Type']
            count = clusterDF[clusterDF.iloc[:, idx].str.contains(s[0])].shape[0]
            row = {'Venue' : n, 'Count' : count, 'Fraction' : count / venuePop}
            commonVenueDF = commonVenueDF.append(row, ignore_index = True)
        commonVenueDF.sort_values(by = 'Count', ascending = False, inplace = True)
        venue = commonVenueDF.iloc[0, 0]
        frac = commonVenueDF.iloc[0, -1]
        newColumns = {columns[colIdx] : venue, columns[colIdx + 1] : frac}
        newRow = dict(newRow, **newColumns)
        colIdx += 2
    weightedClusterRankDF = weightedClusterRankDF.append(dict({'Cluster Labels' : clusterLbl, 'Boroughs in Cluster' : boroughList}, **newRow), ignore_index = True)
    
weightedClusterRankDF = weightedClusterRankDF.sort_values('Cluster Labels', ascending = True)
weightedClusterRankDF.set_index('Cluster Labels').head()

Unnamed: 0_level_0,Boroughs in Cluster,1st Least Common Speciality,1st Fraction,2nd Least Common Speciality,2nd Fraction,3rd Least Common Speciality,3rd Fraction,4th Least Common Speciality,4th Fraction
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,"[Enfield, Hillingdon, Lambeth, Waltham Forest]",Modern European Restaurant,0.75,Mediterranean Restaurant,0.75,Malay Restaurant,0.75,Lebanese Restaurant,0.75
1,"[Brent, Camden, Croydon, Harrow, Hounslow, Isl...",Okonomiyaki Restaurant,0.5,Modern European Restaurant,0.5,Mediterranean Restaurant,0.375,Malay Restaurant,0.375
2,"[Ealing, Greenwich, Hackney, Richmond upon Tha...",Okonomiyaki Restaurant,0.5,Middle Eastern Restaurant,0.5,Mediterranean Restaurant,0.5,Malay Restaurant,0.5
3,"[Bexley, Hammersmith and Fulham, Newham, South...",Modern European Restaurant,0.5,Middle Eastern Restaurant,0.5,Mediterranean Restaurant,0.5,Malay Restaurant,0.666667
4,"[Barking and Dagenham, Barnet, Bromley, Haring...",Modern European Restaurant,0.4,Middle Eastern Restaurant,0.5,Mediterranean Restaurant,0.4,Malay Restaurant,0.5


array([-5, -4, -3, -2, -1])