# Segmenting and Clustering Neighborhoods in London City

In [6]:
import requests 
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

In [7]:
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
# import k-means from clustering stage
from sklearn.cluster import KMeans

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# install and import folium library 
!pip -q install folium
import folium 

# import k-means from clustering stage
from sklearn.cluster import KMeans

In [8]:
URL = "https://en.wikipedia.org/wiki/List_of_London_boroughs"
res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')

df_list = []
# print(soup)
for items in soup.find('table', class_= 'wikitable sortable').find_all('tr')[1::]:
    data = items.find_all(['td'])
    data1 = data[0]
    data8 = data[8]    
    try:
        borough_name = data1.get_text()
        borough_name = borough_name[0]
        ll = data8.get_text()
        lat_long = ll[2]
        latitude = lat_long[0]
        longitude = lat_long[1]
#       Append the borough name, latitude and logitude in a list
        df_list.append((borough_name, latitude, longitude))
    except IndexError:pass

### 1. Download and Explore the Dataset

##### Read the Latitude and Longitude coordinates of all Boroughs in London from a Wikipedia link.

In [9]:
URL = "https://en.wikipedia.org/wiki/List_of_London_boroughs"
res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')

df_list = []
# print(soup)
for items in soup.find('table', class_= 'wikitable sortable').find_all('tr')[1::]:
    data = items.find_all(['td'])
    data1 = data[0]
    data8 = data[8]    
    try:
        borough_name = data1.get_text()
        borough_name = borough_name.split('[')
        borough_name = borough_name[0]
        borough_name = borough_name.strip()
        
        ll = data8.get_text()
        ll = ll.split('/')
        lat_long = ll[2]
        lat_long = lat_long.split('(')
        lat_long = lat_long[0]
        lat_long = lat_long.split(';')
        latitude = lat_long[0]
        latitude = latitude.strip()
        longitude = lat_long[1]
        longitude = longitude.strip()
        longitude = longitude.replace(u'\ufeff', '')
        latitude = float(latitude)
        longitude = float(longitude)

#       Append the borough name, latitude and logitude in a list
        df_list.append((borough_name, latitude, longitude))
    except IndexError:pass

In [10]:
df_boroughs = pd.DataFrame(df_list, columns=['Borough', 'Latitude' , 'Longitude'])

In [11]:
df_boroughs.shape

(32, 3)

In [12]:
df_boroughs.head()

Unnamed: 0,Borough,Latitude,Longitude
0,Barking and Dagenham,51.5607,0.1557
1,Barnet,51.6252,-0.1517
2,Bexley,51.4549,0.1505
3,Brent,51.5588,-0.2817
4,Bromley,51.4039,0.0198


In [13]:
df_boroughs.dtypes

Borough       object
Latitude     float64
Longitude    float64
dtype: object

In [14]:
df_boroughs.loc[df_boroughs['Borough'] == 'Newham']

Unnamed: 0,Borough,Latitude,Longitude
23,Newham,51.5077,0.0469


### Get the Latitude and Longitude of London using geopy library

In [15]:
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
address = 'London, UK'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of London City are {}, {}.'.format(latitude, longitude))

  app.launch_new_instance()


The geograpical coordinate of London City are 51.5073219, -0.1276474.


### Create a map of London with Boroughs superimposed on top.

In [16]:
import folium 

# create map of London using latitude and longitude values
map_london = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough in zip(df_boroughs['Latitude'], df_boroughs['Longitude'], df_boroughs['Borough']):
    label = '{}'.format(borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
fill_opacity=0.7).add_to(map_london)  
map_london

### Preferred location for Asian restaurant - Newham Borough

#### Read the latitude and longtitude coordinates of all neighborhoods(areas) in Newham Borough

In [17]:
from urllib.request import urlopen
import re
URL = "https://en.wikipedia.org/wiki/List_of_areas_of_London"
res = requests.get(URL).text
soup = BeautifulSoup(res,'lxml')

codes = []
areas_list = []
href_links_list = []
for items in soup.find('table', class_= 'wikitable sortable').find_all('tr')[1::]:
    data = items.find_all(['td'])
    data0 = data[0]
    area_name = data0.text

    data1 = data[1]
    data1 = data1.text
    borough = data1.split('[')
    borough_name = borough[0]
    data5 = data[5]
    code = data5.text
    code = code.strip()
    
    if borough_name == 'Newham':
        codes.append(code)
        areas_list.append((borough_name,area_name,code))

                
for link in soup.findAll('a', attrs={'href': re.compile("^https://tools.wmflabs.org")}):
            htext = link.text
            if htext in codes:
                hlink = link.get('href')
                href_links_list.append((htext, hlink))

#### Create a DataFrame from the Areas List

In [18]:
df_areas = pd.DataFrame(areas_list, columns=['Borough', 'Area', 'Code'])

In [19]:
df_areas.head()

Unnamed: 0,Borough,Area,Code
0,Newham,Beckton,TQ435815
1,Newham,Canning Town,TQ405815
2,Newham,Custom House,TQ408807
3,Newham,East Ham,TQ425835
4,Newham,Forest Gate,TQ405855


In [20]:
df_areas.columns

Index(['Borough', 'Area', 'Code'], dtype='object')

In [21]:
df_areas.shape

(14, 3)

#### Create a DataFrame from the list of href links

In [22]:
df_links = pd.DataFrame(href_links_list, columns=['Code','href'])

In [23]:
df_links.columns

Index(['Code', 'href'], dtype='object')

In [24]:
df_links.shape

(15, 2)

In [25]:
df_links

Unnamed: 0,Code,href
0,TQ435815,https://tools.wmflabs.org/geohack/en/51.514205...
1,TQ405815,https://tools.wmflabs.org/geohack/en/51.514959...
2,TQ408807,https://tools.wmflabs.org/geohack/en/51.507695...
3,TQ425835,https://tools.wmflabs.org/geohack/en/51.532429...
4,TQ405855,https://tools.wmflabs.org/geohack/en/51.550902...
5,TQ435855,https://tools.wmflabs.org/geohack/en/51.550147...
6,TQ425855,https://tools.wmflabs.org/geohack/en/51.550401...
7,TQ391849,https://tools.wmflabs.org/geohack/en/51.545857...
8,TQ435795,https://tools.wmflabs.org/geohack/en/51.496234...
9,TQ405825,https://tools.wmflabs.org/geohack/en/51.523944...


#### Merge the Areas and hfef links

In [26]:
cols = df_links.columns.difference(df_areas.columns)

In [27]:
cols

Index(['href'], dtype='object')

In [28]:
df_areas_links = pd.concat([df_areas, df_links[cols]], axis=1)

In [29]:
df_areas_links.shape

(15, 4)

In [30]:
df_areas_links

Unnamed: 0,Borough,Area,Code,href
0,Newham,Beckton,TQ435815,https://tools.wmflabs.org/geohack/en/51.514205...
1,Newham,Canning Town,TQ405815,https://tools.wmflabs.org/geohack/en/51.514959...
2,Newham,Custom House,TQ408807,https://tools.wmflabs.org/geohack/en/51.507695...
3,Newham,East Ham,TQ425835,https://tools.wmflabs.org/geohack/en/51.532429...
4,Newham,Forest Gate,TQ405855,https://tools.wmflabs.org/geohack/en/51.550902...
5,Newham,Little Ilford,TQ435855,https://tools.wmflabs.org/geohack/en/51.550147...
6,Newham,Manor Park,TQ425855,https://tools.wmflabs.org/geohack/en/51.550401...
7,Newham,Maryland,TQ391849,https://tools.wmflabs.org/geohack/en/51.545857...
8,Newham,North Woolwich,TQ435795,https://tools.wmflabs.org/geohack/en/51.496234...
9,Newham,Plaistow,TQ405825,https://tools.wmflabs.org/geohack/en/51.523944...


#### Remove the row where no data exists.

In [31]:
df_areas_links = df_areas_links.dropna(how='any')

In [32]:
df_areas_links

Unnamed: 0,Borough,Area,Code,href
0,Newham,Beckton,TQ435815,https://tools.wmflabs.org/geohack/en/51.514205...
1,Newham,Canning Town,TQ405815,https://tools.wmflabs.org/geohack/en/51.514959...
2,Newham,Custom House,TQ408807,https://tools.wmflabs.org/geohack/en/51.507695...
3,Newham,East Ham,TQ425835,https://tools.wmflabs.org/geohack/en/51.532429...
4,Newham,Forest Gate,TQ405855,https://tools.wmflabs.org/geohack/en/51.550902...
5,Newham,Little Ilford,TQ435855,https://tools.wmflabs.org/geohack/en/51.550147...
6,Newham,Manor Park,TQ425855,https://tools.wmflabs.org/geohack/en/51.550401...
7,Newham,Maryland,TQ391849,https://tools.wmflabs.org/geohack/en/51.545857...
8,Newham,North Woolwich,TQ435795,https://tools.wmflabs.org/geohack/en/51.496234...
9,Newham,Plaistow,TQ405825,https://tools.wmflabs.org/geohack/en/51.523944...


In [33]:
geo_codes = []
for row in df_areas_links.itertuples():
    url = row.href
    code = row.Code
    res = requests.get(url).text
    soup1 = BeautifulSoup(res,'lxml')
    
    for lat in soup1.find('span',{'class':'latitude'}):
        latitude = lat
        latitude = float(latitude)
            
    for long in soup1.find('span',{'class':'longitude'}):    
        longitude = long
        longitude = float(longitude)
        
    geo_codes.append((code, latitude, longitude))

print(geo_codes)

[('TQ435815', 51.514206, 0.066634), ('TQ405815', 51.514959, 0.023429), ('TQ408807', 51.507696, 0.027431), ('TQ425835', 51.53243, 0.053041), ('TQ405855', 51.550902, 0.025024), ('TQ435855', 51.550148, 0.068263), ('TQ425855', 51.550401, 0.05385), ('TQ391849', 51.545857, 0.004608), ('TQ435795', 51.496234, 0.065821), ('TQ405825', 51.523945, 0.023828), ('TQ415795', 51.496738, 0.037029), ('TQ385845', 51.54241, -0.004196), ('TQ405837', 51.534728, 0.024306), ('TQ405837', 51.534728, 0.024306)]


In [34]:
df_geo_codes = pd.DataFrame(geo_codes, columns=['Code','Latitude','Longitude'])

In [35]:
df_geo_codes

Unnamed: 0,Code,Latitude,Longitude
0,TQ435815,51.514206,0.066634
1,TQ405815,51.514959,0.023429
2,TQ408807,51.507696,0.027431
3,TQ425835,51.53243,0.053041
4,TQ405855,51.550902,0.025024
5,TQ435855,51.550148,0.068263
6,TQ425855,51.550401,0.05385
7,TQ391849,51.545857,0.004608
8,TQ435795,51.496234,0.065821
9,TQ405825,51.523945,0.023828


#### Now Merge the Neighborhoods and GeoCodes DataFrames

In [36]:
df_areas.columns

Index(['Borough', 'Area', 'Code'], dtype='object')

In [37]:
df_areas.shape

(14, 3)

In [38]:
df_geo_codes.columns

Index(['Code', 'Latitude', 'Longitude'], dtype='object')

In [39]:
df_geo_codes.shape

(14, 3)

In [40]:
cols = df_geo_codes.columns.difference(df_areas.columns)

In [41]:
cols

Index(['Latitude', 'Longitude'], dtype='object')

In [42]:
Newham_borough = pd.concat([df_areas, df_geo_codes[cols]], axis=1)

In [43]:
Newham_borough

Unnamed: 0,Borough,Area,Code,Latitude,Longitude
0,Newham,Beckton,TQ435815,51.514206,0.066634
1,Newham,Canning Town,TQ405815,51.514959,0.023429
2,Newham,Custom House,TQ408807,51.507696,0.027431
3,Newham,East Ham,TQ425835,51.53243,0.053041
4,Newham,Forest Gate,TQ405855,51.550902,0.025024
5,Newham,Little Ilford,TQ435855,51.550148,0.068263
6,Newham,Manor Park,TQ425855,51.550401,0.05385
7,Newham,Maryland,TQ391849,51.545857,0.004608
8,Newham,North Woolwich,TQ435795,51.496234,0.065821
9,Newham,Plaistow,TQ405825,51.523945,0.023828


In [44]:
Newham_borough = Newham_borough.rename(columns={'Area' :'Neighborhood'})

#### We do not need the column Code for our further analysis, so dropping them.

In [45]:
Newham_borough.drop(['Code'], axis=1, inplace=True)

In [46]:
Newham_borough.columns

Index(['Borough', 'Neighborhood', 'Latitude', 'Longitude'], dtype='object')

In [47]:
Newham_borough

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Newham,Beckton,51.514206,0.066634
1,Newham,Canning Town,51.514959,0.023429
2,Newham,Custom House,51.507696,0.027431
3,Newham,East Ham,51.53243,0.053041
4,Newham,Forest Gate,51.550902,0.025024
5,Newham,Little Ilford,51.550148,0.068263
6,Newham,Manor Park,51.550401,0.05385
7,Newham,Maryland,51.545857,0.004608
8,Newham,North Woolwich,51.496234,0.065821
9,Newham,Plaistow,51.523945,0.023828


In [48]:
Newham_borough.dtypes

Borough          object
Neighborhood     object
Latitude        float64
Longitude       float64
dtype: object

In [49]:
# Getting Coordinates of Newham Borough

address = 'Newham, London'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Newham are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Newham are 51.52999955, 0.0293179602938221.


In [50]:
# Lets Visualize the Areas

# create map of Newham using latitude and longitude values
map_Newham = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(Newham_borough['Latitude'], Newham_borough['Longitude'], Newham_borough['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_Newham)  
    
map_Newham

In [51]:
# Defining FourSquare Credentials
# Lets explore the first Neighborhood(Area) in Newham Borough

Newham_borough.shape

(14, 4)

In [52]:
Newham_borough.columns

Index(['Borough', 'Neighborhood', 'Latitude', 'Longitude'], dtype='object')

In [53]:
Newham_borough.loc[0, 'Neighborhood']

'Beckton'

In [54]:
Newham_borough.loc[0]

Borough           Newham
Neighborhood     Beckton
Latitude         51.5142
Longitude       0.066634
Name: 0, dtype: object

In [55]:
neighborhood_latitude = Newham_borough.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = Newham_borough.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = Newham_borough.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Beckton are 51.514206, 0.066634.


In [70]:
# Create and GET request URL. Name your URL url.
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id=SPY52SJPSAXHU0VE1CXUNJJS5ZYBT5RRNF5MCM1A3ZVFR3D1&client_secret=TW5HFTZGVYAJ1SSAT21T0IINPNUVRGD1CRKL0NUN5PREWS4T&v=20191120&ll=51.51504,0.066634&radius=500&limit=100'.format()
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=SPY52SJPSAXHU0VE1CXUNJJS5ZYBT5RRNF5MCM1A3ZVFR3D1&client_secret=TW5HFTZGVYAJ1SSAT21T0IINPNUVRGD1CRKL0NUN5PREWS4T&v=20191120&ll=51.51504,0.066634&radius=500&limit=100'

In [71]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5de1e2f70f59680022abc5a6'},
 'response': {'headerLocation': 'Beckton',
  'headerFullLocation': 'Beckton, London',
  'headerLocationGranularity': 'neighborhood',
  'totalResults': 9,
  'suggestedBounds': {'ne': {'lat': 51.5195400045, 'lng': 0.0738516381179073},
   'sw': {'lat': 51.5105399955, 'lng': 0.059416361882092684}},
  'groups': [{'type': 'Recommended Places',
    'name': 'recommended',
    'items': [{'reasons': {'count': 0,
       'items': [{'summary': 'This spot is popular',
         'type': 'general',
         'reasonName': 'globalInteractionReason'}]},
      'venue': {'id': '52cecf5711d25db28fc70b17',
       'name': 'Home Bargains',
       'location': {'address': 'Alpine Way',
        'lat': 51.51680527454938,
        'lng': 0.06280401308506929,
        'labeledLatLngs': [{'label': 'display',
          'lat': 51.51680527454938,
          'lng': 0.06280401308506929}],
        'distance': 330,
        'cc': 'GB',
        'city': 'London',
   

In [72]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [73]:
results = requests.get(url).json()

venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Home Bargains,Discount Store,51.516805,0.062804
1,East london Gymnastics Club,Gym / Fitness Center,51.514107,0.060155
2,Lituanica,Grocery Store,51.516442,0.062927
3,Premier Inn London Beckton,Hotel,51.515125,0.061209
4,Matalan,Clothing Store,51.516004,0.062635


In [74]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id=SPY52SJPSAXHU0VE1CXUNJJS5ZYBT5RRNF5MCM1A3ZVFR3D1&client_secret=TW5HFTZGVYAJ1SSAT21T0IINPNUVRGD1CRKL0NUN5PREWS4T&v=20191120&ll=51.51504,0.066634&radius=500&limit=100'.format()
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, lat, lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 
                             'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return(nearby_venues)

In [75]:
Newham_venues = getNearbyVenues(names=Newham_borough['Neighborhood'],
                                   latitudes=Newham_borough['Latitude'],
                                   longitudes=Newham_borough['Longitude']
                                  )

In [76]:
Newham_venues.shape

(126, 7)

In [77]:
Newham_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Beckton,51.514206,0.066634,Home Bargains,51.516805,0.062804,Discount Store
1,Beckton,51.514206,0.066634,East london Gymnastics Club,51.514107,0.060155,Gym / Fitness Center
2,Beckton,51.514206,0.066634,Lituanica,51.516442,0.062927,Grocery Store
3,Beckton,51.514206,0.066634,Premier Inn London Beckton,51.515125,0.061209,Hotel
4,Beckton,51.514206,0.066634,Matalan,51.516004,0.062635,Clothing Store


In [78]:
Newham_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Beckton,9,9,9,9,9,9
Canning Town,9,9,9,9,9,9
Custom House,9,9,9,9,9,9
East Ham,9,9,9,9,9,9
Forest Gate,9,9,9,9,9,9
Little Ilford,9,9,9,9,9,9
Manor Park,9,9,9,9,9,9
Maryland,9,9,9,9,9,9
North Woolwich,9,9,9,9,9,9
Plaistow,9,9,9,9,9,9


In [79]:

# one hot encoding
Newham_onehot = pd.get_dummies(Newham_venues[['Venue Category']], prefix="", prefix_sep="")# add neighborhood column back to dataframe
Newham_onehot['Neighborhood'] = Newham_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [Newham_onehot.columns[-1]] + list(Newham_onehot.columns[:-1])
Newham_onehot = Newham_onehot[fixed_columns]

Newham_onehot.head()

Unnamed: 0,Neighborhood,Clothing Store,Discount Store,Furniture / Home Store,Grocery Store,Gym / Fitness Center,Hotel,Light Rail Station,Pub,Shopping Plaza
0,Beckton,0,1,0,0,0,0,0,0,0
1,Beckton,0,0,0,0,1,0,0,0,0
2,Beckton,0,0,0,1,0,0,0,0,0
3,Beckton,0,0,0,0,0,1,0,0,0
4,Beckton,1,0,0,0,0,0,0,0,0


In [80]:
Newham_onehot.shape

(126, 10)

In [99]:
# Lets group by Neighborhoods

Newham_grouped = Newham_onehot.groupby('Neighborhood').mean().reset_index()
Newham_grouped

Unnamed: 0,Neighborhood,Clothing Store,Discount Store,Furniture / Home Store,Grocery Store,Gym / Fitness Center,Hotel,Light Rail Station,Pub,Shopping Plaza
0,Beckton,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
1,Canning Town,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
2,Custom House,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
3,East Ham,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
4,Forest Gate,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
5,Little Ilford,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
6,Manor Park,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
7,Maryland,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
8,North Woolwich,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
9,Plaistow,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111


In [100]:
Newham_grouped.columns

Index(['Neighborhood', 'Clothing Store', 'Discount Store',
       'Furniture / Home Store', 'Grocery Store', 'Gym / Fitness Center',
       'Hotel', 'Light Rail Station', 'Pub', 'Shopping Plaza'],
      dtype='object')

In [101]:
Newham_grouped.shape

(14, 10)

In [102]:
# Lets print each neighborhood with 5 most common venues

num_top_venues = 5
for hood in Newham_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = Newham_grouped[Newham_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Beckton----
                    venue  freq
0          Clothing Store  0.11
1          Discount Store  0.11
2  Furniture / Home Store  0.11
3           Grocery Store  0.11
4    Gym / Fitness Center  0.11


----Canning Town----
                    venue  freq
0          Clothing Store  0.11
1          Discount Store  0.11
2  Furniture / Home Store  0.11
3           Grocery Store  0.11
4    Gym / Fitness Center  0.11


----Custom House----
                    venue  freq
0          Clothing Store  0.11
1          Discount Store  0.11
2  Furniture / Home Store  0.11
3           Grocery Store  0.11
4    Gym / Fitness Center  0.11


----East Ham----
                    venue  freq
0          Clothing Store  0.11
1          Discount Store  0.11
2  Furniture / Home Store  0.11
3           Grocery Store  0.11
4    Gym / Fitness Center  0.11


----Forest Gate----
                    venue  freq
0          Clothing Store  0.11
1          Discount Store  0.11
2  Furniture / Home Store  0.11
3

In [103]:
# Put into pandas database

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

### Lets create one for the top 10

In [108]:
# Here is one for top 10


num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = Newham_grouped['Neighborhood']

for ind in np.arange(Newham_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Newham_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

ValueError: could not broadcast input array from shape (9) into shape (10)

In [109]:
neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Beckton,,,,,,,,,,
1,Canning Town,,,,,,,,,,
2,Custom House,,,,,,,,,,
3,East Ham,,,,,,,,,,
4,Forest Gate,,,,,,,,,,


### Cluster the Neighborhoods

In [110]:
Newham_grouped.head()

Unnamed: 0,Neighborhood,Clothing Store,Discount Store,Furniture / Home Store,Grocery Store,Gym / Fitness Center,Hotel,Light Rail Station,Pub,Shopping Plaza
0,Beckton,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
1,Canning Town,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
2,Custom House,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
3,East Ham,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111
4,Forest Gate,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111,0.111111


In [111]:
# set number of clusters
kclusters = 5
Newham_grouped_clustering = Newham_grouped.drop('Neighborhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Newham_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

  return_n_iter=True)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [91]:
kmeans

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=0, tol=0.0001, verbose=0)

In [92]:
Newham_merged = Newham_borough
# add clustering labels
Newham_merged['Cluster Labels'] = kmeans.labels_

# merge Neighborhoods dataframe with Newham borough dataframe to add latitude/longitude for each neighborhood
Newham_merged = Newham_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

Newham_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Newham,Beckton,51.514206,0.066634,0,,,,,,,,,,
1,Newham,Canning Town,51.514959,0.023429,0,,,,,,,,,,
2,Newham,Custom House,51.507696,0.027431,0,,,,,,,,,,
3,Newham,East Ham,51.53243,0.053041,0,,,,,,,,,,
4,Newham,Forest Gate,51.550902,0.025024,0,,,,,,,,,,


In [112]:
# Lets Visualize the Cluster

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
y = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(y)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Newham_merged['Latitude'], Newham_merged['Longitude'], Newham_merged['Neighborhood'], Newham_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [113]:
# Examine the Clusters

Newham_merged.loc[Newham_merged['Cluster Labels'] == 0, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Beckton,,,,,,,,,,
1,Canning Town,,,,,,,,,,
2,Custom House,,,,,,,,,,
3,East Ham,,,,,,,,,,
4,Forest Gate,,,,,,,,,,
5,Little Ilford,,,,,,,,,,
6,Manor Park,,,,,,,,,,
7,Maryland,,,,,,,,,,
8,North Woolwich,,,,,,,,,,
9,Plaistow,,,,,,,,,,


In [114]:
Newham_merged.loc[Newham_merged['Cluster Labels'] == 1, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [115]:
Newham_merged.loc[Newham_merged['Cluster Labels'] == 2, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [116]:
Newham_merged.loc[Newham_merged['Cluster Labels'] == 3, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [117]:
Newham_merged.loc[Newham_merged['Cluster Labels'] == 4, Newham_merged.columns[[1] + list(range(5, Newham_merged.shape[1]))]]

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


##  Conclusion: After examining the above 5 clusters, we can recommend our stakeholders that Beckton,Custom House, Maryland, Eastham and Manor Park are the best neighborhoods in Newham borough, to open their Asian restaurant. This is because in these areas, the most common venue visited by the public is the restaurants and as these areas are highly populated with Asians, opening an Asian restaurant would definitley be a good idea.