#### Import dependencies

In [1]:
!pip -q install folium
print('folium installed...')
import folium 
print('folium imported...')

#library to handle data in a vectorized manner
import numpy as np
print('numpy imported...')

#library for data analysis
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
print('pandas imported...')

#library to handle JSON file
import json
print('json imported...')

!pip -q install geopy
# conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
print('geopy installed...')
# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim
print('Nominatim imported...')

# library to handle request
import requests
print('requests imported...')

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
%matplotlib inline
print('matplotlib imported...')

# transform JSON file into a pandas dataframe
from pandas.io.json import json_normalize
print('json_normalize imported...')

# library used for data visualization
import seaborn as sns
print('seaborn imported...')

# import k-means from clustering stage
from sklearn.cluster import KMeans
print('k-means imported...')

#import time
import time
print('time imported...')

# library for pulling data out of HTML and XML files
from bs4 import BeautifulSoup
print('BeautifulSoup imported...')

# install Geocoder
!pip -q install geocoder
print('geocoder installed...')
import geocoder
print('geocoder imported...')

print('DONE - All Dependencies imported...')

folium installed...
folium imported...
numpy imported...
pandas imported...
json imported...
geopy installed...
Nominatim imported...
requests imported...
matplotlib imported...
json_normalize imported...
seaborn imported...
k-means imported...
time imported...
BeautifulSoup imported...
geocoder installed...
geocoder imported...
DONE - All Dependencies imported...


#### Retrieve Data from Data source

In [2]:
wiki_data_source_link = 'https://en.wikipedia.org/wiki/List_of_areas_of_London' 
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'}
wikpedia_page = requests.get(wiki_data_source_link, headers = headers)
wikpedia_page

<Response [200]>

In [3]:
# Clean html file
soup = BeautifulSoup(wikpedia_page.content, 'html.parser')

# This extract the 'tbody' within the table where class is 'Wikitable sortable'
table = soup.find('table', {'class':'wikitable sortable'}).tbody

#Extraact all "tr" within the table above
rows = table.find_all('tr')

#Extracts the column headers, remove and replaces possible '\n' with space f or the "th" tag
columns = [i.text.replace('\n', '')
          for i in rows[0].find_all('th')]

#Converts columns to pd dataframe
df = pd.DataFrame(columns = columns)
df

Unnamed: 0,Location,London borough,Post town,Postcode district,Dial code,OS grid ref


In [4]:
# Extracts every row with corresponding colums, then append the values to the created dataframe "df"
for i in range(1, len(rows)):
    tds = rows[i].find_all('td')
    
    if len(tds) == 7:
        values = [tds[0].text, tds[1].text, tds[2].text.replace('\n', ''.replace('\xa0','')),
                 tds[3].text, tds[4].text.replace('\n', ''.replace('\xa0','')),
                 tds[5].text.replace('\n', ''.replace('\xa0','')), tds[6].text.replace('\n', ''.replace('\xa0',''))]
    else:
        values = [td.text.replace('\n', '').replace('\xa0','') for td in tds]
        df = df.append(pd.Series(values, index = columns), ignore_index = True)
        df

In [5]:
df.head()

Unnamed: 0,Location,London borough,Post town,Postcode district,Dial code,OS grid ref
0,Abbey Wood,"Bexley, Greenwich [7]",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham[8]",LONDON,"W3, W4",20,TQ205805
2,Addington,Croydon[8],CROYDON,CR0,20,TQ375645
3,Addiscombe,Croydon[8],CROYDON,CR0,20,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728


In [6]:
# Remane row header
df = df.rename(index=str, columns = {'Location': 'Location', 'London\xa0borough': 'Borough', 'Post town':'Town',
                                     'Postcode\xa0district': 'Postcode', 'Dial\xa0code': 'Dial Code', 'OS grid ref':'OSGridRe'})

In [7]:
df.head()

Unnamed: 0,Location,Borough,Town,Postcode,Dial Code,OSGridRe
0,Abbey Wood,"Bexley, Greenwich [7]",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham[8]",LONDON,"W3, W4",20,TQ205805
2,Addington,Croydon[8],CROYDON,CR0,20,TQ375645
3,Addiscombe,Croydon[8],CROYDON,CR0,20,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728


In [8]:
# Remove [] from borough name extracted from wikipidia
df['Borough'] = df['Borough'].map(lambda x: x.rstrip(']').rstrip('123456789').rstrip('['))
df.head(5)

Unnamed: 0,Location,Borough,Town,Postcode,Dial Code,OSGridRe
0,Abbey Wood,"Bexley, Greenwich",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham",LONDON,"W3, W4",20,TQ205805
2,Addington,Croydon,CROYDON,CR0,20,TQ375645
3,Addiscombe,Croydon,CROYDON,CR0,20,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728


In [9]:
df.shape

(533, 6)

In [10]:
# To identify distint location, we'll need to assign individual location to each post code
df_unique_postcode = df.drop('Postcode', axis=1).join(df['Postcode'].str.split(',', expand=True).stack().reset_index(level=1, drop=True).rename('Postcode'))

In [47]:
df_unique_postcode

Unnamed: 0,Location,Borough,Town,Dial Code,OSGridRe,Postcode
0,Abbey Wood,"Bexley, Greenwich",LONDON,020,TQ465785,SE2
1,Acton,"Ealing, Hammersmith and Fulham",LONDON,020,TQ205805,W3
1,Acton,"Ealing, Hammersmith and Fulham",LONDON,020,TQ205805,W4
10,Angel,Islington,LONDON,020,TQ345665,EC1
10,Angel,Islington,LONDON,020,TQ345665,N1
100,Church End,Brent,LONDON,020,TQ205785,NW10
101,Church End,Barnet,LONDON,020,TQ255905,N3
102,Clapham,"Lambeth, Wandsworth",LONDON,020,TQ295755,SW4
103,Clerkenwell,Islington,LONDON,020,TQ315825,EC1
104,Cockfosters,"Barnet, Enfield",BARNET,020,TQ275965,EN4


In [48]:
df_unique_postcode.shape

(637, 6)

Create a new dataframe from existing 'df_unique_postcode' with columns Location, Borough, Postcode and Town

In [49]:
df_source = df_unique_postcode[['Location', 'Borough', 'Postcode', 'Town']].reset_index(drop=True)
df_source.head(5)

Unnamed: 0,Location,Borough,Postcode,Town
0,Abbey Wood,"Bexley, Greenwich",SE2,LONDON
1,Acton,"Ealing, Hammersmith and Fulham",W3,LONDON
2,Acton,"Ealing, Hammersmith and Fulham",W4,LONDON
3,Angel,Islington,EC1,LONDON
4,Angel,Islington,N1,LONDON


We'll be using only London Town for this project. Filter out none-london town location

In [50]:
df_source_london = df_source
df_source_london = df_source_london[df_source_london['Town'].str.contains('LONDON')]
df_source_london.head(5)

Unnamed: 0,Location,Borough,Postcode,Town
0,Abbey Wood,"Bexley, Greenwich",SE2,LONDON
1,Acton,"Ealing, Hammersmith and Fulham",W3,LONDON
2,Acton,"Ealing, Hammersmith and Fulham",W4,LONDON
3,Angel,Islington,EC1,LONDON
4,Angel,Islington,N1,LONDON


Since we only not have Towns = London, Drop column 'Town' from dataframe

In [51]:
df_source_london_data = df_source_london[['Location', 'Borough', 'Postcode']].reset_index(drop=True)
df_source_london_data

Unnamed: 0,Location,Borough,Postcode
0,Abbey Wood,"Bexley, Greenwich",SE2
1,Acton,"Ealing, Hammersmith and Fulham",W3
2,Acton,"Ealing, Hammersmith and Fulham",W4
3,Angel,Islington,EC1
4,Angel,Islington,N1
5,Church End,Brent,NW10
6,Church End,Barnet,N3
7,Clapham,"Lambeth, Wandsworth",SW4
8,Clerkenwell,Islington,EC1
9,Colindale,Barnet,NW9


In [52]:
df_London = df_source_london_data
df_London.to_csv('LondonLocations.csv', index = False)

We will be using Geocoder library to obtain the longitude and latitude of each locator

In [53]:
#defind function getLngLat
def get_latlng(arcgis_geocoder):
    lat_lng_coords = None
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, London, United Kingdom'.format(arcgis_geocoder))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [54]:
# Test getLngLat function
testResult = get_latlng('EC1')
testResult

[51.523610000000076, -0.09876999999994496]

In [55]:
#reave geocoding using geocodefarm geocoder
gg = geocoder.geocodefarm(testResult, method = 'reverse')
gg

<[OK] Geocodefarm - Reverse [84 Goswell Road, London, EC1V 7DB, United Kingdom]>

Get time to get lat and long of all location

In [56]:
start = time.time()

post_codes =df_London['Postcode']
coordinates = [get_latlng(post_code) for post_code in post_codes.tolist()]

end = time.time()
print("Time of execution: ", end - start, "seconds")

Time of execution:  239.14979457855225 seconds


In [57]:
df_london_loc = df_London

df_london_coordinates = pd.DataFrame(coordinates, columns = ['Latitude', 'Longitude'])
df_london_loc['Latitude'] = df_london_coordinates['Latitude']
df_london_loc['Longitude'] = df_london_coordinates['Longitude']
df_london_loc.head(5)

Unnamed: 0,Location,Borough,Postcode,Latitude,Longitude
0,Abbey Wood,"Bexley, Greenwich",SE2,51.49245,0.12127
1,Acton,"Ealing, Hammersmith and Fulham",W3,51.51324,-0.26746
2,Acton,"Ealing, Hammersmith and Fulham",W4,51.48944,-0.26194
3,Angel,Islington,EC1,51.52361,-0.09877
4,Angel,Islington,N1,51.53792,-0.09983


In [58]:
df_london_loc.to_csv('LondonLocationsAndCoordinates.csv', index=False)
df_london_loc.shape

(381, 5)

In [59]:
df_read_london = pd.read_csv('LondonLocationsAndCoordinates.csv')
df_read_london.head()

Unnamed: 0,Location,Borough,Postcode,Latitude,Longitude
0,Abbey Wood,"Bexley, Greenwich",SE2,51.49245,0.12127
1,Acton,"Ealing, Hammersmith and Fulham",W3,51.51324,-0.26746
2,Acton,"Ealing, Hammersmith and Fulham",W4,51.48944,-0.26194
3,Angel,Islington,EC1,51.52361,-0.09877
4,Angel,Islington,N1,51.53792,-0.09983


In [46]:
borough_colors = {i: '#%02X%02X%02X' % tuple(np.random.choice(range(256),size=3)) for i in df_london_loc['Postcode'].unique()}
borough_colors

map_london = folium.Map(location=get_latlng('London'), zoom_start=11)

# #add makers to map
for lat, lng, Location, Borough in zip(df_london_loc['Latitude'], df_london_loc['Longitude'], df_london_loc['Borough'], df_london_loc['Postcode']):
        label = '{}, {}'.format(Location, Borough)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng], 
            radius=5, 
            popup=label, 
            color=borough_colors[Borough], 
            fill=True, 
            fill_color=borough_colors[Borough], 
            fill_opacity=0.7, 
            parse_html=False).add_to(map_london)
map_london