#### Import dependencies

In [33]:
!pip -q install folium
print('folium installed...')
import folium 
print('folium imported...')

#library to handle data in a vectorized manner
import numpy as np
print('numpy imported...')

#library for data analysis
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
print('pandas imported...')

#library to handle JSON file
import json
print('json imported...')

!pip -q install geopy
# conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
print('geopy installed...')
# convert an address into latitude and longitude values
from geopy.geocoders import Nominatim
print('Nominatim imported...')

# library to handle request
import requests
print('requests imported...')

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
%matplotlib inline
print('matplotlib imported...')

# transform JSON file into a pandas dataframe
from pandas.io.json import json_normalize
print('json_normalize imported...')

# library used for data visualization
import seaborn as sns
print('seaborn imported...')

# import k-means from clustering stage
from sklearn.cluster import KMeans
print('k-means imported...')

#import time
import time
print('time imported...')

# library for pulling data out of HTML and XML files
from bs4 import BeautifulSoup
print('BeautifulSoup imported...')

print('DONE - All Dependencies imported...')

folium installed...
folium imported...
numpy imported...
pandas imported...
json imported...
geopy installed...
Nominatim imported...
requests imported...
matplotlib imported...
json_normalize imported...
seaborn imported...
k-means imported...
time imported...
BeautifulSoup imported...
DONE - All Dependencies imported...


#### Retrieve Data from Data source

In [34]:
wiki_data_source_link = 'https://en.wikipedia.org/wiki/List_of_areas_of_London' 
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:64.0) Gecko/20100101 Firefox/64.0'}
wikpedia_page = requests.get(wiki_data_source_link, headers = headers)
wikpedia_page

<Response [200]>

In [35]:
# Clean html file
soup = BeautifulSoup(wikpedia_page.content, 'html.parser')

# This extract the 'tbody' within the table where class is 'Wikitable sortable'
table = soup.find('table', {'class':'wikitable sortable'}).tbody

#Extraact all "tr" within the table above
rows = table.find_all('tr')

#Extracts the column headers, remove and replaces possible '\n' with space f or the "th" tag
columns = [i.text.replace('\n', '')
          for i in rows[0].find_all('th')]

#Converts columns to pd dataframe
df = pd.DataFrame(columns = columns)
df

Unnamed: 0,Location,London borough,Post town,Postcode district,Dial code,OS grid ref


In [36]:
# Extracts every row with corresponding colums, then append the values to the created dataframe "df"
for i in range(1, len(rows)):
    tds = rows[i].find_all('td')
    
    if len(tds) == 7:
        values = [tds[0].text, tds[1].text, tds[2].text.replace('\n', ''.replace('\xa0','')),
                 tds[3].text, tds[4].text.replace('\n', ''.replace('\xa0','')),
                 tds[5].text.replace('\n', ''.replace('\xa0','')), tds[6].text.replace('\n', ''.replace('\xa0',''))]
    else:
        values = [td.text.replace('\n', '').replace('\xa0','') for td in tds]
        df = df.append(pd.Series(values, index = columns), ignore_index = True)
        df

In [37]:
df.head()

Unnamed: 0,Location,London borough,Post town,Postcode district,Dial code,OS grid ref
0,Abbey Wood,"Bexley, Greenwich [7]",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham[8]",LONDON,"W3, W4",20,TQ205805
2,Addington,Croydon[8],CROYDON,CR0,20,TQ375645
3,Addiscombe,Croydon[8],CROYDON,CR0,20,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728


In [40]:
# Remane row header
df = df.rename(index=str, columns = {'Location': 'Location', 'London\xa0borough': 'Borough', 'Post town':'Town',
                                     'Postcode\xa0district': 'Postcode', 'Dial\xa0code': 'Dial Code', 'OS grid ref':'OSGridRe'})

In [41]:
df.head()

Unnamed: 0,Location,Borough,Town,Postcode,Dial Code,OSGridRe
0,Abbey Wood,"Bexley, Greenwich [7]",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham[8]",LONDON,"W3, W4",20,TQ205805
2,Addington,Croydon[8],CROYDON,CR0,20,TQ375645
3,Addiscombe,Croydon[8],CROYDON,CR0,20,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728


In [42]:
# Remove [] from borough name extracted from wikipidia
df['Borough'] = df['Borough'].map(lambda x: x.rstrip(']').rstrip('123456789').rstrip('['))
df.head(5)

Unnamed: 0,Location,Borough,Town,Postcode,Dial Code,OSGridRe
0,Abbey Wood,"Bexley, Greenwich",LONDON,SE2,20,TQ465785
1,Acton,"Ealing, Hammersmith and Fulham",LONDON,"W3, W4",20,TQ205805
2,Addington,Croydon,CROYDON,CR0,20,TQ375645
3,Addiscombe,Croydon,CROYDON,CR0,20,TQ345665
4,Albany Park,Bexley,"BEXLEY, SIDCUP","DA5, DA14",20,TQ478728
