# Get London Neighborhood data

### Required libraries

In [1]:
# Libraries & Imports
# !conda install -c conda-forge beautifulsoup4 --yes
!conda install -c conda-forge geopy --yes
from bs4 import BeautifulSoup
import requests
import pandas as pd
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import time

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.0.2p             |       h470a237_1         3.1 MB  conda-forge
    certifi-2018.10.15         |        py36_1000         138 KB  conda-forge
    geopy-1.17.0               |             py_0          49 KB  conda-forge
    ca-certificates-2018.10.15 |       ha4d7672_0         135 KB  conda-forge
    conda-4.5.11               |        py36_1000         651 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.1 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0            conda-forge
    geopy:           

### Function to clean neighborhood values

In [2]:
# Function to clean the London neighborhood name of extraneous information
# e.g. and as in 'Bromley Common and Keston'
# e.g. comma as in 'Ham, Petersfield'
# e.g. bracket as in 'Aldersgate (incl. Cheap)'
def clean_london_name(name):
    pos_comma = name.find(',')
    pos_and = name.find(' and ')
    pos_bracket = name.find('(')
    pos_with = name.find(' with ')
    pos_amp = name.find(' & ')
    if pos_bracket != -1:
        pos = pos_bracket
    elif pos_and != -1:
        pos = pos_and
    elif pos_with != -1:
        pos = pos_with
    elif pos_amp != -1:
        pos = pos_amp
    elif pos_comma != -1:
        pos = pos_comma
    else:
        pos = len(name)
    revised = name[0:pos].strip()
    
    # some specific substitutions
    if revised == 'Holland':
        revised = 'Holland Park'
    if revised == 'Chelsea Riverside':
        revised = 'Chelsea'
    if revised == 'Farringdon Within':
        revised = 'Farringdon'
    
    # print('Input: >{}<  Output: >{}<'.format(name, revised))
    return revised

### Get contents of London web page

In [3]:
r  = requests.get("https://www.citypopulation.de/php/uk-wards-london.php")
data = r.text
soup = BeautifulSoup(data, "lxml")

In [4]:
def get_ll(address):
    time.sleep(1) # wait a second, to conform with Nominatim's terms of use
    geolocator = Nominatim(user_agent="tommccann")
    location = geolocator.geocode(address)
    
    # Need a test for location = None here
    if location:
        latitude = location.latitude
        longitude = location.longitude
        return([latitude, longitude])
    else:
        # return -1 values for long and lat to indicate no coordinates returned
        return [-1,-1]

### Create a DataFrame and add neighborhood coordinates

In [5]:
# Create the dataframe
dfLondon = pd.DataFrame(columns=['Borough', 'Neighborhood', 'Latitude', 'Longitude'])

table = soup.find("table", {"id" : "tl"})

table_rows = table.find_all('tr')

borough = ''
for tr in table_rows:
    tds = tr.find_all('td')
    row = [td.text.strip() for td in tds]  # strip() removes the newline
    if row:  # removes empty row (the table headings)
        if row[1] == 'Borough':
            borough = row[0]
        elif row[1] == 'Ward':
            wardname = clean_london_name(row[0])
            dfLondon.loc[len(dfLondon)] = [borough, wardname, 0, 0] # dummy values for lat, long for now
            
# How many valid neighborhoods did we get?
dfLondon.shape

(637, 4)

### Select only valid central London neighborhoods

In [7]:
# Narrow it down to central London boroughs
central_boroughs = pd.Series(['Camden',
                            'Greenwich',
                            'Hackney'
                            'Hammersmith and Fulham',
                            'Islington',
                            'Kensington and Chelsea',
                            'Lambeth',
                            'Lewisham',
                            'Southwark',
                            'Tower Hamlets',
                            'Wandsworth',
                            'Westminster'])

# Create another DataFrame of central London boroughs only 
dfCentralBoroughs = dfLondon[dfLondon['Borough'].isin(central_boroughs)]
                           
# How many neighborhoods did we get?
dfCentralBoroughs.shape

(177, 4)

### Get geocordinates for neighborhoods

In [13]:
# Iterate through the central london neighborhoods and store lat, long
for index, row in dfCentralBoroughs.iterrows():
    address = row['Neighborhood'] + ', London'
    [lat, long] = get_ll(address)
    if lat != -1 and long != -1: # valid neighborhoods
        row['Latitude'] = lat
        row['Longitude'] = long
        # print('Cordinates found for {}, {} - long:{}, lat:{}'.format(row['Borough'], row['Neighborhood'], long, lat))

dfCentralBoroughs.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
102,Camden,Belsize,51.5474,-0.1683
103,Camden,Bloomsbury,51.5168,-0.125741
104,Camden,Camden Town,51.5418,-0.139128
105,Camden,Cantelowes,51.5469,-0.133241
106,Camden,Fortune Green,51.5546,-0.197622


### Filter out anything wrongly geocoded as being outside the greater London area

In [14]:
# Filter out anything that was misclassified outside the London area between 51.63,-0.47 and 51.33,0.13
dfCentralLondon = dfCentralBoroughs[(dfCentralBoroughs['Latitude'] > 51.33) & (dfCentralBoroughs['Latitude'] < 51.63)
                           & (dfCentralBoroughs['Longitude'] > -0.47) & (dfCentralBoroughs['Longitude'] < 0.13)]

# How many do we end up with?
dfCentralLondon.shape

(150, 4)

### Look at the results

In [15]:
dfCentralLondon.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
102,Camden,Belsize,51.5474,-0.1683
103,Camden,Bloomsbury,51.5168,-0.125741
104,Camden,Camden Town,51.5418,-0.139128
105,Camden,Cantelowes,51.5469,-0.133241
106,Camden,Fortune Green,51.5546,-0.197622


### Cache the central London neighborhood data in a csv file for use later

In [16]:
# Cache the Central London DataFrame
dfCentralLondon.to_csv('/resources/data/CentralLondon.csv', columns=['Borough','Neighborhood','Latitude','Longitude'], index=False)