# IBM Data Science Professional
**Week 4 assignment**

Mapping London crimes and schools to understand the safest areas

In [0]:
# import libraries
import pandas as pd
import numpy as np
import folium
import requests
import os
import json

from geopy import Nominatim
from sklearn import metrics
from sklearn.cluster import KMeans
from folium.plugins import HeatMap
from folium import plugins
from pandas.io.json import json_normalize
from branca.colormap import linear



### Get geo locations

We'll use London tube station latitude and longitude to query the Foursquare API.


In [0]:
# extract the tube stations table from the wiki page
link = 'https://wiki.openstreetmap.org/wiki/List_of_London_Underground_stations'
    
tables = pd.read_html(link)

tables[0].head()

Unnamed: 0,Name,Latitude,Longitude,Platform / Entrance,Collected By,Collected On,Line,Step free
0,Acton Town,51.5025,-0.278126,Platform,User:Gagravarr,24/11/06,"District, Piccadilly",
1,Acton Central,51.50883531,-0.263033174,Entrance,User:Firefishy,08/05/2007,London Overground,
2,Acton Central,51.50856013,-0.262879534,Platform,User:Firefishy,08/05/2007,London Overground,
3,Aldgate,51.51394,-0.07537,Aldgate High Street entrance,User:Morwen,28/4/2007,Metropolitan,No
4,Aldgate East,51.51514,-0.07178,Entrance,User:Parsingphase,(2006),"District, Hammersmith & City",


In [0]:
# select table columns and assign to dataframe
df_tube_stations = tables[0].loc[:, ['Name', 'Latitude', 'Longitude']]

# drop duplicates and NaN values
df_tube_stations = df_tube_stations.drop_duplicates(['Name'])
df_tube_stations = df_tube_stations.dropna(axis=0)

df_tube_stations.describe()

Unnamed: 0,Name,Latitude,Longitude
count,291,291.0,291.0
unique,291,289.0,291.0
top,Brondesbury Park,51.57565,-0.20931
freq,1,2.0,1.0


### Get crime data

We'll use this data for the choropleth map.

In [2]:
from pandas import DataFrame

London total crime data summarized from london.gov

In [3]:

df_london_crime = crimelondon= {'Borough':['Barking and Dagenham','Barnet', 'Bexley','Brent','Bromley','Camdenm','City of London','Croydon','Ealing','Enfield','Greenwich','Hackney','Hammersmith and Fulham','Haringey','Harrow','Havering','Hillingdon','Hounslow','Islington','Kensington and Chelsea','Kingston upon Thames','Lambeth','Lewisham','Merton','Newham','Redbridge','Richmond upon Thames','Southwark','Sutton','Tower Hamlets','Waltham Forest','Wandsworth','Westminster'],'Amount':[44195,47586,30686,65867,
47917,
53753,
68705,
69268,
70484,
45031,
58298,
60485,
44741,
53458,
30917,
35961,
54712,
53622,
55426,
30058,
22018,
74856,
66854,
28902,
68493,
43173,
21054,
71844,
25796,
63523,
52481,
47956,
78522]}
df_london_crime= DataFrame(crimelondon, columns =['Borough','Amount'])
print (df_london_crime)

                   Borough  Amount
0     Barking and Dagenham   44195
1                   Barnet   47586
2                   Bexley   30686
3                    Brent   65867
4                  Bromley   47917
5                  Camdenm   53753
6           City of London   68705
7                  Croydon   69268
8                   Ealing   70484
9                  Enfield   45031
10               Greenwich   58298
11                 Hackney   60485
12  Hammersmith and Fulham   44741
13                Haringey   53458
14                  Harrow   30917
15                Havering   35961
16              Hillingdon   54712
17                Hounslow   53622
18               Islington   55426
19  Kensington and Chelsea   30058
20    Kingston upon Thames   22018
21                 Lambeth   74856
22                Lewisham   66854
23                  Merton   28902
24                  Newham   68493
25               Redbridge   43173
26    Richmond upon Thames   21054
27               Sou

In [0]:
df_london_crime.head()

In [0]:
# filter to include only data labeled as 'Violence Against the Person'
df_violent_crime = df_london_crime[df_london_crime['major_category'] == 'Violence Against the Person']


# groupby to get the total for each borough
df_borough_violent_crime = df_violent_crime.groupby('borough').sum().reset_index()

# add a total column
df_borough_violent_crime['Total_Crimes'] = df_borough_violent_crime.sum(axis=1)

# rename borough to Borough
df_borough_violent_crime.rename(columns = {'borough':'Borough'},
                                inplace = True)

# select only total crimes
df_borough_violent_crime = df_borough_violent_crime.iloc[:, [0, -1]]

df_borough_violent_crime.head()

In [0]:
# create a dictioanry to map features to their values in the choropleth
crime_dict = df_borough_violent_crime.set_index('Borough')['Total_Crimes']

crime_dict



### Get data from Foursquare API

We'll use the venue category ID for *Elemetary School* from the API [resources](https://developer.foursquare.com/docs/resources/categories).

#### Test case

API Query parameters:

In [4]:
#@hidden_cell
# API query parameters
CLIENT_ID = 'GRUGY2P3QI43ZYRG5OPHPSRSGC03W4CSSAIQ3OQWC3STHCTO'
CLIENT_SECRET = 'SG443HXE2V1RA2VZFEQ2XUEC3HS3WW0DOSJFQOPAJSWVU30N'
VERSION = '20190812' 
CATEGORY_ID = '4f4533804b9074f6e4fb0105'

In [0]:
# sample category search GET request parameters
limit = 10
radius = 1500
geolocator = Nominatim(user_agent='IBM Capstone')
london_location = geolocator.geocode('London, UK')
intent = 'browse'


# create URL for the search endpoint
url = 'https://api.foursquare.com/v2/venues/search?&client_id={}'\
      '&client_secret={}&v={}&ll={},{}&radius={}&limit={}&intent={}'\
      '&categoryId={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            london_location.latitude, 
            london_location.longitude, 
            radius, 
            limit,
            intent,
            CATEGORY_ID)

In [0]:
# send the GET request using requests
result = requests.get(url).json()

# clean the json and extract relevant data
data_school = []
for i in range(len(result['response']['venues'])):
    data_school.append(result['response']['venues'][i])


In [0]:
len(result['response']['venues'])

In [0]:
# put the data in a dataframe
df_school_example = json_normalize(data_school)
df_school_example[0:2]

In [0]:
# select relevant columns and filter 
filtered_columns = ['name', 'location.lat', 'location.lng']
df_school_filtered = df_school_example.loc[:, filtered_columns]

# clean columns
df_school_filtered.columns = [col.split(".")[-1] for col in df_school_filtered.columns]

df_school_filtered.head()

#### Generalise process and apply to London locations

Select a random subset of locations from the tube stations dataframe to make the API request more manageable.

In [0]:
# function to generalise data extraction and return a dataframe
def get_venues(names, latitudes, longitudes,
               radius=2000, limit=10,
               intent='browse'):
    
    column_filter = ['name', 'location.lat', 'location.lng']
    
    venues_loc_name=[]
    venues_loc_lat=[]
    venues_loc_lng=[]
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}'\
        '&client_secret={}&v={}&ll={},{}&radius={}&limit={}&intent={}'\
        '&categoryId={}'.format(
            CLIENT_ID,
            CLIENT_SECRET,
            VERSION, 
            lat, 
            lng, 
            radius, 
            limit,
            intent,
            CATEGORY_ID)
            
        # make the GET request
        results = requests.get(url).json()
        #print(results['response']['venues'])
        

        for i in range(len(results['response']['venues'])):
            venues_loc_name.append(name)
            venues_loc_lat.append(lat)
            venues_loc_lng.append(lng)
            try:
                # extract venue information per each name, lat, long location
                venues_list.append(results['response']['venues'][i])
            
            except:
                print('No schools found')
                pass
            
        # clean json into dataframe        
        df_venues_list = json_normalize(venues_list)
        
        df_venues_list = df_venues_list.loc[:, column_filter]
        
        df_venues_list.columns = [col.split(".")[-1] \
                                  for col in df_venues_list.columns]
        
        # add location corresponding to each venue
        df_venues_list['Location'] = venues_loc_name
        df_venues_list['Location Latitude'] = venues_loc_lat
        df_venues_list['Location Longitude'] = venues_loc_lng
        

    return df_venues_list

In [0]:
# sampling 30% of the total rows
df_tube_stations_sample = df_tube_stations.sample(frac=.2)

df_tube_stations_sample.describe()

Retrieve schools in the proximity of the tube stations subset.

In [0]:
df_schools = get_venues(names=df_tube_stations_sample['Name'],
                        latitudes=df_tube_stations_sample['Latitude'],
                        longitudes=df_tube_stations_sample['Longitude'])

df_schools.head()

In [0]:
df_schools_grouped = df_schools.groupby('Location').agg({'Location Latitude': 'first',
                                                         'Location Longitude': 'first',
                                                         'lat': 'first',
                                                         'lng':'first',
                                                         'name': ', '.join}).reset_index()


df_schools_grouped.head()

In [0]:
# cast lat and long as float
df_schools_grouped = df_schools_grouped.astype({'Location Latitude': 'float',
                                                'Location Longitude': 'float',
                                                'lat': 'float',
                                                'lng': 'float'})

### Create map

In [0]:
# cast lat and long as float
df_tube_stations_sample = df_tube_stations_sample.astype({'Latitude': 'float',
                                                          'Longitude': 'float'})

In [0]:
london_map = folium.Map(location=[london_location.latitude,
                                  london_location.longitude],
                        zoom_start=10, control_scale=True,
                        width='100%', height='100%', tiles='CartoDB positron')

# add tube stations
for index, row in df_tube_stations_sample.iterrows():
    folium.CircleMarker([row['Latitude'], row['Longitude']],
                        radius=2,
                        popup=row['Name'],
                        color='#001528').add_to(london_map)

In [0]:
# add schools 
for index, row in df_schools_grouped.iterrows():
    folium.CircleMarker([row['lat'], row['lng']],
                        radius=2,
                        popup=row['name'],
                        color='red').add_to(london_map)

In [0]:
london_map

![London map](https://i.imgur.com/rbmhqfR.png)



#### Choropleth map

In [0]:
# load geo.json
london_geo = os.path.join('/content/drive/My Drive/datasets/', 'london_boroughs_proper.geojson')

# set colormap
colormap = linear.YlGn_09.scale(df_borough_violent_crime.Total_Crimes.min(), 
                                df_borough_violent_crime.Total_Crimes.max())

print(colormap(5.0))
colormap

#geo_json_data = json.loads('/content/drive/My Drive/datasets/london_boroughs_proper.json')

In [0]:
# add choropleth to the map
london_map.choropleth(geo_data=london_geo,
                    data=crime_dict,
                    columns=['Borough', 'Total_Crimes'],
                    fill_color='YlGn',
                    key_on='feature.properties.name',
                    fill_opacity=0.6,
                    line_opacity=0.2,
                    legend_name='Crime')
                    
folium.LayerControl().add_to(london_map)
                   
                   
london_map.save(os.path.join('/content/drive/My Drive/datasets/',
                             'london_crime_choropleth.html'))              
                   

In [0]:
london_map

![London Crime Map](https://i.imgur.com/L2pCmds.png)
