In [1]:
!conda install -c anaconda beautifulsoup4 
!conda install -c anaconda requests
!conda install -c conda-forge geocoder 
!conda install -c conda-forge folium=0.5.0 --yes 
!conda install -c conda-forge geopy --yes 

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - beautifulsoup4


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    beautifulsoup4-4.7.1       |           py36_1         143 KB  anaconda
    certifi-2019.6.16          |           py36_0         154 KB  anaconda
    openssl-1.1.1              |       h7b6447c_0         5.0 MB  anaconda
    ca-certificates-2019.5.15  |                0         133 KB  anaconda
    ------------------------------------------------------------
                                           Total:         5.4 MB

The following packages will be UPDATED:

    beautifulsoup4:  4.7.1-py36_1      --> 4.7.1-py36_1     anaconda
    ca-certificates: 2019.5.15-0       --> 2019.5.15-0      anaconda
    certifi:         2019.6.16-py36_0  --> 2019.6.16-py36_0 anaconda
    openssl:         1.1.1c-h7b

### Importing libraries

In [2]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import geocoder # import geocoder
import folium 
from geopy.geocoders import Nominatim
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import re
print('Libraries Imported')

Libraries Imported


### Use BeautifulSoup package to extract neighborhoods in Washington D.C from Wikipedia

In [3]:
website_url = requests.get('https://en.wikipedia.org/wiki/Neighborhoods_in_Washington,_D.C.').text
soup = BeautifulSoup(website_url,'lxml')
neighbourhoods = []
headers = soup.find_all('ul')
links =[]
for x in range(2,10):
    temp =[]
    temp = headers[x].find_all('li')
    for y in temp:
        links.append(y)
for x in links:
    a = x.find_all('a')
    if(len(a) > 0):
        temp = re.split(' \(|, ',((x.find('a')).get('title')))#append neighbourhoods in array for hyperlink
        if temp[0] not in neighbourhoods:
            neighbourhoods.append(temp[0])
    else:
        if x.text not in neighbourhoods:
            neighbourhoods.append(x.text) #append neighborhoods 
wash_dc_neighbourhoods=pd.DataFrame()
wash_dc_neighbourhoods['Neighbourhoods'] = neighbourhoods
wash_dc_neighbourhoods.head()

Unnamed: 0,Neighbourhoods
0,Adams Morgan
1,Columbia Heights
2,Howard University
3,Kalorama
4,LeDroit Park


In [4]:
print('There are {} neighbourhoods in Washington D.C.'.format(wash_dc_neighbourhoods.shape[0]))

There are 131 neighbourhoods in Washington D.C.


### Use Nominatim from OpenStreetMap to retrieve latitude and longitude of the neighborhoods

In [5]:
neighbourhoodlatitude =[]
neighbourhoodlongitude =[]
for index, row in wash_dc_neighbourhoods.iterrows():
    url = '{0}{1}{2}'.format('http://nominatim.openstreetmap.org/search.php?q=',
                             row['Neighbourhoods']+', Washington D.C.',
                             '&format=json&polygon=0')
    response = requests.get(url).json()
    if(len(response) == 0):
        indexNames = wash_dc_neighbourhoods[ wash_dc_neighbourhoods['Neighbourhoods'] == row['Neighbourhoods']].index
        wash_dc_neighbourhoods.drop(indexNames,inplace=True)
    else:
        response = response[0]
        lst = [response.get(key) for key in ['lat','lon']]
        output = [float(i) for i in lst]
        neighbourhoodlatitude.append(output[0])
        neighbourhoodlongitude.append(output[1])
wash_dc_neighbourhoods['Latitude'] = neighbourhoodlatitude
wash_dc_neighbourhoods['Longitude'] = neighbourhoodlongitude
wash_dc_neighbourhoods.head()

Unnamed: 0,Neighbourhoods,Latitude,Longitude
0,Adams Morgan,38.9215,-77.042199
1,Columbia Heights,38.925672,-77.029429
2,Howard University,38.921525,-77.019536
3,Kalorama,38.916778,-77.052477
4,LeDroit Park,38.915907,-77.015721


### Retrieve latitude, longitude of Washington, D.C.

In [6]:
address = 'Washington, D.C.'

geolocator = Nominatim(user_agent="dc_explorer")
wash_dc_location = geolocator.geocode(address)
wash_dc_latitude = wash_dc_location.latitude
wash_dc_longitude = wash_dc_location.longitude
print('The geographical co-ordinates of Washington, D.C. are {}, {}.'.format(wash_dc_latitude, wash_dc_longitude))

The geographical co-ordinates of Washington, D.C. are 38.8950092, -77.0365625.


### Plot neighbourhoods of Washington, D.C on the map using Folium package

In [7]:
# create map of Washington, D.C using latitude and longitude values
map_washington_dc = folium.Map(location=[wash_dc_latitude, wash_dc_longitude], zoom_start=12)

# add markers to map
for lat, lng, neighbourhood in zip(wash_dc_neighbourhoods['Latitude'], wash_dc_neighbourhoods['Longitude'], wash_dc_neighbourhoods['Neighbourhoods']):
    label = neighbourhood
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='navy',
        fill=True,
        fill_color='royalblue',
        fill_opacity=0.7,
        parse_html=False).add_to(map_washington_dc)  
    
map_washington_dc

### Foursquare API inputs

In [8]:
CLIENT_ID = 'IQBZI3LY1CFPHDSLB04WSNB13J4BYM2SPVUH13OHMHLBWYXL' # your Foursquare ID
CLIENT_SECRET = 'SXGGJXB5HJSVRLZ5FLOOD3HXBHS2QFMTNHFS1CXBAZXDHB3N' # your Foursquare Secret
VERSION = '20190801' # Foursquare API version
LIMIT = 250 # limit of number of venues returned by Foursquare API

### Function to repeat the FourSquare API search query for all the neighbourhoods in Washington D.C. for different venue types

In [9]:
import urllib
def getNearbyVenues(neighbourhood, latitudes, longitudes, categoryIds, radius):
    try:
        wash_dc_venues_list=[]
        for name, lat, lng in zip(neighbourhood, latitudes, longitudes):
            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&categoryId={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, LIMIT, categoryIds)
            # make the GET request
            response = requests.get(url).json()
            results = response['response']['venues']
            # return only relevant information for each nearby venue
            for venues in results:
                success = False
                try:
                    category = venues['categories'][0]['name']
                    success = True
                except:
                    pass

                if success:
                    wash_dc_venues_list.append([(name, lat, lng, venues['name'], venues['location']['lat'], venues['location']['lng'],
                        venues['categories'][0]['name']
                    )])

        wash_dc_nearby_venues = pd.DataFrame([item for wash_dc_venue_list in wash_dc_venues_list for item in wash_dc_venue_list])
        wash_dc_nearby_venues.columns = ['Neighbourhoods', 'Latitude', 'Longitude', 'Venue_Name', 'Venue_Latitude', 'Venue_Longitude', 'Venue_Category']
    except:
        print(url)

    return(wash_dc_nearby_venues)

### Retrieve nearby pizza places for all neighbourhoods

In [10]:
wash_dc_venues_pizza = getNearbyVenues(neighbourhood=wash_dc_neighbourhoods['Neighbourhoods'], latitudes=wash_dc_neighbourhoods['Latitude'], longitudes=wash_dc_neighbourhoods['Longitude'], categoryIds='4bf58dd8d48988d1ca941735', radius=750)
wash_dc_venues_pizza.head()


Unnamed: 0,Neighbourhoods,Latitude,Longitude,Venue_Name,Venue_Latitude,Venue_Longitude,Venue_Category
0,Adams Morgan,38.9215,-77.042199,Pizza Mart,38.921607,-77.0424,Pizza Place
1,Adams Morgan,38.9215,-77.042199,Mellow Mushroom,38.921605,-77.042495,Pizza Place
2,Adams Morgan,38.9215,-77.042199,Jumbo Slice Pizza,38.920594,-77.041958,Pizza Place
3,Adams Morgan,38.9215,-77.042199,&pizza,38.92214,-77.04217,Pizza Place
4,Adams Morgan,38.9215,-77.042199,Pizza Boli’s,38.922018,-77.042394,Pizza Place


### Plot all the pizza places in Washington, D.C

In [11]:
map_washington_dc_pizza_places = folium.Map(location=[wash_dc_latitude, wash_dc_longitude], zoom_start=12)

# add markers to map
for venue_latitude, venue_longitude, neighbourhood, venue_name in zip(wash_dc_venues_pizza['Venue_Latitude'], wash_dc_venues_pizza['Venue_Longitude'], wash_dc_venues_pizza['Neighbourhoods'], wash_dc_venues_pizza['Venue_Name']):
    label = '{} - {}'.format(venue_name, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [venue_latitude, venue_longitude],
        radius=5,
        popup=label,
        color='maroon',
        fill=True,
        fill_color='lightcoral',
        fill_opacity=0.7,
        parse_html=False).add_to(map_washington_dc_pizza_places)  
    
map_washington_dc_pizza_places


### Retrieve nearby high schools for all neighbourhoods

In [12]:
wash_dc_venues_highschools = getNearbyVenues(neighbourhood=wash_dc_neighbourhoods['Neighbourhoods'], latitudes=wash_dc_neighbourhoods['Latitude'], longitudes=wash_dc_neighbourhoods['Longitude'], categoryIds='4bf58dd8d48988d13d941735',radius=750)
wash_dc_venues_highschools.head()


Unnamed: 0,Neighbourhoods,Latitude,Longitude,Venue_Name,Venue_Latitude,Venue_Longitude,Venue_Category
0,Columbia Heights,38.925672,-77.029429,Carlos Rosario Int'l Public Charter School,38.926513,-77.027777,High School
1,Columbia Heights,38.925672,-77.029429,Cardozo High School,38.921728,-77.028846,High School
2,Columbia Heights,38.925672,-77.029429,Next Step Public Charter School,38.928617,-77.035839,High School
3,Columbia Heights,38.925672,-77.029429,Benjamin Banneker Academic High School,38.923689,-77.023762,High School
4,Columbia Heights,38.925672,-77.029429,Youth Build - LAYC,38.927639,-77.032693,High School


### Plot all the high schools in Washington, D.C

In [13]:
map_washington_dc_highschools = folium.Map(location=[wash_dc_latitude, wash_dc_longitude], zoom_start=12)

# add markers to map
for venue_latitude, venue_longitude, neighbourhood, venue_name in zip(wash_dc_venues_highschools['Venue_Latitude'], wash_dc_venues_highschools['Venue_Longitude'], wash_dc_venues_highschools['Neighbourhoods'], wash_dc_venues_highschools['Venue_Name']):
    label = '{} - {}'.format(venue_name, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [venue_latitude, venue_longitude],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='honeydew',
        fill_opacity=0.7,
        parse_html=False).add_to(map_washington_dc_highschools)  
    
map_washington_dc_highschools


### Retrieve nearby universities for all neighbourhoods

In [14]:
wash_dc_venues_university = getNearbyVenues(neighbourhood=wash_dc_neighbourhoods['Neighbourhoods'], latitudes=wash_dc_neighbourhoods['Latitude'], longitudes=wash_dc_neighbourhoods['Longitude'], categoryIds='4bf58dd8d48988d1ae941735', radius=750)
wash_dc_venues_university.head()


Unnamed: 0,Neighbourhoods,Latitude,Longitude,Venue_Name,Venue_Latitude,Venue_Longitude,Venue_Category
0,Adams Morgan,38.9215,-77.042199,Arizona State University Washington Center,38.915488,-77.046366,College Administrative Building
1,Howard University,38.921525,-77.019536,Howard University,38.921143,-77.020954,University
2,Howard University,38.921525,-77.019536,Howard University School Of Education,38.923174,-77.018431,University
3,Howard University,38.921525,-77.019536,Howard University Department Of Psychology,38.920852,-77.020021,University
4,Kalorama,38.916778,-77.052477,Arizona State University Washington Center,38.915488,-77.046366,College Administrative Building


### Plot all the universities in Washington, D.C

In [15]:
map_washington_dc_universities = folium.Map(location=[wash_dc_latitude, wash_dc_longitude], zoom_start=12)

# add markers to map
for venue_latitude, venue_longitude, neighbourhood, venue_name in zip(wash_dc_venues_university['Venue_Latitude'], wash_dc_venues_university['Venue_Longitude'], wash_dc_venues_university['Neighbourhoods'], wash_dc_venues_university['Venue_Name']):
    label = '{} - {}'.format(venue_name, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [venue_latitude, venue_longitude],
        radius=5,
        popup=label,
        color='rebeccapurple',
        fill=True,
        fill_color='plum',
        fill_opacity=0.7,
        parse_html=False).add_to(map_washington_dc_universities)  
    
map_washington_dc_universities


### Retrieve nearby offices for all neighbourhoods

In [16]:
wash_dc_venues_offices = getNearbyVenues(neighbourhood=wash_dc_neighbourhoods['Neighbourhoods'], latitudes=wash_dc_neighbourhoods['Latitude'], longitudes=wash_dc_neighbourhoods['Longitude'], categoryIds='4bf58dd8d48988d124941735', radius=750)
wash_dc_venues_offices.head()


Unnamed: 0,Neighbourhoods,Latitude,Longitude,Venue_Name,Venue_Latitude,Venue_Longitude,Venue_Category
0,Adams Morgan,38.9215,-77.042199,Maga Design Inc.,38.921628,-77.043156,Print Shop
1,Adams Morgan,38.9215,-77.042199,Citizen Effect,38.922,-77.042501,Tech Startup
2,Adams Morgan,38.9215,-77.042199,Housing Counseling Services,38.922014,-77.039566,Office
3,Adams Morgan,38.9215,-77.042199,Aleberry DC,38.915859,-77.045041,Office
4,Adams Morgan,38.9215,-77.042199,CubeSmart Self Storage,38.918288,-77.039099,Storage Facility


### Plot all the offices in Washington, D.C

In [17]:
map_washington_dc_offices = folium.Map(location=[wash_dc_latitude, wash_dc_longitude], zoom_start=12)

# add markers to map
wash_dc_venues_offices_map = getNearbyVenues(neighbourhood=wash_dc_neighbourhoods['Neighbourhoods'], latitudes=wash_dc_neighbourhoods['Latitude'], longitudes=wash_dc_neighbourhoods['Longitude'], categoryIds='4bf58dd8d48988d124941735', radius=300)
for venue_latitude, venue_longitude, neighbourhood, venue_name in zip(wash_dc_venues_offices_map['Venue_Latitude'], wash_dc_venues_offices_map['Venue_Longitude'], wash_dc_venues_offices_map['Neighbourhoods'], wash_dc_venues_offices_map['Venue_Name']):
    label = '{} - {}'.format(venue_name, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [venue_latitude, venue_longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color='silver',
        fill_opacity=0.7,
        parse_html=False).add_to(map_washington_dc_offices)  
    
map_washington_dc_offices


### Function to merge dataframes

In [18]:
def mergedf(wash_dc_neighbourhoods_venues, df, newcolname):
    grouped_wash_dc_venues = pd.DataFrame()
    grouped_wash_dc_venues=df.groupby('Neighbourhoods').count()
    for neighbourhoods in wash_dc_neighbourhoods_venues['Neighbourhoods']:
        if((grouped_wash_dc_venues.index).contains(neighbourhoods)):
            wash_dc_neighbourhoods_venues.loc[wash_dc_neighbourhoods_venues['Neighbourhoods'] == neighbourhoods,newcolname] = grouped_wash_dc_venues.loc[neighbourhoods, 'Venue_Name']
    return wash_dc_neighbourhoods_venues


### Call function to merge dataframes

In [19]:
wash_dc_neighbourhoods_venues = pd.DataFrame()
wash_dc_neighbourhoods_venues = wash_dc_neighbourhoods.copy()
wash_dc_neighbourhoods_venues = mergedf(wash_dc_neighbourhoods_venues, wash_dc_venues_pizza, 'Pizza')
wash_dc_neighbourhoods_venues = mergedf(wash_dc_neighbourhoods_venues, wash_dc_venues_highschools, 'High_Schools')
wash_dc_neighbourhoods_venues = mergedf(wash_dc_neighbourhoods_venues, wash_dc_venues_university, 'Universities')
wash_dc_neighbourhoods_venues = mergedf(wash_dc_neighbourhoods_venues, wash_dc_venues_offices, 'Offices')
wash_dc_neighbourhoods_venues.fillna(value=0,inplace=True)
wash_dc_neighbourhoods_venues.head()

Unnamed: 0,Neighbourhoods,Latitude,Longitude,Pizza,High_Schools,Universities,Offices
0,Adams Morgan,38.9215,-77.042199,11.0,0.0,1.0,50.0
1,Columbia Heights,38.925672,-77.029429,6.0,9.0,0.0,44.0
2,Howard University,38.921525,-77.019536,6.0,6.0,3.0,38.0
3,Kalorama,38.916778,-77.052477,3.0,0.0,1.0,48.0
4,LeDroit Park,38.915907,-77.015721,8.0,7.0,6.0,46.0


### Assigning weights to categories

In [20]:
weight_pizza = -1 # negative weight, because Alex wants to avoid concurrence as much as possible
weight_highschools = 1 #high school customers
weight_universities = 1.5 # university customers
weight_offices = 2 # office customers

### Create a new dataframe with the weighted score

In [21]:
weighted_wash_dc_neighbourhoods_venues = wash_dc_neighbourhoods_venues[['Neighbourhoods']].copy()
weighted_wash_dc_neighbourhoods_venues['Score'] = wash_dc_neighbourhoods_venues['Pizza'] * weight_pizza + wash_dc_neighbourhoods_venues['High_Schools'] * weight_highschools + wash_dc_neighbourhoods_venues['Universities'] * weight_universities + wash_dc_neighbourhoods_venues['Offices'] * weight_offices
weighted_wash_dc_neighbourhoods_venues = weighted_wash_dc_neighbourhoods_venues.sort_values(by=['Score'], ascending=False)
weighted_wash_dc_neighbourhoods_venues

Unnamed: 0,Neighbourhoods,Score
15,Foggy Bottom,117.5
17,Logan Circle,109.0
8,Shaw,109.0
13,Dupont Circle,107.0
24,Cathedral Heights,104.5
84,NoMa,104.0
21,West End,103.0
75,Truxton Circle,102.0
83,Near Northeast,100.5
4,LeDroit Park,100.0


### Therefore, Foggy Bottom is the best neighbourhood where Alex could open his new Pizza place

### Plotting all the venues of Foggy Bottom, Washington D.C.

In [22]:
map_dupontcircle_venues = folium.Map(location=[wash_dc_neighbourhoods[wash_dc_neighbourhoods['Neighbourhoods'] == 'Foggy Bottom']['Latitude'].values[0], wash_dc_neighbourhoods[wash_dc_neighbourhoods['Neighbourhoods'] == 'Dupont Circle']['Longitude'].values[0]], zoom_start=15)

# Plot pizza places in Dupont Circle
dupont_venues_pizza = wash_dc_venues_pizza[wash_dc_venues_pizza['Neighbourhoods'] == 'Foggy Bottom']
for venue_latitude, venue_longitude, neighbourhood, venue_name in zip(dupont_venues_pizza['Venue_Latitude'], dupont_venues_pizza['Venue_Longitude'], dupont_venues_pizza['Neighbourhoods'], dupont_venues_pizza['Venue_Name']):
    label = '{} - {}'.format(venue_name, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [venue_latitude, venue_longitude],
        radius=5,
        popup=label,
        color='maroon',
        fill=True,
        fill_color='lightcoral',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dupontcircle_venues) 

# Plot high schools in Dupont Circle
dupont_venues_highschools = wash_dc_venues_highschools[wash_dc_venues_highschools['Neighbourhoods'] == 'Foggy Bottom']    
for venue_latitude, venue_longitude, neighbourhood, venue_name in zip(dupont_venues_highschools['Venue_Latitude'], dupont_venues_highschools['Venue_Longitude'], dupont_venues_highschools['Neighbourhoods'], dupont_venues_highschools['Venue_Name']):
    label = '{} - {}'.format(venue_name, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [venue_latitude, venue_longitude],
        radius=5,
        popup=label,
        color='green',
        fill=True,
        fill_color='honeydew',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dupontcircle_venues)  

# Plot universities in Dupont Circle
dupont_venues_universties = wash_dc_venues_university[wash_dc_venues_university['Neighbourhoods'] == 'Foggy Bottom']
for venue_latitude, venue_longitude, neighbourhood, venue_name in zip(dupont_venues_universties['Venue_Latitude'], dupont_venues_universties['Venue_Longitude'], dupont_venues_universties['Neighbourhoods'], dupont_venues_universties['Venue_Name']):
    label = '{} - {}'.format(venue_name, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [venue_latitude, venue_longitude],
        radius=5,
        popup=label,
        color='rebeccapurple',
        fill=True,
        fill_color='plum',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dupontcircle_venues) 

# Plot offices in Dupont Circle
dupont_venues_offices = wash_dc_venues_offices[wash_dc_venues_offices['Neighbourhoods'] == 'Foggy Bottom']
for venue_latitude, venue_longitude, neighbourhood, venue_name in zip(dupont_venues_offices['Venue_Latitude'], dupont_venues_offices['Venue_Longitude'], dupont_venues_offices['Neighbourhoods'], dupont_venues_offices['Venue_Name']):
    label = '{} - {}'.format(venue_name, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [venue_latitude, venue_longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color='silver',
        fill_opacity=0.7,
        parse_html=False).add_to(map_dupontcircle_venues)  

map_dupontcircle_venues
