In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import requests # library to handle requests
from bs4 import BeautifulSoup
print('Libraries imported.')

Libraries imported.


Grab district-level data from website

In [2]:
url = requests.get('https://en.wikipedia.org/wiki/List_of_districts_of_Lima')
print(url.status_code) # A code of 200 means the page is indeed present

200


In [4]:
src = url.content
soup = BeautifulSoup(src, 'lxml')
my_table = soup.find('table',{'class':'wikitable sortable'})

Extract the needed data elements from the xml

In [5]:
data_extract = []

for row in my_table.findAll('tr'):
    columns = row.find_all('td')
    columns = [element.text.strip() for element in columns]
    data_extract.append(columns)

Frame the data

In [6]:
df = pd.DataFrame(data_extract) # Convert the data from the list into a dataframe
df = df.drop([0]) # Drop the first row
df = df.reset_index(drop=True) # Reset the index so that it starts again from 0..1..2
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,Ancón,150102,299.22,29419,98.3,18741029 29 October 1874,2,
1,Ate,150103,77.72,419663,5399.7,18570102 2 January 1857,3,
2,Barranco,150104,3.33,45922,13790.4,18741026 26 October 1874,4,
3,Breña,150105,3.22,94808,29443.5,19490715 15 July 1949,5,
4,Carabayllo,150106,346.88,188764,544.2,18210804 4 August 1821,6,


In [7]:
df = df.drop([2,3,4,5,6,7], axis = 1) # Drop columns indexed from 2 to 7

In [8]:
df.head(1)

Unnamed: 0,0,1
0,Ancón,150102


In [9]:
df = df.rename(columns = {
    0: "District",
    1: "Code"
    })
df['District'] = df['District'] + ", " + "Lima"
df.head()

Unnamed: 0,District,Code
0,"Ancón, Lima",150102
1,"Ate, Lima",150103
2,"Barranco, Lima",150104
3,"Breña, Lima",150105
4,"Carabayllo, Lima",150106


In [10]:
df['District'] = df['District'].replace(['Santa María del Mar District, Lima'],'Santa María del Mar, Lima')
df.head()

Unnamed: 0,District,Code
0,"Ancón, Lima",150102
1,"Ate, Lima",150103
2,"Barranco, Lima",150104
3,"Breña, Lima",150105
4,"Carabayllo, Lima",150106


In [11]:
import geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import geocoder # to get coordinates

In [12]:
service = geopy.Nominatim(user_agent = "myGeocoder")

In [13]:
lat = []
for District in df.District:
    lat.append(service.geocode(District).latitude)

In [14]:
long = []
for District in df.District:
    long.append(service.geocode(District).longitude) 

In [15]:
geo = {'Latitude':lat,'Longitude':long}
df_geo = pd.DataFrame(geo)
df_geo.head(1)

Unnamed: 0,Latitude,Longitude
0,-11.696554,-77.111655


In [16]:
df = pd.concat([df, df_geo], axis=1)
df.head(1)

Unnamed: 0,District,Code,Latitude,Longitude
0,"Ancón, Lima",150102,-11.696554,-77.111655


In [17]:
import folium
from IPython.display import display

In [18]:
address = 'Lima, Peru'
geolocator = Nominatim(user_agent = "myGeocoder")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Lima, Peru are', latitude, ',' ,longitude)

The geograpical coordinate of Lima, Peru are -12.0621065 , -77.0365256


In [19]:
# create map of Lima using latitude and longitude values
map_lima = folium.Map(location=[latitude, longitude], zoom_start=13)

In [20]:
# add markers to map
for lat, lng, district in zip(df['Latitude'], df['Longitude'], df['District']):
  label = '{},{},{}'.format(district, lat, lng)
  label = folium.Popup(label, parse_html=True)
  folium.CircleMarker(
      [lat, lng],
      radius=5,
      popup=label,
      color='blue',
      fill=True,
      fill_color='#3186cc',
      fill_opacity=0.7).add_to(map_lima) 
    
display(map_lima)

In [22]:
# define Foursquare Credentials and Version
CLIENT_ID = 'your Foursquare ID' # your Foursquare ID
CLIENT_SECRET = 'your Foursquare Secret' # your Foursquare Secret
VERSION = '20180724' # Foursquare API version

Grab the top 100 venues that are within a radius of 2000 meters.

In [23]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, district in zip(
                                                    df['Latitude'], 
                                                    df['Longitude'], 
                                                    df['District']
                                                ):
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
        
    results = requests.get(url).json()['response']['groups'][0]['items']
    
    for venue in results:
        venues.append((
            district, 
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [24]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)
venues_df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,"Ate, Lima",-12.038728,-76.896873,"Hotel Thunderbird ""El Pueblo""",-12.034299,-76.881955,Hotel
1,"Ate, Lima",-12.038728,-76.896873,La Granja Azul,-12.033249,-76.885768,Peruvian Restaurant
2,"Ate, Lima",-12.038728,-76.896873,Granja Azul Golf Club,-12.033012,-76.886601,Farm
3,"Ate, Lima",-12.038728,-76.896873,Toboganes Grill,-12.03564,-76.883482,Steakhouse
4,"Ate, Lima",-12.038728,-76.896873,Bar Marino,-12.036286,-76.881872,Seafood Restaurant


In [25]:
# define the column names
venues_df.columns = ['District', 
                     'DistrictLatitude', 
                     'DistructLongitude', 
                     'Venue', 
                     'VenueLongitude', 
                     'VenueLatitude', 
                     'VenueCategory']

print(venues_df.shape)
venues_df.head()

(2106, 7)


Unnamed: 0,District,DistrictLatitude,DistructLongitude,Venue,VenueLongitude,VenueLatitude,VenueCategory
0,"Ate, Lima",-12.038728,-76.896873,"Hotel Thunderbird ""El Pueblo""",-12.034299,-76.881955,Hotel
1,"Ate, Lima",-12.038728,-76.896873,La Granja Azul,-12.033249,-76.885768,Peruvian Restaurant
2,"Ate, Lima",-12.038728,-76.896873,Granja Azul Golf Club,-12.033012,-76.886601,Farm
3,"Ate, Lima",-12.038728,-76.896873,Toboganes Grill,-12.03564,-76.883482,Steakhouse
4,"Ate, Lima",-12.038728,-76.896873,Bar Marino,-12.036286,-76.881872,Seafood Restaurant


View the number of venues per district

In [26]:
venues_df.groupby(["District"]).count()

Unnamed: 0_level_0,DistrictLatitude,DistructLongitude,Venue,VenueLongitude,VenueLatitude,VenueCategory
District,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Ate, Lima",10,10,10,10,10,10
"Barranco, Lima",100,100,100,100,100,100
"Breña, Lima",100,100,100,100,100,100
"Carabayllo, Lima",2,2,2,2,2,2
"Chaclacayo, Lima",21,21,21,21,21,21
"Chorrillos, Lima",61,61,61,61,61,61
"Cieneguilla, Lima",10,10,10,10,10,10
"Comas, Lima",11,11,11,11,11,11
"El Agustino, Lima",25,25,25,25,25,25
"Independencia, Lima",50,50,50,50,50,50


In [27]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 207 uniques categories.


In [40]:
# print out the unique venue categories
venues_df['VenueCategory'].unique()[:10

array(['Hotel', 'Peruvian Restaurant', 'Farm', 'Steakhouse',
       'Seafood Restaurant', 'Restaurant', 'Bar', 'Breakfast Spot',
       'BBQ Joint', 'Nightclub'], dtype=object)

In [53]:
# check if the results contain "Athletics & Sports"
"Athletics & Sports" in venues_df['VenueCategory'].unique()

True

### Analyze the districts

In [31]:
# one hot encoding
kl_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
kl_onehot['District'] = venues_df['District'] 

# move neighborhood column to the first column
fixed_columns = [kl_onehot.columns[-1]] + list(kl_onehot.columns[:-1])
kl_onehot = kl_onehot[fixed_columns]

print(kl_onehot.shape)
kl_onehot.head()

(2106, 208)


Unnamed: 0,District,Accessories Store,Airport,American Restaurant,Arcade,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,...,Trade School,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Veterinarian,Vietnamese Restaurant,Water Park,Wings Joint,Women's Store,Yoga Studio
0,"Ate, Lima",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"Ate, Lima",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Ate, Lima",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Ate, Lima",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Ate, Lima",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Find the average frequencey of each venue, grouped by District

In [32]:
kl_grouped = kl_onehot.groupby(["District"]).mean().reset_index()

print(kl_grouped.shape)
kl_grouped.head(1)

(40, 208)


Unnamed: 0,District,Accessories Store,Airport,American Restaurant,Arcade,Arepa Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Arts & Entertainment,...,Trade School,Trail,Turkish Restaurant,Vegetarian / Vegan Restaurant,Veterinarian,Vietnamese Restaurant,Water Park,Wings Joint,Women's Store,Yoga Studio
0,"Ate, Lima",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
len(kl_grouped[kl_grouped["Athletics & Sports"] > 0])

9

In [55]:
kl_active_lifestyle = kl_grouped[["District","Athletics & Sports"]]

In [59]:
kl_active_lifestyle.sort_values(by='Athletics & Sports', ascending=False)[0:10]

Unnamed: 0,District,Athletics & Sports
26,"San Borja, Lima",0.03
16,"Magdalena del Mar, Lima",0.03
32,"San Miguel, Lima",0.010309
20,"Pueblo Libre, Lima",0.01
13,"Lima, Lima",0.01
2,"Breña, Lima",0.01
37,"Surquillo, Lima",0.01
36,"Santiago de Surco, Lima",0.01
27,"San Isidro, Lima",0.01
24,"Rímac, Lima",0.0


## Cluster Districts

In [63]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# set number of clusters
kclusters = 3

kl_clustering = kl_active_lifestyle.drop(["District"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(kl_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

array([0, 0, 2, 0, 0, 0, 0, 0, 0, 0])

In [65]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
kl_merged = kl_active_lifestyle.copy()

# add clustering labels
kl_merged["Cluster Labels"] = kmeans.labels_
kl_merged.head(1)

Unnamed: 0,District,Athletics & Sports,Cluster Labels
0,"Ate, Lima",0.0,0


In [66]:
kl_merged = kl_merged.join(df.set_index("District"), on="District")

print(kl_merged.shape)
kl_merged.head()

(40, 6)


Unnamed: 0,District,Athletics & Sports,Cluster Labels,Code,Latitude,Longitude
0,"Ate, Lima",0.0,0,150103,-12.038728,-76.896873
1,"Barranco, Lima",0.0,0,150104,-12.143959,-77.020268
2,"Breña, Lima",0.01,2,150105,-12.0597,-77.050119
3,"Carabayllo, Lima",0.0,0,150106,-11.794993,-76.989292
4,"Chaclacayo, Lima",0.0,0,150107,-11.97574,-76.769871


In [67]:
# sort the results by Cluster Labels
print(kl_merged.shape)
kl_merged.sort_values(["Cluster Labels"], inplace=True)
kl_merged

(40, 6)


Unnamed: 0,District,Athletics & Sports,Cluster Labels,Code,Latitude,Longitude
0,"Ate, Lima",0.0,0,150103,-12.038728,-76.896873
35,"Santa Rosa, Lima",0.0,0,150139,-12.035851,-77.086616
34,"Santa María del Mar, Lima",0.0,0,150138,-12.401403,-76.775465
33,"Santa Anita, Lima",0.0,0,150137,-12.223383,-76.847707
31,"San Martín de Porres, Lima",0.0,0,150135,-11.986759,-77.097655
30,"San Luis, Lima",0.0,0,150134,-12.072356,-76.995891
29,"San Juan de Miraflores, Lima",0.0,0,150133,-12.155852,-76.972129
28,"San Juan de Lurigancho, Lima",0.0,0,150132,-11.948751,-76.977911
25,"San Bartolo, Lima",0.0,0,150129,-12.387071,-76.777945
24,"Rímac, Lima",0.0,0,150128,-12.020304,-77.035463


In [69]:
# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(kl_merged['Latitude'], kl_merged['Longitude'], kl_merged['District'], kl_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters