In [62]:
import pip
package_names=['geocoder'] #packages to install
pip.main(['install'] + package_names + ['--upgrade'])


import numpy as np # library to handle data in a vectorized manner

import geocoder
import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

#Next, scrape the target Wikipedia webpage 

url='https://en.wikipedia.org/wiki/Planning_Areas_of_Singapore'
response = requests.get(url)
data = response.text
soup = BeautifulSoup(data,'html.parser')
neighborhoodList = []

table = soup.find('table',attrs={"class":"wikitable sortable"})

df = pd.read_html(str(table))[0]
df.head()

#Create a dataframe containing all districts in Singapore

districts_df = df['Name (English)']
districts_df = pd.DataFrame({'District':districts_df})

#identify how many districts are present in Singapore 

districts_df.drop([df.index[10],df.index[26], df.index[43], df.index[24],df.index[51]],inplace=True)

#TThis section is for defining a function that would return coordinates of each district in Singapore 
# I have commented outed this section and the next as running them continually caused Jupyter Notebooks to freeze 

#def get_latlng(neighborhood):
#    # initialize your variable to None
#    lat_lng_coords = None
#    # loop until you get the coordinates
#    while(lat_lng_coords is None):
#        g = geocoder.arcgis('{}, Singapore, Singapore'.format(districts))
#        lat_lng_coords = g.latlng
#    return lat_lng_coords


# call the function to get the coordinates, store in a new list using list comprehension
# coords = [ get_latlng(neighborhood) for neighborhood in districts_df["Districts"].tolist() ]


#Instead, I continue by taking data from a csv file filled with data obtained from the earlier steps. 

df_geo=pd.read_csv('singapore_districts_geo.csv')


# Merge the two dfs together 

df_merged = pd.merge(districts_df,df_geo, on = 'District')
df_merged.head()

print('There are a total of',df_merged.shape[0], 'districts in Singapore')

df_merged.to_csv("merged.csv",index=False)

# Create a new column as Label and get the date from 'Borough' as integer
df_merged['Label']= df_merged.index
display(df_merged.head())

# get the coordinates of Singapore
address = 'Singapore, Singapore'

geolocator = Nominatim(user_agent="singapore_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Singapore, Singapore {}, {}.'.format(latitude, longitude))

#create a cluster map of singapore by setting N(clusters) as N(labels)
kclusters=len(df_merged.Label.unique())

#create cluster map
map_singapore = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, cluster in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['Label']):
    label = folium.Popup(str(df_merged['District']) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_singapore)

map_singapore

#use Foursquare API to explore districts 
# define Foursquare Credentials and Version

CLIENT_ID = 'FSA2B5AZLLCQ41RRZ2JFA5CGTWNXRPN5SQTF55DHNNPEX1HE' # your Foursquare ID
CLIENT_SECRET = '1TVFBJ5WXGCCIJQERSWZ4WQTIVENOM3NNZIAASH2K1JCPTM5' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

#get the top 100 venues in Singapore within a radius of 2000 metres.

radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(df_merged['Latitude'], df_merged['Longitude'], df_merged['District']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))
        
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['District', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()    

#check how many venues were returned for each district
venues_df.groupby(["District"]).count()

#identify how many unique categories are amongst all venues 

print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

# print out the list of categories
venues_df['VenueCategory'].unique()[:50]

# check if the results contain "Gym / Fitness Center"
"Gym / Fitness Center" in venues_df['VenueCategory'].unique()

# use one hot encoding to determine if a certain venue type exists in a particular district 
sg_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
sg_onehot['Districts'] = venues_df['District'] 

# move neighborhood column to the first column
fixed_columns = [sg_onehot.columns[-1]] + list(sg_onehot.columns[:-1])
sg_onehot = sg_onehot[fixed_columns]

print(sg_onehot.shape)
sg_onehot.head()

# group rows by district and take mean of frequency of occurrence for each venue type. 

sg_grouped = sg_onehot.groupby(["Districts"]).mean().reset_index()

print(sg_grouped.shape)
sg_grouped

#create a new dataframe for gym / fitness center data only 

sg_gyms = sg_grouped[['Districts','Gym / Fitness Center']]
sg_gyms.head()

sg_gyms = sg_gyms.sort_values(by = "Gym / Fitness Center", ascending=False)
sg_gyms.head(50)


Requirement already up-to-date: geocoder in c:\users\taytw\anaconda3\lib\site-packages (1.38.1)


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


There are a total of 49 districts in Singapore


Unnamed: 0,District,Latitude,Longitude,Label
0,Ang Mo Kio,1.3691,103.8454,0
1,Bedok,1.3236,103.9273,1
2,Bishan,1.3526,103.8352,2
3,Boon Lay,1.3386,103.7058,3
4,Bukit Batok,1.359,103.7637,4


The geograpical coordinate of Singapore, Singapore 1.357107, 103.8194992.
Your credentials:
CLIENT_ID: FSA2B5AZLLCQ41RRZ2JFA5CGTWNXRPN5SQTF55DHNNPEX1HE
CLIENT_SECRET:1TVFBJ5WXGCCIJQERSWZ4WQTIVENOM3NNZIAASH2K1JCPTM5
(4303, 7)
There are 308 uniques categories.
(4303, 309)
(49, 309)


Unnamed: 0,Districts,Gym / Fitness Center
39,Straits View,0.03
25,Outram,0.03
30,Queenstown,0.03
40,Sungei Kadut,0.025
46,Western Water Catchment,0.021739
3,Boon Lay,0.021277
12,Downtown Core,0.02
38,Singapore River,0.02
28,Pioneer,0.02
9,Changi,0.016949
