In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

In [2]:
df = pd.read_csv('median-rent-by-town-and-flat-type.csv')

# get only 2020 data
df = df[df['quarter'] == '2020-Q1']

# get only 4-RM flats
df = df[df['flat_type'] == '4-RM'].reset_index().drop('index', axis = 1)

# remove NA
df['median_rent'] = df['median_rent'].map(lambda x: None if x == '-' else int(x))
df.dropna(axis=0, inplace=True)
df.reset_index(inplace=True)

df

Unnamed: 0,index,quarter,town,flat_type,median_rent
0,0,2020-Q1,ANG MO KIO,4-RM,2100.0
1,1,2020-Q1,BEDOK,4-RM,2000.0
2,2,2020-Q1,BISHAN,4-RM,2300.0
3,3,2020-Q1,BUKIT BATOK,4-RM,1900.0
4,4,2020-Q1,BUKIT MERAH,4-RM,2500.0
5,5,2020-Q1,BUKIT PANJANG,4-RM,1800.0
6,7,2020-Q1,CENTRAL,4-RM,2600.0
7,8,2020-Q1,CHOA CHU KANG,4-RM,1800.0
8,9,2020-Q1,CLEMENTI,4-RM,2350.0
9,10,2020-Q1,GEYLANG,4-RM,2300.0


**Get all neighborhoods under General location column and check its shape**

In [3]:
neighborhoods = df['town']

print('There are {} neighborhoods'.format(len(neighborhoods)))
print(neighborhoods[:])

There are 25 neighborhoods
0        ANG MO KIO
1             BEDOK
2            BISHAN
3       BUKIT BATOK
4       BUKIT MERAH
5     BUKIT PANJANG
6           CENTRAL
7     CHOA CHU KANG
8          CLEMENTI
9           GEYLANG
10          HOUGANG
11      JURONG EAST
12      JURONG WEST
13          KALLANG
14    MARINE PARADE
15        PASIR RIS
16          PUNGGOL
17       QUEENSTOWN
18        SEMBAWANG
19         SENGKANG
20        SERANGOON
21         TAMPINES
22        TOA PAYOH
23        WOODLANDS
24           YISHUN
Name: town, dtype: object


### Get latlng of all neighborhoods

In [5]:
columns = ['Neighborhood','Latitude','Longitude']
df_sg = pd.DataFrame(columns=columns)

for neighbor in neighborhoods:
    try:
        address = ('{}, Singapore, SG').format(neighbor)
        geolocator = Nominatim(user_agent="sg_explorer")
        location = geolocator.geocode(address,timeout=10000)
        latitude = location.latitude
        longitude = location.longitude
        df_sg = df_sg.append(pd.DataFrame(data={'Neighborhood': [neighbor], 'Latitude':[latitude], 'Longitude': [longitude]}), ignore_index=True)
    except:
        pass

In [6]:
df_sg.insert(column='median_rent', value=df['median_rent'], loc = 3)
df_sg

Unnamed: 0,Neighborhood,Latitude,Longitude,median_rent
0,ANG MO KIO,1.370073,103.849516,2100.0
1,BEDOK,1.323976,103.930216,2000.0
2,BISHAN,1.350986,103.848255,2300.0
3,BUKIT BATOK,1.349057,103.749591,1900.0
4,BUKIT MERAH,1.270439,103.828318,2500.0
5,BUKIT PANJANG,1.378629,103.762136,1800.0
6,CENTRAL,1.288645,103.846575,2600.0
7,CHOA CHU KANG,1.384749,103.744534,1800.0
8,CLEMENTI,1.3151,103.765231,2350.0
9,GEYLANG,1.318186,103.887056,2300.0


### Map of all the neighborhoods

In [7]:
address = 'Singapore, SG'

geolocator = Nominatim(user_agent="sg_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Singapore are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Singapore are 1.357107, 103.8194992.


In [8]:
# create map of Toronto Boroughs using latitude and longitude values
map_sg = folium.Map(location=[latitude, longitude], zoom_start=11)  

# add markers to map
for lat, lng, label in zip(df_sg['Latitude'], df_sg['Longitude'], df_sg['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_sg)  
    
map_sg

### Analyse each neighborhood

In [9]:
# foursqaure credentials 
CLIENT_ID = '0IPZJR41N0LMXCBDDY32TWFI3QQJHZ4AM4A3MUHA10GXT4CA' 
CLIENT_SECRET = 'KFTOVKEMGJA13NVACDMWGMFVCYQP4ZCJTLH2XR15YYVRH1LZ' 
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 0IPZJR41N0LMXCBDDY32TWFI3QQJHZ4AM4A3MUHA10GXT4CA
CLIENT_SECRET:KFTOVKEMGJA13NVACDMWGMFVCYQP4ZCJTLH2XR15YYVRH1LZ


#### Get venues around neighborhood

In [10]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 1000 # define radius

# function that get nearby venues for each geo coordinate
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
#         print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [11]:
sg_venues = getNearbyVenues(names=df_sg['Neighborhood'],
                                   latitudes=df_sg['Latitude'],
                                   longitudes=df_sg['Longitude']
                                  )

In [12]:
print(sg_venues.shape)
sg_venues.head()

(1088, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,ANG MO KIO,1.370073,103.849516,FairPrice Xtra,1.369279,103.848886,Supermarket
1,ANG MO KIO,1.370073,103.849516,Old Chang Kee,1.369094,103.848389,Snack Place
2,ANG MO KIO,1.370073,103.849516,Face Ban Mian 非板面 (Ang Mo Kio),1.372031,103.847504,Noodle House
3,ANG MO KIO,1.370073,103.849516,MOS Burger,1.36917,103.847831,Burger Joint
4,ANG MO KIO,1.370073,103.849516,NTUC FairPrice,1.371507,103.847082,Supermarket


In [13]:
sg_onehot = pd.get_dummies(sg_venues['Venue Category'])
sg_onehot = sg_venues[['Neighborhood']].join(sg_onehot, rsuffix='_venuecat')
sg_grouped = sg_onehot.groupby('Neighborhood').mean()
sg_grouped.reset_index(inplace=True)
print(sg_grouped.shape)
sg_grouped.head()

(25, 164)


Unnamed: 0,Neighborhood,ATM,Accessories Store,American Restaurant,Arcade,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Baby Store,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Waterfront,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,ANG MO KIO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,BEDOK,0.0,0.0,0.016667,0.0,0.0,0.05,0.0,0.0,0.0,...,0.0,0.0,0.016667,0.0,0.0,0.0,0.0,0.016667,0.0,0.0
2,BISHAN,0.0,0.0,0.0,0.0,0.0,0.023256,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,BUKIT BATOK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,BUKIT MERAH,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
sg_grouped = sg_grouped.fillna(0)

In [15]:
# this function takes in a neighborhood series 
# and returns the top venues related to that borough
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [16]:
indicators = ['st', 'nd', 'rd']
num_top_venues = 10

# create columns according to number of top venues
columns = ['Neighborhood', 'median_rent']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = sg_grouped['Neighborhood']
neighborhoods_venues_sorted['median_rent'] = df_sg['median_rent']

for ind in np.arange(sg_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 2:] = return_most_common_venues(sg_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,median_rent,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,ANG MO KIO,2100.0,Coffee Shop,Dessert Shop,Food Court,Sandwich Place,Snack Place,Supermarket,Japanese Restaurant,Fast Food Restaurant,Bubble Tea Shop,Modern European Restaurant
1,BEDOK,2000.0,Chinese Restaurant,Coffee Shop,Asian Restaurant,Sandwich Place,Food Court,Japanese Restaurant,Supermarket,Sushi Restaurant,Fast Food Restaurant,Karaoke Bar
2,BISHAN,2300.0,Food Court,Coffee Shop,Bubble Tea Shop,Ice Cream Shop,Japanese Restaurant,Supermarket,Café,Chinese Restaurant,Cosmetics Shop,Dumpling Restaurant
3,BUKIT BATOK,1900.0,Coffee Shop,Food Court,Chinese Restaurant,Bus Line,Frozen Yogurt Shop,Mobile Phone Shop,Grocery Store,Bowling Alley,Café,Shopping Mall
4,BUKIT MERAH,2500.0,Juice Bar,Coffee Shop,Chinese Restaurant,Hotel,Cafeteria,Flower Shop,Furniture / Home Store,Frozen Yogurt Shop,Fried Chicken Joint,French Restaurant


### Attempt to cluster

In [17]:
# set number of clusters
kclusters = 6

sg_grouped_clustering = sg_grouped.drop(['Neighborhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(sg_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

array([1, 0, 1, 1, 2, 0, 4, 1, 1, 3, 1, 1, 0, 5, 4, 1, 0, 3, 0, 1, 0, 0,
       3, 0, 1])

In [18]:
df_sg[~df_sg['Neighborhood'].isin(sg_grouped['Neighborhood'])]

Unnamed: 0,Neighborhood,Latitude,Longitude,median_rent


In [19]:
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', pd.Series(kmeans.labels_))
neighborhoods_venues_sorted.drop('median_rent', axis=1, inplace=True)

sg_merged = df_sg

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
sg_merged = sg_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on=['Neighborhood'])

sg_merged.head() # check the last columns!

Unnamed: 0,Neighborhood,Latitude,Longitude,median_rent,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,ANG MO KIO,1.370073,103.849516,2100.0,1,Coffee Shop,Dessert Shop,Food Court,Sandwich Place,Snack Place,Supermarket,Japanese Restaurant,Fast Food Restaurant,Bubble Tea Shop,Modern European Restaurant
1,BEDOK,1.323976,103.930216,2000.0,0,Chinese Restaurant,Coffee Shop,Asian Restaurant,Sandwich Place,Food Court,Japanese Restaurant,Supermarket,Sushi Restaurant,Fast Food Restaurant,Karaoke Bar
2,BISHAN,1.350986,103.848255,2300.0,1,Food Court,Coffee Shop,Bubble Tea Shop,Ice Cream Shop,Japanese Restaurant,Supermarket,Café,Chinese Restaurant,Cosmetics Shop,Dumpling Restaurant
3,BUKIT BATOK,1.349057,103.749591,1900.0,1,Coffee Shop,Food Court,Chinese Restaurant,Bus Line,Frozen Yogurt Shop,Mobile Phone Shop,Grocery Store,Bowling Alley,Café,Shopping Mall
4,BUKIT MERAH,1.270439,103.828318,2500.0,2,Juice Bar,Coffee Shop,Chinese Restaurant,Hotel,Cafeteria,Flower Shop,Furniture / Home Store,Frozen Yogurt Shop,Fried Chicken Joint,French Restaurant


In [20]:
sg_merged['Cluster Labels'].value_counts()

1    10
0     8
3     3
4     2
5     1
2     1
Name: Cluster Labels, dtype: int64

### Visualize cluster

In [21]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, rental, cluster in zip(sg_merged['Latitude'], sg_merged['Longitude'], sg_merged['Neighborhood'], df_sg['median_rent'], sg_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster) + '\n' + str(rental), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [22]:
sg_clustered = sg_merged[['median_rent','Cluster Labels']].groupby('Cluster Labels').mean()
sg_clustered

Unnamed: 0_level_0,median_rent
Cluster Labels,Unnamed: 1_level_1
0,1943.75
1,2010.0
2,2500.0
3,2383.333333
4,2325.0
5,2400.0


In [23]:
sg_grouped_clustered = sg_merged[['Cluster Labels']].join(sg_grouped)
sg_grouped_clustered.drop('Neighborhood', axis=1, inplace=True)
sg_grouped_clustered = sg_grouped_clustered.groupby('Cluster Labels').mean()
sg_grouped_clustered = sg_clustered.join(sg_grouped_clustered)
sg_grouped_clustered

Unnamed: 0_level_0,median_rent,ATM,Accessories Store,American Restaurant,Arcade,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Baby Store,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Waterfront,Wine Shop,Wings Joint,Women's Store,Yoga Studio
Cluster Labels,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1943.75,0.005952,0.0,0.015365,0.002083,0.0,0.048855,0.0,0.013886,0.005102,...,0.002083,0.0,0.002083,0.0,0.005102,0.0,0.0,0.003895,0.0,0.0
1,2010.0,0.0,0.001408,0.001563,0.002041,0.001563,0.02666,0.002326,0.001563,0.0,...,0.001408,0.0,0.003449,0.003603,0.0,0.0,0.0,0.0,0.003448,0.0
2,2500.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2383.333333,0.0,0.0,0.0,0.0,0.0,0.041479,0.0,0.0,0.0,...,0.0,0.032323,0.052525,0.0,0.0,0.0,0.0,0.010101,0.0,0.0
4,2325.0,0.0,0.0,0.005,0.0,0.0,0.0,0.0,0.01,0.0,...,0.012821,0.0,0.015,0.0,0.005,0.005,0.005,0.005,0.0,0.027821
5,2400.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
indicators = ['st', 'nd', 'rd']
num_top_venues = 10

# create columns according to number of top venues
columns = ['Cluster Labels', 'median_rent']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
cluster_sorted = pd.DataFrame(columns=columns)
cluster_sorted['Cluster Labels'] = sg_grouped_clustered.index.values
cluster_sorted['median_rent'] = sg_grouped_clustered['median_rent']

for ind in np.arange(sg_grouped_clustered.shape[0]):
    cluster_sorted.iloc[ind, 2:] = return_most_common_venues(sg_grouped_clustered.iloc[ind, 1:], num_top_venues)

cluster_sorted.sort_values(by = 'median_rent')

Unnamed: 0,Cluster Labels,median_rent,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,0,1943.75,Fast Food Restaurant,Coffee Shop,Japanese Restaurant,Chinese Restaurant,Asian Restaurant,Café,Shopping Mall,Supermarket,Food Court,Clothing Store
1,1,2010.0,Coffee Shop,Food Court,Fast Food Restaurant,Chinese Restaurant,Supermarket,Sandwich Place,Café,Shopping Mall,Asian Restaurant,Italian Restaurant
4,4,2325.0,Hotel,Japanese Restaurant,Massage Studio,Multiplex,Indian Restaurant,Salad Place,Yoga Studio,Seafood Restaurant,Chinese Restaurant,Nightclub
3,3,2383.333333,Chinese Restaurant,Noodle House,Food Court,Vegetarian / Vegan Restaurant,Thai Restaurant,Asian Restaurant,Seafood Restaurant,Pool,Train Station,Café
5,5,2400.0,Dessert Shop,Restaurant,Indian Restaurant,Coffee Shop,Supermarket,Hostel,Flower Shop,Furniture / Home Store,Frozen Yogurt Shop,Fried Chicken Joint
2,2,2500.0,Juice Bar,Coffee Shop,Cafeteria,Chinese Restaurant,Hotel,Dumpling Restaurant,Electronics Store,Fast Food Restaurant,Donut Shop,Fish & Chips Shop
