# IBM Data Science Capstone Project 

## New Shopping Mall in Kuala Lumpur, Malaysia

##### Build a dataframe of neighborhoods in Kuala Lumpur, Malaysia by web scraping 
##### Get the geographical coordinates of the neighborhoods
##### Obtain the venue data for the neighborhoods from Foursquare API
##### Explore and cluster the neighborhoods
##### Select the best cluster to open a new shopping mall

In [3]:
import numpy as np #library

import pandas as pd #data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json 

from geopy.geocoders import Nominatim # convert address lat and long values
!pip install geocoder

import geocoder # to get Geo coordinates

import requests 
from bs4 import BeautifulSoup 

from pandas.io.json import json_normalize 

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

!pip install folium

import folium # map library

print("Libraries imported.")


Libraries imported.


In [4]:
# GET data
data = requests.get("https://en.wikipedia.org/wiki/Category:Suburbs_in_Kuala_Lumpur").text

In [5]:
# html into a beautifulsoup
soup = BeautifulSoup(data, 'html.parser')

In [7]:
neighborhoodList = []

In [8]:
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

In [9]:
kl_df = pd.DataFrame({"Neighborhood": neighborhoodList})

kl_df.head()

Unnamed: 0,Neighborhood
0,Alam Damai
1,"Ampang, Kuala Lumpur"
2,Bandar Menjalara
3,Bandar Sri Permaisuri
4,Bandar Tasik Selatan


In [10]:
kl_df.shape

(71, 1)

In [11]:
# function to get coordinates
def get_latlong(neighborhood):
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Kuala Lumpur, Malaysia'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [12]:
coords = [ get_latlong(neighborhood) for neighborhood in kl_df["Neighborhood"].tolist() ]

In [13]:
coords

[[3.0576900000000364, 101.74388000000005],
 [3.1484921138724, 101.69672653590337],
 [3.1903500000000236, 101.62545000000006],
 [3.1039100000000417, 101.71226000000007],
 [3.072750000000042, 101.71461000000005],
 [3.08280000000002, 101.72281000000004],
 [3.1292000000000257, 101.67844000000008],
 [3.1347800000000348, 101.67262000000005],
 [3.111020000000053, 101.66283000000004],
 [3.09898000000004, 101.73499000000004],
 [3.1357600000000616, 101.70837000000006],
 [3.12916000000007, 101.68406000000004],
 [3.147770000000037, 101.70855000000006],
 [3.057800000000043, 101.68965000000009],
 [3.1434800000000678, 101.64433000000008],
 [3.152017197420035, 101.70102760046613],
 [3.129290000000026, 101.69892000000004],
 [3.17381000000006, 101.68276000000009],
 [3.061870000000056, 101.74675000000008],
 [3.163590000000056, 101.69811000000004],
 [3.147980000000075, 101.66798000000006],
 [3.1387586696676304, 101.6840455304707],
 [3.1387586696676304, 101.6840455304707],
 [3.156685175474611, 101.69807679

In [14]:
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])


In [15]:
kl_df['Latitude'] = df_coords['Latitude']
kl_df['Longitude'] = df_coords['Longitude']

In [16]:
print(kl_df.shape)
kl_df

(71, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Alam Damai,3.05769,101.74388
1,"Ampang, Kuala Lumpur",3.148492,101.696727
2,Bandar Menjalara,3.19035,101.62545
3,Bandar Sri Permaisuri,3.10391,101.71226
4,Bandar Tasik Selatan,3.07275,101.71461
5,Bandar Tun Razak,3.0828,101.72281
6,Bangsar,3.1292,101.67844
7,Bangsar Park,3.13478,101.67262
8,Bangsar South,3.11102,101.66283
9,Batu 11 Cheras,3.09898,101.73499


In [17]:
kl_df.to_csv("kl_df.csv", index=False)

In [18]:
address = 'Kuala Lumpur, Malaysia'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Kuala Lumpur, Malaysiae {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Kuala Lumpur, Malaysiae 3.1516964, 101.6942371.


In [19]:
# create map of KL using latitude and longitude values
map_kl = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(kl_df['Latitude'], kl_df['Longitude'], kl_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_kl)  
    
map_kl

In [20]:
map_kl.save('map_kl.html')

In [21]:
# define Foursquare Credentials and Version
CLIENT_ID = 'R1DXCTW41SY1D0KSX3YSQ002ZULHMARD3L5WIQDQBO43DRN1' # your Foursquare ID
CLIENT_SECRET = 'YSDJOYXNPGBADBEKXMEAHN000B3M2GE3YHCPIJNFQS3QZBC5' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: R1DXCTW41SY1D0KSX3YSQ002ZULHMARD3L5WIQDQBO43DRN1
CLIENT_SECRET:YSDJOYXNPGBADBEKXMEAHN000B3M2GE3YHCPIJNFQS3QZBC5


In [23]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(kl_df['Latitude'], kl_df['Longitude'], kl_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [25]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(7079, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Alam Damai,3.05769,101.74388,Pengedar Shaklee Kuala Lumpur,3.061235,101.740696,Supplement Shop
1,Alam Damai,3.05769,101.74388,Machi Noodle 妈子面,3.057695,101.746635,Noodle House
2,Alam Damai,3.05769,101.74388,Minang Tomyam,3.057185,101.749812,Seafood Restaurant
3,Alam Damai,3.05769,101.74388,628火焰鑫茶室,3.058442,101.747947,Chinese Restaurant
4,Alam Damai,3.05769,101.74388,Ivy Sekinchan Seafood Noodle House 适耕莊特制魚丸海鲜面,3.065749,101.748718,Noodle House


In [26]:
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alam Damai,100,100,100,100,100,100
"Ampang, Kuala Lumpur",100,100,100,100,100,100
Bandar Menjalara,100,100,100,100,100,100
Bandar Sri Permaisuri,100,100,100,100,100,100
Bandar Tasik Selatan,96,96,96,96,96,96
Bandar Tun Razak,100,100,100,100,100,100
Bangsar,100,100,100,100,100,100
Bangsar Park,100,100,100,100,100,100
Bangsar South,100,100,100,100,100,100
Batu 11 Cheras,100,100,100,100,100,100


In [27]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 304 uniques categories.


In [32]:
venues_df['VenueCategory'].unique()[:350]

array(['Supplement Shop', 'Noodle House', 'Seafood Restaurant',
       'Chinese Restaurant', 'Breakfast Spot', 'Restaurant', 'Food Court',
       'Dim Sum Restaurant', 'Other Great Outdoors',
       'Vegetarian / Vegan Restaurant', 'Park', 'Coffee Shop',
       'Asian Restaurant', 'Indian Restaurant', 'Bubble Tea Shop', 'Spa',
       'Food Truck', 'Convenience Store', 'Chinese Breakfast Place',
       'Snack Place', 'Outlet Store', 'Dessert Shop', 'Pet Store',
       'Bakery', 'Japanese Restaurant', 'Café', 'Cantonese Restaurant',
       'Malay Restaurant', 'Gym / Fitness Center', 'Farmers Market',
       'Steakhouse', 'Athletics & Sports', 'Fast Food Restaurant',
       'Hakka Restaurant', 'Middle Eastern Restaurant', 'Badminton Court',
       'Pharmacy', 'Mamak Restaurant', 'Winery', 'Burger Joint',
       'College Bookstore', 'Grocery Store', 'Hostel', 'Exhibit',
       'South Indian Restaurant', 'Chettinad Restaurant', 'Hotel',
       'Juice Bar', 'Speakeasy', 'Monument / Landmark'

In [33]:
"Neighborhood" in venues_df['VenueCategory'].unique()

False