# Introduction

A customer is interested in starting their business in Bangkok, Thailand. They are focusing on opening a restaurant but they do not know what kind of cuisine is the best choice so they would like to know the current profiles of restaurant bussiness in Bangkok.

In [3]:
#importing libraries
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


# Data

We will scrape the data from foursquare api to uunderstand the current status of this kind of business in Berkeley.

In [60]:
latitude = 13.7573
longitude = 100.4951
print('The geograpical coordinate of Bangkok are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Bangkok are 13.7573, 100.4951.


In [61]:
# create map of Bangkok using latitude and longitude values
map_bkk = folium.Map(location=[latitude, longitude], zoom_start=12)

label = '{}'.format('Bangkok')
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
    [latitude, longitude],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_bkk)  

map_bkk

In [62]:
import urllib.request

# import the BeautifulSoup library so we can parse HTML and XML documents
from bs4 import BeautifulSoup

In [63]:
# specify which URL/web page we are going to be scraping
url = "https://en.wikipedia.org/wiki/List_of_districts_of_Bangkok"

# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)

# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")

#print(soup.prettify())

In [64]:
# use the 'find_all' function to bring back all instances of the 'table' tag in the HTML and store in 'all_tables' variable
all_tables=soup.find_all("table")
#all_tables
right_table=soup.find('table', class_='wikitable sortable')

In [73]:
A=[]
B=[]
C=[]
D=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if (len(cells)==8) :
        A.append(cells[0].find(text=True))
        B.append(cells[6].find(text=True))
        C.append(cells[7].find(text=True))
        D.append(cells[2].find(text=True))
        

df=pd.DataFrame(D,columns=['ZipCode'])
df['District']=A
df['Latitude']=B
df['Longitude']=C


df = df.sort_values(by=['ZipCode'])
df = df = df.reset_index(drop=True)

df.head()

Unnamed: 0,ZipCode,District,Latitude,Longitude
0,10100,Samphanthawong,13.731389,100.514167
1,10100,Pom Prap Sattru Phai,13.758056,100.513056
2,10110,Watthana,13.742222,100.585833
3,10110,Khlong Toei,13.708056,100.583889
4,10120,Bang Kho Laem,13.693333,100.5025


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
ZipCode      50 non-null object
District     50 non-null object
Latitude     50 non-null object
Longitude    50 non-null object
dtypes: object(4)
memory usage: 1.6+ KB


In [75]:
df['Longitude'] = df['Longitude'].astype(float)
df['Latitude'] = df['Latitude'].astype(float)

In [76]:
# create map of Bangkok using latitude and longitude values
map_bkk = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng in zip(df['Latitude'], df['Longitude']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_bkk)  
    
map_bkk



## Finding sub-groups of BKK 

We will group all districts into 10 groups and focusing on only one of them by using KNN

### 1. Examplifying by using one district

In [77]:
# set number of clusters
kclusters = 7

bkk_grouped_clustering = df.drop('District', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(bkk_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int32)

In [78]:
# add clustering labels
bkk_clust = df.insert(0, 'Cluster Labels', kmeans.labels_)

bkk_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
bkk_merged = bkk_merged.join(bkk_clust.set_index('District'), on='District')

bkk_merged.head() # check the last columns!

AttributeError: 'NoneType' object has no attribute 'set_index'

In [79]:
# create map of Bangkok using latitude and longitude values
map_bkk = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to map
for lat, lng, poi, cluster in zip(bkk_merged['Latitude'], bkk_merged['Longitude'], bkk_merged['District'], bkk_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_bkk) 
    
map_bkk

In [94]:
fin_df = df[df['Cluster Labels']==2]
fin_df = fin_df.reset_index(drop=True)
fin_df

Unnamed: 0,Cluster Labels,ZipCode,District,Latitude,Longitude
0,2,10400,Phaya Thai,13.78,100.542778
1,2,10400,Din Daeng,13.769722,100.552778
2,2,10400,Ratchathewi,13.758889,100.534444


We choosed this district because it has a high potential for tourists and local people. This district has a public transit and connectivity.

In [97]:
lat1 = fin_df.loc[0,'Latitude']
long1 = fin_df.loc[0,'Longitude']

# create map of Bangkok using latitude and longitude values
map_bkk_fin = folium.Map(location=[lat1, long1], zoom_start=13)

# add markers to map
for lat, lng in zip(fin_df['Latitude'], fin_df['Longitude']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_bkk_fin)  
    
map_bkk_fin

# Getting sample venues of Bangkok from Foursquare

In [98]:
CLIENT_ID = 'QQGXNALTUQ3WZMBWLVSPYKXF54V5EFKVT3JLWNOBYU3FUQ1T' # your Foursquare ID
CLIENT_SECRET = 'SW2VNWJ3AO0I3G2V20IBTFDHWZUBEQDNPABXC4LISYBESVP5' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: QQGXNALTUQ3WZMBWLVSPYKXF54V5EFKVT3JLWNOBYU3FUQ1T
CLIENT_SECRET:SW2VNWJ3AO0I3G2V20IBTFDHWZUBEQDNPABXC4LISYBESVP5


We will limit the number of searches to be lower than 200 and assuming that this proportion can infer to the proportion of different tyoes of venues in all three districts.

In [101]:
LIMIT = 200
radius = 500 

In [108]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['District', 
                  'District Latitude', 
                  'District Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [109]:
bkk_venues = getNearbyVenues(names=fin_df['District'],
                                   latitudes=fin_df['Latitude'],
                                   longitudes=fin_df['Longitude']
                                  )

Phaya Thai
Din Daeng
Ratchathewi


In [110]:
print(bkk_venues.shape)
bkk_venues.head()

(136, 7)


Unnamed: 0,District,District Latitude,District Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Phaya Thai,13.78,100.542778,O'glee (โอ-กลี),13.77988,100.543729,Bar
1,Phaya Thai,13.78,100.542778,Hero Izakaya,13.781721,100.543566,Japanese Restaurant
2,Phaya Thai,13.78,100.542778,Coffee No.9,13.780003,100.543816,Coffee Shop
3,Phaya Thai,13.78,100.542778,Greyhound Café (เกรฮาวด์ คาเฟ่),13.780069,100.544988,Restaurant
4,Phaya Thai,13.78,100.542778,Sousaku (โซซาคุ),13.780215,100.540466,Sushi Restaurant


In [112]:
print('There are {} uniques categories.'.format(len(bkk_venues['Venue Category'].unique())))

There are 55 uniques categories.


## Next step

1. We can analyze what type of restaurants that is still lacking but interesting to start in this area, so we will analyze it fr