#### Assignment: Capstone Project - The Battle of Neighborhoods
### Week 4 Report
**_Opening a Seafood Restaurant in Ho Chi Minh City, VietNam_**
- Build a dataframe of neighborhoods in Ho Chi Minh City, VietNam by web scraping the data from Wikipedia page
- Get the geographical coordinates of the neighborhoods
- Obtain the venue data for the neighborhoods from Foursquare API
- Explore and cluster the neighborhoods
- Select the best cluster to open a new Seafood Restaurant
***
### 1. Import libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import geocoder # to get coordinates

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print("Libraries imported.")

Libraries imported.


### 2. Scrap data from Wikipedia page into a DataFrame

In [2]:
# send the GET request
data = requests.get("https://en.wikipedia.org/wiki/Category:Districts_of_Ho_Chi_Minh_City").text

In [3]:
# parse data from the html into a beautifulsoup object
soup = BeautifulSoup(data, 'html.parser')

In [4]:
# create a list to store neighborhood data
neighborhoodList = []

In [5]:
# append the data into the list
for row in soup.find_all("div", class_="mw-category")[0].findAll("li"):
    neighborhoodList.append(row.text)

In [6]:
# create a new DataFrame from the list
hcm_df = pd.DataFrame({"Neighborhood": neighborhoodList})

hcm_df.head()

Unnamed: 0,Neighborhood
0,Bình Chánh District
1,"Bình Tân District, Ho Chi Minh City"
2,Bình Thạnh District
3,Cần Giờ District
4,Củ Chi District


In [7]:
# print the number of rows of the dataframe
hcm_df.shape

(24, 1)

In [8]:
hcm_df

Unnamed: 0,Neighborhood
0,Bình Chánh District
1,"Bình Tân District, Ho Chi Minh City"
2,Bình Thạnh District
3,Cần Giờ District
4,Củ Chi District
5,"District 1, Ho Chi Minh City"
6,"District 2, Ho Chi Minh City"
7,"District 3, Ho Chi Minh City"
8,"District 4, Ho Chi Minh City"
9,"District 5, Ho Chi Minh City"


### 3. Get the geographical coordinates

In [9]:
# define a function to get coordinates
def get_latlng(neighborhood):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Ho Chi Minh City, Vietnam'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

In [10]:
# call the function to get the coordinates, store in a new list using list comprehension
coords = [ get_latlng(neighborhood) for neighborhood in hcm_df["Neighborhood"].tolist() ]

In [11]:
coords

[[10.679220000000043, 106.57654000000008],
 [10.75908000000004, 106.59206000000006],
 [10.805180000000064, 106.69280000000003],
 [10.41566000000006, 106.96130000000005],
 [10.977340000000027, 106.50223000000005],
 [10.780950000000075, 106.69911000000008],
 [10.791990000000055, 106.74985000000004],
 [10.775660000000073, 106.68674000000004],
 [10.766700000000071, 106.70647000000008],
 [10.755690000000072, 106.66637000000009],
 [10.745780000000025, 106.64777000000004],
 [10.70515000000006, 106.73748000000006],
 [10.74771000000004, 106.66334000000006],
 [10.820040000000063, 106.83185000000009],
 [10.768670000000043, 106.66564000000005],
 [10.763080000000059, 106.64294000000007],
 [10.850440000000049, 106.62731000000008],
 [10.833790000000022, 106.66557000000006],
 [10.888360000000034, 106.59640000000007],
 [10.701530000000048, 106.73818000000006],
 [10.795650000000023, 106.67464000000007],
 [10.75908000000004, 106.59206000000006],
 [10.782320000000027, 106.63667000000004],
 [10.84626000000

In [12]:
# create temporary dataframe to populate the coordinates into Latitude and Longitude
df_coords = pd.DataFrame(coords, columns=['Latitude', 'Longitude'])

In [13]:
# merge the coordinates into the original dataframe
hcm_df['Latitude'] = df_coords['Latitude']
hcm_df['Longitude'] = df_coords['Longitude']

In [14]:
# check the neighborhoods and the coordinates
print(hcm_df.shape)
hcm_df

(24, 3)


Unnamed: 0,Neighborhood,Latitude,Longitude
0,Bình Chánh District,10.67922,106.57654
1,"Bình Tân District, Ho Chi Minh City",10.75908,106.59206
2,Bình Thạnh District,10.80518,106.6928
3,Cần Giờ District,10.41566,106.9613
4,Củ Chi District,10.97734,106.50223
5,"District 1, Ho Chi Minh City",10.78095,106.69911
6,"District 2, Ho Chi Minh City",10.79199,106.74985
7,"District 3, Ho Chi Minh City",10.77566,106.68674
8,"District 4, Ho Chi Minh City",10.7667,106.70647
9,"District 5, Ho Chi Minh City",10.75569,106.66637


In [15]:
# save the DataFrame as CSV file
hcm_df.to_csv("HCM_df.csv", index=False)

### 4. Create a map of Ho Chi Minh City, Vietnam with neighborhoods superimposed on top

In [16]:
# get the coordinates of HCM, VN
address = 'Ho Chi Minh City, Vietnam'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Ho Chi Minh City, Vietnam {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Ho Chi Minh City, Vietnam 10.6497452, 106.76197937344351.


In [17]:
# create map of Ho Chi Minh City using latitude and longitude values
map_hcm = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, neighborhood in zip(hcm_df['Latitude'], hcm_df['Longitude'], hcm_df['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_hcm)  
    
map_hcm

In [18]:
# save the map as HTML file
map_hcm.save('map_Ho Chi Minh City_VN.html')

### 5. Use the Foursquare API to explore the neighborhoods

In [19]:
# define Foursquare Credentials and Version
CLIENT_ID = 'UGTYYYNFMKBLPPM1XYWHXZSRCICUMHURHKSF0UMBD2DHUQBS' # your Foursquare ID
CLIENT_SECRET = 'E20F4Z1VOQD001KZYG5GA3F24SUJUWFABON3EWAT52UC2DFO' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: UGTYYYNFMKBLPPM1XYWHXZSRCICUMHURHKSF0UMBD2DHUQBS
CLIENT_SECRET:E20F4Z1VOQD001KZYG5GA3F24SUJUWFABON3EWAT52UC2DFO


**Now, let's get the top 100 venues that are within a radius of 2000 meters.**

In [20]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(hcm_df['Latitude'], hcm_df['Longitude'], hcm_df['Neighborhood']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [21]:
# convert the venues list into a new DataFrame
venues_df = pd.DataFrame(venues)

# define the column names
venues_df.columns = ['Neighborhood', 'Latitude', 'Longitude', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

print(venues_df.shape)
venues_df.head()

(1041, 7)


Unnamed: 0,Neighborhood,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Bình Chánh District,10.67922,106.57654,Kedai Sarah,10.688974,106.574965,Women's Store
1,Bình Chánh District,10.67922,106.57654,Lò Bánh Mì Vạn Hoà,10.665982,106.570857,Bakery
2,Bình Chánh District,10.67922,106.57654,Xí Nghiep Sx Hang Thu Cong My Nghe 27-7,10.683414,106.562306,Arts & Crafts Store
3,Bình Chánh District,10.67922,106.57654,National Road 1A,10.683168,106.561552,Bus Station
4,Bình Chánh District,10.67922,106.57654,Ốc chị Lượm,10.66373,106.570333,Seafood Restaurant


**Let's check how many venues were returned for each neighorhood**

In [22]:
venues_df.groupby(["Neighborhood"]).count()

Unnamed: 0_level_0,Latitude,Longitude,VenueName,VenueLatitude,VenueLongitude,VenueCategory
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bình Chánh District,6,6,6,6,6,6
Bình Thạnh District,88,88,88,88,88,88
"Bình Tân District, Ho Chi Minh City",8,8,8,8,8,8
Cần Giờ District,4,4,4,4,4,4
Củ Chi District,6,6,6,6,6,6
"District 1, Ho Chi Minh City",100,100,100,100,100,100
"District 10, Ho Chi Minh City",100,100,100,100,100,100
"District 11, Ho Chi Minh City",50,50,50,50,50,50
"District 12, Ho Chi Minh City",8,8,8,8,8,8
"District 2, Ho Chi Minh City",40,40,40,40,40,40


**Let's find out how many unique categories can be curated from all the returned venues**

In [23]:
print('There are {} uniques categories.'.format(len(venues_df['VenueCategory'].unique())))

There are 130 uniques categories.


In [24]:
# print out the list of categories
venues_df['VenueCategory'].unique()[:130]

array(["Women's Store", 'Bakery', 'Arts & Crafts Store', 'Bus Station',
       'Seafood Restaurant', 'Diner', 'Café', 'Asian Restaurant',
       'Vietnamese Restaurant', 'Karaoke Bar', 'Korean Restaurant',
       'Snack Place', 'Coffee Shop', 'French Restaurant', 'Spa',
       'BBQ Joint', 'Supermarket', 'Hostel', 'Japanese Restaurant',
       'Convention Center', 'Breakfast Spot',
       'Vegetarian / Vegan Restaurant', 'Travel Agency', 'Park',
       'Yoga Studio', 'Design Studio', 'Gym / Fitness Center',
       'Russian Restaurant', 'Dessert Shop', 'Food Truck', 'Noodle House',
       'Beer Garden', 'Bookstore', 'Sushi Restaurant', 'Soup Place',
       'Ice Cream Shop', 'Dim Sum Restaurant', 'Flea Market', 'Beach',
       'Pharmacy', 'Restaurant', 'Hotel', 'Pizza Place', 'Hotel Bar',
       'Cupcake Shop', 'Bar', 'Italian Restaurant', 'Massage Studio',
       'Middle Eastern Restaurant', 'Brewery', 'Hotpot Restaurant',
       'Steakhouse', 'North Indian Restaurant', 'Thai Restaurant

In [25]:
# check if the results contain "Seafood Restaurant"
"Seafood Restaurant" in venues_df['VenueCategory'].unique()

True

### 6. Analyze Each Neighborhood

In [26]:
# one hot encoding
hcm_onehot = pd.get_dummies(venues_df[['VenueCategory']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
hcm_onehot['Neighborhoods'] = venues_df['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [hcm_onehot.columns[-1]] + list(hcm_onehot.columns[:-1])
hcm_onehot = hcm_onehot[fixed_columns]

print(hcm_onehot.shape)
hcm_onehot.head()

(1041, 131)


Unnamed: 0,Neighborhoods,Airport Food Court,Airport Lounge,Airport Service,Arcade,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,Bar,Basketball Stadium,Bath House,Bathing Area,Beach,Bed & Breakfast,Beer Bar,Beer Garden,Bistro,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Bubble Tea Shop,Buffet,Burger Joint,Bus Station,Café,Cantonese Restaurant,Chinese Restaurant,Clothing Store,Cocktail Bar,Coffee Shop,Convenience Store,Convention Center,Cupcake Shop,Department Store,Design Studio,Dessert Shop,Dim Sum Restaurant,Diner,Dumpling Restaurant,Duty-free Shop,Electronics Store,Fast Food Restaurant,Flea Market,Flower Shop,Food,Food Court,Food Truck,French Restaurant,Fried Chicken Joint,German Restaurant,Gift Shop,Golf Course,Grocery Store,Gym,Gym / Fitness Center,Hawaiian Restaurant,Hostel,Hotel,Hotel Bar,Hotpot Restaurant,Ice Cream Shop,Indian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Karaoke Bar,Korean Restaurant,Lounge,Market,Massage Studio,Mattress Store,Mexican Restaurant,Middle Eastern Restaurant,Movie Theater,Multiplex,Museum,Music Venue,Nightclub,Noodle House,North Indian Restaurant,Opera House,Outdoors & Recreation,Paintball Field,Park,Pharmacy,Pizza Place,Pool,Pub,Ramen Restaurant,Residential Building (Apartment / Condo),Rest Area,Restaurant,Rock Club,Russian Restaurant,Sandwich Place,Seafood Restaurant,Shoe Store,Shopping Mall,Shopping Plaza,Snack Place,Soup Place,Spa,Spanish Restaurant,Speakeasy,Steakhouse,Supermarket,Sushi Restaurant,Tapas Restaurant,Tattoo Parlor,Tea Room,Temple,Thai Restaurant,Theater,Theme Park,Travel Agency,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Water Park,Whisky Bar,Women's Store,Yoga Studio
0,Bình Chánh District,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,Bình Chánh District,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Bình Chánh District,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Bình Chánh District,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Bình Chánh District,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


**Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category**

In [27]:
hcm_grouped = hcm_onehot.groupby(["Neighborhoods"]).mean().reset_index()

print(hcm_grouped.shape)
hcm_grouped

(24, 131)


Unnamed: 0,Neighborhoods,Airport Food Court,Airport Lounge,Airport Service,Arcade,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,BBQ Joint,Bakery,Bar,Basketball Stadium,Bath House,Bathing Area,Beach,Bed & Breakfast,Beer Bar,Beer Garden,Bistro,Boat or Ferry,Bookstore,Boutique,Breakfast Spot,Brewery,Bubble Tea Shop,Buffet,Burger Joint,Bus Station,Café,Cantonese Restaurant,Chinese Restaurant,Clothing Store,Cocktail Bar,Coffee Shop,Convenience Store,Convention Center,Cupcake Shop,Department Store,Design Studio,Dessert Shop,Dim Sum Restaurant,Diner,Dumpling Restaurant,Duty-free Shop,Electronics Store,Fast Food Restaurant,Flea Market,Flower Shop,Food,Food Court,Food Truck,French Restaurant,Fried Chicken Joint,German Restaurant,Gift Shop,Golf Course,Grocery Store,Gym,Gym / Fitness Center,Hawaiian Restaurant,Hostel,Hotel,Hotel Bar,Hotpot Restaurant,Ice Cream Shop,Indian Restaurant,Intersection,Italian Restaurant,Japanese Restaurant,Jazz Club,Jewelry Store,Juice Bar,Karaoke Bar,Korean Restaurant,Lounge,Market,Massage Studio,Mattress Store,Mexican Restaurant,Middle Eastern Restaurant,Movie Theater,Multiplex,Museum,Music Venue,Nightclub,Noodle House,North Indian Restaurant,Opera House,Outdoors & Recreation,Paintball Field,Park,Pharmacy,Pizza Place,Pool,Pub,Ramen Restaurant,Residential Building (Apartment / Condo),Rest Area,Restaurant,Rock Club,Russian Restaurant,Sandwich Place,Seafood Restaurant,Shoe Store,Shopping Mall,Shopping Plaza,Snack Place,Soup Place,Spa,Spanish Restaurant,Speakeasy,Steakhouse,Supermarket,Sushi Restaurant,Tapas Restaurant,Tattoo Parlor,Tea Room,Temple,Thai Restaurant,Theater,Theme Park,Travel Agency,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Water Park,Whisky Bar,Women's Store,Yoga Studio
0,Bình Chánh District,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0
1,Bình Thạnh District,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,0.0,0.0,0.011364,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.193182,0.0,0.0,0.0,0.0,0.079545,0.0,0.011364,0.0,0.0,0.011364,0.011364,0.011364,0.0,0.0,0.0,0.0,0.0,0.011364,0.0,0.0,0.0,0.022727,0.011364,0.0,0.0,0.0,0.0,0.0,0.0,0.011364,0.0,0.011364,0.0,0.0,0.0,0.011364,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.022727,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011364,0.0,0.022727,0.0,0.0,0.0,0.022727,0.011364,0.034091,0.0,0.0,0.0,0.011364,0.011364,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011364,0.0,0.045455,0.215909,0.0,0.0,0.0,0.0,0.011364
2,"Bình Tân District, Ho Chi Minh City",0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
3,Cần Giờ District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0
4,Củ Chi District,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0
5,"District 1, Ho Chi Minh City",0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.09,0.0,0.0,0.0,0.02,0.06,0.01,0.01,0.01,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.03,0.01,0.0,0.01,0.0,0.02,0.02,0.01,0.01,0.0,0.0,0.0,0.02,0.0,0.03,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.02,0.0,0.04,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.01,0.01,0.01,0.01,0.01,0.0,0.0,0.0,0.0,0.02,0.01,0.0,0.0,0.0,0.04,0.11,0.0,0.0,0.01,0.0,0.0
6,"District 10, Ho Chi Minh City",0.0,0.0,0.0,0.0,0.0,0.01,0.02,0.0,0.03,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.09,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.03,0.03,0.0,0.01,0.0,0.01,0.0,0.0,0.01,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.03,0.0,0.01,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.01,0.0,0.05,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.01,0.02,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.17,0.0,0.0,0.0,0.0,0.0
7,"District 11, Ho Chi Minh City",0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.22,0.04,0.14,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.04,0.02,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.02,0.0,0.0,0.02,0.06,0.0,0.02,0.0,0.0,0.0
8,"District 12, Ho Chi Minh City",0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,"District 2, Ho Chi Minh City",0.0,0.0,0.0,0.0,0.025,0.0,0.075,0.025,0.05,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.025,0.0,0.0,0.0,0.025,0.025,0.025,0.0,0.15,0.0,0.0,0.0,0.0,0.075,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025,0.0,0.0,0.025,0.0,0.0,0.0,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0,0.025,0.025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025,0.0,0.025,0.0,0.0,0.0,0.0,0.05,0.025,0.0,0.0,0.0,0.0,0.0


In [28]:
len(hcm_grouped[hcm_grouped["Seafood Restaurant"] > 0])

14

**Create a new DataFrame for Seafood Restaurant data only**

In [30]:
hcm_mall = hcm_grouped[["Neighborhoods","Seafood Restaurant"]]

In [31]:
hcm_mall

Unnamed: 0,Neighborhoods,Seafood Restaurant
0,Bình Chánh District,0.166667
1,Bình Thạnh District,0.022727
2,"Bình Tân District, Ho Chi Minh City",0.125
3,Cần Giờ District,0.0
4,Củ Chi District,0.0
5,"District 1, Ho Chi Minh City",0.0
6,"District 10, Ho Chi Minh City",0.04
7,"District 11, Ho Chi Minh City",0.08
8,"District 12, Ho Chi Minh City",0.0
9,"District 2, Ho Chi Minh City",0.025


### 7. Cluster Neighborhoods
Run k-means to cluster the neighborhoods in Ho Chi Minh City, VN into 3 clusters.

In [32]:
# set number of clusters
kclusters = 3

hcm_clustering = hcm_mall.drop(["Neighborhoods"], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(hcm_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([1, 2, 1, 0, 0, 0, 2, 2, 0, 2])

In [33]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
hcm_merged = hcm_mall.copy()

# add clustering labels
hcm_merged["Cluster Labels"] = kmeans.labels_

In [34]:
hcm_merged.rename(columns={"Neighborhoods": "Neighborhood"}, inplace=True)
hcm_merged.head()

Unnamed: 0,Neighborhood,Seafood Restaurant,Cluster Labels
0,Bình Chánh District,0.166667,1
1,Bình Thạnh District,0.022727,2
2,"Bình Tân District, Ho Chi Minh City",0.125,1
3,Cần Giờ District,0.0,0
4,Củ Chi District,0.0,0


In [35]:
# merge hcm_grouped with hcm_data to add latitude/longitude for each neighborhood
hcm_merged = hcm_merged.join(hcm_df.set_index("Neighborhood"), on="Neighborhood")

print(hcm_merged.shape)
hcm_merged.head() # check the last columns!

(24, 5)


Unnamed: 0,Neighborhood,Seafood Restaurant,Cluster Labels,Latitude,Longitude
0,Bình Chánh District,0.166667,1,10.67922,106.57654
1,Bình Thạnh District,0.022727,2,10.80518,106.6928
2,"Bình Tân District, Ho Chi Minh City",0.125,1,10.75908,106.59206
3,Cần Giờ District,0.0,0,10.41566,106.9613
4,Củ Chi District,0.0,0,10.97734,106.50223


In [36]:
# sort the results by Cluster Labels
print(hcm_merged.shape)
hcm_merged.sort_values(["Cluster Labels"], inplace=True)
hcm_merged

(24, 5)


Unnamed: 0,Neighborhood,Seafood Restaurant,Cluster Labels,Latitude,Longitude
11,"District 4, Ho Chi Minh City",0.0,0,10.7667,106.70647
21,Thủ Đức District,0.0,0,10.84626,106.76992
19,Nhà Bè District,0.0,0,10.70153,106.73818
16,"District 9, Ho Chi Minh City",0.0,0,10.82004,106.83185
14,"District 7, Ho Chi Minh City",0.0,0,10.70515,106.73748
8,"District 12, Ho Chi Minh City",0.0,0,10.85044,106.62731
23,"Tân Phú District, Ho Chi Minh City",0.0,0,10.78232,106.63667
5,"District 1, Ho Chi Minh City",0.0,0,10.78095,106.69911
4,Củ Chi District,0.0,0,10.97734,106.50223
3,Cần Giờ District,0.0,0,10.41566,106.9613


**Finally, let's visualize the resulting clusters**

In [37]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(hcm_merged['Latitude'], hcm_merged['Longitude'], hcm_merged['Neighborhood'], hcm_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [38]:
# save the map as HTML file
map_clusters.save('map_clusters.html')

### 8. Examine Clusters

#### Cluster 0

In [40]:
hcm_merged.loc[hcm_merged['Cluster Labels'] == 0]

Unnamed: 0,Neighborhood,Seafood Restaurant,Cluster Labels,Latitude,Longitude
11,"District 4, Ho Chi Minh City",0.0,0,10.7667,106.70647
21,Thủ Đức District,0.0,0,10.84626,106.76992
19,Nhà Bè District,0.0,0,10.70153,106.73818
16,"District 9, Ho Chi Minh City",0.0,0,10.82004,106.83185
14,"District 7, Ho Chi Minh City",0.0,0,10.70515,106.73748
8,"District 12, Ho Chi Minh City",0.0,0,10.85044,106.62731
23,"Tân Phú District, Ho Chi Minh City",0.0,0,10.78232,106.63667
5,"District 1, Ho Chi Minh City",0.0,0,10.78095,106.69911
4,Củ Chi District,0.0,0,10.97734,106.50223
3,Cần Giờ District,0.0,0,10.41566,106.9613


#### Cluster 1

In [41]:
hcm_merged.loc[hcm_merged['Cluster Labels'] == 1]

Unnamed: 0,Neighborhood,Seafood Restaurant,Cluster Labels,Latitude,Longitude
0,Bình Chánh District,0.166667,1,10.67922,106.57654
22,Tân Bình District,0.125,1,10.75908,106.59206
2,"Bình Tân District, Ho Chi Minh City",0.125,1,10.75908,106.59206
18,Hóc Môn District,0.142857,1,10.88836,106.5964


#### Cluster 2

In [42]:
hcm_merged.loc[hcm_merged['Cluster Labels'] == 2]

Unnamed: 0,Neighborhood,Seafood Restaurant,Cluster Labels,Latitude,Longitude
1,Bình Thạnh District,0.022727,2,10.80518,106.6928
20,Phú Nhuận District,0.05,2,10.79565,106.67464
17,Gò Vấp District,0.02439,2,10.83379,106.66557
7,"District 11, Ho Chi Minh City",0.08,2,10.76308,106.64294
13,"District 6, Ho Chi Minh City",0.020408,2,10.74578,106.64777
12,"District 5, Ho Chi Minh City",0.03,2,10.75569,106.66637
10,"District 3, Ho Chi Minh City",0.02,2,10.77566,106.68674
9,"District 2, Ho Chi Minh City",0.025,2,10.79199,106.74985
6,"District 10, Ho Chi Minh City",0.04,2,10.76867,106.66564
15,"District 8, Ho Chi Minh City",0.037736,2,10.74771,106.66334
