<a href="https://colab.research.google.com/github/truongson203/IBM/blob/master/Week5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries


In [0]:
!pip install geopy

In [0]:
!pip install folium

In [0]:
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd 
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import folium
import json
import requests
import codecs

try:
  import geocoder
except:
  !pip install geocoder
  import geocoder

from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
from pandas.core.common import flatten

print("Libraries imported.")

# Data

In [0]:
!wget https://en.wikipedia.org/wiki/Category:Districts_of_Hanoi
print("Data downloaded successfully!")

In [0]:
html = codecs.open('Category:Districts_of_Hanoi', encoding='utf-8').read()
soup = BeautifulSoup(html, 'html.parser')

In [0]:
def clean_district_name(district_name):
  if ', Hanoi' in district_name:
    district_name = district_name.replace(', Hanoi', '')
  if 'District' not in district_name:
    district_name += ' District'
  return district_name

In [0]:
districts = [li.text.split('\n') for li in soup.find_all('ul')[0:13]]
districts = list(flatten(districts))
districts = [clean_district_name(dist) for dist in districts]

print('There are {} districts in Hanoi, VN.'.format(len(districts)))
districts

In [0]:
geolocator = Nominatim(user_agent='ibm-capstone')

In [0]:
districts_lat = []
districts_lng = []

In [0]:
for dist in districts:
  location = geolocator.geocode(dist, timeout=10000)
  districts_lat.append(location.latitude)
  districts_lng.append(location.longitude)

print('{} latitudes and {} longitudes were found.'.format(len(districts_lat), len(districts_lng)))

In [0]:
city = 'Hanoi, VN'
hanoi_geo = geolocator.geocode(city, timeout=10000)
hanoi_lat = hanoi_geo.latitude
hanoi_lng = hanoi_geo.longitude

print('The latitude and longitude of {} are {}, {}.'.format(city, hanoi_lat, hanoi_lng))

In [0]:
df = pd.DataFrame(data={
    'District': districts,
    'Latitude': districts_lat,
    'Longitude': districts_lng
})
df.head()

In [0]:
df.to_csv('Hanoi_Districts_data.csv', index=False)

# Get venues of each district within a radius of 10km

In [0]:
# Foursquare cresentials and version
CLIENT_ID = 'GKXY12P2XUWGQTJURTZ5FYXRZK5KDTN2HJ3X2OCTIDHL41HO'
CLIENT_SECRET = 'RHTCH4ODAHTZRQFJ53OFM5MWQCAO2SIAADAVFQNQHXBKCHGE'
VERSION = '20200101'
SECTION = 'arts'
LIMIT = 50
RADIUS = 10000

In [0]:
def getNearbyVenues(districts, latitudes, longitudes):
    
    venues_list=[]
    for district, lat, lng in zip(districts, latitudes, longitudes):
        print(district)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}&section={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            RADIUS, 
            LIMIT,
            SECTION)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            district,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['District',
                            'District Latitude', 
                            'District Longitude', 
                            'Venue', 
                            'Venue Latitude', 
                            'Venue Longitude', 
                            'Venue Category']
              
    return(nearby_venues)

In [0]:
hanoi_venues = getNearbyVenues(districts=df['District'],
                              latitudes=df['Latitude'],
                              longitudes=df['Longitude'])

In [0]:
hanoi_venues.head()

In [0]:
hanoi_venues.shape

# Data Wrangling

In [0]:
hanoi_venues.info()

In [0]:
hanoi_venues['Venue Category'].unique()

# Data Analysis

In [0]:
venues_by_dist = hanoi_venues.groupby('District').count()[['Venue']]
venues_by_dist

In [0]:
venues_by_dist.sort_values(by='Venue', ascending=False).plot.bar()
plt.xlabel('District')
plt.ylabel('Number of venues')
plt.legend('')
plt.title('Number of venues in Hanoi')
plt.show()

In [0]:
venues_by_category = hanoi_venues.groupby('Venue Category').count()[['Venue']]
venues_by_category

In [0]:
venues_by_category.sort_values(by='Venue', ascending=False).plot.bar()
plt.xlabel('Venue Category')
plt.ylabel('Number of venues')
plt.legend('')
plt.title('Top most common venue categories in Hanoi')
plt.show()

In [0]:
print('There are {} unique values in Hanoi Venue Category'.format(len(hanoi_venues['Venue Category'].unique())))
hanoi_venues['Venue Category'].unique()

In [0]:
# one hot encoding
hanoi_onehot = pd.get_dummies(hanoi_venues[['Venue Category']], prefix='', prefix_sep='')

# add neighborhood column back to dataframe
hanoi_onehot['District'] = hanoi_venues[['District']]

# move neighborhood column to the first column
fixed_columns = [hanoi_onehot.columns[-1]] + list(hanoi_onehot.columns[:-1])
hanoi_onehot = hanoi_onehot[fixed_columns]

hanoi_onehot.head()

In [0]:
hanoi_onehot.shape

In [0]:
hanoi_grouped = hanoi_onehot.groupby('District').mean().reset_index()
hanoi_grouped

In [0]:
hanoi_grouped.shape

In [0]:
# Print top 5 most common venues of each district
num_top_venues = 5

for dist in hanoi_grouped['District']:
    print('----'+dist+'----')
    # get a dataframe of venues according to neigborhood and transpose it 
    temp = hanoi_grouped[hanoi_grouped['District'] == dist].T.reset_index()
    
    # update meaningful column names
    temp.columns = ['Venue Category', 'Frequency']
    
    # remove the first row of the dataframe (the previous 'District' column)
    temp = temp.iloc[1:]
    
    # convert 'freq' column to type float
    temp['Frequency'] = temp['Frequency'].astype(float)
    
    # round the 'freq' column to 2 digit
    temp = temp.round({'Frequency': 2})
    
    # sort the dataframe desceningly
    temp.sort_values(by='Frequency', ascending=False, inplace=True)
    
    # reset and drop index
    temp = temp.reset_index(drop=True)
    
    print(temp[:5])
    print('')

In [0]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [0]:
num_top_venues = 15

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['District']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['District'] = hanoi_grouped['District']

for ind in np.arange(hanoi_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(hanoi_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

# Modelling

In [0]:
# set number of clusters
kclusters = 5

hanoi_grouped_clustering = hanoi_grouped.drop('District', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(hanoi_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [0]:
# add clustering labels
try:  
  neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)
except:
  neighborhoods_venues_sorted = neighborhoods_venues_sorted
hanoi_merged = df

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
hanoi_merged = hanoi_merged.join(neighborhoods_venues_sorted.set_index('District'), on='District')

#hanoi_merged

In [0]:
hanoi_merged.dropna(inplace=True)
hanoi_merged

In [0]:
#create map
map_clusters = folium.Map(location=[hanoi_lat, hanoi_lng], zoom_start=10)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(hanoi_merged['Latitude'], hanoi_merged['Longitude'], hanoi_merged['District'], hanoi_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster -1)],
        fill=True,
        fill_color=rainbow[int(cluster-1)],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [0]:
# Cluster 0
hanoi_merged.loc[hanoi_merged['Cluster Labels'] == 0, hanoi_merged.columns[[0] + list(range(4, hanoi_merged.shape[1]))]]

In [0]:
# Cluster 1
hanoi_merged.loc[hanoi_merged['Cluster Labels'] == 1, hanoi_merged.columns[[0] + list(range(4, hanoi_merged.shape[1]))]]

In [0]:
# Cluster 2
hanoi_merged.loc[hanoi_merged['Cluster Labels'] == 2, hanoi_merged.columns[[0] + list(range(4, hanoi_merged.shape[1]))]]

In [0]:
# Cluster 3
hanoi_merged.loc[hanoi_merged['Cluster Labels'] == 3, hanoi_merged.columns[[0] + list(range(4, hanoi_merged.shape[1]))]]

In [0]:
# Cluster 4
hanoi_merged.loc[hanoi_merged['Cluster Labels'] == 4, hanoi_merged.columns[[0] + list(range(4, hanoi_merged.shape[1]))]]