# Capstone Project

#### Author: Viet Nguyen

#### Purpose: the main purpose is to cluster venues (restaurants) in Melbourne's inner suburbs to see the similarity between areas

### Import necessary libraries

In [None]:
# import libraries
import pandas as pd
import numpy as np

# convert an address into latitude and longitude values
!pip install geopy
from geopy.geocoders import Nominatim 

# map rendering library
!pip install folium
import folium

# library to handle requests
import requests

# to generate random colors
import random

# to handle processing strings
import re

#K-Means
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Turn off the warnings
pd.options.mode.chained_assignment = None

### Define lists of Melbourne suburbs to be used

In [None]:
inner_suburbs = []

city_suburbs = ['South Melbourne', 'Southbank', 'Fitzroy', 'Fitzroy North', 'Carlton', 'Collingwood', 'Melbourne'] #0.7 (chance) | #1 (comp)
city_suburbs = np.array(city_suburbs)
southyarra_suburbs = ['Prahran', 'Windsor', 'Cremorne', 'South Yarra', 'East Melbourne'] #0.75 (chance)  | #0.9 (comp)
southyarra_suburbs = np.array(southyarra_suburbs)

stkilda_suburbs = ['St Kilda', 'Balaclava', 'St Kilda East', 'St Kilda West'] #0.7 (chance) | #0.8 (comp)
stkilda_suburbs = np.array(stkilda_suburbs)
richmond_suburbs = ['Abbotsford', 'Richmond', 'Fairfield', 'Burnley', 'Clifton Hill'] #0.7 (chance) | #0.7 (comp)
richmond_suburbs = np.array(richmond_suburbs)

hawthorn_suburbs = ['Hawthorn', 'Hawthorn East', 'Camberwell', 'Kew', 'Box Hill'] #0.7 (chance) | #0.6 (comp)
hawthorn_suburbs = np.array(hawthorn_suburbs)

brighton_suburbs = ['Ripponlea', 'Elsternwick', 'Elwood', 'Brighton', 'Gardenvale', 'Ashwood'] #0.65 (chance) | #0.5 (comp)
brighton_suburbs = np.array(brighton_suburbs)

malvern_suburbs = ['Malvern', 'Malvern East', 'Ashburton', 'Glen Iris', 'Kooyong', 'Toorak', 'Armadale', 'Caulfield North', 'Caulfield'] #0.6 (chance) | #0.4 (comp)
malvern_suburbs = np.array(malvern_suburbs)
balwyn_suburbs = ['Balwyn', 'Balwyn North', 'Deepdene', 'Canterbury', 'Surrey Hills', 'Kew East'] #0.55 (chance) | #0.3 (comp)
balwyn_suburbs = np.array(balwyn_suburbs)

inner_suburbs = np.concatenate([city_suburbs, southyarra_suburbs, stkilda_suburbs, richmond_suburbs, 
                                hawthorn_suburbs, brighton_suburbs, malvern_suburbs, balwyn_suburbs])

inner_suburbs = np.unique(inner_suburbs)

print("Completed", ' - Size', len(inner_suburbs))

### Define necessary variables to be used by Foursquare API

In [None]:
# Define Foursquare Credentials and Version
CLIENT_ID = 'QEC0BVYHEJCHQD5ZR4DJA1GCHNRQ5EIH3XOKPGG3Y3FLTV34' # your Foursquare ID
CLIENT_SECRET = 'KWKBGZQ2GO14GX21K2IVEJEVIXHDA323JQEE1PEKXICD42BC' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
radius = 200
LIMIT = 10
state = "VIC"
country = "Australia"

### Function to format collected address information into a standard form

There are too many different formats of collected addresses from Foursquare API.
It is important to make them having a standard format for later analysis.

In [None]:
road_types = [' road', ' street', ' rd', ' st', ' avenue', ' ave', ' parade', ' pde', ' lane', ' ln']
delimiters = ['/', ',']

def address_extract(raw_address, road_types):
    
    raw_address = raw_address.lower()

    for i, value in enumerate(delimiters):
        pos = raw_address.find(value)
        if (pos != -1):
            raw_address = raw_address[pos:]

    for i, value in enumerate(raw_address):
        if not (re.match('^[a-zA-Z ]*$', value)):
            raw_address = raw_address.replace(value, '')

    for i, value in enumerate(road_types):
        if value in raw_address:
            
            if (value == road_types[0]):
                raw_address = raw_address.replace(value, '_road')
            
            if (value == road_types[1]):
                raw_address = raw_address.replace(value, '_street')
                
            if (value == road_types[2]):
                raw_address = raw_address.replace(value, '_road')
                
            if (value == road_types[3]):
                raw_address = raw_address.replace(value, '_street')
                
            if (value == road_types[4]):
                raw_address = raw_address.replace(value, '_avenue')
                
            if (value == road_types[5]):
                raw_address = raw_address.replace(value, '_avenue')
                
            if (value == road_types[6]):
                raw_address = raw_address.replace(value, '_parade')
                
            if (value == road_types[7]):
                raw_address = raw_address.replace(value, '_parade')
                
            if (value == road_types[8]):
                raw_address = raw_address.replace(value, '_lane')
                
            if (value == road_types[9]):
                raw_address = raw_address.replace(value, '_lane')
            
            break
        
    raw_address = raw_address.replace(' ', '')
    raw_address = raw_address.replace('_', ' ')
    raw_address = raw_address.title()
    
    if raw_address == ' Streetation Street':
        return 'Flinders Street'
    
    return raw_address

### Searching for suburb's postcode and geographical coordinate data (with high accuracy) with Foursquare API

In [None]:
# Define lists that hold suburb's geograpical coordinate
postcodes = []
suburbs = []
latitudes = []
longitudes = []

for i, suburb in enumerate(inner_suburbs):
    url = 'https://api.foursquare.com/v2/venues/search?client_id={0}&client_secret={1}&near={2},{3},{4}&v={5}&radius={6}&limit={7}'.format(CLIENT_ID, CLIENT_SECRET, suburb, state, country, VERSION, radius, LIMIT)
    
    results = requests.get(url).json()
    venues = results['response']['venues']
        
    postcode = lat = long = 0
    for x in range(0, len(venues)):
        if 'postalCode' not in venues[x]['location']:
            continue
        else:
            postcode = venues[x]['location']['postalCode']
            lat = venues[x]['location']['lat']
            long = venues[x]['location']['lng']
            break
            
            
    # Filling data to defined lists
    postcodes.append(postcode)
    suburbs.append(suburb)
    latitudes.append(lat)
    longitudes.append(long)

# Make the Python lists become Numpy array 
postcodes = np.array(postcodes)
suburbs = np.array(suburbs)
latitudes = np.array(latitudes)
longitudes = np.array(longitudes)

print("Lists initialisation completed")

### Generate suburbs_geo dataframe with suburb's data lists defined and initialised from the above step

In [None]:
# Generate Dataframe with Melbourne's suburbs geographical coordinate data
columns = ['Postcode', 'Suburb', 'Latitude', 'Longitude']
data = np.array([postcodes, suburbs, latitudes, longitudes])
suburbs_geo = pd.DataFrame(data.T, columns = columns)
suburbs_geo.sort_values(by = 'Postcode', inplace = True)
# Reset index
suburbs_geo.reset_index(drop = True, inplace = True)
print("Completed")

### Save the suburbs_geo dataframe to a csv file

In [None]:
suburbs_geo.to_csv("suburb_geo.csv", encoding='utf-8', index = False)

### Function to format collected venue's city into a standard format

In [None]:
def city_name_extract(city):
    
    city = city.lower()
    city = city.replace(' ', '')
    
    if city == 'southmelbourne':
        return 'South Melbourne'
    
    if city == 'eastmelbourne':
        return 'East Melbourne'
    
    if city == 'southyarra':
        return 'South Yarra'
    
    if city == 'northcote':
        return 'Northcote'
    
    if city == 'northrichmond':
        return 'Richmond'
    
    if city == 'southbank':
        return 'Southbank'
    
    if city == 'saintkilda':
        return 'St Kilda'
    
    if city == 'elsternwick':
        return 'Elsternwick'

    if ',' in city:
        pos = city.find(',')
        city = city[0:pos]
        
    if '.' in city:
        city = city.replace('.', '')
        
    if 'st' in city:
        city = city.replace('st', 'st ')

    words = ['east', 'west', 'south', 'north', 'hill', 'hills', 'iris']

    for i, value in enumerate(words):
        if value in city:
            city = city.replace(value, ' ' + value)
            break
            
    city = city.title()
    
    return city

### Function to explore venues in a suburb and put them all into a single dataframe

In [None]:
def exploreVenues(data, query, radius, limit):
    
    df_columns = ['Suburb', 'Postcode', 'Suburb_Latitude', 
              'Suburb_Longitude', 'Venue', 'Category', 
              'Address', 'Venue_Latitude', 'Venue_Longitude', 'Label', 'Chance', 'Comp']
        
    venues_list=[]
    
    for postcode, lat, lng in zip(data['Postcode'], data['Latitude'], data['Longitude']):
          
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?client_id={0}&client_secret={1}&ll={2},{3}&v={4}&query={5}&radius={6}&limit={7}'.format(CLIENT_ID, CLIENT_SECRET, lat, lng, VERSION, query, radius, limit)
        
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        for x in range(0, len(results)):
            venue_details = []
            if 'address' not in results[x]['venue']['location']:
                continue
            elif 'city' not in results[x]['venue']['location']:
                continue
            else:
                address = results[x]['venue']['location']['address']      
                if (re.match('^[0-9]*$', address)):
                    continue
                else:
                    # format suburb's name
                    suburb = city_name_extract(results[x]['venue']['location']['city'])
                    # format address information
                    address = address_extract(address, road_types)
                    
                    # default values
                    label = 0
                    chance = 0.6
                    comp = 0.75
                    
                    #1
                    for i, value in enumerate(city_suburbs):
                        if suburb == value:
                            chance = 0.7
                            comp = 0.9
                            break
                    #2      
                    for i, value in enumerate(southyarra_suburbs):
                        if suburb == value:
                            chance = 0.75
                            comp = 0.85
                            break
                    #3
                    for i, value in enumerate(stkilda_suburbs):
                        if suburb == value:
                            chance = 0.7
                            comp = 0.8
                            break
                    #4     
                    for i, value in enumerate(richmond_suburbs):
                        if suburb == value:
                            chance = 0.7
                            comp = 0.75
                            break
                    #5
                    for i, value in enumerate(hawthorn_suburbs):
                        if suburb == value:
                            chance = 0.7
                            comp = 0.7
                            break 
                    #6       
                    for i, value in enumerate(brighton_suburbs):
                        if suburb == value:
                            chance = 0.6
                            comp = 0.6
                            break 
                    #7
                    for i, value in enumerate(malvern_suburbs):
                        if suburb == value:
                            chance = 0.6
                            comp = 0.55
                            break
                    #8
                    for i, value in enumerate(balwyn_suburbs):
                        if suburb == value:
                            chance = 0.5
                            comp = 0.5
                            break
                    
                    venue_details.append([suburb, postcode, lat, lng, results[x]['venue']['name'],
                                          results[x]['venue']['categories'][0]['name'], address,
                                          results[x]['venue']['location']['lat'], 
                                          results[x]['venue']['location']['lng'], label, chance, comp])
                
            venues_list.append(venue_details)

    results_df = pd.DataFrame(columns = df_columns)
    
    for i, venue in enumerate(venues_list):
        venue_df = pd.DataFrame(data = venue, columns = df_columns)
        results_df = results_df.append(venue_df)
    
    results_df.reset_index(drop = True, inplace = True)
    
    return(results_df)

### Function to explore venues in all suburbs

In [None]:
def search_venues(keyword, radius = 1300, limit = 100):

    allvenues_df = pd.DataFrame()

    for i in range(0, suburbs_geo['Postcode'].count()):
        sub_geo = suburbs_geo.iloc[[i]]
        suburbvenues_df = exploreVenues(data=sub_geo, query=keyword, radius = radius, limit = limit)
        allvenues_df = allvenues_df.append(suburbvenues_df)

    allvenues_df.reset_index(drop = True, inplace = True)

    return allvenues_df

### Function to generate a clustered venues map

In [None]:
def generate_map_cluster(venues_data, suburbs_geo, query):
    
    number_of_clusters = venues_data['Label'].max() + 1
    
    distinguishable_colors = ['red', 'blue', 'black', 'orange', 'green', 'cyan', 'purple', 'yellow', 'lime', 'fuchsia', 'indigo', 'brown']
    colors = []
    for i, value in enumerate(random.sample(range(0, len(distinguishable_colors)), number_of_clusters)):
        colors.append(distinguishable_colors[value])
        
              
    start_data = suburbs_geo[suburbs_geo['Suburb'] == query]
    start_lat = float(start_data['Latitude'])
    start_lng = float(start_data['Longitude'])
    
    # create map of the suburbs using latitude and longitude values
    result_map = folium.Map(location=[start_lat, start_lng], zoom_start=13, width = '80%', height = '80%')
    
    # add markers (venues) to map
    for lat, lng, label in zip(venues_data['Venue_Latitude'], venues_data['Venue_Longitude'], venues_data['Label']):
        
        if (number_of_clusters > 1):
            color = colors[label]
        else:
            color = 'blue'
        
        #venue_label = folium.Popup(label, parse_html=True)
        folium.CircleMarker([float(lat), float(lng)], radius=3, color=color, fill=True, fill_color=color, fill_opacity=1).add_to(result_map)
    
    if (number_of_clusters > 1):
        
        stats_df = venues_data
        stats_df = stats_df.drop(labels = {'Suburb', 'Postcode', 'Suburb_Latitude', 'Suburb_Longitude', 'Venue_Latitude', 'Venue', 'Category', 
                               'Address', 'Venue_Longitude'}, axis = 1)
        stats_df[['Chance', 'Comp']] = stats_df[['Chance', 'Comp']].astype('float')
        stats_df = stats_df.groupby('Label').mean().reset_index()
        
        for i in range(0, number_of_clusters):
        
            chance = stats_df[stats_df['Label'] == i]['Chance'].values[0]
            comp = stats_df[stats_df['Label'] == i]['Comp'].values[0]

            print('Cluster', i, ' color', ': ', colors[i], '\t', ' | ',
                  'Chance getting customers rate: ', chance * 100, '%', ' | ',
                  'Competition rate: ', comp * 100, '%')

    return result_map

### Function to process generated dataframe from the previous steps with specific conditions and standard format

In [None]:
def generate_venues_dataframe(data):
    
    venues_list = data
    venues_list = venues_list.drop_duplicates(subset = ['Venue'], keep = 'first').reset_index(drop = True)
    venues_list = venues_list.drop(labels = 'Postcode', axis = 1)

    # generate a dataframe of venues with incorrect suburb
    incorrect_suburb_venues = venues_list[venues_list['Suburb'] == 'Melbourne']
    # also drop the suburb column which contains incorrect information
    incorrect_suburb_venues = incorrect_suburb_venues.drop(labels = {'Suburb'}, axis = 1)
    
    # drop those incorrect venues from the venues_list dataframe
    venues_list = venues_list.drop(venues_list[venues_list['Suburb'] == 'Melbourne'].index)
    
    # using suburbs_geo dataframe to merge with the incorrect_suburb_venues dataframe to find the correct suburbs
    fixed_suburb_venues = incorrect_suburb_venues.merge(suburbs_geo, how = 'left', left_on=['Suburb_Latitude', 'Suburb_Longitude'], right_on = ['Latitude', 'Longitude']).reset_index(drop = True)
    
    # drop unnecessary columns after merging
    fixed_suburb_venues = fixed_suburb_venues.drop(labels = {'Postcode', 'Suburb_Latitude', 'Suburb_Longitude', 'Latitude', 'Longitude'}, axis = 1)
    
    # and reorder the columns for merger dataframe
    fixed_suburb_venues = fixed_suburb_venues[['Suburb', 'Venue', 'Category', 'Address', 'Venue_Latitude', 'Venue_Longitude', 'Label', 'Chance', 'Comp']]
    
    # update venues_list dataframe with fixed_suburb_venues dataframe by appending
    venues_list = venues_list.drop(labels = {'Suburb_Latitude', 'Suburb_Longitude'}, axis = 1)
    venues_list = venues_list.append(fixed_suburb_venues).reset_index(drop = True)
    
    # generate dataframe count the frequency of each suburb in venues_list dataframe 
    suburb_count_df = venues_list.groupby(['Suburb']).count().reset_index()
    # we only need to take suburbs that appear a least 4 times
    suburb_count_df = suburb_count_df.drop(suburb_count_df[suburb_count_df['Address'] < 4].index)
    # we only need to know the name of those suburbs that we want to keep
    satisfied_suburbs_df = suburb_count_df[['Suburb']]
    # now we will fill up information for list of suburbs we kept in previous step
    satisfied_suburbs_df = satisfied_suburbs_df.merge(venues_list, how = 'left', left_on = ['Suburb'], right_on = ['Suburb'])
    
    # now update the venues_list dataframe
    venues_list = satisfied_suburbs_df
    # we again generate a new dataframe but this one displays the frequency of the addresses instead 
    address_count_df = satisfied_suburbs_df.groupby('Address').count().reset_index()
    # we won't keep addresses that appear less than 2 times
    address_count_df = address_count_df.drop(address_count_df[address_count_df['Suburb'] < 2].index)
    # now we have list of addresses we want
    satisfied_addresses_df = address_count_df[['Address']]
    # fill up information for those address
    satisfied_addresses_df = satisfied_addresses_df.merge(venues_list, how = 'left', right_on = ['Address'], left_on = ['Address'])

    # and update venues_list dataframe
    venues_list = satisfied_addresses_df
    
    return venues_list

### Function to process clustering with K-means algorithm

In [None]:
def kmeans_algorithm(venues_data, n_clusters):
    
    kmeans_df = data[['Suburb', 'Address', 'Chance', 'Comp']]

    train_df = kmeans_df[['Chance', 'Comp']]
    train_df = pd.concat([train_df, pd.get_dummies(kmeans_df['Suburb'])], axis = 1)
    train_df = pd.concat([train_df, pd.get_dummies(kmeans_df['Address'])], axis = 1)
    train_df = train_df.astype('float')

    X = train_df.values[:, 1:]
    X = np.nan_to_num(X)

    k_means = KMeans(init = "k-means++", n_clusters = n_clusters, n_init = 12)
    k_means.fit(X)
    labels = k_means.labels_
    kmeans_df['Label'] = labels
    
    kmeans_df = kmeans_df[['Address', 'Suburb', 'Chance', 'Comp', 'Label']]
    
    result_df = data.drop(labels = 'Label', axis = 1)
    result_df['Label'] = kmeans_df['Label']
    
    return result_df

### After defined all necessary elements, it's time to run the test

In [None]:
data = search_venues(keyword = 'Restaurant')
# save search results
data.to_csv('search_results.csv', encoding='utf-8', index = False)
# generate dataframe holds found venues information
venues_list = generate_venues_dataframe(data)
print("Completed!")

In [None]:
# before clustering
generate_map_cluster(venues_list, suburbs_geo, 'Hawthorn') #hawthorn is the centre of the map

In [None]:
# fit the model
fit_venues_list = kmeans_algorithm(venues_list, n_clusters = 5)
print('Completed!') # 5 clusters

In [None]:
# after clustering visualise clusters on map
generate_map_cluster(fit_venues_list, suburbs_geo, 'Hawthorn') #Hawthorn is the centre of the map