In [217]:
import pandas as pd
import io
import requests
import lxml
import numpy as np
import folium
from sklearn.cluster import KMeans
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import matplotlib.cm as cm
import matplotlib.colors as colors
url="https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
dfs = pd.read_html(url)

In [218]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

In [219]:
print(len(dfs))
df = dfs[0]
df.info()

3
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Postal Code    180 non-null    object
 1   Borough        180 non-null    object
 2   Neighbourhood  180 non-null    object
dtypes: object(3)
memory usage: 4.3+ KB


- The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood  

- Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.     

- More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed   twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11  in the above table.

- If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.

- Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.

- In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [220]:
# Dropping row within Borough that's not assigned
df['Borough'] =df['Borough'][~df.Borough.str.contains("Not assigned")]

In [221]:
# Group by postal code and combined neightbourhood if sharing same postal code
df = df.groupby(['Postal Code', 'Borough'], sort=False).agg(', '.join)
df.reset_index(inplace=True)

In [222]:
# If neightbourhood not assigned, then assigned it with borough
df['Neighbourhood'] = np.where(df['Neighbourhood'] == 'Not assigned', df['Borough'], df['Neighbourhood'])

In [223]:
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


## Import in the geospatial coordinate

In [224]:
df2 = pd.read_csv('Geospatial_Coordinates.csv')

In [225]:
# join both table on postal 
df = df.join(df2.set_index('Postal Code'), on='Postal Code')

### Since we only want specific location which is Toronto downtown, We will only extract those within the coordinate

In [226]:
df = df.loc[df['Borough']== 'Downtown Toronto']

In [227]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306


In [228]:
latitude = 	43.654260
longitude = -79.360636

In [229]:
venues_map = folium.Map(location=[latitude, longitude], zoom_start=13) # generate map centred around the Conrad Hotel

# add a red circle marker to represent the Downtown Toronto
folium.CircleMarker(
    [latitude, longitude],
    radius=10,
    color='red',
    popup='Downtown Toronto',
    fill = True,
    fill_color = 'green',
    fill_opacity = 0.5
).add_to(venues_map)

# add the Borough as blue circle markers
for lat, lng, label in zip(df['Latitude'], df['Longitude'], df['Borough']):
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        color='blue',
        popup=label,
        fill = True,
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(venues_map)

# # display map
venues_map

# Clustering the Neighborhood 

In [230]:
# df.head()

In [231]:
# Toronto_cluser = df.transpose()
# Toronto_cluser.columns = ['Group-{}'.format(i) for i in range(0,len(Toronto_cluser.columns))]
# Toronto_cluser

## Seperation into 3 different cluster

In [232]:
# set number of clusters
kclusters = 3

manhattan_grouped_clustering = df.drop(['Postal Code', 'Borough','Neighbourhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

# Insert k cluster as column into df

df.insert(0, 'cluster label', kmeans.labels_)

# Mapping the cluster

In [233]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['Latitude'], df['Longitude'], df['Neighbourhood'],df['cluster label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# BOBA Drink exploration in Bay Area

In [234]:
# Hidden cell
CLIENT_ID = 'NVHSI1USRDZDFZ2P2FJTTKTDG343TF2K5M4YI1CASRQ2VX2N' # your Foursquare ID
CLIENT_SECRET = '3RNICWVFXC35EOGADJTN3LQBHLU0LHAAL4JQM1OPAWWVREO2' # your Foursquare Secret
ACCESS_TOKEN = 'L024RG4LPXZA4GEHDJ324JPSHGZRFWVL32XL5S4L2S5HNKDF' # your FourSquare Access Token
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: NVHSI1USRDZDFZ2P2FJTTKTDG343TF2K5M4YI1CASRQ2VX2N
CLIENT_SECRET:3RNICWVFXC35EOGADJTN3LQBHLU0LHAAL4JQM1OPAWWVREO2


In [235]:
# Specify the location of our data point
address = 'San Jose, CA'

geolocator = Nominatim(user_agent="foursquare_agent")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of San Jose City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of San Jose City are 37.3361905, -121.890583.


In [236]:
# Generate a foursquare query for RESTAURANTS in the area
search_query = 'boba'
radius = 200000
# url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude,ACCESS_TOKEN, VERSION, radius, LIMIT)
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&query={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude,ACCESS_TOKEN, VERSION, search_query, radius, LIMIT)


# Get our result return from the foursqaure dbase
results = requests.get(url).json()
v=results['response']['venues']   # extract the result from json file
dframe = json_normalize(v)  # normalize our result with json normalize function

  dframe = json_normalize(v)  # normalize our result with json normalize function


In [237]:
# keep only columns that include venue name, and anything that is associated with location
filtered_columns = ['name', 'categories'] + [col for col in dframe.columns if col.startswith('location.')] + ['id']
dataframe_filtered = dframe.loc[:, filtered_columns]

# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

# filter the category for each row
dataframe_filtered['categories'] = dataframe_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
dataframe_filtered.columns = [column.split('.')[-1] for column in dataframe_filtered.columns]

# dataframe_filtered

In [238]:
dataframe_filtered

Unnamed: 0,name,categories,address,crossStreet,lat,lng,labeledLatLngs,distance,postalCode,cc,city,state,country,formattedAddress,neighborhood,id
0,Boba Bar Teahouse & Eatery,Bubble Tea Shop,310 S 3rd St,at E San Carlos St,37.332368,-121.884731,"[{'label': 'display', 'lat': 37.3323681755391,...",670,95112.0,US,San Jose,CA,United States,"[310 S 3rd St (at E San Carlos St), San Jose, ...",,53cf2ef3498e1e5b6248251d
1,Tiger Milk Boba,Bubble Tea Shop,72 N Almaden Ave,,37.336055,-121.894399,"[{'label': 'display', 'lat': 37.336055, 'lng':...",338,95110.0,US,San Jose,CA,United States,"[72 N Almaden Ave, San Jose, CA 95110]",,5ec8592ba823280008d502fb
2,Cafe Boba,Coffee Shop,110 E San Fernando St,,37.335346,-121.886551,"[{'label': 'display', 'lat': 37.33534622192383...",369,95112.0,US,San Jose,CA,United States,"[110 E San Fernando St, San Jose, CA 95112]",,4f3262fd19836c91c7d2c424
3,BOBATEANI,Bubble Tea Shop,75 E Santa Clara St,,37.337212,-121.889275,"[{'label': 'display', 'lat': 37.33721160888672...",162,95113.0,US,San Jose,CA,United States,"[75 E Santa Clara St, San Jose, CA 95113]",,5963f134364d9774dfe2603c
4,Boba Pub,Coffee Shop,,,37.25348,-121.901566,"[{'label': 'display', 'lat': 37.25347953453237...",9258,,US,San Jose,CA,United States,"[San Jose, CA]",,541dea34498e9ab326788618
5,Boba Guys,Bubble Tea Shop,855 El Camino Real #120,,37.438476,-122.159122,"[{'label': 'display', 'lat': 37.43847553201563...",26339,94301.0,US,Palo Alto,CA,United States,"[855 El Camino Real #120, Palo Alto, CA 94301]",,5c3518fd2b274a002c12c626
6,Boba Fitt,Bubble Tea Shop,1051 E Capitol Expy,,37.30131,-121.82282,"[{'label': 'display', 'lat': 37.30131, 'lng': ...",7145,95121.0,US,San Jose,CA,United States,"[1051 E Capitol Expy, San Jose, CA 95121]",South San Jose,5abc614d2c7eb9726c9ee34e
7,Oh Boba,Bubble Tea Shop,,,37.350555,-121.94401,"[{'label': 'display', 'lat': 37.35055503603879...",4991,,US,Santa Clara,CA,United States,"[Santa Clara, CA]",,51c88905498e52a0369b41ce
8,Boba Tea Express,Café,4100 Monterey Hwy,Marina,37.279468,-121.834097,"[{'label': 'display', 'lat': 37.279468, 'lng':...",8055,95111.0,US,San Jose,CA,United States,"[4100 Monterey Hwy (Marina), San Jose, CA 95111]",,5153a474e4b04e3a8d061b99
9,Boba,Coffee Shop,1710 N Milpitas Blvd,,37.455524,-121.910233,"[{'label': 'display', 'lat': 37.45552444458008...",13397,95035.0,US,Milpitas,CA,United States,"[1710 N Milpitas Blvd, Milpitas, CA 95035]",,4f32367a19836c91c7c14327


In [239]:
df = dataframe_filtered[['name', 'categories','address','lat','lng', 'postalCode','formattedAddress', 'id','distance','city']]

In [240]:
df.head()

Unnamed: 0,name,categories,address,lat,lng,postalCode,formattedAddress,id,distance,city
0,Boba Bar Teahouse & Eatery,Bubble Tea Shop,310 S 3rd St,37.332368,-121.884731,95112.0,"[310 S 3rd St (at E San Carlos St), San Jose, ...",53cf2ef3498e1e5b6248251d,670,San Jose
1,Tiger Milk Boba,Bubble Tea Shop,72 N Almaden Ave,37.336055,-121.894399,95110.0,"[72 N Almaden Ave, San Jose, CA 95110]",5ec8592ba823280008d502fb,338,San Jose
2,Cafe Boba,Coffee Shop,110 E San Fernando St,37.335346,-121.886551,95112.0,"[110 E San Fernando St, San Jose, CA 95112]",4f3262fd19836c91c7d2c424,369,San Jose
3,BOBATEANI,Bubble Tea Shop,75 E Santa Clara St,37.337212,-121.889275,95113.0,"[75 E Santa Clara St, San Jose, CA 95113]",5963f134364d9774dfe2603c,162,San Jose
4,Boba Pub,Coffee Shop,,37.25348,-121.901566,,"[San Jose, CA]",541dea34498e9ab326788618,9258,San Jose


In [241]:
sanjose_map = folium.Map(location=[latitude, longitude], zoom_start=12)


incidents = folium.map.FeatureGroup()

for lat, lng, in zip(df.lat, df.lng):
    incidents.add_child(
        folium.features.CircleMarker(
            [lat, lng],
            radius=5, 
            color='yellow',
            fill=True,
            fill_color='blue',
            fill_opacity=0.6
        )
    )

# add pop-up text to each marker on the map
latitudes = df.lat
longitudes = df.lng
label = df.categories

for lat, lng, label in zip(latitudes, longitudes, label):
    folium.Marker([lat, lng], popup=label).add_to(sanjose_map)    
    
# add incidents to map
sanjose_map.add_child(incidents)

In [242]:
df.columns

Index(['name', 'categories', 'address', 'lat', 'lng', 'postalCode',
       'formattedAddress', 'id', 'distance', 'city'],
      dtype='object')

In [243]:
# set number of clusters
kclusters = 4

group_cluster = df.drop(['name', 'categories','address', 'postalCode','formattedAddress','id','distance','city'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(group_cluster)

# check cluster labels generated for each row in the dataframe
kmeans.labels_

# Insert k cluster as column into df

df.insert(0, 'cluster label', kmeans.labels_)

In [244]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df['lat'], df['lng'],df['city'],df['cluster label']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)

In [245]:
map_clusters

In [246]:
cluster_2 = df.loc[df['cluster label'] == 0]
cluster_2

Unnamed: 0,cluster label,name,categories,address,lat,lng,postalCode,formattedAddress,id,distance,city
0,0,Boba Bar Teahouse & Eatery,Bubble Tea Shop,310 S 3rd St,37.332368,-121.884731,95112.0,"[310 S 3rd St (at E San Carlos St), San Jose, ...",53cf2ef3498e1e5b6248251d,670,San Jose
1,0,Tiger Milk Boba,Bubble Tea Shop,72 N Almaden Ave,37.336055,-121.894399,95110.0,"[72 N Almaden Ave, San Jose, CA 95110]",5ec8592ba823280008d502fb,338,San Jose
2,0,Cafe Boba,Coffee Shop,110 E San Fernando St,37.335346,-121.886551,95112.0,"[110 E San Fernando St, San Jose, CA 95112]",4f3262fd19836c91c7d2c424,369,San Jose
3,0,BOBATEANI,Bubble Tea Shop,75 E Santa Clara St,37.337212,-121.889275,95113.0,"[75 E Santa Clara St, San Jose, CA 95113]",5963f134364d9774dfe2603c,162,San Jose
4,0,Boba Pub,Coffee Shop,,37.25348,-121.901566,,"[San Jose, CA]",541dea34498e9ab326788618,9258,San Jose
6,0,Boba Fitt,Bubble Tea Shop,1051 E Capitol Expy,37.30131,-121.82282,95121.0,"[1051 E Capitol Expy, San Jose, CA 95121]",5abc614d2c7eb9726c9ee34e,7145,San Jose
7,0,Oh Boba,Bubble Tea Shop,,37.350555,-121.94401,,"[Santa Clara, CA]",51c88905498e52a0369b41ce,4991,Santa Clara
8,0,Boba Tea Express,Café,4100 Monterey Hwy,37.279468,-121.834097,95111.0,"[4100 Monterey Hwy (Marina), San Jose, CA 95111]",5153a474e4b04e3a8d061b99,8055,San Jose
11,0,Pho 21 & Boba 21,Vietnamese Restaurant,,37.31947,-121.82376,95122.0,"[San Jose, CA 95122]",59a10b74135b395e9dc4a1f9,6201,San Jose
12,0,Simply Boba,Bubble Tea Shop,3005 Silver Creek Rd Ste 192,37.309512,-121.813647,95121.0,"[3005 Silver Creek Rd Ste 192, San Jose, CA 95...",54cbe525498e1886f9b7c459,7429,San Jose


# Cluster result
In comparison with other county and city within the area, We can observe San Jose attracted majority of the boba locations

In [247]:
sanjose_map = folium.Map(location=[latitude, longitude], zoom_start=12)


map_feature = folium.map.FeatureGroup()

for lat, lng, in zip(cluster_2.lat, cluster_2.lng):
    map_feature.add_child(
        folium.features.CircleMarker(
            [lat, lng],
            radius=5, 
            color='yellow',
            fill=True,
            fill_color='blue',
            fill_opacity=0.6
        )
    )

# add pop-up text to each marker on the map
latitudes = list(df.lat)
longitudes = list(df.lng)
labels = list(df.name)

for lat, lng, label in zip(cluster_2.lat,cluster_2.lng, df.categories):
    folium.Marker([lat, lng], popup=label).add_to(sanjose_map)    
    
# add incidents to map
sanjose_map.add_child(map_feature)

# Check out some of the boba shop within cluster 0

#### Tiger Milk Tea

In [249]:
venue_id = '53cf2ef3498e1e5b6248251d'
radius = 200
# url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude,ACCESS_TOKEN, VERSION, radius, LIMIT)
url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&radius={}&limit={}'.format(venue_id,CLIENT_ID, CLIENT_SECRET, latitude, longitude,ACCESS_TOKEN, VERSION, radius, LIMIT)

# Get our result return from the foursqaure dbase
results = requests.get(url).json()
print('total like: ', result['response']['venue']['likes']['count'])
print('price: ', result['response']['venue'][])

total like:  0


KeyError: 'price'

### Boba Tea house

In [250]:
venue_id = '53cf2ef3498e1e5b6248251d'
radius = 200
# url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude,ACCESS_TOKEN, VERSION, radius, LIMIT)
url = 'https://api.foursquare.com/v2/venues/{}?client_id={}&client_secret={}&ll={},{}&oauth_token={}&v={}&radius={}&limit={}'.format(venue_id,CLIENT_ID, CLIENT_SECRET, latitude, longitude,ACCESS_TOKEN, VERSION, radius, LIMIT)

# Get our result return from the foursqaure dbase
results = requests.get(url).json()
print('total like: ', result['response']['venue']['likes']['count'])
print('price: ', result['response']['venue']['price'])

total like:  0


#### The last one doesnt look too promising because it only have 0 rating

# Conclusion
    By using folium and K-mean cluster analysis, we was able to figure out which region within the Bay area contain the most boba shop. We also check out two of those boba shop within the San Jose city which is a variable of cluster 0. We can see that Tiger Milk Tea have higher rating and also have more information about price.