### COURSERA | IBM DATA SCIENCE | NEIGHBORHOOD SEGMENTATION

The purpose of this program is to classify neighborhoods within Toronto into distinct groups based upon the similarity between the type of venues located nearby. <br> This program uses Canadian postal code data from Wikipedia, location data from Foursquare API, and geographical data from Coursera. <br> The Folium package was used to render all geographical data. <br><br>Venues were limited to 100 within a 500m radius for each neighborhood with the top 5 venue types analyzed. <br> Neighborhoods were partitioned into K=5 clusters through K-Means Clustering.

In [1]:
# @hidden_cell
credentials = {
    'IAM_SERVICE_ID': 'iam-ServiceId-d0f8f1bd-e2f0-406a-8332-37fd6aa968ed',
    'IBM_API_KEY_ID': 'i5BveX5kkRvPQYvpNHUpCk5i94fy6VpFpDlF34h2iTXZ',
    'ENDPOINT': 'https://s3-api.us-geo.objectstorage.service.networklayer.com',
    'IBM_AUTH_ENDPOINT': 'https://iam.ng.bluemix.net/oidc/token',
    'BUCKET': 'project-donotdelete-pr-8ka5t6nq466jou',
    'FILE': 'canada_postal_codes.csv'
}

CLIENT_ID = 'XUTFTOO2ER5MV2YMGAMZ34XFGEEMJ2QN3ZPBREVDGCY1DBK0'
CLIENT_SECRET = 'NFFPP4M0INLKO30VNSS3ZJYVKY0S1Z2R0ZSFBN4OVEYFIAJU'
VERSION = '20190812'

In [2]:
from ibm_botocore.client import Config
import ibm_boto3

def download_file(credentials):  
    cos = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=credentials['IBM_API_KEY_ID'],
    ibm_service_instance_id=credentials['IAM_SERVICE_ID'],
    ibm_auth_endpoint=credentials['IBM_AUTH_ENDPOINT'],
    config=Config(signature_version='oauth'),
    endpoint_url=credentials['ENDPOINT'])
    
    try:
        res = cos.download_file(Bucket=credentials['BUCKET'], Key=credentials['FILE'], Filename=credentials['FILE'])
        print('SUCCESSFULLY DOWNLOADED FILE')
    except Exception:
        print('FAILED TO DOWNLOAD FILE')

#### PART 1: DOWNLOAD & CLEAN POSTAL CODE DATA

In [3]:
import pandas as pd
import numpy as np

In [4]:
download_file(credentials)
df = pd.read_csv(credentials['FILE'])
df.head()

SUCCESSFULLY DOWNLOADED FILE


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [5]:
df.rename(columns={'Postcode': 'Postal Code', 'Neighbourhood': 'Neighborhood'}, inplace=True)
df = df[df.Borough != 'Not assigned']
df.Neighborhood.replace('Not assigned', df.Borough, inplace=True)
df = df.groupby(['Postal Code', 'Borough'], sort=False)['Neighborhood'].apply(', '.join).reset_index()
df.sort_values('Postal Code', inplace=True)
df.index = list(range(len(df)))
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
print('NEIGHBORHOODS IN CANADA:', len(df))

NEIGHBORHOODS IN CANADA: 103


#### PART 2: CONCATENATE GEOGRAPHICAL DATA

In [7]:
coord_path = 'https://cocl.us/Geospatial_data'
df_coord = pd.read_csv(coord_path)
df['Latitude'] = df_coord['Latitude']
df['Longitude'] = df_coord['Longitude']
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


#### PART 3: VISUALIZE NEIGHBORHOODS IN TORONTO

In [8]:
df = df[df['Borough'].str.contains('Toronto')].reset_index(drop=True)
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [9]:
print('NEIGHBORHOODS IN TORONTO:', len(df))

NEIGHBORHOODS IN TORONTO: 38


In [10]:
!pip install folium
import folium
print('SUCCESSFULLY DOWNLOADED FOLIUM')

SUCCESSFULLY DOWNLOADED FOLIUM


In [11]:
LAT = 43.6532
LNG = -79.3832
toronto_map = folium.Map(location=[LAT, LNG], zoom_start=12)
for borough, neighborhood, lat, lng in zip(df['Borough'], df['Neighborhood'], df['Latitude'], df['Longitude']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label)
    folium.CircleMarker([lat, lng], radius=5, popup=label, color='blue', fill=True, fill_color='#3186CC', fill_opacity=0.8).add_to(toronto_map)
toronto_map

#### PART 4: EXPLORE NEIGHBORHOODS AND NEARBY VENUES IN TORONTO

In [12]:
import requests
import json

In [13]:
RADIUS = 500
LIMIT = 100
def get_venues(name, lat, lng):
    venues = []
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, RADIUS, LIMIT)
    results = requests.get(url).json()['response']['groups'][0]['items']
    for v in results:
        info = [name, lat, lng, 
                v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']]
        venues.append(info)
    df = pd.DataFrame(venues)
    df.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 
                         'Venue', 'Venue Latitude', 'Venue Longitude', 'Category']
    return df

In [14]:
venue_df = pd.DataFrame()
for name, lat, lng in zip(df['Neighborhood'], df['Latitude'], df['Longitude']):
    venues = get_venues(name, lat, lng)
    venue_df = pd.concat([venue_df, venues])
venue_df.reset_index(inplace=True, drop=True)
venue_df.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [15]:
venue_df.groupby('Neighborhood').count()['Venue']

Neighborhood
Adelaide, King, Richmond                                                                                      100
Berczy Park                                                                                                    56
Brockton, Exhibition Place, Parkdale Village                                                                   21
Business Reply Mail Processing Centre 969 Eastern                                                              19
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara     15
Cabbagetown, St. James Town                                                                                    45
Central Bay Street                                                                                             84
Chinatown, Grange Park, Kensington Market                                                                     100
Christie                                                                   

In [16]:
print('VENUES IN TORONTO:', len(venue_df))

VENUES IN TORONTO: 1689


In [17]:
print('UNIQUE VENUE TYPES IN TORONTO:', len(venue_df['Category'].unique()))

UNIQUE VENUE TYPES IN TORONTO: 236


#### PART 5: ANALYZE NEIGHBORHOODS AND VENUE TYPE FREQUENCY

In [18]:
onehot_df = pd.get_dummies(venue_df[['Category']], prefix='', prefix_sep='')
onehot_df['NEIGHBORHOOD'] = venue_df['Neighborhood']
new_columns = [onehot_df.columns[-1]] + list(onehot_df.columns[:-1])
onehot_df = onehot_df[new_columns]
onehot_df.head()

Unnamed: 0,NEIGHBORHOOD,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
grouped_df = onehot_df.groupby('NEIGHBORHOOD').mean().reset_index()
grouped_df.head()

Unnamed: 0,NEIGHBORHOOD,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.066667,0.066667,0.133333,0.2,0.133333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
NUM_TOP_VENUES = 5
def top_venues(row):
    categories = row.iloc[1:]
    sorted_categories = categories.sort_values(ascending=False)
    return sorted_categories.index.values[0:NUM_TOP_VENUES]

In [21]:
columns = ['Neighborhood']
for i in range(NUM_TOP_VENUES):
    columns.append('Top Venue {}'.format(i+1))
venues_sorted_df = pd.DataFrame(columns=columns)
venues_sorted_df['Neighborhood'] = grouped_df['NEIGHBORHOOD']

for i in range(len(grouped_df)):
    venues_sorted_df.iloc[i, 1:] = top_venues(grouped_df.iloc[i, :])
venues_sorted_df.head()

Unnamed: 0,Neighborhood,Top Venue 1,Top Venue 2,Top Venue 3,Top Venue 4,Top Venue 5
0,"Adelaide, King, Richmond",Coffee Shop,Café,Bar,Steakhouse,Thai Restaurant
1,Berczy Park,Coffee Shop,Cocktail Bar,Beer Bar,Seafood Restaurant,Cheese Shop
2,"Brockton, Exhibition Place, Parkdale Village",Breakfast Spot,Coffee Shop,Café,Italian Restaurant,Pet Store
3,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Spa,Garden Center,Garden
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",Airport Service,Airport Lounge,Airport Terminal,Coffee Shop,Harbor / Marina


#### PART 6: CLUSTER NEIGHBORHOODS BASED ON VENUE TYPE SIMILARITY

In [22]:
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [23]:
K = 5
clustering_df = grouped_df.drop('NEIGHBORHOOD', axis=1)
kmeans = KMeans(n_clusters=K, random_state=0).fit(clustering_df)
kmeans.labels_[0:NUM_TOP_VENUES]

array([0, 0, 0, 0, 0], dtype=int32)

In [24]:
venues_sorted_df.insert(0, 'Cluster', kmeans.labels_)
merged_df = df
merged_df = merged_df.join(venues_sorted_df.set_index('Neighborhood'), on='Neighborhood')
merged_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude,Cluster,Top Venue 1,Top Venue 2,Top Venue 3,Top Venue 4,Top Venue 5
0,M4E,East Toronto,The Beaches,43.676357,-79.293031,0,Health Food Store,Neighborhood,Trail,Pub,Filipino Restaurant
1,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,0,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store
2,M4L,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,0,Park,Sandwich Place,Fish & Chips Shop,Pub,Fast Food Restaurant
3,M4M,East Toronto,Studio District,43.659526,-79.340923,0,Café,Coffee Shop,Italian Restaurant,American Restaurant,Bakery
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,4,Park,Bus Line,Swim School,Yoga Studio,Donut Shop


In [25]:
x = np.arange(K)
ys = [i + x + (i*x)**2 for i in range(K)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

cluster_map = folium.Map(location=[lat, lng], zoom_start=12)
for lat, lng, neighborhood, cluster in zip(merged_df['Latitude'], merged_df['Longitude'], merged_df['Neighborhood'], merged_df['Cluster']):
    label = '{}, CLUSTER: {}'.format(neighborhood, cluster)
    label = folium.Popup(label)
    folium.CircleMarker([lat, lng], radius=5, popup=label, 
                        color=rainbow[cluster-1], fill=True, fill_color=rainbow[cluster-1], fill_opacity=0.8).add_to(cluster_map)
cluster_map