In [1]:
##Import some of the required libraries

import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import random # library for random number generation

In [2]:
#Install geopy
!conda install -c conda-forge geopy --yes

Collecting package metadata: done
Solving environment: done

# All requested packages already installed.



In [3]:
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values

# libraries for displaying images
from IPython.display import Image 
from IPython.core.display import HTML 

In [4]:
# tranforming json file into a pandas dataframe library
from pandas.io.json import json_normalize

In [5]:
#Import folium for plotting on map
!conda install -c conda-forge folium=0.5.0 --yes
import folium # plotting library

print('Folium installed')
print('Libraries imported.')

Collecting package metadata: done
Solving environment: done

# All requested packages already installed.

Folium installed
Libraries imported.


In [6]:
#Import KMeans for k-means analysis
from sklearn.cluster import KMeans

#### Read in csv of Toronto Boroughs from provided file

In [7]:
TorBo = pd.read_csv('TorontoBo.csv')

##### Check to see file looks correct

In [8]:
TorBo.head()

Unnamed: 0.1,Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,0,M3A,North York,Parkwoods,43.753259,-79.329656
1,1,M4A,North York,Victoria Village,43.725882,-79.315572
2,2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


##### Make a list of postal codes' latitude and longitude to use in drawing on map

In [9]:
Boroughs = TorBo[['Latitude', 'Longitude']]  #subset just the lat and lon columns
Borlist = Boroughs.values.tolist()
len(Borlist) #check that the length of the list is correct

103

#### Create an initial map of Toronto

In [10]:
Tor_map = folium.Map(location=[43.654260,-79.360636], zoom_start=11) # generate map centered around downtown

##### Add a circle marker to the map for each postal code and display map

In [11]:
Tor_map = folium.Map(location=[43.654260,-79.360636], zoom_start=11)

for point in range(0, len(Borlist)):
    folium.CircleMarker(Borlist[point], popup=TorBo['PostalCode'][point]).add_to(Tor_map)
    
Tor_map

##### Because the downtown postal codes are small areas and overlap at a distance closer than outer ones, I will remove them and focus on just the outer postal codes.

In [12]:
#Create a new data named 'NoT' that has the postal codes with 'Toronto' in the name removed
# The ~ in front means 'not'
NoT=TorBo[~TorBo['Borough'].str.contains('Toronto')].reset_index()
len(NoT)

65

In [13]:
NoT.head() #Check that the data looks correct

Unnamed: 0.1,index,Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,0,0,M3A,North York,Parkwoods,43.753259,-79.329656
1,1,1,M4A,North York,Victoria Village,43.725882,-79.315572
2,3,3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
3,4,4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494
4,5,5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242


In [14]:
NoTmp = NoT[['Latitude', 'Longitude']]
NoTmplist = NoTmp.values.tolist()

#### Map just these postal codes

In [15]:
NoTmap = folium.Map(location=[43.654260,-79.360636], zoom_start=12)
for point in range(0, len(NoTmplist)):
    folium.Circle(NoTmplist[point], radius=800).add_to(NoTmap)
NoTmap    

#### Just a few postal codes overlap. I will call that a good balance between including enough area to have plenty of data but not having overlapping areas that will interfere with unique clustering.

### Foursquare queries

In [16]:
# The code was removed by Watson Studio for sharing.

In [17]:
#Create separate lists for the latitude and longitude in the data to use in for loops
lats = NoT['Latitude']
longs = NoT['Longitude']

In [18]:
#Check that the lists came out correctly
lats[0:3], longs[0:3]

(0    43.753259
 1    43.725882
 2    43.718518
 Name: Latitude, dtype: float64, 0   -79.329656
 1   -79.315572
 2   -79.464763
 Name: Longitude, dtype: float64)

#### Create an initial dataframe with first set of latitude/longitude numbers. That will get appended to with a subesquent for loop for the rest of the postal codes.

In [19]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

#Set constant values for intent, radius, limit
intent = 'browse'
radius = 800
LIMIT = 400

#set the latitude and longitude for the first foursquare search to be for the first borough
latitude=lats[0]
longitude=longs[0]

url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&intent={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, intent, radius, LIMIT)
allven = requests.get(url).json()
Allven = allven['response']['venues']
Allvendf = json_normalize(Allven)

All_filt_col = ['name', 'categories'] + [col for col in Allvendf.columns if col.startswith('name')] + ['id']
Allven_filtered = Allvendf.loc[:,All_filt_col ]


# filter the category for each row
Allven_filtered['categories'] = Allven_filtered.apply(get_category_type, axis=1)

# clean column names by keeping only last term
Allven_filtered.columns = [column.split('.')[-1] for column in Allven_filtered.columns]

CatFilt = Allven_filtered.categories
tempdf = CatFilt.value_counts().reset_index(name='latitude')

tempdf = pd.DataFrame(tempdf)

    
tempdf.head()  



Unnamed: 0,index,latitude
0,Residential Building (Apartment / Condo),9
1,Bus Stop,6
2,Park,4
3,Other Great Outdoors,3
4,Elementary School,3


In [20]:
#make a copy of the initial data frame in case I make an error and want to restore original initil data frame.
playdf=tempdf

In [21]:
#change the name of the venues column from 'index' to 'venue'
playdf=playdf.rename(columns={'index':'venue'})

##### Subset the latitudes and longitudes to remove the first entries that were used to make the initial data frame.

In [22]:
lat2=lats[1:len(lats)]
long2=longs[1:len(longs)]

#### Loop through the remaining entries, perform foursquare search, reformat data, and append to the initial df

In [23]:
for lat,lon in zip(lat2,long2):
    latitude=lat
    longitude=lon
    url2 = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&intent={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, intent, radius, LIMIT)

    allven = requests.get(url2).json()
    Allven = allven['response']['venues']
    Allvendf = json_normalize(Allven)

    All_filt_col = ['name', 'categories'] + [col for col in Allvendf.columns if col.startswith('name')] + ['id']
    Allven_filtered = Allvendf.loc[:,All_filt_col ]

    # filter the category for each row
    Allven_filtered['categories'] = Allven_filtered.apply(get_category_type, axis=1)

    # clean column names by keeping only last term
    Allven_filtered.columns = [column.split('.')[-1] for column in Allven_filtered.columns]
    
    CatFilt = Allven_filtered.categories
    tempdf3 = CatFilt.value_counts().reset_index(name=latitude)

    tempdf3 = pd.DataFrame(tempdf3)
    tempdf3=tempdf3.rename(columns={'index':'venue'})
    
    playdf=playdf.merge(tempdf3,how="outer")
    
playdf.head()  #check that the full data frame looks good

Unnamed: 0,venue,latitude,43.7258823,43.718518,43.6623015,43.6678556,43.8066863,43.7459058,43.7063972,43.709577,...,43.8152522,43.6056466,43.7394164,43.79952520000001,43.60241370000001,43.7067483,43.836124700000006,43.65365360000001,43.6362579,43.6288408
0,Residential Building (Apartment / Condo),9.0,14.0,,,5.0,1.0,2.0,2.0,5.0,...,,2.0,1.0,12.0,2.0,,,4.0,8.0,
1,Bus Stop,6.0,4.0,,,1.0,,,4.0,3.0,...,,,,1.0,,1.0,,2.0,,2.0
2,Park,4.0,3.0,,1.0,7.0,2.0,3.0,1.0,2.0,...,2.0,2.0,1.0,3.0,1.0,1.0,6.0,2.0,3.0,
3,Other Great Outdoors,3.0,1.0,1.0,,1.0,,3.0,,1.0,...,,,1.0,,2.0,1.0,7.0,2.0,,1.0
4,Elementary School,3.0,,,,,1.0,,,1.0,...,1.0,,,,,,1.0,,3.0,1.0


In [24]:
finaldf=playdf #once again make a new copy of this data frame in case I want to restore it to this state

In [25]:
finaldf.shape #check how large the output data frame is

(474, 66)

##### The compilation of all the data yielded a lot of different venues! I will reduce the size of the data set by removing venues that are less common overall.

In [26]:
sums = finaldf.sum(axis=1) #Create a sum of entries for each venue

In [27]:
finaldf['Total'] = finaldf.sum(axis=1) #Create new column labeled 'Total' that has the sums

In [28]:
finaldf.shape #Check that shape looks correct

(474, 67)

In [29]:
finaldfsorted = finaldf.sort_values(by=['Total'],ascending=False) #Sort by totals just to see
finaldfsorted.head()

Unnamed: 0,venue,latitude,43.7258823,43.718518,43.6623015,43.6678556,43.8066863,43.7459058,43.7063972,43.709577,...,43.6056466,43.7394164,43.79952520000001,43.60241370000001,43.7067483,43.836124700000006,43.65365360000001,43.6362579,43.6288408,Total
7,Office,3.0,5.0,8.0,3.0,6.0,8.0,25.0,5.0,6.0,...,7.0,1.0,4.0,9.0,19.0,5.0,6.0,3.0,6.0,399.0
0,Residential Building (Apartment / Condo),9.0,14.0,,,5.0,1.0,2.0,2.0,5.0,...,2.0,1.0,12.0,2.0,,,4.0,8.0,,321.0
19,Building,2.0,4.0,3.0,3.0,2.0,3.0,9.0,5.0,3.0,...,2.0,1.0,2.0,4.0,13.0,3.0,,1.0,4.0,244.0
10,Automotive Shop,2.0,8.0,5.0,,,8.0,1.0,2.0,2.0,...,,2.0,,,8.0,,1.0,,5.0,214.0
14,Salon / Barbershop,2.0,,1.0,,4.0,6.0,,1.0,4.0,...,1.0,15.0,3.0,3.0,,1.0,10.0,1.0,1.0,192.0


In [30]:
reducedfinaldf = finaldfsorted[finaldfsorted['Total']>50] #Keep only venues that have more than 50 instances

In [31]:
df2 =reducedfinaldf.fillna(0,) #Replace the NaN values with zeroes 
df2.shape  #Check that the data frame shape looks good and see how many venues are left

(39, 67)

In [32]:
df2.head() 

Unnamed: 0,venue,latitude,43.7258823,43.718518,43.6623015,43.6678556,43.8066863,43.7459058,43.7063972,43.709577,...,43.6056466,43.7394164,43.79952520000001,43.60241370000001,43.7067483,43.836124700000006,43.65365360000001,43.6362579,43.6288408,Total
7,Office,3.0,5.0,8.0,3.0,6.0,8.0,25.0,5.0,6.0,...,7.0,1.0,4.0,9.0,19.0,5.0,6.0,3.0,6.0,399.0
0,Residential Building (Apartment / Condo),9.0,14.0,0.0,0.0,5.0,1.0,2.0,2.0,5.0,...,2.0,1.0,12.0,2.0,0.0,0.0,4.0,8.0,0.0,321.0
19,Building,2.0,4.0,3.0,3.0,2.0,3.0,9.0,5.0,3.0,...,2.0,1.0,2.0,4.0,13.0,3.0,0.0,1.0,4.0,244.0
10,Automotive Shop,2.0,8.0,5.0,0.0,0.0,8.0,1.0,2.0,2.0,...,0.0,2.0,0.0,0.0,8.0,0.0,1.0,0.0,5.0,214.0
14,Salon / Barbershop,2.0,0.0,1.0,0.0,4.0,6.0,0.0,1.0,4.0,...,1.0,15.0,3.0,3.0,0.0,1.0,10.0,1.0,1.0,192.0


In [33]:
df4=df2.T  #transpose the data frame for k-mean analysis

In [34]:
#Change the header row of the new data frame to the venue names
header=df4.iloc[0]
df4=df4[1:]
df4.columns=header

In [35]:
df4.head()

venue,Office,Residential Building (Apartment / Condo),Building,Automotive Shop,Salon / Barbershop,Park,Doctor's Office,Church,Bank,Dentist's Office,...,Fast Food Restaurant,Clothing Store,Gym,Caribbean Restaurant,Nail Salon,Sandwich Place,Laundry Service,Factory,General Entertainment,Government Building
latitude,3,9,2,2,2,4,1,2,1,0,...,2,0,0,2,1,1,1,0,0,0
43.7258823,5,14,4,8,0,3,1,0,3,2,...,0,0,0,0,1,0,0,1,0,2
43.718518,8,0,3,5,1,0,1,1,1,0,...,0,12,0,0,0,0,0,2,0,1
43.6623015,3,0,3,0,0,1,2,2,0,0,...,0,0,0,0,0,2,0,0,1,12
43.6678556,6,5,2,0,4,7,4,3,4,4,...,1,1,3,0,0,0,0,2,1,0


In [36]:
df4.tail()

venue,Office,Residential Building (Apartment / Condo),Building,Automotive Shop,Salon / Barbershop,Park,Doctor's Office,Church,Bank,Dentist's Office,...,Fast Food Restaurant,Clothing Store,Gym,Caribbean Restaurant,Nail Salon,Sandwich Place,Laundry Service,Factory,General Entertainment,Government Building
43.836124700000006,5,0,3,0,1,6,0,3,0,1,...,1,1,1,0,0,0,0,1,1,0
43.65365360000001,6,4,0,1,10,2,5,1,8,8,...,0,0,1,0,2,2,0,0,1,0
43.6362579,3,8,1,0,1,3,0,1,0,0,...,0,0,0,0,2,1,0,1,1,0
43.6288408,6,0,4,5,1,0,0,1,1,1,...,3,0,1,0,0,1,0,2,0,1
Total,399,321,244,214,192,175,174,171,159,148,...,71,69,65,63,62,61,56,55,55,53


In [37]:
df4.shape

(66, 39)

In [53]:
#Remove the bottom row, which has the totals
df5=df4.iloc[0:65,]

In [39]:
df5.tail() #Check that the the total row is gone and the bottom postal code row is still there

venue,Office,Residential Building (Apartment / Condo),Building,Automotive Shop,Salon / Barbershop,Park,Doctor's Office,Church,Bank,Dentist's Office,...,Fast Food Restaurant,Clothing Store,Gym,Caribbean Restaurant,Nail Salon,Sandwich Place,Laundry Service,Factory,General Entertainment,Government Building
43.706748,19,0,13,8,0,1,4,5,1,0,...,0,0,2,3,0,1,0,11,3,1
43.836125,5,0,3,0,1,6,0,3,0,1,...,1,1,1,0,0,0,0,1,1,0
43.653654,6,4,0,1,10,2,5,1,8,8,...,0,0,1,0,2,2,0,0,1,0
43.636258,3,8,1,0,1,3,0,1,0,0,...,0,0,0,0,2,1,0,1,1,0
43.628841,6,0,4,5,1,0,0,1,1,1,...,3,0,1,0,0,1,0,2,0,1


In [54]:
#The data frame needs to have floats in order to do the scaling manipulations
df5=df5.astype(float) 

In [41]:
#Standardize data
from sklearn.preprocessing import StandardScaler

X = df5
X = np.nan_to_num(X)
cluster_dataset = StandardScaler().fit_transform(X)
cluster_dataset

array([[-0.48061371,  0.82385833, -0.58111856, ..., -0.47944053,
        -0.87733694, -0.4797215 ],
       [-0.17434027,  1.83807787,  0.0815605 , ...,  0.08717101,
        -0.87733694,  0.69695388],
       [ 0.2850699 , -1.00173683, -0.24977903, ...,  0.65378254,
        -0.87733694,  0.10861619],
       ...,
       [-0.02120355, -0.19036121, -1.24379763, ..., -0.47944053,
         0.15951581, -0.4797215 ],
       [-0.48061371,  0.62101442, -0.91245809, ...,  0.08717101,
         0.15951581, -0.4797215 ],
       [-0.02120355, -1.00173683,  0.0815605 , ...,  0.65378254,
        -0.87733694,  0.10861619]])

In [42]:
##Modeling
##I orginally tried 5 clusters, but one borough was alone in a cluster.
num_clusters = 4

k_means5 = KMeans(init="k-means++", n_clusters=num_clusters, n_init=20)
k_means5.fit(cluster_dataset)
labels = k_means5.labels_

In [43]:
labels #Check that the clusters seem good

array([3, 3, 2, 2, 3, 1, 3, 1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 0, 1, 1, 3, 1,
       1, 1, 1, 2, 1, 0, 3, 1, 3, 3, 3, 1, 2, 1, 1, 1, 1, 1, 2, 3, 1, 0,
       3, 3, 2, 3, 1, 1, 3, 1, 3, 2, 1, 3, 1, 0, 0, 1, 2, 3, 1, 3, 2],
      dtype=int32)

In [55]:
df5["Labels"] = labels  #Add a column with the cluster assignments to the data frame

In [56]:
#Find means for each column by the assigned cluster number
df8=df5.groupby('Labels').mean()

#### Transpose frame again to make it easier to see results

In [57]:
df9=df8.T
df9

Labels,0,1,2,3
venue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Office,3.4,6.806452,8.333333,4.8
Residential Building (Apartment / Condo),8.4,4.096774,1.111111,7.1
Building,3.8,3.354839,5.777778,3.45
Automotive Shop,1.4,1.870968,11.111111,2.45
Salon / Barbershop,6.6,3.290323,1.777778,2.05
Park,1.6,2.193548,1.0,4.5
Doctor's Office,5.0,2.806452,1.666667,2.35
Church,4.0,2.290323,2.222222,3.0
Bank,3.6,2.935484,1.222222,1.95
Dentist's Office,4.8,3.096774,0.444444,1.2


### Summary
#### I left a lot of venues in the final data set, so the clusters are a complex mix of characteristic venues. 
#### For this implementation, clusters 1 & 2 were high in offices, and clusters 0 & 3 were high in apartments. Cluster 2 was particularly high in automotive shops, cluster 0 is high in grocery stores and doctor and dental offices, cluster 3 was high in parks.
##### If one wanted to spend more time it, you could combine some venues into more general categories such as restaurants, salons, transportation, business offices, medical venues, etc.

-------------------