# The Battle of Neighborhoods - Relocation Helper

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes 
!conda install -c conda-forge folium=0.5.0 --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

print('Libraries imported.')

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda/envs/python

  added / updated specs:
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.22.0               |     pyh9f0ad1d_0          63 KB  conda-forge
    ------------------------------------------------------------
                                           Total:          97 KB

The following NEW packages will be INSTALLED:

  geographiclib      conda-forge/noarch::geographiclib-1.50-py_0
  geopy              conda-forge/noarch::geopy-1.22.0-pyh9f0ad1d_0



Downloading and Extracting Packages
geopy-1.22.0         | 63 KB     | ##################################### | 100% 
geographiclib-1.50   | 34 KB     | ###############################

In [2]:
pip install BeautifulSoup4

Collecting BeautifulSoup4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/b5/7bb03a696f2c9b7af792a8f51b82974e51c268f15e925fc834876a4efa0b/beautifulsoup4-4.9.0-py3-none-any.whl (109kB)
[K     |████████████████████████████████| 112kB 3.9MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2 (from BeautifulSoup4)
  Downloading https://files.pythonhosted.org/packages/6f/8f/457f4a5390eeae1cc3aeab89deb7724c965be841ffca6cfca9197482e470/soupsieve-2.0.1-py3-none-any.whl
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.9.0 soupsieve-2.0.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install lxml

Collecting lxml
[?25l  Downloading https://files.pythonhosted.org/packages/dd/ba/a0e6866057fc0bbd17192925c1d63a3b85cf522965de9bc02364d08e5b84/lxml-4.5.0-cp36-cp36m-manylinux1_x86_64.whl (5.8MB)
[K     |████████████████████████████████| 5.8MB 6.3MB/s eta 0:00:01     |███████████████████▌            | 3.5MB 6.3MB/s eta 0:00:01
[?25hInstalling collected packages: lxml
Successfully installed lxml-4.5.0
Note: you may need to restart the kernel to use updated packages.


### Neighborhoods in Toronto

In [4]:
from bs4 import BeautifulSoup
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text
Canada_data = BeautifulSoup(source, 'lxml')

# creat a new Dataframe
column_names = ['PostalCode','Borough','Neighborhood']
toronto = pd.DataFrame(columns = column_names)

# loop through to find postcode, borough, neighborhood 
content = Canada_data.find('div', class_='mw-parser-output')
table = content.table.tbody
postcode = 0
borough = 0
neighborhood = 0

for tr in table.find_all('tr'):
    i = 0
    for td in tr.find_all('td'):
        if i == 0:
            postcode = td.text.strip('\n')
            i = i + 1
        elif i == 1:
            borough = td.text.strip('\n')
            i = i + 1
        elif i == 2: 
            neighborhood = td.text.strip('\n').replace(']','')
            
    toronto = toronto.append({'PostalCode': postcode,'Borough': borough,'Neighborhood': neighborhood},ignore_index=True)

# clean dataframe 
toronto = toronto[toronto.Borough!='Not assigned']
toronto = toronto[toronto.Borough!= 0]
toronto.reset_index(drop = True, inplace = True)
i = 0
for i in range(0,toronto.shape[0]):
    if toronto.iloc[i][2] == 'Not assigned':
        toronto.iloc[i][2] = toronto.iloc[i][1]
        i = i+1
                                 
df = toronto.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()

!wget -q -O 'Toronto_long_lat_data.csv'  http://cocl.us/Geospatial_data
df_lon_lat = pd.read_csv('Toronto_long_lat_data.csv')
df_lon_lat = df_lon_lat.rename(columns={"Postal Code":"PostalCode"})

Toronto_df = pd.merge(df,
                 df_lon_lat[['PostalCode','Latitude', 'Longitude']],
                 on='PostalCode')
Toronto_df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


### Map of Neighborhoods in Toronto

In [5]:
address = 'Toronto, CA'

geolocator = Nominatim(user_agent="TO_CA_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

map_Toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Toronto_df['Latitude'], Toronto_df['Longitude'], Toronto_df['Borough'], Toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Toronto)  
    
map_Toronto

# Add Information from Foursquare

In [6]:
CLIENT_ID = 'ZXPGFJJXJYHSZUZOGHJ5JMCY0Z02JK1VDZIPGU4JRS3LIUCT'
CLIENT_SECRET = 'Z0SC2CCUEWGM0YDFPIGIDXGLWJ44DC1C0AGLSD4LKVYL03R0'
VERSION = '20180605'

def getNearbyVenues(names, latitudes, longitudes, radius):
    
    venues_list=[]
    remove = []
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        #Log postal codees without nearby addresses 
        if not results:
            remove.append(name)

        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal_Code', 
                  'Postal_Latitude', 
                  'Postal_Longitude', 
                  'Venue', 
                  'Venue_Latitude', 
                  'Venue_Longitude', 
                  'Venue_Category']
    
    return(nearby_venues, remove)

#Radius of 750m and a limit of 100 venues
radius = 750
LIMIT = 100

#Get the venues near Toronto postal codes
toronto_venues,remove = getNearbyVenues(names=Toronto_df['PostalCode'],
                                   latitudes=Toronto_df['Latitude'],
                                   longitudes=Toronto_df['Longitude'],
                                   radius = radius
                                  )

#Remove those postal codes with no nearby venues
for item in remove:
    indexNames = Toronto_df[ Toronto_df['PostalCode'] == item ].index
    # Delete these row indexes from dataFrame
    Toronto_df.drop(indexNames , inplace=True)
    
# Display the shape of the dataframe and the first 5 rows
print(toronto_venues.shape)
toronto_venues.head()


(3706, 7)


Unnamed: 0,Postal_Code,Postal_Latitude,Postal_Longitude,Venue,Venue_Latitude,Venue_Longitude,Venue_Category
0,M1B,43.806686,-79.194353,Wendy’s,43.807448,-79.199056,Fast Food Restaurant
1,M1B,43.806686,-79.194353,Wendy's,43.802008,-79.19808,Fast Food Restaurant
2,M1B,43.806686,-79.194353,Staples Morningside,43.800285,-79.196607,Paper / Office Supplies Store
3,M1B,43.806686,-79.194353,Tim Hortons,43.802,-79.198169,Coffee Shop
4,M1B,43.806686,-79.194353,Images Salon & Spa,43.802283,-79.198565,Spa


# Rank Venues from Foursquare

In [7]:
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue_Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot["Postal_Code"] = toronto_venues["Postal_Code"] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

# group rows by Postal Code and by taking the mean of the frequency of occurrence of each category
toronto_grouped = toronto_onehot.groupby("Postal_Code").mean().reset_index()
print(toronto_grouped.shape)

#Identify top venues
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

num_top_venues = 5 #Interested in the top 5 venues
indicators = ['st', 'nd', 'rd'] #Numbering indicators (1st, 2nd, 3rd)

# create columns according to number of top venues
columns = ["Postal_Code"]
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
postal_venues_sorted = pd.DataFrame(columns=columns)
postal_venues_sorted["Postal_Code"] = toronto_grouped["Postal_Code"]

for ind in np.arange(toronto_grouped.shape[0]):
    postal_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)
    
kclusters = 5 # set number of clusters
toronto_grouped_clustering = toronto_grouped.drop("Postal_Code", 1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering) # run k-means clustering

toronto_merged = Toronto_df
toronto_merged['Cluster Labels'] = kmeans.labels_  # add clustering labels

# merge toronto_grouped with toronto_data to add latitude/longitude for each postal code
toronto_merged = toronto_merged.join(postal_venues_sorted.set_index('Postal_Code'), on='PostalCode')
toronto_merged.head(5)

(102, 323)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353,3,Fast Food Restaurant,Coffee Shop,Bus Station,African Restaurant,Construction & Landscaping
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,1,Breakfast Spot,Bar,Italian Restaurant,Burger Joint,Yoga Studio
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,1,Fast Food Restaurant,Pizza Place,Restaurant,Sports Bar,Fried Chicken Joint
3,M1G,Scarborough,Woburn,43.770992,-79.216917,2,Coffee Shop,Park,Convenience Store,Business Service,Yoga Studio
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476,3,Coffee Shop,Indian Restaurant,Yoga Studio,Chinese Restaurant,Burger Joint


# Map Toronto clusters

In [8]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11) # create map

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i+x+(i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

### Neighborhoods in New York

In [9]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset #get data for NY
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)
neighborhoods_data = newyork_data['features']

#transform into pandas df
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude']  # define the dataframe columns
neighborhoods = pd.DataFrame(columns=column_names) # instantiate the dataframe

#loop through to fill the df
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
neighborhoods.head(10)

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585
5,Bronx,Kingsbridge,40.881687,-73.902818
6,Manhattan,Marble Hill,40.876551,-73.91066
7,Bronx,Woodlawn,40.898273,-73.867315
8,Bronx,Norwood,40.877224,-73.879391
9,Bronx,Williamsbridge,40.881039,-73.857446


### Map of Neighborhoods in New York

In [10]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

# Segment only the neighborhoods in Manhattan

In [11]:
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
address = 'Manhattan, NY'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude

# create map of Manhattan using latitude and longitude values
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(manhattan_data['Latitude'], manhattan_data['Longitude'], manhattan_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_manhattan)  
    
map_manhattan

In [12]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [13]:
manhattan_venues = getNearbyVenues(names=manhattan_data['Neighborhood'],
                                   latitudes=manhattan_data['Latitude'],
                                   longitudes=manhattan_data['Longitude']
                                  )
manhattan_venues.head()

Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron
Hudson Yards


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,Arturo's,40.874412,-73.910271,Pizza Place
1,Marble Hill,40.876551,-73.91066,Bikram Yoga,40.876844,-73.906204,Yoga Studio
2,Marble Hill,40.876551,-73.91066,Tibbett Diner,40.880404,-73.908937,Diner
3,Marble Hill,40.876551,-73.91066,Starbucks,40.877531,-73.905582,Coffee Shop
4,Marble Hill,40.876551,-73.91066,Dunkin',40.877136,-73.906666,Donut Shop


In [14]:
#how many venues were returned for each neighborhood
print('There are {} uniques categories.'.format(len(manhattan_venues['Venue Category'].unique())))
manhattan_venues.groupby('Neighborhood').count()

There are 328 uniques categories.


Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Battery Park City,61,61,61,61,61,61
Carnegie Hill,84,84,84,84,84,84
Central Harlem,45,45,45,45,45,45
Chelsea,100,100,100,100,100,100
Chinatown,100,100,100,100,100,100
Civic Center,88,88,88,88,88,88
Clinton,100,100,100,100,100,100
East Harlem,43,43,43,43,43,43
East Village,100,100,100,100,100,100
Financial District,100,100,100,100,100,100


# Analyze Each Neighborhood

In [15]:
# one hot encoding
manhattan_onehot = pd.get_dummies(manhattan_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
manhattan_onehot['Neighborhood'] = manhattan_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]

# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
manhattan_grouped = manhattan_onehot.groupby('Neighborhood').mean().reset_index()

# print each neighborhood along with the top 5 most common venues
num_top_venues = 5
for hood in manhattan_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = manhattan_grouped[manhattan_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Battery Park City----
           venue  freq
0           Park  0.11
1          Hotel  0.07
2  Memorial Site  0.05
3            Gym  0.05
4    Coffee Shop  0.05


----Carnegie Hill----
           venue  freq
0    Coffee Shop  0.08
1    Pizza Place  0.05
2    Yoga Studio  0.04
3  Grocery Store  0.04
4           Café  0.04


----Central Harlem----
                 venue  freq
0   African Restaurant  0.07
1       Cosmetics Shop  0.07
2                  Bar  0.04
3   Chinese Restaurant  0.04
4  American Restaurant  0.04


----Chelsea----
                 venue  freq
0          Art Gallery  0.16
1          Coffee Shop  0.07
2                 Café  0.04
3       Ice Cream Shop  0.03
4  American Restaurant  0.03


----Chinatown----
                venue  freq
0  Chinese Restaurant  0.08
1        Cocktail Bar  0.04
2              Bakery  0.04
3         Coffee Shop  0.03
4                 Spa  0.03


----Civic Center----
                 venue  freq
0          Coffee Shop  0.06
1             

In [16]:
# put that into a pandas df
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    return row_categories_sorted.index.values[0:num_top_venues]

# display top venues per neighborhood
num_top_venues = 5
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = manhattan_grouped['Neighborhood']

for ind in np.arange(manhattan_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(manhattan_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(5)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Battery Park City,Park,Hotel,Coffee Shop,Gym,Memorial Site
1,Carnegie Hill,Coffee Shop,Pizza Place,Café,Japanese Restaurant,Gym
2,Central Harlem,African Restaurant,Cosmetics Shop,Bar,American Restaurant,French Restaurant
3,Chelsea,Art Gallery,Coffee Shop,Café,Ice Cream Shop,American Restaurant
4,Chinatown,Chinese Restaurant,Cocktail Bar,Bakery,Optical Shop,American Restaurant


# Cluster Neighborhoods in NY

In [17]:
kclusters = 5 # set number of clusters
manhattan_grouped_clustering = manhattan_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(manhattan_grouped_clustering)   # run k-means clustering
kmeans.labels_[0:10]     # check cluster labels generated for each row in the dataframe

neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)     # add clustering labels

manhattan_merged = manhattan_data

# merge to add latitude/longitude for each neighborhood
manhattan_merged = manhattan_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

manhattan_merged.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Manhattan,Marble Hill,40.876551,-73.91066,2,Sandwich Place,Gym,Coffee Shop,Diner,Steakhouse
1,Manhattan,Chinatown,40.715618,-73.994279,0,Chinese Restaurant,Cocktail Bar,Bakery,Optical Shop,American Restaurant
2,Manhattan,Washington Heights,40.851903,-73.9369,2,Café,Bakery,Mobile Phone Shop,Chinese Restaurant,Latin American Restaurant
3,Manhattan,Inwood,40.867684,-73.92121,2,Mexican Restaurant,Restaurant,Café,Lounge,Pizza Place
4,Manhattan,Hamilton Heights,40.823604,-73.949688,0,Pizza Place,Coffee Shop,Café,Deli / Bodega,Mexican Restaurant


# Map NY clusters

In [18]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(manhattan_merged['Latitude'], manhattan_merged['Longitude'], manhattan_merged['Neighborhood'], manhattan_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters

### NYC Cluster 1

In [19]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 0, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
6,Central Harlem,African Restaurant,Cosmetics Shop,Bar,American Restaurant,French Restaurant
20,Lower East Side,Chinese Restaurant,Art Gallery,Café,Cocktail Bar,Yoga Studio
1,Chinatown,Chinese Restaurant,Cocktail Bar,Bakery,Optical Shop,American Restaurant
30,Carnegie Hill,Coffee Shop,Pizza Place,Café,Japanese Restaurant,Gym
25,Manhattan Valley,Coffee Shop,Spa,Bar,Mexican Restaurant,Hawaiian Restaurant
12,Upper West Side,Italian Restaurant,Dessert Shop,Wine Bar,Bakery,Coffee Shop
4,Hamilton Heights,Pizza Place,Coffee Shop,Café,Deli / Bodega,Mexican Restaurant
19,East Village,Pizza Place,Coffee Shop,Cocktail Bar,Juice Bar,Wine Bar


### NYC Cluster 2

In [20]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 1, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
32,Civic Center,Coffee Shop,French Restaurant,American Restaurant,Park,Hotel
29,Financial District,Coffee Shop,Hotel,American Restaurant,Sandwich Place,Pizza Place
15,Midtown,Coffee Shop,Hotel,Clothing Store,Theater,Pizza Place
16,Murray Hill,Coffee Shop,Hotel,Sandwich Place,Gym / Fitness Center,Steakhouse
39,Hudson Yards,Hotel,American Restaurant,Gym / Fitness Center,Italian Restaurant,Café
11,Roosevelt Island,Hotel,Outdoors & Recreation,Metro Station,Supermarket,Food & Drink Shop
13,Lincoln Square,Italian Restaurant,Plaza,Café,Gym / Fitness Center,Theater
33,Midtown South,Korean Restaurant,Hotel,Burger Joint,Japanese Restaurant,Coffee Shop
28,Battery Park City,Park,Hotel,Coffee Shop,Gym,Memorial Site
14,Clinton,Theater,Coffee Shop,Gym / Fitness Center,Gym,Wine Shop


### NYC Cluster 3

In [21]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 2, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
2,Washington Heights,Café,Bakery,Mobile Phone Shop,Chinese Restaurant,Latin American Restaurant
36,Tudor City,Café,Park,Mexican Restaurant,Deli / Bodega,Pizza Place
5,Manhattanville,Coffee Shop,Seafood Restaurant,Italian Restaurant,Mexican Restaurant,Chinese Restaurant
7,East Harlem,Mexican Restaurant,Bakery,Deli / Bodega,Thai Restaurant,Latin American Restaurant
3,Inwood,Mexican Restaurant,Restaurant,Café,Lounge,Pizza Place
26,Morningside Heights,Park,Bookstore,American Restaurant,Coffee Shop,Deli / Bodega
0,Marble Hill,Sandwich Place,Gym,Coffee Shop,Diner,Steakhouse


### NYC Cluster 4

In [22]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 3, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
17,Chelsea,Art Gallery,Coffee Shop,Café,Ice Cream Shop,American Restaurant
22,Little Italy,Chinese Restaurant,Spa,Italian Restaurant,Bakery,Mediterranean Restaurant
35,Turtle Bay,Coffee Shop,Italian Restaurant,Café,Wine Bar,Park
38,Flatiron,Gym / Fitness Center,Italian Restaurant,American Restaurant,Wine Shop,Salon / Barbershop
27,Gramercy,Italian Restaurant,Bagel Shop,Coffee Shop,Playground,Pizza Place
8,Upper East Side,Italian Restaurant,Bakery,Gym / Fitness Center,Juice Bar,Spa
31,Noho,Italian Restaurant,Coffee Shop,Grocery Store,Japanese Restaurant,Sandwich Place
9,Yorkville,Italian Restaurant,Coffee Shop,Gym,Deli / Bodega,Bar
18,Greenwich Village,Italian Restaurant,Coffee Shop,Gym,Pizza Place,Ice Cream Shop
34,Sutton Place,Italian Restaurant,Coffee Shop,Gym / Fitness Center,Park,Bagel Shop


### NYC Cluster 5

In [23]:
manhattan_merged.loc[manhattan_merged['Cluster Labels'] == 4, manhattan_merged.columns[[1] + list(range(5, manhattan_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
37,Stuyvesant Town,Boat or Ferry,Park,Bar,Gym / Fitness Center,Baseball Field


### Toronto cluster 1

In [24]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
96,North York,0,Bakery,Arts & Crafts Store,Pizza Place,Yoga Studio,Dumpling Restaurant
79,North York,0,Home Service,Bakery,Yoga Studio,Dumpling Restaurant,Distribution Center


### Toronto cluster 2

In [25]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
1,Scarborough,1,Breakfast Spot,Bar,Italian Restaurant,Burger Joint,Yoga Studio
15,Scarborough,1,Fast Food Restaurant,Pizza Place,Chinese Restaurant,Grocery Store,Noodle House
2,Scarborough,1,Fast Food Restaurant,Pizza Place,Restaurant,Sports Bar,Fried Chicken Joint
24,North York,1,Grocery Store,Coffee Shop,Park,Pizza Place,Bank
101,Etobicoke,1,Grocery Store,Pizza Place,Beer Store,Fast Food Restaurant,Fried Chicken Joint
31,North York,1,Grocery Store,Shopping Mall,Coffee Shop,Vietnamese Restaurant,Pizza Place
98,York,1,Pharmacy,Pizza Place,Park,Fried Chicken Joint,Gift Shop
99,Etobicoke,1,Pizza Place,Discount Store,Middle Eastern Restaurant,Coffee Shop,Gas Station
11,Scarborough,1,Pizza Place,Middle Eastern Restaurant,Burger Joint,Grocery Store,Restaurant
94,Etobicoke,1,Pizza Place,Theater,Mexican Restaurant,Coffee Shop,Hotel


### Toronto cluster 3

In [26]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
91,Etobicoke,2,Baseball Field,Park,Convenience Store,Gym / Fitness Center,Dumpling Restaurant
25,North York,2,Bus Stop,Park,Food & Drink Shop,Road,Yoga Studio
3,Scarborough,2,Coffee Shop,Park,Convenience Store,Business Service,Yoga Studio
23,North York,2,Park,Gym,Bowling Alley,Convenience Store,Bank
50,Downtown Toronto,2,Park,Trail,Candy Store,Playground,Yoga Studio


### Toronto cluster 4

In [27]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
90,Etobicoke,3,Bakery,Bank,Breakfast Spot,Sushi Restaurant,Bar
83,West Toronto,3,Bar,Café,Thai Restaurant,Breakfast Spot,Restaurant
27,North York,3,Beer Store,Japanese Restaurant,Asian Restaurant,Gym,Coffee Shop
81,York,3,Brewery,Pizza Place,Furniture / Home Store,Coffee Shop,Athletics & Sports
44,Central Toronto,3,Business Service,Bus Line,Coffee Shop,Swim School,Electronics Store
67,Downtown Toronto,3,Café,Bar,Vegetarian / Vegan Restaurant,Coffee Shop,Mexican Restaurant
84,West Toronto,3,Café,Coffee Shop,Bakery,Italian Restaurant,Park
43,East Toronto,3,Café,Coffee Shop,Bar,Bakery,Sandwich Place
58,Downtown Toronto,3,Café,Coffee Shop,Hotel,Theater,Clothing Store
95,Etobicoke,3,Café,Coffee Shop,Liquor Store,Cosmetics Shop,Pizza Place


### Toronto cluster 5

In [28]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]].sort_values(by=['1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue'])

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
20,North York,4,Pool,Yoga Studio,Dumpling Restaurant,Distribution Center,Dive Bar
