**Final Week Capstone**

**(1) Imports**

In [1]:
import pandas as pd
import numpy as np

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    openssl-1.0.2p             |       h470a237_1         3.1 MB  conda-forge
    certifi-2018.10.15         |        py36_1000         138 KB  conda-forge
    geopy-1.17.0               |             py_0          49 KB  conda-forge
    ca-certificates-2018.10.15 |       ha4d7672_0         135 KB  conda-forge
    conda-4.5.11               |        py36_1000         651 KB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.1 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.49-py_0            conda-forge
    geopy:           

**(2) Load Neighborhoods Data of Chicago and New York**

In [2]:
# Chicago Data is saved through "Week4Capstone-DataChicago.csv" combined with manual data cleaning/correction
df_chi = pd.read_csv('Chicago Neighborhoods_WZ.csv')

# New York Data is saved to csv via "3-3-2-Neighborhoods-New-York-py-v1.0/ipynb"
df_ny = pd.read_csv('NY_Neighborhoods.csv')

df_ny.drop('Unnamed: 0', axis=1, inplace=True)
df_chi.rename(columns={'Community area': 'Borough'}, inplace=True)
df_ny['City'] = 'New York'
df_chi['City'] = 'Chicago'
df_raw_all = pd.concat([df_ny, df_chi])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  # This is added back by InteractiveShellApp.init_path()


**(3) Visualize Chicago and New York Neighborhoods (before any clustering)**

Chicago

In [3]:
address = 'Chicago, IL'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude_chi = location.latitude
longitude_chi = location.longitude

# create map of New York using latitude and longitude values
map_chi = folium.Map(location=[latitude_chi, longitude_chi], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_chi['Latitude'], df_chi['Longitude'], df_chi['Borough'], df_chi['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_chi)  
    
map_chi



New York

In [4]:
address = 'New York City, New York'
geolocator = Nominatim()
location = geolocator.geocode(address)
latitude_nyc = location.latitude
longitude_nyc = location.longitude

# create map of New York using latitude and longitude values
map_ny = folium.Map(location=[latitude_nyc, longitude_nyc], zoom_start=11)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_ny['Latitude'], df_ny['Longitude'], df_ny['Borough'], df_ny['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_ny)  
    
map_ny



In [6]:
len(df_ny['Neighborhood'].unique())

302

In [7]:
len(df_chi['Neighborhood'].unique())

119

**(4) Grab Venues Data using Foursquare API**

In [8]:
CLIENT_ID = 'VEZFXJGSG5RCDYPNCTHNALGRCVSCP5RBBAK1X4FL4TK3DG11' # your Foursquare ID
CLIENT_SECRET = '01JSCVYSC42P4XXUQWW5WKVACORF3OBSLR00HR4D1APVB5HT' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

LIMIT = 200 # limit of number of venues returned by Foursquare API
radius = 1000 # define radius

In [8]:
def get_category_type(row):
    """
    This function will extract the category of the given venue
    """
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

def getNearbyVenues(names, latitudes, longitudes, boroughs, radius):
    """
    This function will get nearby venues for neighborhood
    """
    venues_list=[]
    for name, lat, lng, borough in zip(names, latitudes, longitudes, boroughs):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        
        try:
            # make the GET request
            results = requests.get(url).json()["response"]['groups'][0]['items']

            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                borough,
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
        except:
            pass

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 'Borough', 'Venue', 'Venue Latitude', 'Venue Longitude', 'Venue Category']
    
    return(nearby_venues)

In [48]:
# Grab Nearby Venues for New York Neighborhoods (and save it to csv to avoid doing this again)
venues_ny = getNearbyVenues(names=df_ny['Neighborhood'], latitudes=df_ny['Latitude'], longitudes=df_ny['Longitude'], boroughs = df_ny['Borough'], radius=radius)
venues_ny.to_csv('Venues_NY.csv')

Wakefield
Co-op City
Eastchester
Fieldston
Riverdale
Kingsbridge
Marble Hill
Woodlawn
Norwood
Williamsbridge
Baychester
Pelham Parkway
City Island
Bedford Park
University Heights
Morris Heights
Fordham
East Tremont
West Farms
High  Bridge
Melrose
Mott Haven
Port Morris
Longwood
Hunts Point
Morrisania
Soundview
Clason Point
Throgs Neck
Country Club
Parkchester
Westchester Square
Van Nest
Morris Park
Belmont
Spuyten Duyvil
North Riverdale
Pelham Bay
Schuylerville
Edgewater Park
Castle Hill
Olinville
Pelham Gardens
Concourse
Unionport
Edenwald
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker

In [9]:
# Grab Nearby Venues for Chicago Neighborhoods (and save it to csv to avoid doing this again)
venues_chi = getNearbyVenues(names=df_chi['Neighborhood'], latitudes=df_chi['Latitude'], longitudes=df_chi['Longitude'], boroughs = df_chi['Community area'], radius=radius)
venues_chi.to_csv('Venues_Chicago.csv')

The Loop
Near East Side
Printer's Row
South Loop
Cabrini-Green
Gold Coast
Goose Island
Magnificent Mile
Near North Side
Old Town
River North
Streeterville
Central Station
Dearborn Park
Museum Campus
Prairie Avenue Historic District
Fulton River District
Greektown
Illinois Medical District
Little Italy/University Village
Near West Side
Tri-Taylor
West Loop
Albany Park
Archer Heights
Armour Square
Chinatown
Wentworth Gardens
Ashburn
Auburn Gresham
Galewood
The Island
North Austin
South Austin
West Humboldt Park
Avalon Park
Avondale
Irving Park
Belmont Central
Beverly
Bridgeport
Brighton Park
Burnside
Calumet Heights
Chatham
Chicago Lawn
Chrysler Village
Bronzeville
Dearborn Homes
Groveland Park
Lake Meadows
Prairie Shores
South Commons
Stateway Gardens
Dunning
East Garfield Park
East Side
Edgewater
Edison Park
Englewood
Forest Glen
Fuller Park
Gage Park
Garfield Ridge
Grand Boulevard
Greater Grand Crossing
Hegewisch
Hermosa
Humboldt Park
East Hyde Park
Hyde Park
Jefferson Park
Kenwood
La

In [9]:
df_venues_ny = pd.read_csv('Venues_NY.csv')
df_venues_chi = pd.read_csv('Venues_Chicago.csv')

Combine Chicago and NY Datasets:

In [11]:
df_venues_ny.drop('Unnamed: 0', axis=1, inplace=True)
df_venues_chi.drop('Unnamed: 0', axis=1, inplace=True)
df_venues_ny['City'] = 'New York'
df_venues_chi['City'] = 'Chicago'
df_venues_all = pd.concat([df_venues_ny, df_venues_chi])

In [12]:
print('There are {} uniques categories.'.format(len(df_venues_ny['Venue Category'].unique())))
print('There are {} uniques categories.'.format(len(df_venues_chi['Venue Category'].unique())))
print('There are {} uniques categories.'.format(len(df_venues_all['Venue Category'].unique())))

There are 462 uniques categories.
There are 355 uniques categories.
There are 501 uniques categories.


**(5) Data Preparation for Clustering**

In [28]:
# one hot encoding
onehot_all = pd.get_dummies(df_venues_all[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot_all['Neighborhood'] = df_venues_all['Neighborhood']
onehot_all['Borough'] = df_venues_all['Borough'] 
onehot_all['City'] = df_venues_all['City']

# move Borough and City columns to the first two columns
fixed_columns = list(onehot_all.columns[-2:]) + list(onehot_all.columns[:-2])
onehot_all = onehot_all[fixed_columns]

df_grouped = onehot_all.groupby(['Neighborhood', 'Borough', 'City']).mean().reset_index()
df_grouped.head(2)

Unnamed: 0,Neighborhood,Borough,City,ATM,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport,Airport Lounge,...,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,Albany Park,Albany Park,Chicago,0.0,0.011628,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.011628,0.0,0.0,0.0,0.0
1,Allerton,Bronx,New York,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[3:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [30]:
num_top_venues = 10
indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood', 'Borough', 'City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = df_grouped['Neighborhood']
neighborhoods_venues_sorted['Borough'] = df_grouped['Borough']
neighborhoods_venues_sorted['City'] = df_grouped['City']

for ind in np.arange(df_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 3:] = return_most_common_venues(df_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(2)

Unnamed: 0,Neighborhood,Borough,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Albany Park,Albany Park,Chicago,Mexican Restaurant,Grocery Store,Discount Store,Pizza Place,Hookah Bar,Donut Shop,Sandwich Place,Chinese Restaurant,Bakery,Middle Eastern Restaurant
1,Allerton,Bronx,New York,Pizza Place,Fast Food Restaurant,Donut Shop,Caribbean Restaurant,Bus Station,Sandwich Place,Pharmacy,Mexican Restaurant,Dessert Shop,Supermarket


**(6) Cluster Analysis with K-Means - Using All Neighborhoods Data**

In [39]:
kclusters = 5 # We tried different values, and the main conclusions don't vary much
df_grouped_clustering = df_grouped.drop(['Neighborhood', 'Borough', 'City'], axis=1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped_clustering)

df_grouped['Cluster Labels'] = kmeans.labels_
df_merged = neighborhoods_venues_sorted.merge(df_grouped[['Cluster Labels', 'Neighborhood', 'Borough', 'City']], how='inner', on=['Neighborhood', 'Borough', 'City'])
df_merged_all = df_merged.merge(df_raw_all, how='inner', on=['Neighborhood', 'Borough', 'City'])

In [40]:
def plot_clustered_map(lat_start, lng_start, zoom_start, df_to_plot):
    """
    This function will plot a map with clusters for each neighborhood
    df_to_plot: This dataframe should have the following columns: Latitude, Longitude, Neighborhood, Borough, Cluster Labels
    """
    
    # create map
    map_clusters = folium.Map(location=[lat_start, lng_start], zoom_start=zoom_start)

    kclusters = len(df_merged_all['Cluster Labels'].unique())
    # set color scheme for the clusters
    x = np.arange(kclusters)
    ys = [i+x+(i*x)**2 for i in range(kclusters)]
    colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
    rainbow = [colors.rgb2hex(i) for i in colors_array]

    # add markers to the map
    markers_colors = []
    for lat, lon, poi, bor, cluster in zip(df_to_plot['Latitude'], df_to_plot['Longitude'], df_to_plot['Neighborhood'], df_to_plot['Borough'], df_to_plot['Cluster Labels']):
        label = folium.Popup('%s - %s - Cluster%s'%(poi, bor, cluster), parse_html=True)
        folium.CircleMarker(
            [lat, lon],
            radius=5,
            popup=label,
            color=rainbow[cluster-1],
            fill=True,
            fill_color=rainbow[cluster-1],
            fill_opacity=0.7).add_to(map_clusters)

    return map_clusters

In [41]:
map_clusters = plot_clustered_map(lat_start=latitude_nyc, lng_start=longitude_nyc, zoom_start=11, df_to_plot=df_merged_all)
map_clusters

In [42]:
map_clusters = plot_clustered_map(lat_start=latitude_chi, lng_start=longitude_chi, zoom_start=11, df_to_plot=df_merged_all)
map_clusters

In [45]:
df_merged_all_display = pd.DataFrame()
for col in ['Neighborhood', 'Borough', 'City', 'Cluster Labels']:
    df_merged_all_display[col] = df_merged_all[col]
df_merged_all_display['Key'] = df_merged_all_display['Borough'] + '-' + df_merged_all_display['City'].map({'New York': 'NYC', 'Chicago': 'CHI'})

In [56]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [57]:
df_merged_all_display.groupby('Cluster Labels')['Key'].value_counts()

Cluster Labels  Key                       
0               Queens-NYC                    43
                Staten Island-NYC             32
                Bronx-NYC                     31
                Brooklyn-NYC                  27
                Armour Square-CHI              2
                Austin-CHI                     2
                Albany Park-CHI                1
                Avondale-CHI                   1
                Belmont Cragin-CHI             1
                Beverly-CHI                    1
                Bridgeport-CHI                 1
                Douglas-CHI                    1
                Dunning-CHI                    1
                East Side-CHI                  1
                Garfield Ridge-CHI             1
                Hegewisch-CHI                  1
                Hermosa-CHI                    1
                Irving Park-CHI                1
                Manhattan-NYC                  1
                Mount Gree

In [62]:
# Generate helper dataframe to see the percentage of which Cluster(s) each Borough belongs to
# (1) Get total # of neighborhoods each borough has
df_display = pd.DataFrame(df_merged_all_display.groupby(['Borough', 'City'])['Neighborhood'].count())
df_display.reset_index(drop=False, inplace=True)
df_display.rename(columns={'Neighborhood': 'NumNeighborhoods'}, inplace=True)

# (2) Get total counts of neighborhoods belong to each cluster for each Borough
df_display_cluster = pd.DataFrame(df_merged_all_display.groupby(['Borough'])['Cluster Labels'].value_counts()) 
df_display_cluster.columns=['NumClusterLabels']
df_display_cluster.reset_index(drop=False, inplace=True)
df_display_cluster = df_display_cluster.pivot(index='Borough', columns='Cluster Labels', values='NumClusterLabels').fillna(0)
df_display_cluster.reset_index(drop=False, inplace=True)

# (3) Merge above two dataframe
df_display = df_display.merge(df_display_cluster, how='inner', on='Borough')

# (4) Convert from count to percentage
for i in range(kclusters):
    df_display[i] = df_display[i]/df_display['NumNeighborhoods']

df_display.to_csv('Cluster_All.csv')

**(6) Cluster Analysis with K-Means - Using Downtown Neighborhoods Data Only**

In [116]:
# New York Downtown Area: Manhattan
# Chicago Downtown Area: The Loop', Near North Side, Near South Side, Near West Side
downtown_borough_list = ['Manhattan', 'The Loop', 'Near North Side', 'Near South Side', 'Near West Side']
df_grouped_downtown = df_grouped[df_grouped['Borough'].isin(downtown_borough_list)]

In [117]:
neighborhoods_venues_downtown_sorted = neighborhoods_venues_sorted[neighborhoods_venues_sorted['Borough'].isin(downtown_borough_list)]

In [121]:
kclusters = 5
df_grouped_downtown_clustering = df_grouped_downtown.drop(['Neighborhood', 'Borough', 'City'], axis=1)
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_grouped_downtown_clustering)

df_grouped_downtown['Cluster Labels'] = kmeans.labels_
df_merged_downtown = neighborhoods_venues_downtown_sorted.merge(df_grouped_downtown[['Cluster Labels', 'Neighborhood', 'Borough', 'City']], how='inner', on=['Neighborhood', 'Borough', 'City'])
df_merged_downtown = df_merged_downtown.merge(df_raw_all, how='inner', on=['Neighborhood', 'Borough', 'City'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [129]:
map_clusters = plot_clustered_map(lat_start=latitude_nyc, lng_start=longitude_nyc, zoom_start=11, df_to_plot=df_merged_downtown)
map_clusters

In [128]:
df_merged_downtown[df_merged_downtown['Neighborhood']=='Little Italy']

Unnamed: 0,Neighborhood,Borough,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels,Latitude,Longitude
25,Little Italy,Manhattan,New York,Men's Store,Clothing Store,Café,Italian Restaurant,Cocktail Bar,Chinese Restaurant,Coffee Shop,Shoe Store,Pizza Place,Women's Store,1,40.719324,-73.997305


In [123]:
map_clusters = plot_clustered_map(lat_start=latitude_chi, lng_start=longitude_chi, zoom_start=13, df_to_plot=df_merged_downtown)
map_clusters

In [126]:
# Generate helper dataframe to see the percentage of which Cluster(s) each Borough belongs to
# (1) Get total # of neighborhoods each borough has
df_display = pd.DataFrame(df_grouped_downtown.groupby(['Borough', 'City'])['Neighborhood'].count())
df_display.reset_index(drop=False, inplace=True)
df_display.rename(columns={'Neighborhood': 'NumNeighborhoods'}, inplace=True)

# (2) Get total counts of neighborhoods belong to each cluster for each Borough
df_display_cluster = pd.DataFrame(df_merged_downtown.groupby(['Borough'])['Cluster Labels'].value_counts()) 
df_display_cluster.columns=['NumClusterLabels']
df_display_cluster.reset_index(drop=False, inplace=True)
df_display_cluster = df_display_cluster.pivot(index='Borough', columns='Cluster Labels', values='NumClusterLabels').fillna(0)
df_display_cluster.reset_index(drop=False, inplace=True)

# (3) Merge above two dataframe
df_display = df_display.merge(df_display_cluster, how='inner', on='Borough')

# (4) Convert from count to percentage
for i in range(kclusters):
    df_display[i] = df_display[i]/df_display['NumNeighborhoods']

df_display.to_csv('Cluster_Downtown.csv')

**(7) Further Examine Clusters using Downtown Neighborhoods Data**

Cluster 1

In [160]:
df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 0, :]

Unnamed: 0,Neighborhood,Borough,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels,Latitude,Longitude
12,Flatiron,Manhattan,New York,Gym,Gym / Fitness Center,Cycle Studio,New American Restaurant,American Restaurant,Italian Restaurant,Spa,Park,Wine Shop,Vegetarian / Vegan Restaurant,0,40.739673,-73.990947
13,Fulton River District,Near West Side,Chicago,New American Restaurant,Italian Restaurant,Coffee Shop,Bar,Restaurant,Mexican Restaurant,Gym,Burger Joint,Latin American Restaurant,Café,0,41.889495,-87.643364
16,Gramercy,Manhattan,New York,American Restaurant,New American Restaurant,Restaurant,Mediterranean Restaurant,Cheese Shop,Park,Mexican Restaurant,Cosmetics Shop,Cocktail Bar,Juice Bar,0,40.73721,-73.981376
17,Greektown,Near West Side,Chicago,Greek Restaurant,New American Restaurant,Coffee Shop,Italian Restaurant,Pizza Place,Sandwich Place,Café,Bar,Grocery Store,Gym,0,41.878564,-87.64705
33,Midtown South,Manhattan,New York,Korean Restaurant,Gym / Fitness Center,Coffee Shop,Sandwich Place,Japanese Restaurant,Yoga Studio,Pizza Place,Hotel,Italian Restaurant,Spa,0,40.74851,-73.988713
35,Murray Hill,Manhattan,New York,Gym / Fitness Center,Japanese Restaurant,Korean Restaurant,Coffee Shop,Gym,Pizza Place,Chinese Restaurant,Sandwich Place,American Restaurant,Gourmet Shop,0,40.748303,-73.978332
44,River North,Near North Side,Chicago,Steakhouse,Italian Restaurant,Restaurant,Mexican Restaurant,Gym,Coffee Shop,Bar,Hotel,Sushi Restaurant,Wine Bar,0,41.892385,-87.634075
59,West Loop,Near West Side,Chicago,New American Restaurant,Coffee Shop,Grocery Store,Breakfast Spot,Greek Restaurant,Pizza Place,Bar,Sandwich Place,Italian Restaurant,Mediterranean Restaurant,0,41.882457,-87.644678


In [159]:
df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 0, ['Neighborhood', 'City']].values

array([['Flatiron', 'New York'],
       ['Fulton River District', 'Chicago'],
       ['Gramercy', 'New York'],
       ['Greektown', 'Chicago'],
       ['Midtown South', 'New York'],
       ['Murray Hill', 'New York'],
       ['River North', 'Chicago'],
       ['West Loop', 'Chicago']], dtype=object)

In [162]:
# Generate Venues that appear in the Top10 Categories by Frequency
venues0 = df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 0, :].iloc[:, 3:13].values.flatten()
unique_elements, counts_elements = np.unique(venues0, return_counts=True)
df_venues0 = pd.DataFrame()
df_venues0['Counts'] = sorted(counts_elements)
df_venues0['VenueCategory'] = [x for _,x in sorted(zip(counts_elements,unique_elements))]
df_venues0.sort_values('Counts', ascending=False)

Unnamed: 0,Counts,VenueCategory
35,6,Italian Restaurant
34,6,Coffee Shop
33,5,New American Restaurant
32,5,Gym
31,4,Sandwich Place
30,4,Pizza Place
29,4,Bar
26,3,Gym / Fitness Center
25,3,American Restaurant
28,3,Restaurant


Cluster 2

In [161]:
# Generate Venues that appear in the Top10 Categories by Frequency
venues1 = df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 1, :].iloc[:, 3:13].values.flatten()
unique_elements, counts_elements = np.unique(venues1, return_counts=True)
df_venues1 = pd.DataFrame()
df_venues1['Counts'] = sorted(counts_elements)
df_venues1['VenueCategory'] = [x for _,x in sorted(zip(counts_elements,unique_elements))]
df_venues1.sort_values('Counts', ascending=False)

Unnamed: 0,Counts,VenueCategory
52,15,Italian Restaurant
51,15,Coffee Shop
50,11,American Restaurant
49,8,French Restaurant
48,8,Bakery
47,7,Pizza Place
46,7,Hotel
45,6,Café
44,5,Gym / Fitness Center
43,5,Cocktail Bar


In [154]:
df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 1, ['Neighborhood', 'City']].values

array([['Chelsea', 'New York'],
       ['Chinatown', 'New York'],
       ['Civic Center', 'New York'],
       ['Gold Coast', 'Chicago'],
       ['Greenwich Village', 'New York'],
       ['Lenox Hill', 'New York'],
       ['Lincoln Square', 'New York'],
       ['Little Italy', 'New York'],
       ['Lower East Side', 'New York'],
       ['Magnificent Mile', 'Chicago'],
       ['Near North Side', 'Chicago'],
       ['Noho', 'New York'],
       ['Soho', 'New York'],
       ['Sutton Place', 'New York'],
       ['Tribeca', 'New York'],
       ['Upper East Side', 'New York'],
       ['West Village', 'New York']], dtype=object)

In [130]:
df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 1, :]

Unnamed: 0,Neighborhood,Borough,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels,Latitude,Longitude
4,Chelsea,Manhattan,New York,Art Gallery,American Restaurant,Coffee Shop,Hotel,Seafood Restaurant,Nightclub,Italian Restaurant,Bakery,Bagel Shop,Tapas Restaurant,1,40.744035,-74.003116
5,Chinatown,Manhattan,New York,Chinese Restaurant,Cocktail Bar,Ice Cream Shop,Wine Bar,Sandwich Place,Café,American Restaurant,French Restaurant,Shoe Store,Thai Restaurant,1,40.715618,-73.994279
6,Civic Center,Manhattan,New York,French Restaurant,Bakery,Coffee Shop,Hotel,Spa,Chinese Restaurant,Cocktail Bar,Men's Store,American Restaurant,Ice Cream Shop,1,40.715229,-74.005415
14,Gold Coast,Near North Side,Chicago,Hotel,Italian Restaurant,Gym,American Restaurant,Salon / Barbershop,Café,Coffee Shop,Steakhouse,Yoga Studio,Restaurant,1,41.906699,-87.625331
18,Greenwich Village,Manhattan,New York,Italian Restaurant,Coffee Shop,Seafood Restaurant,American Restaurant,Pizza Place,Clothing Store,Spa,Café,Indie Movie Theater,French Restaurant,1,40.726933,-73.999914
23,Lenox Hill,Manhattan,New York,Italian Restaurant,Sushi Restaurant,French Restaurant,Gym / Fitness Center,Dessert Shop,Bakery,Coffee Shop,Café,Burger Joint,Spanish Restaurant,1,40.768113,-73.95886
24,Lincoln Square,Manhattan,New York,Italian Restaurant,Gym / Fitness Center,French Restaurant,Jazz Club,Gym,Bakery,Theater,Coffee Shop,Performing Arts Venue,Concert Hall,1,40.773529,-73.985338
25,Little Italy,Manhattan,New York,Men's Store,Clothing Store,Café,Italian Restaurant,Cocktail Bar,Chinese Restaurant,Coffee Shop,Shoe Store,Pizza Place,Women's Store,1,40.719324,-73.997305
27,Lower East Side,Manhattan,New York,Italian Restaurant,Coffee Shop,Mexican Restaurant,Ice Cream Shop,Boutique,Japanese Restaurant,Wine Bar,Deli / Bodega,Shoe Store,Cocktail Bar,1,40.717807,-73.98089
28,Magnificent Mile,Near North Side,Chicago,American Restaurant,Hotel,New American Restaurant,Italian Restaurant,Department Store,Pizza Place,Grocery Store,Seafood Restaurant,Steakhouse,Restaurant,1,41.894809,-87.624214


Cluster 3

In [163]:
df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 2, ['Neighborhood', 'City']].values

array([['Battery Park City', 'New York'],
       ['Cabrini-Green', 'Chicago'],
       ['Carnegie Hill', 'New York'],
       ['Central Harlem', 'New York'],
       ['Clinton', 'New York'],
       ['Dearborn Park', 'Chicago'],
       ['Financial District', 'New York'],
       ['Goose Island', 'Chicago'],
       ['Hudson Yards', 'New York'],
       ['Midtown', 'New York'],
       ['Museum Campus', 'Chicago'],
       ['Near East Side', 'Chicago'],
       ['Near West Side', 'Chicago'],
       ['Old Town', 'Chicago'],
       ['Prairie Avenue Historic District', 'Chicago'],
       ["Printer's Row", 'Chicago'],
       ['South Loop', 'Chicago'],
       ['Streeterville', 'Chicago'],
       ['The Loop', 'Chicago'],
       ['Tudor City', 'New York'],
       ['Turtle Bay', 'New York']], dtype=object)

In [164]:
df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 2, :]

Unnamed: 0,Neighborhood,Borough,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels,Latitude,Longitude
0,Battery Park City,Manhattan,New York,Park,Coffee Shop,Wine Shop,Hotel,American Restaurant,Gym,Plaza,Dog Run,Gym / Fitness Center,BBQ Joint,2,40.711932,-74.016869
1,Cabrini-Green,Near North Side,Chicago,Coffee Shop,Gym / Fitness Center,Gym,American Restaurant,Breakfast Spot,Bar,Italian Restaurant,Café,Bakery,Park,2,41.901091,-87.641464
2,Carnegie Hill,Manhattan,New York,Pizza Place,Coffee Shop,Gym,Yoga Studio,Italian Restaurant,Bakery,Art Museum,Cocktail Bar,Café,Spa,2,40.782683,-73.953256
3,Central Harlem,Manhattan,New York,Southern / Soul Food Restaurant,Café,French Restaurant,African Restaurant,Seafood Restaurant,Theater,Sushi Restaurant,American Restaurant,Coffee Shop,Gym / Fitness Center,2,40.815976,-73.943211
7,Clinton,Manhattan,New York,Theater,Italian Restaurant,American Restaurant,Burger Joint,Hotel,Coffee Shop,Indie Theater,Wine Shop,Gym / Fitness Center,Bakery,2,40.759101,-73.996119
8,Dearborn Park,Near South Side,Chicago,Pizza Place,Grocery Store,Coffee Shop,Gym / Fitness Center,Park,Breakfast Spot,Burger Joint,American Restaurant,Sushi Restaurant,Yoga Studio,2,41.866429,-87.629
11,Financial District,Manhattan,New York,Coffee Shop,Hotel,Park,Steakhouse,Falafel Restaurant,Plaza,Sandwich Place,Cocktail Bar,Pizza Place,Jewelry Store,2,40.707107,-74.010665
15,Goose Island,Near North Side,Chicago,Pizza Place,Furniture / Home Store,Bar,Gym,Sporting Goods Shop,Clothing Store,Grocery Store,Burger Joint,Theater,American Restaurant,2,41.904455,-87.654414
20,Hudson Yards,Manhattan,New York,Dance Studio,Theater,Hotel,Gym / Fitness Center,Italian Restaurant,American Restaurant,Pizza Place,Indie Theater,Gym,Music Venue,2,40.756658,-74.000111
32,Midtown,Manhattan,New York,Theater,Coffee Shop,Sandwich Place,Gym,Plaza,Hotel,Chinese Restaurant,Sporting Goods Shop,Toy / Game Store,Steakhouse,2,40.754691,-73.981669


In [165]:
# Generate Venues that appear in the Top10 Categories by Frequency
venues2 = df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 2, :].iloc[:, 3:13].values.flatten()
unique_elements, counts_elements = np.unique(venues2, return_counts=True)
df_venues2 = pd.DataFrame()
df_venues2['Counts'] = sorted(counts_elements)
df_venues2['VenueCategory'] = [x for _,x in sorted(zip(counts_elements,unique_elements))]
df_venues2.sort_values('Counts', ascending=False)

Unnamed: 0,Counts,VenueCategory
68,18,Coffee Shop
67,12,American Restaurant
66,11,Hotel
65,10,Pizza Place
64,10,Park
63,9,Gym / Fitness Center
62,8,Italian Restaurant
60,7,Gym
59,7,Burger Joint
61,7,Theater


Cluster 4

In [166]:
df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 3, ['Neighborhood', 'City']].values

array([['East Harlem', 'New York'],
       ['East Village', 'New York'],
       ['Hamilton Heights', 'New York'],
       ['Inwood', 'New York'],
       ['Little Italy/University Village', 'Chicago'],
       ['Manhattan Valley', 'New York'],
       ['Manhattanville', 'New York'],
       ['Morningside Heights', 'New York'],
       ['Roosevelt Island', 'New York'],
       ['Stuyvesant Town', 'New York'],
       ['Upper West Side', 'New York'],
       ['Washington Heights', 'New York'],
       ['Yorkville', 'New York']], dtype=object)

In [167]:
df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 3, :]

Unnamed: 0,Neighborhood,Borough,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels,Latitude,Longitude
9,East Harlem,Manhattan,New York,Bakery,Mexican Restaurant,Pizza Place,Café,Plaza,Thai Restaurant,Latin American Restaurant,Deli / Bodega,Park,Cocktail Bar,3,40.792249,-73.944182
10,East Village,Manhattan,New York,Cocktail Bar,Ice Cream Shop,Coffee Shop,Wine Bar,Italian Restaurant,Japanese Restaurant,Vegetarian / Vegan Restaurant,Bar,Chinese Restaurant,Pizza Place,3,40.727847,-73.982226
19,Hamilton Heights,Manhattan,New York,Coffee Shop,Mexican Restaurant,Café,Bar,Yoga Studio,Park,Scenic Lookout,Chinese Restaurant,Caribbean Restaurant,American Restaurant,3,40.823604,-73.949688
22,Inwood,Manhattan,New York,Latin American Restaurant,Pizza Place,Mexican Restaurant,Café,Wine Bar,Lounge,Deli / Bodega,Spanish Restaurant,Chinese Restaurant,Bakery,3,40.867684,-73.92121
26,Little Italy/University Village,Near West Side,Chicago,Sandwich Place,Italian Restaurant,Bar,Thai Restaurant,Pizza Place,Park,Breakfast Spot,Theater,Pharmacy,Donut Shop,3,41.868607,-87.660579
29,Manhattan Valley,Manhattan,New York,Park,Coffee Shop,Indian Restaurant,Pizza Place,Bar,Grocery Store,Mexican Restaurant,Chinese Restaurant,Ice Cream Shop,Dog Run,3,40.797307,-73.964286
30,Manhattanville,Manhattan,New York,Mexican Restaurant,American Restaurant,Seafood Restaurant,Italian Restaurant,Park,Café,Tennis Court,Coffee Shop,Southern / Soul Food Restaurant,Art Gallery,3,40.816934,-73.957385
34,Morningside Heights,Manhattan,New York,Coffee Shop,Italian Restaurant,Park,American Restaurant,Seafood Restaurant,Bakery,Mexican Restaurant,Bookstore,Restaurant,Café,3,40.808,-73.963896
45,Roosevelt Island,Manhattan,New York,Park,Sushi Restaurant,Coffee Shop,Pizza Place,Greek Restaurant,Deli / Bodega,Italian Restaurant,Bar,Mexican Restaurant,Café,3,40.76216,-73.949168
49,Stuyvesant Town,Manhattan,New York,Bar,Cocktail Bar,Pizza Place,Park,Coffee Shop,Italian Restaurant,Bagel Shop,American Restaurant,Ice Cream Shop,Dog Run,3,40.731,-73.974052


In [169]:
# Generate Venues that appear in the Top10 Categories by Frequency
venues3 = df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 3, :].iloc[:, 3:13].values.flatten()
unique_elements, counts_elements = np.unique(venues3, return_counts=True)
df_venues3 = pd.DataFrame()
df_venues3['Counts'] = sorted(counts_elements)
df_venues3['VenueCategory'] = [x for _,x in sorted(zip(counts_elements,unique_elements))]
df_venues3.sort_values('Counts', ascending=False)

Unnamed: 0,Counts,VenueCategory
44,10,Park
43,9,Pizza Place
42,9,Mexican Restaurant
41,9,Coffee Shop
40,9,Bar
39,8,Italian Restaurant
38,7,Café
37,5,Ice Cream Shop
36,5,Bakery
35,5,American Restaurant


Cluster 5

In [170]:
df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 4, ['Neighborhood', 'City']].values

array([['Illinois Medical District', 'Chicago'],
       ['Marble Hill', 'New York'],
       ['Tri-Taylor', 'Chicago']], dtype=object)

In [171]:
df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 4, :]

Unnamed: 0,Neighborhood,Borough,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue,Cluster Labels,Latitude,Longitude
21,Illinois Medical District,Near West Side,Chicago,Sandwich Place,Pizza Place,Thai Restaurant,Mexican Restaurant,Train Station,Sports Bar,Middle Eastern Restaurant,Italian Restaurant,Bar,Hot Dog Joint,4,41.868494,-87.673975
31,Marble Hill,Manhattan,New York,Park,Mexican Restaurant,Pharmacy,Spanish Restaurant,Supermarket,Donut Shop,Café,Pizza Place,Coffee Shop,Shoe Store,4,40.876551,-73.91066
52,Tri-Taylor,Near West Side,Chicago,Fast Food Restaurant,Café,Bus Station,Sandwich Place,Park,Pizza Place,Hot Dog Joint,Bus Line,Lounge,Bike Rental / Bike Share,4,41.869102,-87.684744


In [172]:
# Generate Venues that appear in the Top10 Categories by Frequency
venues4 = df_merged_downtown.loc[df_merged_downtown['Cluster Labels'] == 4, :].iloc[:, 3:13].values.flatten()
unique_elements, counts_elements = np.unique(venues4, return_counts=True)
df_venues4 = pd.DataFrame()
df_venues4['Counts'] = sorted(counts_elements)
df_venues4['VenueCategory'] = [x for _,x in sorted(zip(counts_elements,unique_elements))]
df_venues4.sort_values('Counts', ascending=False)

Unnamed: 0,Counts,VenueCategory
22,3,Pizza Place
21,2,Sandwich Place
20,2,Park
19,2,Mexican Restaurant
18,2,Hot Dog Joint
17,2,Café
12,1,Spanish Restaurant
16,1,Train Station
15,1,Thai Restaurant
14,1,Supermarket
