Import libraries

In [1]:
import pandas as pd
import numpy as np

# !conda install -c conda-forge geopy --yes # already installed
from geopy.geocoders import Nominatim #converts address to latlong value

# matplotlib libraries to be used with mapping
import matplotlib.cm as cm
import matplotlib.colors as colors

# use for clustering
from sklearn.cluster import KMeans

# map library
# !conda install -c conda-forge folium=0.5.0 --yes # already installed
import folium

import requests
from pandas.io.json import json_normalize

Read NYC data in from ny.gov

In [2]:
dfNY = pd.read_html('https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm')[0]
dfNY.head()

Unnamed: 0,Borough,Neighborhood,ZIP Codes
0,Bronx,Central Bronx,"10453, 10457, 10460"
1,Bronx,Bronx Park and Fordham,"10458, 10467, 10468"
2,Bronx,High Bridge and Morrisania,"10451, 10452, 10456"
3,Bronx,Hunts Point and Mott Haven,"10454, 10455, 10459, 10474"
4,Bronx,Kingsbridge and Riverdale,"10463, 10471"


Read Boston data from archive.boston.com

In [3]:
dfBST = pd.read_html('http://archive.boston.com/news/local/articles/2007/04/15/sixfigurezipcodes_city/')[6]
dfBST.head()

Unnamed: 0,Zip code,Community,Totaltax returnsfiled for 2004,"Total$100,000+returns","Percent of$100,000+returns"
0,,,,,
1,2101.0,Downtown Boston,34.0,,
2,2108.0,Beacon Hill,2440.0,963.0,39%
3,2109.0,Markets / Inner Harbor,3230.0,1175.0,36%
4,2110.0,Financial District / Wharves,2608.0,1229.0,47%


Clean Boston data

In [4]:
dfBST = dfBST[['Zip code','Community']]
dfBST.rename(columns={'Community':'Neighborhood'}, inplace = True)
dfBST.dropna(inplace = True)
dfBST.reset_index(inplace=True)
dfBST.drop(['index'], axis = 1, inplace = True)
dfBST['Zip code'] = dfBST['Zip code'].astype(int) #change zipcodes from floats to ints, removing decimal
dfBST['Zip code'] = '0' + dfBST['Zip code'].astype(str) #changes zipcodes to str and appends 0 that cannot be held by int
# all Boston zips have a leading zero, but the geolocational data needs the leading zero so it cannot remain an int
dfBST.head()

Unnamed: 0,Zip code,Neighborhood
0,2101,Downtown Boston
1,2108,Beacon Hill
2,2109,Markets / Inner Harbor
3,2110,Financial District / Wharves
4,2111,Chinatown / Tufts-New England Medical Center


Get latlong for both NYC and Boston

In [5]:
nyAddress = 'New York City, NY'
geolocator = Nominatim(user_agent="museum_explorer")
nyLocation = geolocator.geocode(nyAddress)
nyLatitude = nyLocation.latitude
nyLongitude = nyLocation.longitude
print("NYC is located at coordinates {}, {}".format(nyLatitude, nyLongitude))


bstAddress = 'Boston, MA'
#bstGeolocator = Nominatim(user_agent="museum_explorer")
bstLocation = geolocator.geocode(bstAddress)
bstLatitude = bstLocation.latitude
bstLongitude = bstLocation.longitude
print("Boston is located at coordinates {}, {}".format(bstLatitude,bstLongitude))

NYC is located at coordinates 40.7127281, -74.0060152
Boston is located at coordinates 42.3602534, -71.0582912


Break up Boston neighborhoods containing a '/'

In [6]:
dfBSTTempN = pd.DataFrame(columns = ['Zip code','Neighborhood'])
dfBSTTempN


for ind in range(0,len(dfBST)):
    if("/" in dfBST.iloc[ind, 1]):
        tempNeighborhood = (dfBST.iloc[ind,1]).split(" / ")
        for arr in tempNeighborhood:
            dfBSTTempN = dfBSTTempN.append((pd.DataFrame([dfBST.iloc[ind,0],arr]).transpose()),ignore_index = True)
    else:
        tempNeighborhood = dfBST.iloc[ind,1]
        dfBSTTempN = dfBSTTempN.append((pd.DataFrame([dfBST.iloc[ind,0], tempNeighborhood]).transpose()),ignore_index = True)
dfBST = dfBSTTempN.dropna(axis = 'columns')
dfBST.columns = ['Zip code', 'Neighborhood']
dfBST

Unnamed: 0,Zip code,Neighborhood
0,02101,Downtown Boston
1,02108,Beacon Hill
2,02109,Markets
3,02109,Inner Harbor
4,02110,Financial District
...,...,...
60,02228,East Boston
61,02238,Cambridge
62,02445,Brookline
63,02446,Brookline


Import CSV file with latlong of every U.S. zipcode

In [7]:
dfZips = pd.read_csv("zipLatLong.csv", index_col = False)
bstZips = dfZips[dfZips['State'] == "MA"]
bstZips = bstZips[bstZips['City'] == "Boston"]
#nyZips = dfZips[dfZips['City'] == "New York"]
#Test to see if above is what is responsible for disparity between zips and dfNY
nyZips = dfZips[dfZips['State'] == "NY"]
bstZips.reset_index(inplace = True)
nyZips.reset_index(inplace = True)
bstZips = bstZips[['Zip','Latitude','Longitude']]
nyZips = nyZips[['Zip','Latitude','Longitude']]
nyZips.head()




# todo

Unnamed: 0,Zip,Latitude,Longitude
0,10940,41.449364,-74.43951
1,12930,44.731571,-74.53485
2,14228,43.023347,-78.79494
3,10706,40.992084,-73.8729
4,13851,42.206745,-75.74488


Merge boston zip codes df with the Boston latlong df

In [8]:
#merge zip codes with their latlongs in Boston
bstZips['Zip'] = '0' + bstZips['Zip'].astype(str) #add leading zeros and cast to string
bstZips.set_index('Zip', inplace = True) #set zip to index
dfBST.set_index('Zip code', inplace = True)
dfBST = dfBST.join(bstZips, lsuffix='_caller',rsuffix='_other')
dfBST.dropna(inplace= True)
dfBST

Unnamed: 0,Neighborhood,Latitude,Longitude
2101,Downtown Boston,42.370567,-71.026964
2108,Beacon Hill,42.357903,-71.06408
2109,Markets,42.361477,-71.05417
2109,Inner Harbor,42.361477,-71.05417
2110,Financial District,42.356532,-71.05365
2110,Wharves,42.356532,-71.05365
2111,Chinatown,42.349838,-71.06101
2111,Tufts-New England Medical Center,42.349838,-71.06101
2112,Downtown Boston,42.338947,-70.919635
2113,North End,42.365028,-71.05636


Before we merge NYC zip codes df with NYC latlong df, we need to seperate the multiple zip codes for each neighborhood into individual rows.

In [9]:
dfNY.head()

Unnamed: 0,Borough,Neighborhood,ZIP Codes
0,Bronx,Central Bronx,"10453, 10457, 10460"
1,Bronx,Bronx Park and Fordham,"10458, 10467, 10468"
2,Bronx,High Bridge and Morrisania,"10451, 10452, 10456"
3,Bronx,Hunts Point and Mott Haven,"10454, 10455, 10459, 10474"
4,Bronx,Kingsbridge and Riverdale,"10463, 10471"


Create temporary dataframe to hold dfNY data with one zipcode per row

In [10]:
dfNYTemp = pd.DataFrame(columns = ['Borough','Neighborhood','Zip code'])
dfNYTemp

Unnamed: 0,Borough,Neighborhood,Zip code


Break zipcodes and neighborhoods into individual rows for NYC data

In [11]:
for ind in range(0,len(dfNY)):
    if("," in dfNY.loc[ind,'ZIP Codes']):
        tempZips = (dfNY.loc[ind,'ZIP Codes']).split(",")
        for arr in tempZips:
            dfNYTemp = dfNYTemp.append((pd.DataFrame([dfNY.iloc[ind,0], dfNY.iloc[ind,1],arr]).transpose()),ignore_index = True)
    else:
        tempZips = dfNY.loc[ind,'ZIP Codes']
        dfNYTemp = dfNYTemp.append((pd.DataFrame([dfNY.iloc[ind,0], dfNY.iloc[ind,1],tempZips]).transpose()),ignore_index = True)

dfNY = dfNYTemp.dropna(axis = 'columns')
dfNY.columns = ['Borough', 'Neighborhood', 'Zip code']
dfNY.head()

Unnamed: 0,Borough,Neighborhood,Zip code
0,Bronx,Central Bronx,10453
1,Bronx,Central Bronx,10457
2,Bronx,Central Bronx,10460
3,Bronx,Bronx Park and Fordham,10458
4,Bronx,Bronx Park and Fordham,10467


In [12]:
dfNYTempN = pd.DataFrame(columns = ['Borough','Neighborhood','Zip code'])
dfNYTempN


for ind in range(0,len(dfNY)):
    if("/" in dfNY.iloc[ind, 1]):
        tempNeighborhood = (dfNY.iloc[ind,1]).split("/")
        for arr in tempNeighborhood:
            dfNYTempN = dfNYTempN.append((pd.DataFream([dfNY.iloc[ind,0],arr,dfNY.iloc[ind,2]]).transpose()),ignore_index = True)
    else:
        tempNeighborhood = dfNY.iloc[ind,1]
        dfNYTempN = dfNYTempN.append((pd.DataFrame([dfNY.iloc[ind,0], tempNeighborhood, dfNY.iloc[ind,2]]).transpose()),ignore_index = True)
dfNY = dfNYTempN.dropna(axis = 'columns')
dfNY.columns = ['Borough','Neighborhood','Zip code']
dfNY
# TODO TESTING CODE

Unnamed: 0,Borough,Neighborhood,Zip code
0,Bronx,Central Bronx,10453
1,Bronx,Central Bronx,10457
2,Bronx,Central Bronx,10460
3,Bronx,Bronx Park and Fordham,10458
4,Bronx,Bronx Park and Fordham,10467
...,...,...,...
173,Staten Island,South Shore,10312
174,Staten Island,Stapleton and St. George,10301
175,Staten Island,Stapleton and St. George,10304
176,Staten Island,Stapleton and St. George,10305


In [13]:
nyZips['Zip'] = nyZips['Zip'].astype(str)
nyZips.dtypes

Zip           object
Latitude     float64
Longitude    float64
dtype: object

Now let's merge this dataframe with the latlong dataframe

In [14]:
nyZips.set_index('Zip', inplace = True) #set zip to index
dfNY.set_index('Zip code', inplace = True)
dfNY = dfNY.join(nyZips, lsuffix='_caller',rsuffix='_other')
nyZips
dfNY.head()

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
Zip code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10453,Bronx,Central Bronx,40.853017,-73.91214
10457,Bronx,Central Bronx,,
10460,Bronx,Central Bronx,,
10458,Bronx,Bronx Park and Fordham,40.864166,-73.88881
10467,Bronx,Bronx Park and Fordham,,


Get the shape of the dataframe for reference

In [15]:
dfNY.shape

(178, 4)

Remove values that were in NY state but not NYC

In [16]:
dfNY.dropna(inplace = True)
dfNY.shape

(42, 4)

Check the dataframe

In [17]:
dfNY

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
Zip code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10453,Bronx,Central Bronx,40.853017,-73.91214
10458,Bronx,Bronx Park and Fordham,40.864166,-73.88881
10451,Bronx,High Bridge and Morrisania,40.819729,-73.9223
10454,Bronx,Hunts Point and Mott Haven,40.805968,-73.91628
10463,Bronx,Kingsbridge and Riverdale,40.881086,-73.90749
10466,Bronx,Northeast Bronx,40.89095,-73.84702
10461,Bronx,Southeast Bronx,40.842917,-73.83819
10464,Bronx,Southeast Bronx,40.857017,-73.78903
11212,Brooklyn,Central Brooklyn,40.662191,-73.91328
11209,Brooklyn,Southwest Brooklyn,40.62327,-74.0295


Declare foursquare credentials as variables

In [47]:
#Credentials removed for privacy

print("Hush hush, credentials declared")

Hush hush, credentials declared


Venue category extracting function from lab

In [19]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Nearby venues function from lab

In [20]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()['response']['groups'][0]['items']
        #results = requests.get(url).json()
        #print(results)
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [21]:
dfNY.head()

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
Zip code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10453,Bronx,Central Bronx,40.853017,-73.91214
10458,Bronx,Bronx Park and Fordham,40.864166,-73.88881
10451,Bronx,High Bridge and Morrisania,40.819729,-73.9223
10454,Bronx,Hunts Point and Mott Haven,40.805968,-73.91628
10463,Bronx,Kingsbridge and Riverdale,40.881086,-73.90749


In [22]:
neighborhood_latitude = dfNY.iloc[0, 2] # neighborhood latitude value
neighborhood_longitude = dfNY.iloc[0, 3] # neighborhood longitude value

neighborhood_name = dfNY.iloc[0, 1] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of Central Bronx are 40.853017, -73.91214000000001.


In [23]:
LIMIT = 100

radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID,CLIENT_SECRET,VERSION,neighborhood_latitude,neighborhood_longitude,radius,LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=ZJZIGIA1FSCYFGW3J1IATTYGRZYNBUYPFVU0PMN0TJ0FYSEC&client_secret=QQQIO1UM1YM1MQF0J12BAF4XQ3XKU15NEJS2OGGK3CKRODP1&v=20180605&ll=40.853017,-73.91214000000001&radius=500&limit=100'

In [24]:
results = requests.get(url).json()
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [25]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

  nearby_venues = json_normalize(venues) # flatten JSON


Unnamed: 0,name,categories,lat,lng
0,Accra Resturant,African Restaurant,40.853871,-73.908421
1,Bravo Supermarkets,Grocery Store,40.854107,-73.914162
2,Papa John's Pizza,Pizza Place,40.852429,-73.908976
3,T-Mobile,Mobile Phone Shop,40.850147,-73.916581
4,Food Dynasty,Supermarket,40.853772,-73.909267


Get venues from NYC and Boston Neighborhoods

In [26]:
nyVenues = getNearbyVenues(names=dfNY['Neighborhood'],
                          latitudes=dfNY['Latitude'],
                          longitudes=dfNY['Longitude'])
nyVenues

bstVenues = getNearbyVenues(names=dfBST['Neighborhood'],
                           latitudes=dfBST['Latitude'],
                           longitudes=dfBST['Longitude'])

bstVenues
#nyVenues
bstVenues

Central Bronx
Bronx Park and Fordham
High Bridge and Morrisania
Hunts Point and Mott Haven
Kingsbridge and Riverdale
Northeast Bronx
Southeast Bronx
Southeast Bronx
Central Brooklyn
Southwest Brooklyn
Borough Park
Canarsie and Flatlands
Southern Brooklyn
Northwest Brooklyn
Flatbush
East New York and New Lots
Greenpoint
Sunset Park
Bushwick and Williamsburg
Central Harlem
Chelsea and Clinton
East Harlem
Gramercy Park and Murray Hill
Greenwich Village and Soho
Lower East Side
Upper East Side
Upper West Side
Inwood and Washington Heights
Northeast Queens
North Queens
Central Queens
Jamaica
Northwest Queens
West Central Queens
Rockaways
Southeast Queens
Southwest Queens
West Queens
Port Richmond
South Shore
Stapleton and St. George
Mid-Island
Downtown Boston
Beacon Hill
Markets
Inner Harbor
Financial District
Wharves
Chinatown
Tufts-New England Medical Center
Downtown Boston
North End
West End
Back of the Hill
Fenway
East Fens
Longwood
Back Bay
Downtown Boston
South End
Roxbury
Roxbury Cro

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Downtown Boston,42.370567,-71.026964,Alamo Rent A Car,42.368561,-71.029946,Rental Car Location
1,Downtown Boston,42.370567,-71.026964,National Car Rental,42.368013,-71.030275,Rental Car Location
2,Downtown Boston,42.370567,-71.026964,Memorial Park,42.372588,-71.031452,Park
3,Downtown Boston,42.370567,-71.026964,Unified Rental Car Center,42.368447,-71.030115,Rental Car Location
4,Downtown Boston,42.370567,-71.026964,Enterprise Rent-A-Car,42.368700,-71.029824,Rental Car Location
...,...,...,...,...,...,...,...
1784,Boston University,42.346997,-71.102150,McDonald's,42.348541,-71.096742,Fast Food Restaurant
1785,Boston University,42.346997,-71.102150,Foundation Room,42.347280,-71.096302,Lounge
1786,Boston University,42.346997,-71.102150,Qdoba Mexican Grill,42.348423,-71.096802,Mexican Restaurant
1787,Boston University,42.346997,-71.102150,Dunkin',42.346603,-71.096306,Donut Shop


Begin analysis of each neighborhood in NYC

In [27]:
#one hot encoding
nyOnehot = pd.get_dummies(nyVenues[['Venue Category']], prefix = "", prefix_sep = "")

#re-add neighborhood column to dataframe
nyOnehot['Neighborhood'] = nyVenues['Neighborhood']

#move neighborhood column for ease of reading
fixedColumns = [nyOnehot.columns[-1]] + list(nyOnehot.columns[:-1])
nyOnehot = nyOnehot[fixedColumns]

nyOnehot.head()

Unnamed: 0,Neighborhood,Accessories Store,African Restaurant,American Restaurant,Arepa Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Central Bronx,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Central Bronx,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Central Bronx,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Central Bronx,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Central Bronx,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Begin analysis of each neighborhood in Boston

In [28]:
#one hot encoding
bstOnehot = pd.get_dummies(bstVenues[['Venue Category']], prefix = "", prefix_sep = "")

#re-add neighborhood column to dataframe
bstOnehot.drop(['Neighborhood'], axis = 1, inplace = True)
bstOnehot['Neighborhood'] = bstVenues['Neighborhood']

#move neighborhood column for ease of reading
fixedColumn = [bstOnehot.columns[-1]] + list(bstOnehot.columns[:-1])
bstOnehot = bstOnehot[fixedColumn]

bstOnehot

Unnamed: 0,Neighborhood,Accessories Store,African Restaurant,Airport,Airport Terminal,American Restaurant,Aquarium,Arepa Restaurant,Art Gallery,Art Museum,...,Track,Trail,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Downtown Boston,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Downtown Boston,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Downtown Boston,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Downtown Boston,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Downtown Boston,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1784,Boston University,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1785,Boston University,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1786,Boston University,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1787,Boston University,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Group NY data

In [29]:
nyGrouped = nyOnehot.groupby('Neighborhood').mean().reset_index()
nyGrouped

Unnamed: 0,Neighborhood,Accessories Store,African Restaurant,American Restaurant,Arepa Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Borough Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.064516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Bronx Park and Fordham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.027778,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Bushwick and Williamsburg,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Canarsie and Flatlands,0.0,0.0,0.035714,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bronx,0.0,0.030303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Central Brooklyn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Central Harlem,0.0,0.033898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.016949,0.0,0.0,0.0,0.0,0.0,0.016949,0.016949,0.0,0.0
7,Central Queens,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.033333,0.033333,...,0.0,0.0,0.033333,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Chelsea and Clinton,0.0,0.0,0.034091,0.0,0.011364,0.011364,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,East Harlem,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Group Boston data

In [30]:
bstGrouped = bstOnehot.groupby('Neighborhood').mean().reset_index()
bstGrouped

Unnamed: 0,Neighborhood,Accessories Store,African Restaurant,Airport,Airport Terminal,American Restaurant,Aquarium,Arepa Restaurant,Art Gallery,Art Museum,...,Track,Trail,Udon Restaurant,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Back Bay,0.01,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.02,0.02,0.0
1,Back of the Hill,0.0,0.0,0.0,0.0,0.05,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025
2,Beacon Hill,0.0,0.0,0.0,0.0,0.034091,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.011364,0.0,0.0,0.0,0.0,0.011364
3,Boston University,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,0.0,0.0,...,0.0,0.015625,0.015625,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Cambridge,0.0,0.0,0.0,0.0,0.0,0.0,0.029412,0.0,0.0,...,0.0,0.0,0.0,0.029412,0.0,0.0,0.029412,0.0,0.0,0.0
5,Chinatown,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.01,0.0,0.01,0.01,0.0,0.0,0.01
6,Codman Square,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429
7,Dorchester,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.044444,0.0,0.0,0.0,0.022222
8,Downtown Boston,0.0,0.0,0.011628,0.034884,0.046512,0.0,0.0,0.011628,0.011628,...,0.0,0.0,0.0,0.011628,0.0,0.0,0.011628,0.0,0.0,0.0
9,East Boston,0.0,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.047619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Define function to fetch columns containing word museum in each grouped dataframe

In [33]:
def getMuseums(df):
    museumColumns = [col for col in df.columns if 'Museum' in col]
    return museumColumns

Fetch museum columns list

In [35]:
nyMuseumCols = getMuseums(nyGrouped)
#nyMuseumCols

bstMuseumCols = getMuseums(bstGrouped)
#bstMuseumCols

['Art Museum', 'History Museum', 'Museum']

In [44]:
def neighborhoodMax(df):
    max = 0
    bestNeighborhood = "none"
    for ind in range(0,len(df)):
        sum = df.loc[ind, 'Art Museum'] + df.loc[ind, 'History Museum'] + df.loc[ind, 'Museum']
        if sum > max:
            max = sum
            bestNeighborhood = df.iloc[ind,0]
    return [bestNeighborhood,str(max)]

In [46]:
bestNY = neighborhoodMax(nyGrouped)
bestBST = neighborhoodMax(bstGrouped)
print('The best neighborhood for museums in NYC is {}, with a proportion of {} museums in the neighborhood!'.format(bestNY[0],
                                                                                                                   bestNY[1]))
print('The best neighborhood for museums in Boston is {}, with a proportion of {} museums in the neighborhood!'.format(bestBST[0],
                                                                                                                      bestBST[1]))

The best neighborhood for museums in NYC is Bronx Park and Fordham, with a proportion of 0.027777777777777776 museums in the neighborhood!
The best neighborhood for museums in Boston is East Fens, with a proportion of 0.15625 museums in the neighborhood!


Ok! Boston seems like the place to be for museum density.