# The Battle of Neighborhoods Week-2

### Lets import the libraries

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

#!conda install -c conda-forge folium=0.5.0 --yes 

import csv # implements classes to read and write tabular data in CSV form

print('Libraries imported.')

Libraries imported.


In [2]:
import sys
!{sys.executable} -m pip install geocoder
!{sys.executable} -m pip install folium

print('Packages installed.')

Packages installed.


In [3]:
import folium # map rendering library

In [4]:
# Download New York Dataset
!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
print('Data downloaded!')

Data downloaded!


Load and Explore Data

In [5]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [6]:
neighborhoods_data = newyork_data['features']
neighborhoods_data[0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

### Tranform the data into a pandas dataframe
The next task is essentially transforming this data of nested Python dictionaries into a pandas dataframe. Start by creating an empty dataframe.

In [7]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [8]:
neighborhoods

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude


Then loop through the data and fill the dataframe one row at a time.

In [9]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [10]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


We will check that the dataset has all 5 boroughs and 306 neighborhoods.

In [11]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 5 boroughs and 306 neighborhoods.


### Write file in CSV

In [12]:
neighborhoods.to_csv('Battle_of_Neighborhood.csv',index=False)

### Retrieve Geoloaction

In [13]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="Newyork")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


### Creating Map

In [14]:
# create map of Toronto using latitude and longitude values
map_NewYork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NewYork)  
    
map_NewYork

### Web scrapping of Population and Demographics data of New York city from Wikipedia

### POPULATION DATA

Web scrapping of Population data from wikipedia page - https://en.wikipedia.org/wiki/New_York_City

In [15]:
!pip install beautifulsoup4
from bs4 import BeautifulSoup # package for parsing HTML and XML documents




##### Web scrapping of Population data from wikipedia page using BeautifulSoup.

Beautiful Soup is a Python package for parsing HTML and XML documents (including having malformed markup, i.e. non-closed tags, so named after tag soup). It creates a parse tree for parsed pages that can be used to extract data from HTML, which is useful for web scraping.


In [45]:
website_url = requests.get('https://en.wikipedia.org/wiki/Demographics_of_New_York_City').text
soup = BeautifulSoup(website_url,'lxml')
table = soup.find('table',{'class':'wikitable sortable'})
#print(soup.prettify())

headers = [header.text for header in table.find_all('th')]

table_rows = table.find_all('tr')        
rows = []
for row in table_rows:
   td = row.find_all('td')
   row = [row.text for row in td]
   rows.append(row)

with open('Battle_of Neighborhood_population.csv', 'w') as f:
   writer = csv.writer(f)
   writer.writerow(headers)
   writer.writerows(row for row in rows if row)

In [46]:
population=pd.read_csv('Battle_of Neighborhood_population.csv')
population

Unnamed: 0,New York City's five boroughsvte,Jurisdiction,Population,Gross Domestic Product,Land area,Density,Borough,County,Estimate (2018)[12],billions(US$)[13],per capita(US$),square miles,squarekm,persons / sq. mi,persons /km2
0,The Bronx\n,\n Bronx\n,"1,432,132\n",42.695\n,"29,200\n",42.10\n,109.04\n,"34,653\n","13,231\n",,,,,,
1,Brooklyn\n,\n Kings\n,"2,582,830\n",91.559\n,"34,600\n",70.82\n,183.42\n,"37,137\n","14,649\n",,,,,,
2,Manhattan\n,\n New York\n,"1,628,701\n",600.244\n,"360,900\n",22.83\n,59.13\n,"72,033\n","27,826\n",,,,,,
3,Queens\n,\n Queens\n,"2,278,906\n",93.310\n,"39,600\n",108.53\n,281.09\n,"21,460\n","8,354\n",,,,,,
4,Staten Island\n,\n Richmond\n,"476,179\n",14.514\n,"30,300\n",58.37\n,151.18\n,"8,112\n","3,132\n",,,,,,
5,City of New York,8398748,842.343,97700,302.64,783.83,28188,"10,947\n",,,,,,,
6,State of New York,19745289,1701.399,85700,47214,122284,416.4,159\n,,,,,,,
7,Sources:[14] and see individual borough articl...,,,,,,,,,,,,,,


In [47]:
population.drop(population.columns[[3,8,9,10,11,12,13,14]], axis=1,inplace=True)
population

Unnamed: 0,New York City's five boroughsvte,Jurisdiction,Population,Land area,Density,Borough,County
0,The Bronx\n,\n Bronx\n,"1,432,132\n","29,200\n",42.10\n,109.04\n,"34,653\n"
1,Brooklyn\n,\n Kings\n,"2,582,830\n","34,600\n",70.82\n,183.42\n,"37,137\n"
2,Manhattan\n,\n New York\n,"1,628,701\n","360,900\n",22.83\n,59.13\n,"72,033\n"
3,Queens\n,\n Queens\n,"2,278,906\n","39,600\n",108.53\n,281.09\n,"21,460\n"
4,Staten Island\n,\n Richmond\n,"476,179\n","30,300\n",58.37\n,151.18\n,"8,112\n"
5,City of New York,8398748,842.343,302.64,783.83,28188,"10,947\n"
6,State of New York,19745289,1701.399,47214,122284,416.4,159\n
7,Sources:[14] and see individual borough articl...,,,,,,


In [48]:
population.rename(columns = {'Jurisdiction\n':'County',
                   'Population\n':'Estimate_2018', 
                   'Landarea\n':'square_miles',
                    'Density\n':'square_km',
                    'Borough':'persons_sq_mi','County':'persons_sq_km'}, inplace=True)
population

Unnamed: 0,New York City's five boroughsvte,County,Estimate_2018,Land area,square_km,persons_sq_mi,persons_sq_km
0,The Bronx\n,\n Bronx\n,"1,432,132\n","29,200\n",42.10\n,109.04\n,"34,653\n"
1,Brooklyn\n,\n Kings\n,"2,582,830\n","34,600\n",70.82\n,183.42\n,"37,137\n"
2,Manhattan\n,\n New York\n,"1,628,701\n","360,900\n",22.83\n,59.13\n,"72,033\n"
3,Queens\n,\n Queens\n,"2,278,906\n","39,600\n",108.53\n,281.09\n,"21,460\n"
4,Staten Island\n,\n Richmond\n,"476,179\n","30,300\n",58.37\n,151.18\n,"8,112\n"
5,City of New York,8398748,842.343,302.64,783.83,28188,"10,947\n"
6,State of New York,19745289,1701.399,47214,122284,416.4,159\n
7,Sources:[14] and see individual borough articl...,,,,,,


In [49]:
population.rename(columns={"New York City's five boroughsvte\n":'Borough'}, inplace=1)
population.head()

Unnamed: 0,Borough,County,Estimate_2018,Land area,square_km,persons_sq_mi,persons_sq_km
0,The Bronx\n,\n Bronx\n,"1,432,132\n","29,200\n",42.10\n,109.04\n,"34,653\n"
1,Brooklyn\n,\n Kings\n,"2,582,830\n","34,600\n",70.82\n,183.42\n,"37,137\n"
2,Manhattan\n,\n New York\n,"1,628,701\n","360,900\n",22.83\n,59.13\n,"72,033\n"
3,Queens\n,\n Queens\n,"2,278,906\n","39,600\n",108.53\n,281.09\n,"21,460\n"
4,Staten Island\n,\n Richmond\n,"476,179\n","30,300\n",58.37\n,151.18\n,"8,112\n"


In [50]:
population.rename(columns={"Land area\n":'square_mi'}, inplace=1)
population

Unnamed: 0,Borough,County,Estimate_2018,square_mi,square_km,persons_sq_mi,persons_sq_km
0,The Bronx\n,\n Bronx\n,"1,432,132\n","29,200\n",42.10\n,109.04\n,"34,653\n"
1,Brooklyn\n,\n Kings\n,"2,582,830\n","34,600\n",70.82\n,183.42\n,"37,137\n"
2,Manhattan\n,\n New York\n,"1,628,701\n","360,900\n",22.83\n,59.13\n,"72,033\n"
3,Queens\n,\n Queens\n,"2,278,906\n","39,600\n",108.53\n,281.09\n,"21,460\n"
4,Staten Island\n,\n Richmond\n,"476,179\n","30,300\n",58.37\n,151.18\n,"8,112\n"
5,City of New York,8398748,842.343,302.64,783.83,28188,"10,947\n"
6,State of New York,19745289,1701.399,47214,122284,416.4,159\n
7,Sources:[14] and see individual borough articl...,,,,,,


In [51]:
population['Borough']=population['Borough'].replace(to_replace='\n', value='', regex=True)
population['County']=population['County'].replace(to_replace='\n', value='', regex=True)
population['Estimate_2018']=population['Estimate_2018'].replace(to_replace='\n', value='', regex=True)
population['square_mi']=population['square_mi'].replace(to_replace='\n', value='', regex=True)

population

Unnamed: 0,Borough,County,Estimate_2018,square_mi,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1432132.0,29200.0,42.10\n,109.04\n,"34,653\n"
1,Brooklyn,Kings,2582830.0,34600.0,70.82\n,183.42\n,"37,137\n"
2,Manhattan,New York,1628701.0,360900.0,22.83\n,59.13\n,"72,033\n"
3,Queens,Queens,2278906.0,39600.0,108.53\n,281.09\n,"21,460\n"
4,Staten Island,Richmond,476179.0,30300.0,58.37\n,151.18\n,"8,112\n"
5,City of New York,8398748,842.343,302.64,783.83,28188,"10,947\n"
6,State of New York,19745289,1701.399,47214.0,122284,416.4,159\n
7,Sources:[14] and see individual borough articles,,,,,,


In [52]:
population['persons_sq_km']=population['persons_sq_km'].replace(to_replace='\n', value='', regex=True)
population['square_km']=population['square_km'].replace(to_replace='\n', value='', regex=True)
population['persons_sq_mi']=population['persons_sq_mi'].replace(to_replace='\n', value='', regex=True)
population

Unnamed: 0,Borough,County,Estimate_2018,square_mi,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1432132.0,29200.0,42.1,109.04,34653.0
1,Brooklyn,Kings,2582830.0,34600.0,70.82,183.42,37137.0
2,Manhattan,New York,1628701.0,360900.0,22.83,59.13,72033.0
3,Queens,Queens,2278906.0,39600.0,108.53,281.09,21460.0
4,Staten Island,Richmond,476179.0,30300.0,58.37,151.18,8112.0
5,City of New York,8398748,842.343,302.64,783.83,28188.0,10947.0
6,State of New York,19745289,1701.399,47214.0,122284.0,416.4,159.0
7,Sources:[14] and see individual borough articles,,,,,,


In [53]:
population.loc[5:,['persons_sq_mi','persons_sq_km']] = population.loc[2:,['persons_sq_mi','persons_sq_km']].shift(1,axis=1)
population.loc[5:,['square_km','persons_sq_mi']] = population.loc[2:,['square_km','persons_sq_mi']].shift(1,axis=1)
population.loc[5:,['square_mi','square_km']] = population.loc[2:,['square_mi','square_km']].shift(1,axis=1)
population.loc[5:,['Estimate_2018','square_mi']] = population.loc[2:,['Estimate_2018','square_mi']].shift(1,axis=1)
population.loc[5:,['County','Estimate_2018']] = population.loc[2:,['County','Estimate_2018']].shift(1,axis=1)
population.loc[5:,['Borough','County']] = population.loc[2:,['Borough','County']].shift(1,axis=1)
population

Unnamed: 0,Borough,County,Estimate_2018,square_mi,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1432132.0,29200.0,42.1,109.04,34653.0
1,Brooklyn,Kings,2582830.0,34600.0,70.82,183.42,37137.0
2,Manhattan,New York,1628701.0,360900.0,22.83,59.13,72033.0
3,Queens,Queens,2278906.0,39600.0,108.53,281.09,21460.0
4,Staten Island,Richmond,476179.0,30300.0,58.37,151.18,8112.0
5,,City of New York,8398748.0,842.343,302.64,783.83,28188.0
6,,State of New York,19745289.0,1701.399,47214.0,122284.0,416.4
7,,Sources:[14] and see individual borough articles,,,,,


In [54]:
population = population.fillna('')
population

Unnamed: 0,Borough,County,Estimate_2018,square_mi,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1432132.0,29200.0,42.1,109.04,34653.0
1,Brooklyn,Kings,2582830.0,34600.0,70.82,183.42,37137.0
2,Manhattan,New York,1628701.0,360900.0,22.83,59.13,72033.0
3,Queens,Queens,2278906.0,39600.0,108.53,281.09,21460.0
4,Staten Island,Richmond,476179.0,30300.0,58.37,151.18,8112.0
5,,City of New York,8398748.0,842.343,302.64,783.83,28188.0
6,,State of New York,19745289.0,1701.399,47214.0,122284.0,416.4
7,,Sources:[14] and see individual borough articles,,,,,


In [55]:
i= population[((population .County == 'Sources:[14] and see individual borough articles'))].index
population.drop(i)

Unnamed: 0,Borough,County,Estimate_2018,square_mi,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1432132,29200.0,42.1,109.04,34653.0
1,Brooklyn,Kings,2582830,34600.0,70.82,183.42,37137.0
2,Manhattan,New York,1628701,360900.0,22.83,59.13,72033.0
3,Queens,Queens,2278906,39600.0,108.53,281.09,21460.0
4,Staten Island,Richmond,476179,30300.0,58.37,151.18,8112.0
5,,City of New York,8398748,842.343,302.64,783.83,28188.0
6,,State of New York,19745289,1701.399,47214.0,122284.0,416.4


In [58]:
pop=population.drop(population.index[7])

In [59]:
pop

Unnamed: 0,Borough,County,Estimate_2018,square_mi,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1432132,29200.0,42.1,109.04,34653.0
1,Brooklyn,Kings,2582830,34600.0,70.82,183.42,37137.0
2,Manhattan,New York,1628701,360900.0,22.83,59.13,72033.0
3,Queens,Queens,2278906,39600.0,108.53,281.09,21460.0
4,Staten Island,Richmond,476179,30300.0,58.37,151.18,8112.0
5,,City of New York,8398748,842.343,302.64,783.83,28188.0
6,,State of New York,19745289,1701.399,47214.0,122284.0,416.4


#### save data

In [60]:
pop.to_csv('Battle_of_Neighborhood_pop.csv',index=False)

In [35]:
# The code was removed by Watson Studio for sharing.

Unnamed: 0,Borough,Latitude,Longitude
0,The Bronx,40.8448,-73.8648
1,Brooklyn,40.6782,-73.9442
2,Manhattan,40.7831,-73.9712
3,Queens,40.7282,-73.7949
4,State Island,40.5795,-74.1502


In [61]:
pop

Unnamed: 0,Borough,County,Estimate_2018,square_mi,square_km,persons_sq_mi,persons_sq_km
0,The Bronx,Bronx,1432132,29200.0,42.1,109.04,34653.0
1,Brooklyn,Kings,2582830,34600.0,70.82,183.42,37137.0
2,Manhattan,New York,1628701,360900.0,22.83,59.13,72033.0
3,Queens,Queens,2278906,39600.0,108.53,281.09,21460.0
4,Staten Island,Richmond,476179,30300.0,58.37,151.18,8112.0
5,,City of New York,8398748,842.343,302.64,783.83,28188.0
6,,State of New York,19745289,1701.399,47214.0,122284.0,416.4


In [62]:
new_data=pd.merge(df_data_1,pop, how='right', on='Borough')
new_data

Unnamed: 0,Borough,County,Estimate_2018,square_mi,square_km,persons_sq_mi,persons_sq_km,Latitude,Longitude
0,The Bronx,Bronx,1432132.0,29200.0,42.1,109.04,34653.0,40.8448,-73.8648
1,Brooklyn,Kings,2582830.0,34600.0,70.82,183.42,37137.0,40.6782,-73.9442
2,Manhattan,New York,1628701.0,360900.0,22.83,59.13,72033.0,40.7831,-73.9712
3,Queens,Queens,2278906.0,39600.0,108.53,281.09,21460.0,40.7282,-73.7949
4,State Island,,,,,,,40.5795,-74.1502


In [64]:
new_data.shape

(5, 9)

In [65]:
from sklearn.cluster import KMeans
!pip install folium
import folium



### Mapping New York

In [66]:
latitude=40.730610
longitude= -73.935242

In [68]:
# create map of NEW YORK using latitude and longitude values above:
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(new_data['Latitude'], new_data['Longitude'], new_data['Borough']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork

### Explore Foursquare api

In [69]:
# The code was removed by Watson Studio for sharing.

My Credential


In [70]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 2000 # define radius

In [71]:

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['District', 
                  'District Latitude', 
                  'District Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category' ]
    
    return(nearby_venues)

In [72]:
NY_venues = getNearbyVenues(names=new_data['Borough'],
                                   latitudes=new_data['Latitude'],
                                   longitudes=new_data['Longitude']
                                    )

The Bronx
Brooklyn
Manhattan
Queens
State Island


In [73]:
print(NY_venues.shape)
NY_venues.head(15)

(206, 7)


Unnamed: 0,District,District Latitude,District Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,The Bronx,40.8448,-73.8648,Conti's Pastry Shoppe,40.845906,-73.862836,Coffee Shop
1,The Bronx,40.8448,-73.8648,New Morris Deli,40.846529,-73.863874,Deli / Bodega
2,The Bronx,40.8448,-73.8648,Primavera Pizzeria & Restaurant,40.845761,-73.863848,Pizza Place
3,The Bronx,40.8448,-73.8648,Morris Park Pizza,40.844962,-73.867606,Pizza Place
4,The Bronx,40.8448,-73.8648,Arth Aljanathain,40.847338,-73.866632,Middle Eastern Restaurant
5,The Bronx,40.8448,-73.8648,F & J Pine Tavern,40.848766,-73.862242,Italian Restaurant
6,The Bronx,40.8448,-73.8648,Anthony's Pizza,40.845269,-73.8661,Pizza Place
7,The Bronx,40.8448,-73.8648,Dunkin',40.845476,-73.865949,Donut Shop
8,The Bronx,40.8448,-73.8648,900 Park Restaurant,40.84669,-73.85946,Italian Restaurant
9,The Bronx,40.8448,-73.8648,Istanbul Cafe,40.847595,-73.867405,Hookah Bar


In [75]:
NY_venues.groupby('District').count()

Unnamed: 0_level_0,District Latitude,District Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
District,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Brooklyn,48,48,48,48,48,48
Manhattan,95,95,95,95,95,95
Queens,37,37,37,37,37,37
State Island,4,4,4,4,4,4
The Bronx,22,22,22,22,22,22


In [76]:
print('The number of unique categories is {}.'.format(len(NY_venues['Venue Category'].unique())))

The number of unique categories is 95.


In [78]:
# one hot encoding
NY_onehot = pd.get_dummies(NY_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
NY_onehot['District'] = NY_venues['District'] 

# move district column to the first column
cols=list(NY_onehot.columns.values)
cols.pop(cols.index('District'))
NY_onehot=NY_onehot[['District']+cols]

# rename Neighborhood for Districts so that future merge works
NY_onehot.rename(columns = {'District': 'District'}, inplace = True)
NY_onehot.head(15)

Unnamed: 0,District,American Restaurant,Asian Restaurant,Bakery,Bank,Bar,Baseball Field,Basketball Court,Bookstore,Bowling Alley,Breakfast Spot,Bridge,Burger Joint,Bus Station,Café,Caribbean Restaurant,Castle,Chinese Restaurant,Clothing Store,Coffee Shop,College Basketball Court,Convenience Store,Creperie,Deli / Bodega,Department Store,Dessert Shop,Diner,Dog Run,Donut Shop,Dumpling Restaurant,Exhibit,Farmers Market,Fast Food Restaurant,Flower Shop,Food Truck,Fried Chicken Joint,Garden,Gas Station,Gift Shop,Golf Course,Grocery Store,Gym,Historic Site,History Museum,Hookah Bar,Hotel,Ice Cream Shop,Indian Restaurant,Israeli Restaurant,Italian Restaurant,Japanese Restaurant,Kids Store,Korean Restaurant,Liquor Store,Locksmith,Market,Mattress Store,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Mobile Phone Shop,Movie Theater,Museum,Music Venue,Nightclub,Outdoor Sculpture,Park,Pizza Place,Planetarium,Playground,Plaza,Public Art,Restaurant,Sandwich Place,Scenic Lookout,Science Museum,Sculpture Garden,Seafood Restaurant,Shipping Store,Southern / Soul Food Restaurant,Souvenir Shop,Spa,Spanish Restaurant,Speakeasy,Sports Bar,Supermarket,Sushi Restaurant,Taco Place,Theater,Trail,Vegetarian / Vegan Restaurant,Video Store,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,The Bronx,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,The Bronx,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,The Bronx,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,The Bronx,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,The Bronx,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,The Bronx,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,The Bronx,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,The Bronx,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,The Bronx,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,The Bronx,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [79]:
NY_onehot.shape

(206, 96)

In [80]:
NY_grouped = NY_onehot.groupby('District').mean().reset_index()
NY_grouped

Unnamed: 0,District,American Restaurant,Asian Restaurant,Bakery,Bank,Bar,Baseball Field,Basketball Court,Bookstore,Bowling Alley,Breakfast Spot,Bridge,Burger Joint,Bus Station,Café,Caribbean Restaurant,Castle,Chinese Restaurant,Clothing Store,Coffee Shop,College Basketball Court,Convenience Store,Creperie,Deli / Bodega,Department Store,Dessert Shop,Diner,Dog Run,Donut Shop,Dumpling Restaurant,Exhibit,Farmers Market,Fast Food Restaurant,Flower Shop,Food Truck,Fried Chicken Joint,Garden,Gas Station,Gift Shop,Golf Course,Grocery Store,Gym,Historic Site,History Museum,Hookah Bar,Hotel,Ice Cream Shop,Indian Restaurant,Israeli Restaurant,Italian Restaurant,Japanese Restaurant,Kids Store,Korean Restaurant,Liquor Store,Locksmith,Market,Mattress Store,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Mobile Phone Shop,Movie Theater,Museum,Music Venue,Nightclub,Outdoor Sculpture,Park,Pizza Place,Planetarium,Playground,Plaza,Public Art,Restaurant,Sandwich Place,Scenic Lookout,Science Museum,Sculpture Garden,Seafood Restaurant,Shipping Store,Southern / Soul Food Restaurant,Souvenir Shop,Spa,Spanish Restaurant,Speakeasy,Sports Bar,Supermarket,Sushi Restaurant,Taco Place,Theater,Trail,Vegetarian / Vegan Restaurant,Video Store,Wine Bar,Wine Shop,Women's Store,Yoga Studio
0,Brooklyn,0.020833,0.0,0.020833,0.041667,0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.020833,0.0,0.041667,0.041667,0.0,0.041667,0.0,0.083333,0.0,0.041667,0.0,0.020833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833,0.0,0.0,0.020833,0.0,0.020833,0.0,0.0,0.041667,0.020833,0.0,0.0,0.0,0.0,0.0,0.020833,0.0,0.0,0.020833,0.0,0.0,0.020833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833,0.0,0.020833,0.0,0.0,0.020833,0.0,0.041667,0.020833,0.0,0.020833,0.0625,0.0,0.0,0.0,0.041667,0.0,0.0625,0.0,0.0,0.0,0.020833,0.0,0.020833,0.0,0.020833,0.0,0.0,0.0,0.020833,0.0,0.0,0.0,0.0
1,Manhattan,0.010526,0.0,0.0,0.0,0.010526,0.021053,0.0,0.010526,0.0,0.010526,0.010526,0.0,0.0,0.010526,0.0,0.010526,0.0,0.010526,0.031579,0.0,0.0,0.010526,0.0,0.0,0.010526,0.0,0.021053,0.0,0.0,0.157895,0.010526,0.0,0.010526,0.010526,0.0,0.031579,0.0,0.010526,0.0,0.010526,0.010526,0.010526,0.063158,0.0,0.010526,0.0,0.010526,0.010526,0.021053,0.0,0.010526,0.0,0.0,0.0,0.0,0.0,0.010526,0.010526,0.0,0.0,0.010526,0.010526,0.010526,0.0,0.010526,0.073684,0.010526,0.031579,0.042105,0.010526,0.010526,0.010526,0.0,0.010526,0.010526,0.010526,0.0,0.0,0.0,0.010526,0.0,0.0,0.0,0.010526,0.0,0.021053,0.0,0.031579,0.010526,0.010526,0.0,0.021053,0.010526,0.010526,0.010526
2,Queens,0.0,0.027027,0.027027,0.027027,0.0,0.0,0.027027,0.027027,0.0,0.0,0.0,0.0,0.081081,0.0,0.0,0.0,0.0,0.0,0.027027,0.027027,0.0,0.027027,0.0,0.027027,0.0,0.0,0.0,0.0,0.027027,0.0,0.0,0.0,0.0,0.027027,0.0,0.0,0.0,0.0,0.0,0.0,0.027027,0.0,0.0,0.0,0.0,0.027027,0.0,0.0,0.027027,0.054054,0.0,0.027027,0.0,0.027027,0.027027,0.027027,0.0,0.027027,0.027027,0.027027,0.0,0.0,0.0,0.0,0.0,0.027027,0.108108,0.0,0.027027,0.0,0.0,0.027027,0.054054,0.0,0.0,0.0,0.0,0.027027,0.0,0.0,0.027027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,State Island,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0
4,The Bronx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136364,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.181818,0.0,0.0,0.045455,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181818,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [81]:
NY_grouped.shape


(5, 96)

In [82]:
num_top_venues = 5

for hood in NY_grouped['District']:
    print("----"+hood+"----")
    temp = NY_grouped[NY_grouped['District'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Brooklyn----
                             venue  freq
0                      Coffee Shop  0.08
1                   Sandwich Place  0.06
2  Southern / Soul Food Restaurant  0.06
3               Seafood Restaurant  0.04
4             Caribbean Restaurant  0.04


----Manhattan----
            venue  freq
0         Exhibit  0.16
1            Park  0.07
2  History Museum  0.06
3      Playground  0.04
4          Garden  0.03


----Queens----
                 venue  freq
0          Pizza Place  0.11
1          Bus Station  0.08
2       Sandwich Place  0.05
3  Japanese Restaurant  0.05
4   Mexican Restaurant  0.03


----State Island----
           venue  freq
0    Golf Course  0.50
1          Trail  0.25
2  Bowling Alley  0.25
3  Movie Theater  0.00
4          Plaza  0.00


----The Bronx----
                venue  freq
0       Deli / Bodega  0.18
1         Pizza Place  0.18
2         Bus Station  0.14
3  Italian Restaurant  0.09
4               Diner  0.05




In [83]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)

    return row_categories_sorted.index.values[0:num_top_venues]

In [84]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['District']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
districts_venues_sorted = pd.DataFrame(columns=columns)
districts_venues_sorted['District'] = NY_grouped['District']

for ind in np.arange(NY_grouped.shape[0]):
    districts_venues_sorted.iloc[ind, 1:] = return_most_common_venues(NY_grouped.iloc[ind, :], num_top_venues)

districts_venues_sorted

Unnamed: 0,District,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Brooklyn,Coffee Shop,Sandwich Place,Southern / Soul Food Restaurant,Convenience Store,Caribbean Restaurant,Café,Playground,Seafood Restaurant,Chinese Restaurant,Grocery Store
1,Manhattan,Exhibit,Park,History Museum,Playground,Theater,Planetarium,Coffee Shop,Garden,Sushi Restaurant,Italian Restaurant
2,Queens,Pizza Place,Bus Station,Sandwich Place,Japanese Restaurant,Market,Department Store,Locksmith,Park,Dumpling Restaurant,Mexican Restaurant
3,State Island,Golf Course,Trail,Bowling Alley,Yoga Studio,Fried Chicken Joint,Diner,Dog Run,Donut Shop,Dumpling Restaurant,Exhibit
4,The Bronx,Deli / Bodega,Pizza Place,Bus Station,Italian Restaurant,Middle Eastern Restaurant,Spanish Restaurant,Supermarket,Coffee Shop,Donut Shop,Diner


In [85]:
NY_merged = new_data

# add clustering labels
Ny_merged['Cluster Labels'] = kmeans.labels_

# merge NY_grouped with NY_data to add latitude/longitude for each neighborhood
NY_merged = NY_merged.join(districts_venues_sorted.set_index('District'), on='District')
NY_merged.head(11) # check the last columns!


NameError: name 'kmeans' is not defined