Before we get the data and start exploring it, let's download all the dependencies that we will need.


In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import seaborn as sns

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

import wget

Libraries imported.


In [2]:
import folium.plugins as plugins

In [3]:
!wget -q -O 'newyork_data.json' https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DS0701EN-SkillsNetwork/labs/newyork_data.json
print('Data downloaded!')

Data downloaded!


In [4]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [5]:
newyork_data

{'type': 'FeatureCollection',
 'totalFeatures': 306,
 'features': [{'type': 'Feature',
   'id': 'nyu_2451_34572.1',
   'geometry': {'type': 'Point',
    'coordinates': [-73.84720052054902, 40.89470517661]},
   'geometry_name': 'geom',
   'properties': {'name': 'Wakefield',
    'stacked': 1,
    'annoline1': 'Wakefield',
    'annoline2': None,
    'annoline3': None,
    'annoangle': 0.0,
    'borough': 'Bronx',
    'bbox': [-73.84720052054902,
     40.89470517661,
     -73.84720052054902,
     40.89470517661]}},
  {'type': 'Feature',
   'id': 'nyu_2451_34572.2',
   'geometry': {'type': 'Point',
    'coordinates': [-73.82993910812398, 40.87429419303012]},
   'geometry_name': 'geom',
   'properties': {'name': 'Co-op City',
    'stacked': 2,
    'annoline1': 'Co-op',
    'annoline2': 'City',
    'annoline3': None,
    'annoangle': 0.0,
    'borough': 'Bronx',
    'bbox': [-73.82993910812398,
     40.87429419303012,
     -73.82993910812398,
     40.87429419303012]}},
  {'type': 'Feature',
 

In [6]:
neighborhoods_data = newyork_data['features']

In [7]:
neighborhoods_data[0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

In [8]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

In [9]:
neighborhoods

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude


In [10]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [11]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [12]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(neighborhoods['Borough'].unique()),
        neighborhoods.shape[0]
    )
)

The dataframe has 5 boroughs and 306 neighborhoods.


In [13]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7127281, -74.0060152.


In [14]:
csv_path = 'ZipcodeNWC.csv'
df_postal = pd.read_csv(csv_path)

In [15]:
df_postal

Unnamed: 0,zip,borough,post_office,neighborhood,population,density
0,10001,Manhattan,"New York, NY",Chelsea and Clinton,21102,33959
1,10002,Manhattan,"New York, NY",Lower East Side,81410,92573
2,10003,Manhattan,"New York, NY",Lower East Side,56024,97188
3,10004,Manhattan,"New York, NY",Lower Manhattan,3089,5519
4,10005,Manhattan,"New York, NY",Lower Manhattan,7135,97048
5,10006,Manhattan,"New York, NY",Lower Manhattan,3011,32796
6,10007,Manhattan,"New York, NY",Lower Manhattan,6988,42751
7,10009,Manhattan,"New York, NY",Lower East Side,61347,99492
8,10010,Manhattan,"New York, NY",Gramercy Park and Murray Hill,31834,81487
9,10011,Manhattan,"New York, NY",Chelsea and Clinton,50984,77436


In [16]:
df_postal = df_postal.rename(columns={'borough':'Borough', 'neighborhood':'Neighborhood'})

In [17]:
df_postal.head()

Unnamed: 0,zip,Borough,post_office,Neighborhood,population,density
0,10001,Manhattan,"New York, NY",Chelsea and Clinton,21102,33959
1,10002,Manhattan,"New York, NY",Lower East Side,81410,92573
2,10003,Manhattan,"New York, NY",Lower East Side,56024,97188
3,10004,Manhattan,"New York, NY",Lower Manhattan,3089,5519
4,10005,Manhattan,"New York, NY",Lower Manhattan,7135,97048


In [18]:
NYC = pd.merge(neighborhoods, df_postal[['Neighborhood', 'zip']], on='Neighborhood')
NYC

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,zip
0,Brooklyn,Sunset Park,40.645103,-74.010316,11220
1,Brooklyn,Sunset Park,40.645103,-74.010316,11232
2,Brooklyn,Greenpoint,40.730201,-73.954241,11211
3,Brooklyn,Greenpoint,40.730201,-73.954241,11222
4,Brooklyn,Flatbush,40.636326,-73.958401,11203
5,Brooklyn,Flatbush,40.636326,-73.958401,11210
6,Brooklyn,Flatbush,40.636326,-73.958401,11225
7,Brooklyn,Flatbush,40.636326,-73.958401,11226
8,Brooklyn,Borough Park,40.633131,-73.990498,11204
9,Brooklyn,Borough Park,40.633131,-73.990498,11218


In [19]:
NYC = NYC.rename(columns={'zip':'Zip Code'})
NYC.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Zip Code
0,Brooklyn,Sunset Park,40.645103,-74.010316,11220
1,Brooklyn,Sunset Park,40.645103,-74.010316,11232
2,Brooklyn,Greenpoint,40.730201,-73.954241,11211
3,Brooklyn,Greenpoint,40.730201,-73.954241,11222
4,Brooklyn,Flatbush,40.636326,-73.958401,11203


In [20]:
csv_gas = 'Natural_Gas_Consumption_by_ZIP_Code_-_2010.csv'
df_gas = pd.read_csv(csv_gas)

In [21]:
df_gas.head(5)

Unnamed: 0,Zip Code,Building type (service class,Consumption (therms),Consumption,Utility/Data Source
0,10300,Commercial,470,50,National Grid
1,10335,Commercial,647,68,National Grid
2,10360,Large residential,33762,3562,National Grid
3,11200,Commercial,32125,3389,National Grid
4,11200,Institutional,3605,380,National Grid


In [22]:
df_gas.columns

Index(['Zip Code', 'Building type (service class', ' Consumption (therms) ',
       'Consumption', 'Utility/Data Source'],
      dtype='object')

In [23]:
df_gas

Unnamed: 0,Zip Code,Building type (service class,Consumption (therms),Consumption,Utility/Data Source
0,10300,Commercial,470.0,50.0,National Grid
1,10335,Commercial,647.0,68.0,National Grid
2,10360,Large residential,33762.0,3562.0,National Grid
3,11200,Commercial,32125.0,3389.0,National Grid
4,11200,Institutional,3605.0,380.0,National Grid
5,11200,Small residential,3960.0,418.0,National Grid
6,11254,Small residential,1896.0,200.0,National Grid
7,11274,Commercial,8364.0,882.0,National Grid
8,11279,Commercial,2579.0,272.0,National Grid
9,11279,Large residential,301.0,32.0,National Grid


In [24]:
df_gas['Zip Code'] = df_gas['Zip Code'].str[:5]

In [25]:
df_gas

Unnamed: 0,Zip Code,Building type (service class,Consumption (therms),Consumption,Utility/Data Source
0,10300,Commercial,470.0,50.0,National Grid
1,10335,Commercial,647.0,68.0,National Grid
2,10360,Large residential,33762.0,3562.0,National Grid
3,11200,Commercial,32125.0,3389.0,National Grid
4,11200,Institutional,3605.0,380.0,National Grid
5,11200,Small residential,3960.0,418.0,National Grid
6,11254,Small residential,1896.0,200.0,National Grid
7,11274,Commercial,8364.0,882.0,National Grid
8,11279,Commercial,2579.0,272.0,National Grid
9,11279,Large residential,301.0,32.0,National Grid


In [26]:
df_gas['Zip Code'].isnull().values.any()

False

In [27]:
df_gas['Consumption'].isnull().values.any()

True

In [28]:
df_gas['Zip Code'] = df_gas['Zip Code'].astype(str).astype(int)

In [29]:
NYC.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36 entries, 0 to 35
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Borough       36 non-null     object 
 1   Neighborhood  36 non-null     object 
 2   Latitude      36 non-null     float64
 3   Longitude     36 non-null     float64
 4   Zip Code      36 non-null     int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 1.7+ KB


In [30]:
df_gas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1015 entries, 0 to 1014
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Zip Code                      1015 non-null   int32 
 1   Building type (service class  1015 non-null   object
 2    Consumption (therms)         1005 non-null   object
 3   Consumption                   1005 non-null   object
 4   Utility/Data Source           1015 non-null   object
dtypes: int32(1), object(4)
memory usage: 35.8+ KB


In [31]:
df_gas[' Consumption (GJ) '] = df_gas['Consumption'].replace(np.nan, 2474.55)

In [32]:
df_gas[' Consumption (GJ) '].isnull().values.any()

False

In [33]:
NYC.columns

Index(['Borough', 'Neighborhood', 'Latitude', 'Longitude', 'Zip Code'], dtype='object')

In [34]:
NYC.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36 entries, 0 to 35
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Borough       36 non-null     object 
 1   Neighborhood  36 non-null     object 
 2   Latitude      36 non-null     float64
 3   Longitude     36 non-null     float64
 4   Zip Code      36 non-null     int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 1.7+ KB


In [35]:
df_gas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1015 entries, 0 to 1014
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Zip Code                      1015 non-null   int32 
 1   Building type (service class  1015 non-null   object
 2    Consumption (therms)         1005 non-null   object
 3   Consumption                   1005 non-null   object
 4   Utility/Data Source           1015 non-null   object
 5    Consumption (GJ)             1015 non-null   object
dtypes: int32(1), object(5)
memory usage: 43.7+ KB


In [36]:
NYC_gas = pd.merge(NYC, df_gas[['Zip Code', 'Consumption']], on = ['Zip Code'])

In [37]:
NYC_gas

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Zip Code,Consumption
0,Brooklyn,Sunset Park,40.645103,-74.010316,11220,712816.0
1,Brooklyn,Sunset Park,40.645103,-74.010316,11220,235802.0
2,Brooklyn,Sunset Park,40.645103,-74.010316,11220,528353.0
3,Brooklyn,Sunset Park,40.645103,-74.010316,11220,1134741.0
4,Brooklyn,Sunset Park,40.645103,-74.010316,11220,66027.0
5,Brooklyn,Sunset Park,40.645103,-74.010316,11232,346629.0
6,Brooklyn,Sunset Park,40.645103,-74.010316,11232,194919.0
7,Brooklyn,Sunset Park,40.645103,-74.010316,11232,276420.0
8,Brooklyn,Sunset Park,40.645103,-74.010316,11232,216143.0
9,Brooklyn,Sunset Park,40.645103,-74.010316,11232,1376698.0


#### Create a map of New York with neighborhoods superimposed on top.


In [40]:
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_newyork)  
    
map_newyork