# Coursera Capstone Project

### Import Libraries

In [1]:
import numpy as np 
import pandas as pd 
import json 
import folium
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim 
from pandas.io.json import json_normalize 
from sklearn.cluster import KMeans

### Get Manhattan Neighborhoods

In [2]:
# access new york city data from server
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [3]:
# define neighborhood variable
neighborhoods_data = newyork_data['features']

# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

# loop the data to fill the dataframe
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
    
# get Manhattan data
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


### Explore Manhattan Venues with Foursquare

In [4]:
# Define Foursquare Credentials and Version
CLIENT_ID = 'TC0TWC1EU2U4TCOHLCXH5BSXMNYHV2O05PMSTBZLQF2EK2F0'
CLIENT_SECRET = '5UNL2A3UBA211U4AEB4GLCRNRL01W2EOJNPJGK4KWHHO2WUK'
VERSION = '20190601'
radius = 500
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: TC0TWC1EU2U4TCOHLCXH5BSXMNYHV2O05PMSTBZLQF2EK2F0
CLIENT_SECRET:5UNL2A3UBA211U4AEB4GLCRNRL01W2EOJNPJGK4KWHHO2WUK


In [5]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)


In [6]:
# run the above function on each neighborhood 
manhattan_venues = getNearbyVenues(names=manhattan_data['Neighborhood'],
                                   latitudes=manhattan_data['Latitude'],
                                   longitudes=manhattan_data['Longitude']
                                  )

# check the resulting dataframe
manhattan_venues.head()

Marble Hill
Chinatown
Washington Heights
Inwood
Hamilton Heights
Manhattanville
Central Harlem
East Harlem
Upper East Side
Yorkville
Lenox Hill
Roosevelt Island
Upper West Side
Lincoln Square
Clinton
Midtown
Murray Hill
Chelsea
Greenwich Village
East Village
Lower East Side
Tribeca
Little Italy
Soho
West Village
Manhattan Valley
Morningside Heights
Gramercy
Battery Park City
Financial District
Carnegie Hill
Noho
Civic Center
Midtown South
Sutton Place
Turtle Bay
Tudor City
Stuyvesant Town
Flatiron
Hudson Yards


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,Arturo's,40.874412,-73.910271,Pizza Place
1,Marble Hill,40.876551,-73.91066,Bikram Yoga,40.876844,-73.906204,Yoga Studio
2,Marble Hill,40.876551,-73.91066,Tibbett Diner,40.880404,-73.908937,Diner
3,Marble Hill,40.876551,-73.91066,Starbucks,40.877531,-73.905582,Coffee Shop
4,Marble Hill,40.876551,-73.91066,Dunkin',40.877136,-73.906666,Donut Shop


### Explore Manhattan Apparel Stores

In [7]:
# get all unique categories
manhattan_venues['Venue Category'].unique()

array(['Pizza Place', 'Yoga Studio', 'Diner', 'Coffee Shop', 'Donut Shop',
       'Gym', 'Tennis Stadium', 'Seafood Restaurant', 'Department Store',
       'Pharmacy', 'Supplement Shop', 'Video Game Store', 'Steakhouse',
       'Discount Store', 'Bank', 'Ice Cream Shop', 'American Restaurant',
       'Bakery', 'Sandwich Place', 'Kids Store', 'Clothing Store',
       'Deli / Bodega', 'Spa', 'Greek Restaurant',
       'General Entertainment', 'Hotel', 'Cocktail Bar',
       'Chinese Restaurant', 'Museum', 'English Restaurant', 'Bike Shop',
       'Garden Center', 'Noodle House', 'Salon / Barbershop', 'Roof Deck',
       'Tea Room', 'New American Restaurant', 'Indie Movie Theater',
       'Korean Restaurant', 'Sake Bar', 'Bubble Tea Shop',
       'Spanish Restaurant', 'Hotpot Restaurant', 'Restaurant', 'Café',
       'Italian Restaurant', 'Burger Joint', 'Market', 'Ramen Restaurant',
       'Park', 'Breakfast Spot', 'Liquor Store', 'Frozen Yogurt Shop',
       'Wine Shop', 'Tapas Restaura

##### Per the above, we know that categories related to apparels include Department Store, Clothing Store, Women's Store, Men's Store, Kids Store, Lingerie Store, Accessories Store, Shoe Store, Boutique. Please note that the same list can also be obtained from Foursquare.com(https://developer.foursquare.com/docs/resources/categories).

In [9]:
category = ["Department Store", "Clothing Store",  "Women's Store", "Men's Store", "Kids Store", 
            "Lingerie Store", "Accessories Store", "Shoe Store", "Boutique"]
manhattan_clothing = manhattan_venues[manhattan_venues['Venue Category'].isin(category)]
manhattan_clothing.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
8,Marble Hill,40.876551,-73.91066,T.J. Maxx,40.877232,-73.905042,Department Store
21,Marble Hill,40.876551,-73.91066,The Children's Place,40.873672,-73.908156,Kids Store
22,Marble Hill,40.876551,-73.91066,Forever 21,40.87747,-73.90594,Clothing Store
201,Central Harlem,40.815976,-73.943211,Vault,40.815543,-73.947303,Boutique
219,East Harlem,40.792249,-73.944182,Goliath RF,40.791119,-73.945532,Clothing Store


In [10]:
manhattan_clothing.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Battery Park City,1,1,1,1,1,1
Carnegie Hill,1,1,1,1,1,1
Central Harlem,1,1,1,1,1,1
East Harlem,1,1,1,1,1,1
Financial District,2,2,2,2,2,2
Flatiron,1,1,1,1,1,1
Greenwich Village,3,3,3,3,3,3
Hudson Yards,1,1,1,1,1,1
Lenox Hill,2,2,2,2,2,2
Little Italy,3,3,3,3,3,3


##### Per the above, we know that Soho is the most popular neighborhood with the most number of apparel stores. As such, we will recommend our client to locate their new business in Soho.

### Analyze Each Neighborhood

In [11]:
# one hot encoding
manhattan_onehot = pd.get_dummies(manhattan_clothing[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
manhattan_onehot['Neighborhood'] = manhattan_clothing['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [manhattan_onehot.columns[-1]] + list(manhattan_onehot.columns[:-1])
manhattan_onehot = manhattan_onehot[fixed_columns]

# quickly examine results
manhattan_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Boutique,Clothing Store,Department Store,Kids Store,Lingerie Store,Men's Store,Shoe Store,Women's Store
8,Marble Hill,0,0,0,1,0,0,0,0,0
21,Marble Hill,0,0,0,0,1,0,0,0,0
22,Marble Hill,0,0,1,0,0,0,0,0,0
201,Central Harlem,0,1,0,0,0,0,0,0,0
219,East Harlem,0,0,1,0,0,0,0,0,0


In [12]:
# group rows by neighborhood and by taking the mean of the frequency of occurrence of each category
manhattan_grouped = manhattan_onehot.groupby('Neighborhood').mean().reset_index()
manhattan_grouped

Unnamed: 0,Neighborhood,Accessories Store,Boutique,Clothing Store,Department Store,Kids Store,Lingerie Store,Men's Store,Shoe Store,Women's Store
0,Battery Park City,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,Carnegie Hill,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,Central Harlem,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,East Harlem,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Financial District,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0
5,Flatiron,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
6,Greenwich Village,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Hudson Yards,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
8,Lenox Hill,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5
9,Little Italy,0.0,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.333333


In [22]:
# create the new dataframe and display the top 3 venues for each neighborhood
num_top_venues = 3

for hood in manhattan_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = manhattan_grouped[manhattan_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Battery Park City----
               venue  freq
0   Department Store   1.0
1  Accessories Store   0.0
2           Boutique   0.0


----Carnegie Hill----
               venue  freq
0         Shoe Store   1.0
1  Accessories Store   0.0
2           Boutique   0.0


----Central Harlem----
               venue  freq
0           Boutique   1.0
1  Accessories Store   0.0
2     Clothing Store   0.0


----East Harlem----
               venue  freq
0     Clothing Store   1.0
1  Accessories Store   0.0
2           Boutique   0.0


----Financial District----
               venue  freq
0  Accessories Store   0.5
1         Shoe Store   0.5
2           Boutique   0.0


----Flatiron----
               venue  freq
0      Women's Store   1.0
1  Accessories Store   0.0
2           Boutique   0.0


----Greenwich Village----
               venue  freq
0     Clothing Store   1.0
1  Accessories Store   0.0
2           Boutique   0.0


----Hudson Yards----
               venue  freq
0   Department Store 

In [23]:
# put the above into a pandas dataframe
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [24]:
# create the new dataframe and display the top 3 venues for each neighborhood
num_top_venues = 3

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = manhattan_grouped['Neighborhood']

for ind in np.arange(manhattan_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(manhattan_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head(20)

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue
0,Battery Park City,Department Store,Women's Store,Shoe Store
1,Carnegie Hill,Shoe Store,Women's Store,Men's Store
2,Central Harlem,Boutique,Women's Store,Shoe Store
3,East Harlem,Clothing Store,Women's Store,Shoe Store
4,Financial District,Shoe Store,Accessories Store,Women's Store
5,Flatiron,Women's Store,Shoe Store,Men's Store
6,Greenwich Village,Clothing Store,Women's Store,Shoe Store
7,Hudson Yards,Department Store,Women's Store,Shoe Store
8,Lenox Hill,Women's Store,Lingerie Store,Shoe Store
9,Little Italy,Clothing Store,Women's Store,Shoe Store
