# Nearest Neighborhood

### Required libraries

In [1]:
# Libraries & Imports
import pandas as pd
import json # library to handle JSON files
import requests # library to handle requests
import time

# import k-means from clustering stage
from sklearn.cluster import KMeans

### Read London neighborhood data from cache

In [2]:
# Read London neighborhood file and turn it into a DataFrame
# London csv file previously created and cached by notebook GetLondonData.ipynb
dfLondon = pd.read_csv('/resources/data/CentralLondon.csv')
dfLondon.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Camden,Belsize,51.547393,-0.168301
1,Camden,Bloomsbury,51.516845,-0.125741
2,Camden,Camden Town,51.541825,-0.139128
3,Camden,Cantelowes,51.546916,-0.133241
4,Camden,Fortune Green,51.554596,-0.197622


### Read New York neighborhood data from cache

In [3]:
# New York neighborhood data previously supplied in course lab
!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json

with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)  # loads as a dict
    # define the dataframe columns
    column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude']

    # instantiate the dataframe
    dfNewYork = pd.DataFrame(columns=column_names)

### Parse the neighborhood data from the json file and add to New York DataFrame

In [4]:
neighborhoods_data = newyork_data['features']  # gives a list

for data in neighborhoods_data:
    borough = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    dfNewYork = dfNewYork.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
    
dfNewYork.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


### Create a combined (London & NY) DataFrame

In [5]:
# Add city identifier column
# Added the check if columnn exists so that the cell won't throw an error if run more than once
if 'City' not in dfNewYork.columns:
    dfNewYork.insert(0, 'City', 'New York')
    
if 'City' not in dfLondon.columns:
    dfLondon.insert(0, 'City', 'London')

# Contatenate the 2 cities into one DataFrame
dfAll = pd.concat([dfNewYork, dfLondon])

# Check that the new DataFrame contains both cities
dfAll['City'].unique()

array(['New York', 'London'], dtype=object)

## Get venue data for each neighborhood

### Use FourSquare api with credentials

In [6]:
CLIENT_ID = 'DTICD0OUQRFBTN0RFEUYZ2F2AKCAVFNKKWLQFW04RPZOHBWP' # Foursquare ID
CLIENT_SECRET = 'POFBCSBAPCSFFGAYN2M1HDB4BJ033Q0OCZOBFAKL2BMRLFP1' # Foursquare Secret
VERSION = '20180605' # Foursquare API version

### Define functions for obtaining and extracting venue data from FourSquare

In [7]:
### Get neighborhood venue data
def getNearbyVenues(cities, names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for city, name, lat, lng in zip(cities, names, latitudes, longitudes):
        time.sleep(1)
        # print(city, name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            100)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            city,
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City',
                  'Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Use FourSquare to get venues

In [9]:
all_venues = getNearbyVenues(cities=dfAll['City'],
                                   names=dfAll['Neighborhood'],
                                   latitudes=dfAll['Latitude'],
                                   longitudes=dfAll['Longitude']
                                  )

print(all_venues.shape)
all_venues.head()

(15953, 8)


Unnamed: 0,City,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,New York,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,New York,Wakefield,40.894705,-73.847201,Rite Aid,40.896521,-73.84468,Pharmacy
2,New York,Wakefield,40.894705,-73.847201,Cooler Runnings Jamaican Restaurant Inc,40.898276,-73.850381,Caribbean Restaurant
3,New York,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
4,New York,Wakefield,40.894705,-73.847201,Dunkin Donuts,40.890631,-73.849027,Donut Shop


### Save to cache

In [10]:
# Cache the all_venues DataFrame
all_venues.to_csv('/resources/data/AllVenues.csv', columns=['City','Neighborhood','Neighborhood Latitude','Neighborhood Longitude','Venue','Venue Latitude','Venue Longitude','Venue Category'], index=False)

### Retrieve from cache

In [11]:
# This cell is useful to start from if you don't want to download data from scratch
all_venues = pd.read_csv('/resources/data/AllVenues.csv')
all_venues.head()

Unnamed: 0,City,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,New York,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,New York,Wakefield,40.894705,-73.847201,Rite Aid,40.896521,-73.84468,Pharmacy
2,New York,Wakefield,40.894705,-73.847201,Cooler Runnings Jamaican Restaurant Inc,40.898276,-73.850381,Caribbean Restaurant
3,New York,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
4,New York,Wakefield,40.894705,-73.847201,Dunkin Donuts,40.890631,-73.849027,Donut Shop


### Convert all restaurant types e.g. 'Seafood Restauarant' to 'Restaurant'

In [12]:
### Categorise all restaurant types as 'Restaurant'
def convertRestaurant(category):
    if "Restaurant" in category:
        return 'Restaurant'
    else:
        return category

In [13]:
for index, row in all_venues.iterrows():
    all_venues.at[index,'Venue Category'] = convertRestaurant(row['Venue Category'])

In [14]:
# note that record 3 changes from 'Caribbean Restaurant' to 'Restaurant'
all_venues.head()

Unnamed: 0,City,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,New York,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,New York,Wakefield,40.894705,-73.847201,Rite Aid,40.896521,-73.84468,Pharmacy
2,New York,Wakefield,40.894705,-73.847201,Cooler Runnings Jamaican Restaurant Inc,40.898276,-73.850381,Restaurant
3,New York,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
4,New York,Wakefield,40.894705,-73.847201,Dunkin Donuts,40.890631,-73.849027,Donut Shop


### Apply one-hot encoding to the categories

In [15]:
# one hot encoding
all_onehot = pd.get_dummies(all_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
all_onehot['Neighborhood'] = all_venues['Neighborhood'] 

# move Neighborhood column to the first column
fixed_columns = [all_onehot.columns[-1]] + list(all_onehot.columns[:-1])
all_onehot = all_onehot[fixed_columns]

# add City column back to dataframe
all_onehot['City'] = all_venues['City'] 

# move neighborhood column to the first column
fixed_columns = [all_onehot.columns[-1]] + list(all_onehot.columns[:-1])
all_onehot = all_onehot[fixed_columns]

all_onehot.head()

Unnamed: 0,City,Zoo Exhibit,Accessories Store,Adult Boutique,Airport Terminal,Airport Tram,Animal Shelter,Antique Shop,Arcade,Art Gallery,...,Watch Shop,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,New York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,New York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,New York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,New York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,New York,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Calculate the mean number of each category type

In [16]:
all_grouped = all_onehot.groupby(['City','Neighborhood']).mean().reset_index()
all_grouped.head()

Unnamed: 0,City,Neighborhood,Zoo Exhibit,Accessories Store,Adult Boutique,Airport Terminal,Airport Tram,Animal Shelter,Antique Shop,Arcade,...,Watch Shop,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio
0,London,Abbey Wood,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,London,Aldersgate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.012346,0.0,0.0,0.0,0.0,0.0
2,London,Balham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016393
3,London,Barnsbury,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.023256
4,London,Bellingham,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Cluster neighborhoods

In [17]:
# set number of clusters
kclusters = 10

all_grouped_clustering = all_grouped.drop(['City','Neighborhood'], 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(all_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

array([8, 0, 0, 1, 8, 0, 5, 0, 1, 0], dtype=int32)

In [18]:
all_merged = all_grouped[['City','Neighborhood']].copy()

# add clustering labels
all_merged['Cluster Labels'] = kmeans.labels_

all_merged

Unnamed: 0,City,Neighborhood,Cluster Labels
0,London,Abbey Wood,8
1,London,Aldersgate,0
2,London,Balham,0
3,London,Barnsbury,1
4,London,Bellingham,8
5,London,Belsize,0
6,London,Bethnal Green,5
7,London,Bishopsgate,0
8,London,Blackheath,1
9,London,Blackheath Westcombe,0


### Export results to a spreadsheet

In [19]:
# Save the clustered data to csv
all_merged.to_csv('/resources/data/NYLonClusters.csv', columns=['City','Neighborhood','Cluster Labels'], index=False)

## Results
Having clustered the neighborhoods, the way to find the nearest neighborhoods is as follows:
    1. Look up the New York neighborhood from the alphabetical list below and find its cluster number.
    2. Look at the London neighborhoods in the same cluster. Those are the 'nearest neighborhoods'.
    
#### For convenience, the exported data has been exported to csv and presented in spreadsheet form. See report for details.

### Alphabetical list of New York Neighborhoods with cluster number

In [20]:
all_merged.loc[all_merged['City'] == 'New York', ['Neighborhood','Cluster Labels']]

Unnamed: 0,Neighborhood,Cluster Labels
148,Allerton,4
149,Annadale,4
150,Arden Heights,8
151,Arlington,4
152,Arrochar,4
153,Arverne,8
154,Astoria,1
155,Astoria Heights,4
156,Auburndale,4
157,Bath Beach,1


### London neighborhoods in cluster 0

In [21]:
all_merged.loc[(all_merged['Cluster Labels'] == 0) & (all_merged['City'] == 'London'), ['City','Neighborhood']]

Unnamed: 0,City,Neighborhood
1,London,Aldersgate
2,London,Balham
5,London,Belsize
7,London,Bishopsgate
9,London,Blackheath Westcombe
10,London,Blackwall
11,London,Bloomsbury
15,London,Brockley
17,London,Bromley South
18,London,Brompton


### London neighborhoods in cluster 1

In [22]:
all_merged.loc[(all_merged['Cluster Labels'] == 1) & (all_merged['City'] == 'London'), ['City','Neighborhood']]

Unnamed: 0,City,Neighborhood
3,London,Barnsbury
8,London,Blackheath
14,London,Brixton Hill
72,London,Junction
78,London,Lansbury
100,London,Prince's
102,London,Queenhithe
104,London,Riverside
113,London,South Camberwell
118,London,St Katharine's


### London neighborhoods in cluster 2

In [23]:
all_merged.loc[(all_merged['Cluster Labels'] == 2) & (all_merged['City'] == 'London'), ['City','Neighborhood']]

Unnamed: 0,City,Neighborhood
40,London,Downham
79,London,Latchmere
128,London,Streatham Wells
133,London,Thamesmead Moorings


### London neighborhoods in cluster 3

In [24]:
all_merged.loc[(all_merged['Cluster Labels'] == 3) & (all_merged['City'] == 'London'), ['City','Neighborhood']]

Unnamed: 0,City,Neighborhood


### London neighborhoods in cluster 4

In [25]:
all_merged.loc[(all_merged['Cluster Labels'] == 4) & (all_merged['City'] == 'London'), ['City','Neighborhood']]

Unnamed: 0,City,Neighborhood
83,London,Livesey
108,London,Rushey Green
125,London,Stockwell
129,London,Surrey Docks
131,London,Sydenham
134,London,Thornton


### London neighborhoods in cluster 5

In [26]:
all_merged.loc[(all_merged['Cluster Labels'] == 5) & (all_merged['City'] == 'London'), ['City','Neighborhood']]

Unnamed: 0,City,Neighborhood
6,London,Bethnal Green
12,London,Bow East
13,London,Bow West
16,London,Bromley North
26,London,Canonbury
29,London,Charlton
30,London,Chaucer
32,London,Clapham Common
43,London,East Dulwich
47,London,Fairfield


### London neighborhoods in cluster 6

In [27]:
all_merged.loc[(all_merged['Cluster Labels'] == 6) & (all_merged['City'] == 'London'), ['City','Neighborhood']]

Unnamed: 0,City,Neighborhood
19,London,Brunswick Park
110,London,Shaftesbury
112,London,South Bermondsey
120,London,St Mary's
146,London,Woolwich Common


### London neighborhoods in cluster 7

In [28]:
all_merged.loc[(all_merged['Cluster Labels'] == 7) & (all_merged['City'] == 'London'), ['City','Neighborhood']]

Unnamed: 0,City,Neighborhood


### London neighborhoods in cluster 8

In [29]:
all_merged.loc[(all_merged['Cluster Labels'] == 8) & (all_merged['City'] == 'London'), ['City','Neighborhood']]

Unnamed: 0,City,Neighborhood
0,London,Abbey Wood
4,London,Bellingham
27,London,Cantelowes
28,London,Catford South
59,London,Graveney
74,London,Kidbrooke
97,London,Plumstead
141,London,Weavers


### London neighborhoods in cluster 9

In [30]:
all_merged.loc[(all_merged['Cluster Labels'] == 9) & (all_merged['City'] == 'London'), ['City','Neighborhood']]

Unnamed: 0,City,Neighborhood
