Coursera: Data Scientist specialist course
Final Assignment Capstone

# Finding the right neighborhood in a new city

### PART1: importing data

In [1]:
import numpy as np # library to handle data in a vectorized manner
import pandas as pd # library for data analsysis
import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [2]:
#Download NewYork neighborhood data
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('Data downloaded!')

Data downloaded!


In [3]:
#Load data
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [4]:
#extract relevant feature section from data
neighborhoods_data = newyork_data['features']

In [5]:
# define the dataframe columns
column_names = ['Neighborhood', 'Latitude', 'Longitude'] 

# retrieve relevant columns from data
NY_neigh = pd.DataFrame(columns=column_names)
NY_neigh.head()

Unnamed: 0,Neighborhood,Latitude,Longitude


In [6]:
# Populate table
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    NY_neigh = NY_neigh.append({'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)
NY_neigh.head()

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Wakefield,40.894705,-73.847201
1,Co-op City,40.874294,-73.829939
2,Eastchester,40.887556,-73.827806
3,Fieldston,40.895437,-73.905643
4,Riverdale,40.890834,-73.912585


## PART2: Retrieve venues for each neighborhood

In [7]:
# define Foursquare credentials
CLIENT_ID = 'LIBG3H520BZBJDM2PWAWAL0VOP3GM2HMBS14F0STFF0MSCC1' # your Foursquare ID
CLIENT_SECRET = 'VTMYNF3H5F1PLEQ4F15SBKR3TKFLYTGAZEHGCCATB22TI231' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [8]:
def getNearbyVenues(names, latitudes, longitudes, radius=2000):
    LIMIT=100
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [9]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
# type your answer here
venues = getNearbyVenues(names=NY_neigh['Neighborhood'],
                                   latitudes=NY_neigh['Latitude'],
                                   longitudes=NY_neigh['Longitude']
                                  )

print("Retrieved venues for eah neighborhood")

Retrieved venues for eah neighborhood


In [10]:
#Get venues for 'Lange Munte'
# coordinaten: 50°48'40"NB, 3°17'56"OL -> 50.811111, 3.298889
lange_munte=getNearbyVenues(['Lange Munte'], ['50.811111'], ['3.298889'], radius=2000)

#add lange munte to venues
venues=venues.append(lange_munte)

In [11]:
# Analyze each neighborhood
# one hot encoding
onehot = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
onehot['Neighborhood'] = venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [onehot.columns[-1]] + list(onehot.columns[:-1])
onehot = onehot[fixed_columns]

onehot.head()

Unnamed: 0,Zoo Exhibit,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Animal Shelter,...,Weight Loss Center,Whisky Bar,Windmill,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
#group reoccuring venues, sum reoccuring venues

ny_grouped = onehot.groupby('Neighborhood').sum().reset_index()

#tranform the table to binairy (one pool or 2 in the neighborhood should not make a difference)
ny_grouped.loc[:, ny_grouped.columns != 'Neighborhood']=ny_grouped.loc[:, ny_grouped.columns != 'Neighborhood'].clip(upper=1)
ny_grouped.head()

Unnamed: 0,Neighborhood,Zoo Exhibit,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Weight Loss Center,Whisky Bar,Windmill,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo
0,Allerton,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,Annadale,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,Arden Heights,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,Arlington,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,1,0,0
4,Arrochar,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## PART3: Create a content based recommender

In [13]:
#Retrieve and 'Lange Munte' row and retrieve index of this row
input_rc=ny_grouped.loc[ny_grouped['Neighborhood'] == 'Lange Munte']
input_index=input_rc.index

# create content matrix
content_rc=ny_grouped
#drop 'Lange Munte' row from matrix
content_rc=content_rc.drop(input_index)
#save neighborhood names
content_names=content_rc['Neighborhood']

#drop neighborhood columns from input and content
content_rc=content_rc.drop(columns='Neighborhood')
input_rc=input_rc.drop(columns='Neighborhood')


#create dot product between input and content
#Multiply the genres by the weights and then take the weighted average
input_rc=input_rc.transpose()
print(input_rc.shape)
print(content_rc.shape)
recommendationTable_df = (content_rc.dot(input_rc))
recommendationTable_df['Neighborhood']=content_names
recommendationTable_df=recommendationTable_df.sort_values(by=149,ascending=False)
recommendationTable_df.head(5)


(439, 1)
(302, 439)


Unnamed: 0,149,Neighborhood
107,24,Fresh Meadows
157,24,Little Neck
81,23,East Village
265,23,Sunnyside
50,23,Chinatown


### Visualization

In [29]:
top5=recommendationTable_df.iloc[0:6,1]
top5=list(top5)
address = 'New York, NY'

# create map
map_clusters = folium.Map(location=[40.730610, -73.935242], zoom_start=11)

# add markers to the map
markers_colors = []
for lat, lon, poi in zip(NY_neigh['Latitude'], NY_neigh['Longitude'], NY_neigh['Neighborhood']):
    color='grey'
    if poi in top5:
        hit=1
        color='red'
        label = folium.Popup(poi+" has a high resemblence to the input neighborhood", parse_html=True)
    else:
        hit=0
        label = folium.Popup(poi, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=color,
        fill=True,
        fill_color=color,
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters