## Week 5
### Code for the final project
### Opening a New Restaurant in Minneapolis, Minnesota, US

In [13]:
from pandas.io.json import json_normalize
from geopy.geocoders import Nominatim
from sklearn.cluster import KMeans
from urllib.request import urlopen
import matplotlib.colors as colors
import matplotlib.cm as cm
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd 
import json
import requests 
import geocoder
import folium 

# First Part: Scrap data from Wikipedia page into a DataFrame

In [14]:
html = urlopen('https://en.wikipedia.org/wiki/Hennepin_County,_Minnesota')
bs = BeautifulSoup(html, 'html.parser')

tabs = []
# find the li label
for i in bs.find_all("div", class_="div-col columns column-width")[1].findAll("li"):
    tabs.append(i.text)

MN_df = pd.DataFrame({"Suburbs": tabs})
MN_df.head()

Unnamed: 0,Suburbs
0,Bloomington
1,Brooklyn Center
2,Brooklyn Park
3,Champlin
4,Chanhassen (partial)


### Process the data_frame: 
1. Drop the nan rows in Borough column
2. Group neighborhoods in the same borough
3. For Neighborhood Not assigned, replace the value with Borough

In [15]:
# define a function to get coordinates
def get_latlng(suburb):
    # initialize your variable to None
    lat_lng_coords = None
    # loop until you get the coordinates
    while(lat_lng_coords is None):
        g = geocoder.arcgis('{}, Minneapolis, MN'.format(neighborhood))
        lat_lng_coords = g.latlng
    return lat_lng_coords

coords = [ get_latlng(Suburb) for Suburb in MN_df["Suburbs"].tolist() ]

In [27]:
address = 'Minneapolis'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The lat and lon of Minneapolis are {}, {}.'.format(latitude, longitude))

The lat and lon of Minneapolis are 44.9772995, -93.2654692.


In [29]:
# create map of Toronto using latitude and longitude values
map_mpls = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, suburb in zip(MN_df['Lat'], MN_df['Lon'], MN_df['Suburbs']):
    label = '{}'.format(suburb)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_mpls)  
    
map_mpls

## Use the Foursquare API to explore the neighborhoods

In [11]:
CLIENT_ID = '2TFRL2F2VNNMEZMRYXGAVGSS5IAUPLGR2NVAVX2JDP5FC1FS' # your Foursquare ID
CLIENT_SECRET = 'QAHJITKTZHQEUSYJP4K4U453XVOIP3QLTDOYTK50NWG1JS3A' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [31]:
radius = 2000
LIMIT = 100

venues = []

for lat, long, neighborhood in zip(MN_df['Lat'], MN_df['Lon'], MN_df['Suburbs']):
    
    # create the API request URL
    url = "https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}".format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        lat,
        long,
        radius, 
        LIMIT)
    
    # make the GET request
    results = requests.get(url).json()["response"]['groups'][0]['items']
    
    # return only relevant information for each nearby venue
    for venue in results:
        venues.append((
            neighborhood,
            lat, 
            long, 
            venue['venue']['name'], 
            venue['venue']['location']['lat'], 
            venue['venue']['location']['lng'],  
            venue['venue']['categories'][0]['name']))

In [75]:
# convert the venues list into a new DataFrame
venues = pd.DataFrame(venues)

# define the column names
venues.columns = ['Suburbs', 'Lat', 'Lon', 'VenueName', 'VenueLatitude', 'VenueLongitude', 'VenueCategory']

venues.head()

Unnamed: 0,Suburbs,Lat,Lon,VenueName,VenueLatitude,VenueLongitude,VenueCategory
0,Bloomington,44.935798,-93.252388,Powderhorn Park,44.937803,-93.256452,Park
1,Bloomington,44.935798,-93.252388,Tiny Diner,44.934437,-93.259101,American Restaurant
2,Bloomington,44.935798,-93.252388,May Day Cafe,44.939844,-93.252607,Bakery
3,Bloomington,44.935798,-93.252388,Matt's Bar,44.939533,-93.247376,Burger Joint
4,Bloomington,44.935798,-93.252388,Marla's Caribbean Cuisine,44.934169,-93.252303,Caribbean Restaurant


In [77]:
venues_process = pd.get_dummies(venues[['VenueCategory']], prefix="", prefix_sep="")
venues_process['Suburbs'] = venues['Suburbs']
venues_process.shape

(3481, 245)

In [78]:
venues_grouped = venues_process.groupby(["Suburbs"]).mean().reset_index()
restaurant_df = venues_grouped[venues_grouped.filter(like='Restaurant').columns]
restaurant_df['Restaurant Total']= restaurant_df.sum(axis=1)

restaurant_final= restaurant_df[['Restaurant Total']]
restaurant_final['Suburbs'] = venues_grouped[['Suburbs']]
restaurant_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Restaurant Total,Suburbs
0,0.25,Bloomington
1,0.205882,Brooklyn Center
2,0.223881,Brooklyn Park
3,0.157895,Champlin
4,0.24,Chanhassen (partial)
5,0.252632,Corcoran
6,0.173913,Crystal
7,0.24,Dayton (partial)
8,0.24,Deephaven
9,0.055556,Eden Prairie


## K-means cluster

In [80]:
nclusters = 3
restaurant_cluster=restaurant_final.drop(['Suburbs'], axis=1)
kmeans = KMeans(n_clusters=nclusters, random_state=0).fit(restaurant_cluster)

array([0, 2, 0, 2, 0, 0, 2, 0, 0, 1], dtype=int32)

In [87]:
# create a new dataframe that includes the cluster as well as the top 10 venues for each neighborhood.
result = restaurant_final.copy()

# add clustering labels
result["Cluster Labels"] = kmeans.labels_
result.head()

Unnamed: 0,Restaurant Total,Suburbs,Cluster Labels
0,0.25,Bloomington,0
1,0.205882,Brooklyn Center,2
2,0.223881,Brooklyn Park,0
3,0.157895,Champlin,2
4,0.24,Chanhassen (partial),0


In [88]:
result = result.join(MN_df.set_index("Suburbs"), on="Suburbs")
result.head()

Unnamed: 0,Restaurant Total,Suburbs,Cluster Labels,Lat,Lon
0,0.25,Bloomington,0,44.935798,-93.252388
1,0.205882,Brooklyn Center,2,45.094393,-93.367998
2,0.223881,Brooklyn Park,0,45.09448,-93.38835
3,0.157895,Champlin,2,45.17053,-93.392277
4,0.24,Chanhassen (partial),0,44.97902,-93.26494


In [90]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# set color scheme for the clusters
x = np.arange(nclusters)
ys = [i+x+(i*x)**2 for i in range(nclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(result['Lat'], result['Lon'], result['Suburbs'], result['Cluster Labels']):
    label = folium.Popup(str(poi) + ' - Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [91]:
result.loc[result['Cluster Labels'] == 0]

Unnamed: 0,Restaurant Total,Suburbs,Cluster Labels,Lat,Lon
0,0.25,Bloomington,0,44.935798,-93.252388
2,0.223881,Brooklyn Park,0,45.09448,-93.38835
4,0.24,Chanhassen (partial),0,44.97902,-93.26494
5,0.252632,Corcoran,0,44.94293,-93.24146
7,0.24,Dayton (partial),0,44.97902,-93.26494
8,0.24,Deephaven,0,44.97902,-93.26494
13,0.28,Greenfield,0,44.98223,-93.27644
15,0.24,Hanover (partial),0,44.97902,-93.26494
19,0.24,Loretto,0,44.97902,-93.26494
20,0.27,Maple Grove,0,44.972172,-93.285187


###So we can see most of the neighborhoods fall into label 0, and most areas are cafe, restaurants.

In [92]:
result.loc[result['Cluster Labels'] == 1]

Unnamed: 0,Restaurant Total,Suburbs,Cluster Labels,Lat,Lon
9,0.055556,Eden Prairie,1,44.900383,-93.379409
16,0.108696,Hopkins,1,44.811134,-93.286478
17,0.033333,Independence,1,45.060778,-93.399329
27,0.0,Minnetrista,1,44.957607,-93.650264
28,0.0,Mound,1,44.957607,-93.650264
37,0.076923,Shorewood,1,44.801397,-93.309462
40,0.0625,St. Bonifacius,1,44.90409,-93.74419
44,0.097561,Woodland,1,44.842149,-93.353159


In [93]:
result.loc[result['Cluster Labels'] == 2]

Unnamed: 0,Restaurant Total,Suburbs,Cluster Labels,Lat,Lon
1,0.205882,Brooklyn Center,2,45.094393,-93.367998
3,0.157895,Champlin,2,45.17053,-93.392277
6,0.173913,Crystal,2,45.026042,-93.323711
10,0.16,Edina,2,44.91645,-93.341182
11,0.125,Excelsior,2,44.927425,-93.370291
12,0.169492,Golden Valley,2,44.998758,-93.307808
14,0.157895,Greenwood,2,45.012919,-93.318908
18,0.206897,Long Lake,2,45.039022,-93.336887
23,0.164835,Medina,2,45.013158,-93.476999
31,0.2,Osseo,2,45.041994,-93.317944


## The suburbs in cluster 1 are the most preferred locations to open a new restaurant, because it has the lowest number in total restaurants and can aviod intensive competation. Property developers can also open new restaurants in suburbs within cluster 2 with moderate competition.