In [1]:
import requests
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import numpy as np
from bs4 import BeautifulSoup
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

### Processing Toronto data from wikipedia page.

In [2]:
Toronto_url = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
# Now that we have the url where our data from Toronto is, we use BeautifulSoup to extract our data.

soup = BeautifulSoup(Toronto_url,'lxml')

### Finding in Wikipedia Toronto's table with Boroughs and Neighborhoods starts

In [3]:
Toronto_BN = soup.find('table',{'class':'wikitable sortable'})

### Creating 3 lists with all Postal Codes, Borough and Neighborhoods

In [4]:
PostalCodes = []
Borough =[]
Neighborhood=[]
for row in Toronto_BN.findAll("tr"):
    cells = row.findAll("td")
    #For each "tr", assign each "td" to a variable.
    if len(cells) == 3:
        PostalCodes.append(cells[0].find(text=True))
        Borough.append(cells[1].find(text=True))
        Neighborhood.append(cells[2].find(text=True))
Toronto_df = pd.DataFrame({'PostalCode': PostalCodes,'Neighborhood':Neighborhood,'Borough': Borough})
# Dropping all rows where Borough is equal to 'Not assigned'
df_Toronto = Toronto_df[Toronto_df.Borough != 'Not assigned']
df_Toronto.reset_index(inplace=True)
df_Toronto = df_Toronto.replace('\n','', regex=True) # This removes all new lines ('\n') that I may have in all columns.
df_Toronto.drop('index',axis=1,inplace=True)

### Assigning the name of the Borough to neigborhoods with value 'Not assigned'

In [5]:
ind=[]
a = df_Toronto['Neighborhood'].str.find('Not assigned')
a = a.tolist()
for i in range(len(a)):
    if a[i] == 0:
        ind.append(i)
for id in ind:
    df_Toronto.loc[id,'Neighborhood']=df_Toronto.loc[id,'Borough']

In [6]:
# Creating one Postal Code per Borough and grouping in the Neighborhood column all Neighborhoods in the same Postal Code 
df_Torontonew = df_Toronto.groupby(df_Toronto['PostalCode'],as_index=False).aggregate({'Borough': 'first', 'Neighborhood': lambda x: ', '.join(x)},sort=False)


### Including LAT and LON (Using a CSV file)

In [7]:
LatLon_df = pd.read_csv(r'C:\Users\v.hernandez.byd\Documents\Data Science\Geospatial_Coordinates.csv')
LatLon_df = LatLon_df.rename(columns={'Postal Code': 'PostalCode'})
Toronto_data_df = pd.merge(df_Torontonew, LatLon_df, on='PostalCode')
Toronto_data_df.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## Processing New York City Data (CSV file)

In [8]:
NewYork_df = pd.read_csv(r'C:\Users\v.hernandez.byd\Documents\Data Science\Coursera\NewYork_Data.csv')
NewYork_df.head(20)

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585
5,Bronx,Kingsbridge,40.881687,-73.902818
6,Manhattan,Marble Hill,40.876551,-73.91066
7,Bronx,Woodlawn,40.898273,-73.867315
8,Bronx,Norwood,40.877224,-73.879391
9,Bronx,Williamsbridge,40.881039,-73.857446


### Starting Exploring data using Foursquare data

In [9]:
CLIENT_ID ='SVDP1YJGJZ5EMR51DKHDVEITMXMDUSZ2AS4HCHNI4QSX1RX3'
CLIENT_SECRET= 'DTG3XJSF3XWPLC2KUVX0V4KSUEZNGBAIUHOLP3LQK0MX4WAL'
VERSION ='20180605'

### Creating a function to explore all neighborhoods in Toronto and New York

In [10]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    LIMIT=200
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Getting all venues in Toronto

In [13]:
Toronto_venues = getNearbyVenues(Toronto_data_df['Neighborhood'],
                                   latitudes=Toronto_data_df['Latitude'],
                                   longitudes=Toronto_data_df['Longitude'])
Toronto_venues.head(10)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Rouge, Malvern",43.806686,-79.194353,Wendy's,43.807448,-79.199056,Fast Food Restaurant
1,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,RIGHT WAY TO GOLF,43.785177,-79.161108,Golf Course
2,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Royal Canadian Legion,43.782533,-79.163085,Bar
3,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Swiss Chalet Rotisserie & Grill,43.767697,-79.189914,Pizza Place
4,"Guildwood, Morningside, West Hill",43.763573,-79.188711,G & G Electronics,43.765309,-79.191537,Electronics Store
5,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Big Bite Burrito,43.766299,-79.19072,Mexican Restaurant
6,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Enterprise Rent-A-Car,43.764042,-79.193371,Rental Car Location
7,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Woburn Medical Centre,43.766631,-79.192286,Medical Center
8,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Eggsmart,43.7678,-79.190466,Breakfast Spot
9,Woburn,43.770992,-79.216917,Starbucks,43.770037,-79.221156,Coffee Shop


### Getting all venues in New York

In [14]:
NewYork_venues = getNearbyVenues(names=NewYork_df['Neighborhood'],
                                   latitudes=NewYork_df['Latitude'],
                                   longitudes=NewYork_df['Longitude'])
NewYork_venues.head(10)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Rite Aid,40.896521,-73.84468,Pharmacy
2,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
3,Wakefield,40.894705,-73.847201,Cooler Runnings Jamaican Restaurant Inc,40.898276,-73.850381,Caribbean Restaurant
4,Wakefield,40.894705,-73.847201,Dunkin Donuts,40.890631,-73.849027,Donut Shop
5,Wakefield,40.894705,-73.847201,SUBWAY,40.890656,-73.849192,Sandwich Place
6,Wakefield,40.894705,-73.847201,Central Deli,40.896846,-73.844415,Deli / Bodega
7,Wakefield,40.894705,-73.847201,Baychester Avenue Food Truck,40.892293,-73.84323,Food Truck
8,Co-op City,40.874294,-73.829939,Dollar Tree,40.870125,-73.828989,Discount Store
9,Co-op City,40.874294,-73.829939,Capri II Pizza,40.876374,-73.82994,Pizza Place


### Creating a function to know number of venues per city

In [15]:
def city_venues_data(venue_data, venues):
    venues_data = pd.get_dummies(venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
    venues_data ['Neighborhood'] = venues['Neighborhood'] 

# move neighborhood column to the first column
    fixed_columns = [venues_data.columns[-1]] + list(venues_data.columns[:-1])
    venues_data = venues_data[fixed_columns]
    new_col_list=[]
    col_list= list(venues_data)
    for i in range(len(col_list)):
        if 'Neighborhood' in col_list[i] or 'Restaurant' in col_list[i] or 'Food' in col_list[i] or 'Cafe' in col_list[i] or 'Bar' in col_list[i] or 'Deli' in col_list[i] or 'Hotel' in col_list[i] or 'Motel' in col_list[i] or 'Park' in col_list[i] or 'Museum' in col_list[i] or 'Airport Terminal' in col_list[i] or 'Stadium' in col_list[i]:
            new_col_list.append(col_list[i])
    df= venues_data[new_col_list]
    return (df)

### Analyzing venues data for Toronto

In [16]:
Toronto_vdf = city_venues_data(Toronto_data_df, Toronto_venues)
Toronto_grouped = Toronto_vdf.groupby('Neighborhood').mean().reset_index()
Toronto_grouped.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,Airport Food Court,Airport Terminal,American Restaurant,Arepa Restaurant,Art Museum,Asian Restaurant,Bar,Baseball Stadium,Basketball Stadium,Beer Bar,Belgian Restaurant,Brazilian Restaurant,Cafeteria,Cajun / Creole Restaurant,Caribbean Restaurant,Chinese Restaurant,Cocktail Bar,College Cafeteria,College Stadium,Comfort Food Restaurant,Cuban Restaurant,Deli / Bodega,Dim Sum Restaurant,Doner Restaurant,Dumpling Restaurant,Eastern European Restaurant,Empanada Restaurant,Ethiopian Restaurant,Falafel Restaurant,Fast Food Restaurant,Filipino Restaurant,Food,Food & Drink Shop,Food Court,Food Truck,French Restaurant,Gaming Cafe,Gay Bar,German Restaurant,Gluten-free Restaurant,Greek Restaurant,Hakka Restaurant,Health Food Store,History Museum,Hookah Bar,Hotel,Hotel Bar,Indian Restaurant,Indonesian Restaurant,Italian Restaurant,Japanese Restaurant,Jewish Restaurant,Juice Bar,Korean Restaurant,Latin American Restaurant,Malay Restaurant,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Molecular Gastronomy Restaurant,Motel,Museum,New American Restaurant,Park,Piano Bar,Portuguese Restaurant,Ramen Restaurant,Restaurant,Sake Bar,Salon / Barbershop,Seafood Restaurant,Skate Park,Southern / Soul Food Restaurant,Sports Bar,Stadium,Sushi Restaurant,Taiwanese Restaurant,Tapas Restaurant,Thai Restaurant,Turkish Restaurant,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Wine Bar
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.04,0.0,0.01,0.03,0.03,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.03,0.0,0.01,0.0,0.0,0.02,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.03,0.0,0.01,0.01,0.0,0.0,0.0,0.0,0.02,0.0,0.0,0.04,0.0,0.01,0.0,0.01
1,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Analyzing venues data for New York

In [17]:
NewYork_vdf = city_venues_data(NewYork_df, NewYork_venues)
NewYork_grouped = NewYork_vdf.groupby('Neighborhood').mean().reset_index()
NewYork_grouped.head()

Unnamed: 0,Neighborhood,Afghan Restaurant,African Restaurant,American Restaurant,Arepa Restaurant,Argentinian Restaurant,Art Museum,Asian Restaurant,Australian Restaurant,Austrian Restaurant,Bar,Baseball Stadium,Beach Bar,Beer Bar,Belgian Restaurant,Brazilian Restaurant,Cafeteria,Cajun / Creole Restaurant,Cambodian Restaurant,Cantonese Restaurant,Caribbean Restaurant,Caucasian Restaurant,Chinese Restaurant,Cocktail Bar,College Cafeteria,Colombian Restaurant,Comfort Food Restaurant,Cuban Restaurant,Czech Restaurant,Deli / Bodega,Dim Sum Restaurant,Dive Bar,Dumpling Restaurant,Eastern European Restaurant,Egyptian Restaurant,Empanada Restaurant,English Restaurant,Ethiopian Restaurant,Falafel Restaurant,Fast Food Restaurant,Filipino Restaurant,Food,Food & Drink Shop,Food Court,Food Stand,Food Truck,French Restaurant,Gaming Cafe,Gay Bar,German Restaurant,Gluten-free Restaurant,Greek Restaurant,Halal Restaurant,Hawaiian Restaurant,Health Food Store,Himalayan Restaurant,History Museum,Hookah Bar,Hotel,Hotel Bar,Hotel Pool,Hotpot Restaurant,Hunan Restaurant,Indian Restaurant,Indonesian Restaurant,Israeli Restaurant,Italian Restaurant,Japanese Curry Restaurant,Japanese Restaurant,Jewish Restaurant,Juice Bar,Karaoke Bar,Kebab Restaurant,Korean Restaurant,Kosher Restaurant,Latin American Restaurant,Lebanese Restaurant,Malay Restaurant,Mediterranean Restaurant,Mexican Restaurant,Middle Eastern Restaurant,Modern European Restaurant,Molecular Gastronomy Restaurant,Moroccan Restaurant,Motel,Museum,New American Restaurant,Paella Restaurant,Pakistani Restaurant,Park,Persian Restaurant,Peruvian Restaurant,Piano Bar,Polish Restaurant,Portuguese Restaurant,Ramen Restaurant,Restaurant,Romanian Restaurant,Russian Restaurant,Sake Bar,Salon / Barbershop,Seafood Restaurant,Shabu-Shabu Restaurant,Shanghai Restaurant,Skate Park,Soba Restaurant,South American Restaurant,South Indian Restaurant,Southern / Soul Food Restaurant,Spanish Restaurant,Sports Bar,Sri Lankan Restaurant,Stadium,State / Provincial Park,Sushi Restaurant,Swiss Restaurant,Szechuan Restaurant,Taiwanese Restaurant,Tapas Restaurant,Tennis Stadium,Tex-Mex Restaurant,Thai Restaurant,Theme Park,Theme Park Ride / Attraction,Tibetan Restaurant,Tiki Bar,Turkish Restaurant,Udon Restaurant,Vegetarian / Vegan Restaurant,Venezuelan Restaurant,Vietnamese Restaurant,Whisky Bar,Wine Bar
0,Allerton,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Annadale,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Arden Heights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Arlington,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Arrochar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.117647,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058824,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Creating Clusters for Toronto and NY

In [18]:
# Creating a Clustering Function
def clustering(grouped):
# set number of clusters
    kclusters = 4
    grouped_clustering = grouped.drop('Neighborhood', 1)
# run k-means clustering
    kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(grouped_clustering)
# check cluster labels generated for each row in the dataframe
    return(kmeans.labels_) 

In [19]:
Toronto_cluster = clustering(Toronto_grouped)
Toronto_cluster

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 3, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1])

In [20]:
NewYork_cluster = clustering(NewYork_grouped)
NewYork_cluster

array([0, 0, 0, 3, 3, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 3, 3, 0,
       3, 0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0,
       0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 3, 3, 0, 0,
       0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 3, 0,
       3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0, 3, 0, 3, 3, 0, 0, 0, 3, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 0, 0, 0,
       0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3,
       0, 0, 0, 0, 0, 3, 0, 0, 1, 0, 3, 0, 3, 0, 0, 1, 3, 0, 0, 0, 0, 2,
       0, 0, 0, 0, 3, 1, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 3, 0, 0,
       0, 0, 0, 0, 0, 3, 2, 0, 0, 0, 0, 0, 3, 0, 0,

### Creating new Data Frame with cluster labels and venues in every neighborhood for Toronto

In [21]:
Toronto_Top = Toronto_grouped
# add clustering labels
Toronto_Top['Cluster Labels'] = Toronto_cluster
# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Toronto_Top = Toronto_Top.join(Toronto_venues.set_index('Neighborhood'), on='Neighborhood')
Toronto_CL=Toronto_Top[['Neighborhood', 'Cluster Labels','Neighborhood Latitude','Neighborhood Longitude','Venue','Venue Latitude','Venue Longitude','Venue Category']]
Toronto_CL=pd.DataFrame(Toronto_CL)
Toronto_CL.head(10)

Unnamed: 0,Neighborhood,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Adelaide, King, Richmond",0,43.650571,-79.384568,Four Seasons Centre for the Performing Arts,43.650609,-79.38628,Concert Hall
0,"Adelaide, King, Richmond",0,43.650571,-79.384568,Nathan Phillips Square,43.65227,-79.383516,Plaza
0,"Adelaide, King, Richmond",0,43.650571,-79.384568,The Keg Steakhouse & Bar,43.649937,-79.384196,Steakhouse
0,"Adelaide, King, Richmond",0,43.650571,-79.384568,Shangri-La Toronto,43.649129,-79.386557,Hotel
0,"Adelaide, King, Richmond",0,43.650571,-79.384568,Soho House Toronto,43.648734,-79.386541,Speakeasy
0,"Adelaide, King, Richmond",0,43.650571,-79.384568,Hy's Steakhouse,43.649505,-79.382919,Steakhouse
0,"Adelaide, King, Richmond",0,43.650571,-79.384568,Momofuku Noodle Bar,43.649366,-79.386217,Noodle House
0,"Adelaide, King, Richmond",0,43.650571,-79.384568,John & Sons Oyster House,43.650656,-79.381613,Seafood Restaurant
0,"Adelaide, King, Richmond",0,43.650571,-79.384568,Estiatorio Volos,43.650329,-79.384533,Greek Restaurant
0,"Adelaide, King, Richmond",0,43.650571,-79.384568,M Square Coffee Co,43.651218,-79.383555,Coffee Shop


### Creating new Data Frame with cluster labels and venues in every neighborhood for NY

In [22]:
NewYork_Top = NewYork_grouped
# add clustering labels
NewYork_Top['Cluster Labels'] = NewYork_cluster
# merge NY_grouped with toronto_data to add latitude/longitude for each neighborhood
NewYork_Top = NewYork_Top.join(NewYork_venues.set_index('Neighborhood'), on='Neighborhood')
NewYork_CL=NewYork_Top[['Neighborhood', 'Cluster Labels','Neighborhood Latitude','Neighborhood Longitude','Venue','Venue Latitude','Venue Longitude','Venue Category']]
NewYork_CL=pd.DataFrame(NewYork_CL)
NewYork_CL.head(10)

Unnamed: 0,Neighborhood,Cluster Labels,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Allerton,0,40.865788,-73.859319,Domenick's Pizzeria,40.865576,-73.858124,Pizza Place
0,Allerton,0,40.865788,-73.859319,White Castle,40.866065,-73.862307,Fast Food Restaurant
0,Allerton,0,40.865788,-73.859319,Bronx Martial Arts Academy,40.865721,-73.857529,Martial Arts Dojo
0,Allerton,0,40.865788,-73.859319,Sal & Doms Bakery,40.865377,-73.855236,Dessert Shop
0,Allerton,0,40.865788,-73.859319,Dunkin Donuts,40.865318,-73.858956,Donut Shop
0,Allerton,0,40.865788,-73.859319,IHOP,40.865728,-73.86246,Breakfast Spot
0,Allerton,0,40.865788,-73.859319,Li's Kitchen,40.863566,-73.858372,Chinese Restaurant
0,Allerton,0,40.865788,-73.859319,Dollar Tree,40.866239,-73.861546,Discount Store
0,Allerton,0,40.865788,-73.859319,Foodtown,40.86537,-73.858152,Supermarket
0,Allerton,0,40.865788,-73.859319,Popeye's,40.866786,-73.861208,Fried Chicken Joint


### Evaluating each cluster according to amount of venues we are evaluating to take the desicion

In [23]:
Toronto_Rest =Toronto_CL.loc[Toronto_CL['Venue Category'] == 'Restaurant', Toronto_CL.columns[[1] + list(range(4, Toronto_CL.shape[1]))]]
Toronto_Rest.head()

Unnamed: 0,Cluster Labels,Venue,Venue Latitude,Venue Longitude,Venue Category
0,0,Bannock,43.652101,-79.381178,Restaurant
0,0,Canoe,43.647452,-79.38132,Restaurant
0,0,Bymark,43.647217,-79.381252,Restaurant
5,0,Bagel Plus,43.755395,-79.440686,Restaurant
7,0,Darbar Persian Grill,43.735484,-79.420006,Restaurant


In [24]:
Toronto_Hotel =Toronto_CL.loc[Toronto_CL['Venue Category'] == 'Hotel', Toronto_CL.columns[[1] + list(range(4, Toronto_CL.shape[1]))]]
Toronto_Hotel.head()

Unnamed: 0,Cluster Labels,Venue,Venue Latitude,Venue Longitude,Venue Category
0,0,Shangri-La Toronto,43.649129,-79.386557,Hotel
0,0,Hilton Toronto,43.649852,-79.385576,Hotel
0,0,The Adelaide Hotel Toronto,43.649831,-79.380164,Hotel
8,0,"The Westin Harbour Castle, Toronto",43.641211,-79.375749,Hotel
17,0,Hilton Garden Inn,43.638519,-79.618721,Hotel


In [25]:
Toronto_Airport=Toronto_CL.loc[Toronto_CL['Venue Category'] == 'Airport', Toronto_CL.columns[[1] + list(range(4, Toronto_CL.shape[1]))]]
Toronto_Airport.head()

Unnamed: 0,Cluster Labels,Venue,Venue Latitude,Venue Longitude,Venue Category
13,1,Toronto Downsview Airport (YZD),43.738883,-79.470111,Airport
14,0,Billy Bishop Toronto City Airport (YTZ) (Billy...,43.631574,-79.396,Airport


In [26]:
Toronto_Stadium=Toronto_CL.loc[Toronto_CL['Venue Category'] == 'Stadium', Toronto_CL.columns[[1] + list(range(4, Toronto_CL.shape[1]))]]
Toronto_Stadium.head()

Unnamed: 0,Cluster Labels,Venue,Venue Latitude,Venue Longitude,Venue Category
11,0,Lamport Stadium,43.638778,-79.423534,Stadium


In [27]:
NY_Restaurant=NewYork_CL.loc[NewYork_CL['Venue Category'] == 'Restaurant', NewYork_CL.columns[[1] + list(range(4, NewYork_CL.shape[1]))]]
NY_Restaurant.head()

Unnamed: 0,Cluster Labels,Venue,Venue Latitude,Venue Longitude,Venue Category
1,0,Il Sogno,40.541286,-74.178489,Restaurant
9,0,Lazzat Restaurant,40.601619,-74.001519,Restaurant
12,0,Applebee's Grill + Bar,40.779199,-73.776437,Restaurant
22,3,Applebee's Grill + Bar,40.86014,-73.891113,Restaurant
22,3,Golden City,40.854018,-73.889059,Restaurant


In [28]:
NY_Hotel=NewYork_CL.loc[NewYork_CL['Venue Category'] == 'Hotel', NewYork_CL.columns[[1] + list(range(4, NewYork_CL.shape[1]))]]
NY_Hotel.head()

Unnamed: 0,Cluster Labels,Venue,Venue Latitude,Venue Longitude,Venue Category
4,3,Staten Island Navy Lodge,40.598734,-74.062288,Hotel
10,0,Conrad New York,40.714911,-74.015461,Hotel
10,0,W New York - Downtown,40.709196,-74.013596,Hotel
10,0,New York Marriott Downtown,40.709521,-74.014486,Hotel
10,0,Holiday Inn Manhattan-Financial District,40.708286,-74.014084,Hotel


In [29]:
NY_Airport=NewYork_CL.loc[NewYork_CL['Venue Category'] == 'Airport', NewYork_CL.columns[[1] + list(range(4, NewYork_CL.shape[1]))]]
NY_Airport.head()

Unnamed: 0,Cluster Labels,Venue,Venue Latitude,Venue Longitude,Venue Category


In [30]:
NY_Stadium=NewYork_CL.loc[NewYork_CL['Venue Category'] == 'Stadium', NewYork_CL.columns[[1] + list(range(4, NewYork_CL.shape[1]))]]
NY_Stadium.head()

Unnamed: 0,Cluster Labels,Venue,Venue Latitude,Venue Longitude,Venue Category
266,0,Cotto Vs Margarito,40.645668,-74.004526,Stadium


## The infrastructure criteria will be assessed based on the following weighting percentages: 
#### Food and Beverage = 20 %
#### Transport (Airport)  = 30%
#### Accomodation = 20%
####  Stadium = 30%

##### * Commercial Criteria  is not considered for the election (Estimated revenues, Marketing, etc)

#### Comparing all venues from Toronto and NY according to their categories and ppliying Criteria.

In [71]:
# Food and Beverage:
a= Toronto_Rest['Venue Category'].count()
b=NY_Restaurant['Venue Category'].count()
print('The Number of Restaurants in Toronto are:',a,'represeting a weight of:',a*0.2,'on the overall evaluation') 
print('The Number of Restaurants in New York are:',b,'represeting a weight of:',b*0.2,'on the overall evaluation') 

The Number of Restaurants in Toronto are: 65 represeting a weight of: 13.0 on the overall evaluation
The Number of Restaurants in New York are: 88 represeting a weight of: 17.6 on the overall evaluation


In [80]:
# Accomodation:
c=Toronto_Hotel['Venue Category'].count()
d= NY_Hotel['Venue Category'].count()
print('The Number of Hotels in Toronto are:',c,'represeting a weight of:',c*0.2,'on the overall evaluation') 
print('The Number of Hotels in New York are:',d,'represeting a weight of:',d*0.2,'on the overall evaluation') 

The Number of Hotels in Toronto are: 43 represeting a weight of: 8.6 on the overall evaluation
The Number of Hotels in New York are: 97 represeting a weight of: 19.400000000000002 on the overall evaluation


In [79]:
# Airports:
e= Toronto_Airport['Venue Category'].count()
f=NY_Airport['Venue Category'].count()
print('The Number of Hotels in Toronto are:',e,'represeting a weight of:',e*0.3,'on the overall evaluation') 
print('The Number of Hotels in New York are:',2,'represeting a weight of:',2*0.3,'on the overall evaluation') 

The Number of Hotels in Toronto are: 2 represeting a weight of: 0.6 on the overall evaluation
The Number of Hotels in New York are: 2 represeting a weight of: 0.6 on the overall evaluation


In [81]:
# Stadiums:
g=Toronto_Stadium['Venue Category'].count()
h=NY_Stadium['Venue Category'].count()
print('The Number of Hotels in Toronto are:',g,'represeting a weight of:',g*0.3,'on the overall evaluation') 
print('The Number of Hotels in New York are:',h,'represeting a weight of:',h*0.3,'on the overall evaluation')

The Number of Hotels in Toronto are: 1 represeting a weight of: 0.3 on the overall evaluation
The Number of Hotels in New York are: 1 represeting a weight of: 0.3 on the overall evaluation


In [83]:
def winner (F_B,Acc,Air,Stadium):
    win=F_B+Acc+Air,Stadium
    return win

In [87]:
if winner(a,c,e,g) > winner (b,d,f,h):
    print ('The winner is: TORONTO')
else:
    print ('The winner is: NEW YORK')

The winner is: NEW YORK
