### COURSERA | IBM DATA SCIENCE | CITY COMPARISON
The purpose of this program is to compare neighborhoods in Toronto with those in Manhattan based upon the similarity of the type of venues located nearby. <br> This program uses Canadian postal code data from Wikipedia, location data from Foursquare API, and geographical data from Coursera. <br> The Folium package was used to render all geographical data. <br><br>Venues were limited to 100 within a 500m radius for each neighborhood with the top 5 venue types analyzed. <br> Similarity was calculated based on the frequency of top unique venue types shared between the neighborhoods of Toronto and Manhattan.

In [1]:
# @hidden_cell
credentials = {
    'IAM_SERVICE_ID': 'iam-ServiceId-d0f8f1bd-e2f0-406a-8332-37fd6aa968ed',
    'IBM_API_KEY_ID': 'i5BveX5kkRvPQYvpNHUpCk5i94fy6VpFpDlF34h2iTXZ',
    'ENDPOINT': 'https://s3-api.us-geo.objectstorage.service.networklayer.com',
    'IBM_AUTH_ENDPOINT': 'https://iam.ng.bluemix.net/oidc/token',
    'BUCKET': 'project-donotdelete-pr-8ka5t6nq466jou',
    'FILE': 'canada_postal_codes.csv'
}

CLIENT_ID = 'XUTFTOO2ER5MV2YMGAMZ34XFGEEMJ2QN3ZPBREVDGCY1DBK0'
CLIENT_SECRET = 'NFFPP4M0INLKO30VNSS3ZJYVKY0S1Z2R0ZSFBN4OVEYFIAJU'
VERSION = '20190812'

In [2]:
from ibm_botocore.client import Config
import ibm_boto3

def download_file(credentials):  
    cos = ibm_boto3.client(service_name='s3',
    ibm_api_key_id=credentials['IBM_API_KEY_ID'],
    ibm_service_instance_id=credentials['IAM_SERVICE_ID'],
    ibm_auth_endpoint=credentials['IBM_AUTH_ENDPOINT'],
    config=Config(signature_version='oauth'),
    endpoint_url=credentials['ENDPOINT'])
    
    try:
        res = cos.download_file(Bucket=credentials['BUCKET'], Key=credentials['FILE'], Filename=credentials['FILE'])
        print('SUCCESSFULLY DOWNLOADED FILE')
    except Exception:
        print('FAILED TO DOWNLOAD FILE')

#### PART 1: DOWNLOAD & CLEAN CANADA DATA

In [3]:
import pandas as pd
import numpy as np

In [4]:
download_file(credentials)
tor_df = pd.read_csv(credentials['FILE'])
tor_df.head()

SUCCESSFULLY DOWNLOADED FILE


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [5]:
tor_df.rename(columns={'Postcode': 'Postal Code', 'Neighbourhood': 'Neighborhood'}, inplace=True)
tor_df = tor_df[tor_df.Borough != 'Not assigned']
tor_df.Neighborhood.replace('Not assigned', tor_df.Borough, inplace=True)
tor_df = tor_df.groupby(['Postal Code', 'Borough'], sort=False)['Neighborhood'].apply(', '.join).reset_index()
tor_df.sort_values('Postal Code', inplace=True)
tor_df.index = list(range(len(tor_df)))
tor_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [6]:
coord_path = 'https://cocl.us/Geospatial_data'
df_coord = pd.read_csv(coord_path)
tor_df['Latitude'] = df_coord['Latitude']
tor_df['Longitude'] = df_coord['Longitude']
tor_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [7]:
print('NEIGHBORHOODS IN CANADA:', len(tor_df))

NEIGHBORHOODS IN CANADA: 103


#### PART 2: DOWNLOAD & CLEAN NEW YORK DATA

In [8]:
import json
import requests

In [9]:
!wget -q -O 'newyork_data.json' https://cocl.us/new_york_dataset
print('SUCCESSFULLY DOWNLOADED FILE')

SUCCESSFULLY DOWNLOADED FILE


In [10]:
with open('newyork_data.json') as ny_data:
    data = json.load(ny_data)
    data = data['features']
    columns = ['Borough', 'Neighborhood', 'Latitude', 'Longitude']
    man_df = pd.DataFrame(columns=columns)
    for entry in data:
        borough = entry['properties']['borough']
        neighborhood = entry['properties']['name']
        lat = entry['geometry']['coordinates'][1]
        lng = entry['geometry']['coordinates'][0]
        man_df = man_df.append({'Borough': borough, 'Neighborhood': neighborhood, 'Latitude': lat, 'Longitude': lng}, ignore_index=True)
man_df.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [11]:
print('NEIGHBORHOODS IN NEW YORK:', len(man_df))

NEIGHBORHOODS IN NEW YORK: 306


#### PART 3: VISUALIZE NEIGHBORHOODS IN TORONTO & MANHATTAN

In [12]:
!pip install folium
import folium
print('SUCCESSFULLY DOWNLOADED FOLIUM')

SUCCESSFULLY DOWNLOADED FOLIUM


In [13]:
tor_df = tor_df[tor_df['Borough'].str.contains('Toronto')].reset_index(drop=True)
tor_df.drop(columns=['Postal Code'], inplace=True)
tor_df.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,East Toronto,The Beaches,43.676357,-79.293031
1,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
2,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572
3,East Toronto,Studio District,43.659526,-79.340923
4,Central Toronto,Lawrence Park,43.72802,-79.38879


In [14]:
print('NEIGHBORHOODS IN TORONTO:', len(tor_df))

NEIGHBORHOODS IN TORONTO: 38


In [15]:
man_df = man_df[man_df['Borough'] == 'Manhattan'].reset_index(drop=True)
man_df.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


In [16]:
print('NEIGHBORHOODS IN MANHATTAN:', len(man_df))

NEIGHBORHOODS IN MANHATTAN: 40


In [17]:
def create_map(df, latitude, longitude):
    city_map = folium.Map(location=[latitude, longitude], zoom_start=12)
    for borough, neighborhood, lat, lng in zip(df['Borough'], df['Neighborhood'], df['Latitude'], df['Longitude']):
        label = '{}, {}'.format(neighborhood, borough)
        label = folium.Popup(label)
        folium.CircleMarker([lat, lng], radius=5, popup=label, color='blue', fill=True, fill_color='#3186CC', fill_opacity=0.8).add_to(city_map)
    return city_map

In [18]:
tor_map = create_map(tor_df, 43.6532, -79.3832)
tor_map

In [19]:
man_map = create_map(man_df, 40.7831, -73.9712)
man_map

#### PART 4: EXPLORE NEARBY VENUES IN TORONTO & MANHATTAN

In [20]:
def get_venues(name, lat, lng):
    venues = []
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, 500, 100)
    results = requests.get(url).json()['response']['groups'][0]['items']
    for v in results:
        info = [name, lat, lng, 
                v['venue']['name'], v['venue']['location']['lat'], v['venue']['location']['lng'], v['venue']['categories'][0]['name']]
        venues.append(info)
    df = pd.DataFrame(venues)
    df.columns = ['Neighborhood', 'Neighborhood Latitude', 'Neighborhood Longitude', 
                         'Venue', 'Venue Latitude', 'Venue Longitude', 'Category']
    return df

In [21]:
def get_city_venues(df):
    venue_df = pd.DataFrame()
    for neighborhood, lat, lng in zip(df['Neighborhood'], df['Latitude'], df['Longitude']):
        venues = get_venues(neighborhood, lat, lng)
        venue_df = pd.concat([venue_df, venues])
    venue_df.reset_index(inplace=True, drop=True)
    return venue_df

In [22]:
tor_venue_df = get_city_venues(tor_df)
tor_venue_df.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Category
0,The Beaches,43.676357,-79.293031,Glen Manor Ravine,43.676821,-79.293942,Trail
1,The Beaches,43.676357,-79.293031,The Big Carrot Natural Food Market,43.678879,-79.297734,Health Food Store
2,The Beaches,43.676357,-79.293031,Grover Pub and Grub,43.679181,-79.297215,Pub
3,The Beaches,43.676357,-79.293031,Upper Beaches,43.680563,-79.292869,Neighborhood
4,"The Danforth West, Riverdale",43.679557,-79.352188,Pantheon,43.677621,-79.351434,Greek Restaurant


In [23]:
tor_venue_df.groupby('Neighborhood').count()['Venue']

Neighborhood
Adelaide, King, Richmond                                                                                      100
Berczy Park                                                                                                    56
Brockton, Exhibition Place, Parkdale Village                                                                   21
Business Reply Mail Processing Centre 969 Eastern                                                              19
CN Tower, Bathurst Quay, Island airport, Harbourfront West, King and Spadina, Railway Lands, South Niagara     15
Cabbagetown, St. James Town                                                                                    45
Central Bay Street                                                                                             84
Chinatown, Grange Park, Kensington Market                                                                     100
Christie                                                                   

In [24]:
print('VENUES IN TORONTO:', len(tor_venue_df))

VENUES IN TORONTO: 1689


In [25]:
print('UNIQUE VENUE TYPES IN TORONTO:', len(tor_venue_df['Category'].unique()))

UNIQUE VENUE TYPES IN TORONTO: 236


In [26]:
man_venue_df = get_city_venues(man_df)
man_venue_df.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Category
0,Marble Hill,40.876551,-73.91066,Arturo's,40.874412,-73.910271,Pizza Place
1,Marble Hill,40.876551,-73.91066,Bikram Yoga,40.876844,-73.906204,Yoga Studio
2,Marble Hill,40.876551,-73.91066,Tibbett Diner,40.880404,-73.908937,Diner
3,Marble Hill,40.876551,-73.91066,Starbucks,40.877531,-73.905582,Coffee Shop
4,Marble Hill,40.876551,-73.91066,Dunkin',40.877136,-73.906666,Donut Shop


In [27]:
man_venue_df.groupby('Neighborhood').count()['Venue']

Neighborhood
Battery Park City      100
Carnegie Hill          100
Central Harlem          43
Chelsea                100
Chinatown              100
Civic Center           100
Clinton                100
East Harlem             44
East Village           100
Financial District     100
Flatiron               100
Gramercy               100
Greenwich Village      100
Hamilton Heights        60
Hudson Yards            79
Inwood                  57
Lenox Hill             100
Lincoln Square         100
Little Italy           100
Lower East Side         60
Manhattan Valley        59
Manhattanville          40
Marble Hill             24
Midtown                100
Midtown South          100
Morningside Heights     40
Murray Hill            100
Noho                   100
Roosevelt Island        27
Soho                   100
Stuyvesant Town         18
Sutton Place           100
Tribeca                100
Tudor City              81
Turtle Bay             100
Upper East Side        100
Upper West Side

In [28]:
print('VENUES IN MANHATTAN:', len(man_venue_df))

VENUES IN MANHATTAN: 3317


In [29]:
print('UNIQUE VENUE TYPES IN MANHATTAN:', len(man_venue_df['Category'].unique()))

UNIQUE VENUE TYPES IN MANHATTAN: 337


#### PART 5: ANALYZE VENUE TYPE FREQUENCY IN TORONTO & MANHATTAN

In [30]:
def onehot_encode(venue_df):
    onehot_df = pd.get_dummies(venue_df[['Category']], prefix='', prefix_sep='')
    onehot_df['NEIGHBORHOOD'] = venue_df['Neighborhood']
    new_columns = [onehot_df.columns[-1]] + list(onehot_df.columns[:-1])
    onehot_df = onehot_df[new_columns]
    return onehot_df

In [31]:
tor_onehot_df = onehot_encode(tor_venue_df)
tor_onehot_df.head()

Unnamed: 0,NEIGHBORHOOD,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,The Beaches,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"The Danforth West, Riverdale",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
tor_grouped_df = tor_onehot_df.groupby('NEIGHBORHOOD').mean().reset_index()
tor_grouped_df.head()

Unnamed: 0,NEIGHBORHOOD,Afghan Restaurant,Airport,Airport Food Court,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Antique Shop,Aquarium,...,Thrift / Vintage Store,Toy / Game Store,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Women's Store,Yoga Studio
0,"Adelaide, King, Richmond",0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.01,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.0
2,"Brockton, Exhibition Place, Parkdale Village",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052632
4,"CN Tower, Bathurst Quay, Island airport, Harbo...",0.0,0.066667,0.066667,0.133333,0.2,0.133333,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
man_onehot_df = onehot_encode(man_venue_df)
man_onehot_df.head()

Unnamed: 0,NEIGHBORHOOD,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,American Restaurant,Antique Shop,Arcade,Arepa Restaurant,Argentinian Restaurant,...,Video Store,Vietnamese Restaurant,Volleyball Court,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Marble Hill,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
man_grouped_df = man_onehot_df.groupby('NEIGHBORHOOD').mean().reset_index()
man_grouped_df.head()

Unnamed: 0,NEIGHBORHOOD,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,American Restaurant,Antique Shop,Arcade,Arepa Restaurant,Argentinian Restaurant,...,Video Store,Vietnamese Restaurant,Volleyball Court,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Battery Park City,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.03,0.0,0.03,0.0
1,Carnegie Hill,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,...,0.0,0.02,0.0,0.0,0.0,0.01,0.03,0.0,0.01,0.03
2,Central Harlem,0.0,0.0,0.0,0.069767,0.046512,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Chelsea,0.0,0.0,0.0,0.0,0.03,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.01,0.0
4,Chinatown,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,...,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
NUM_TOP_VENUES = 5
def get_top_venues(row):
    categories = row.iloc[1:]
    sorted_categories = categories.sort_values(ascending=False)
    return sorted_categories.index.values[0:NUM_TOP_VENUES]

In [36]:
def get_city_venues(df, grouped_df):
    columns = ['Neighborhood']
    for i in range(NUM_TOP_VENUES):
        columns.append('Top Venue {}'.format(i+1))
    venues_sorted_df = pd.DataFrame(columns=columns)
    venues_sorted_df['Neighborhood'] = grouped_df['NEIGHBORHOOD']

    for i in range(len(grouped_df)):
        venues_sorted_df.iloc[i, 1:] = get_top_venues(grouped_df.iloc[i, :])
    merged_df = df
    merged_df = merged_df.join(venues_sorted_df.set_index('Neighborhood'), on='Neighborhood')
    return merged_df

In [37]:
tor_merged_df = get_city_venues(tor_df, tor_grouped_df)
tor_merged_df.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Top Venue 1,Top Venue 2,Top Venue 3,Top Venue 4,Top Venue 5
0,East Toronto,The Beaches,43.676357,-79.293031,Health Food Store,Neighborhood,Trail,Pub,Filipino Restaurant
1,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,Greek Restaurant,Coffee Shop,Italian Restaurant,Ice Cream Shop,Furniture / Home Store
2,East Toronto,"The Beaches West, India Bazaar",43.668999,-79.315572,Park,Sandwich Place,Fish & Chips Shop,Pub,Fast Food Restaurant
3,East Toronto,Studio District,43.659526,-79.340923,Café,Coffee Shop,Italian Restaurant,American Restaurant,Bakery
4,Central Toronto,Lawrence Park,43.72802,-79.38879,Park,Bus Line,Swim School,Yoga Studio,Donut Shop


In [38]:
man_merged_df = get_city_venues(man_df, man_grouped_df)
man_merged_df.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Top Venue 1,Top Venue 2,Top Venue 3,Top Venue 4,Top Venue 5
0,Manhattan,Marble Hill,40.876551,-73.91066,Discount Store,Sandwich Place,Coffee Shop,Yoga Studio,Steakhouse
1,Manhattan,Chinatown,40.715618,-73.994279,Chinese Restaurant,Cocktail Bar,Salon / Barbershop,American Restaurant,Vietnamese Restaurant
2,Manhattan,Washington Heights,40.851903,-73.9369,Café,Bakery,Mobile Phone Shop,Grocery Store,Mexican Restaurant
3,Manhattan,Inwood,40.867684,-73.92121,Mexican Restaurant,Café,Lounge,Pizza Place,Restaurant
4,Manhattan,Hamilton Heights,40.823604,-73.949688,Pizza Place,Café,Mexican Restaurant,Coffee Shop,Park


#### PART 6: VENUE SIMILARITY ANALYSIS BETWEEN TORONTO & MANHATTAN

In [39]:
def get_venues(merged_df):
    df = pd.DataFrame()
    columns = merged_df.columns[4:]
    df['Venue'] = pd.concat([merged_df[i] for i in columns]).reset_index(drop=True)
    return df

In [40]:
def get_freq_venues(top_venues):
    df = pd.DataFrame(columns=['Venue', 'Count', 'Frequency'])
    unique_venues = sorted(top_venues['Venue'].unique())
    df['Venue'] = unique_venues
    df['Count'] = top_venues.groupby(['Venue'])['Venue'].count().reset_index(drop=True)
    df['Frequency'] = df['Count']/len(top_venues)
    df.sort_values('Frequency', ascending=False, inplace=True)
    df.index = list(range(len(df)))
    return df

In [41]:
tor_top_venues = get_venues(tor_merged_df)
print('TOP VENUES IN TORONTO:', len(tor_top_venues))

TOP VENUES IN TORONTO: 190


In [42]:
tor_freq_venues = get_freq_venues(tor_top_venues)
tor_freq_venues.head()

Unnamed: 0,Venue,Count,Frequency
0,Coffee Shop,22,0.115789
1,Café,20,0.105263
2,Italian Restaurant,11,0.057895
3,Restaurant,10,0.052632
4,Park,9,0.047368


In [43]:
print('TOP UNIQUE VENUES IN TORONTO:', len(tor_freq_venues))

TOP UNIQUE VENUES IN TORONTO: 81


In [44]:
man_top_venues = get_venues(man_merged_df)
print('TOP VENUES IN MANHATTAN:', len(man_top_venues))

TOP VENUES IN MANHATTAN: 200


In [45]:
man_freq_venues = get_freq_venues(man_top_venues)
man_freq_venues.head()

Unnamed: 0,Venue,Count,Frequency
0,Coffee Shop,18,0.09
1,Italian Restaurant,17,0.085
2,Pizza Place,11,0.055
3,Café,10,0.05
4,Bar,9,0.045


In [46]:
print('TOP UNIQUE VENUES IN MANHATTAN:', len(man_freq_venues))

TOP UNIQUE VENUES IN MANHATTAN: 65


In [47]:
top_freq_venues = pd.DataFrame(columns=['Venue', 'Average Frequency'])
top_freq_venues['Venue'] = pd.concat([tor_freq_venues['Venue'], man_freq_venues['Venue']])
top_freq_venues.drop_duplicates(subset='Venue', inplace=True)
top_freq_venues.set_index('Venue', inplace=True)

In [48]:
tor_freq_venues.set_index('Venue', inplace=True)
man_freq_venues.set_index('Venue', inplace=True)

In [49]:
for index, row in top_freq_venues.iterrows():
    freq = 0
    if index in tor_freq_venues.index:
        freq += tor_freq_venues.loc[index, 'Frequency']
    if index in man_freq_venues.index:
        freq += man_freq_venues.loc[index, 'Frequency']
    row['Average Frequency'] = freq/2
top_freq_venues.sort_values('Average Frequency', ascending=False, inplace=True)
top_freq_venues.head()

Unnamed: 0_level_0,Average Frequency
Venue,Unnamed: 1_level_1
Coffee Shop,0.102895
Café,0.0776316
Italian Restaurant,0.0714474
Park,0.0436842
Hotel,0.0332895


In [50]:
similarity = 0
for index, row in tor_freq_venues.iterrows():
    if index in man_freq_venues.index:
        tor_freq = row['Frequency']
        man_freq = man_freq_venues.loc[index, 'Frequency']
        similarity += min(tor_freq, man_freq)
similarity = round(similarity*100, 4)
print('SIMILARITY BETWEEN TORONTO & MANHATTAN: {}%'.format(similarity))

SIMILARITY BETWEEN TORONTO & MANHATTAN: 50.1842%
