### Imports 

In [1]:
import requests
import pandas as pd
from geopy.geocoders import Nominatim
import folium
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

### Getting latitude, longitude

In [2]:
addresses = ["One World Trade Center", "Empire State Building", 
            "central park zoo new york", "Museum of the City of New York", 
            "Yankee Stadium"]
def get_lat_long(address):
    geolocator = Nominatim(user_agent="foursquare_agent")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    print(address, latitude, longitude)
    return latitude, longitude

In [3]:
for i in addresses:
    get_lat_long(i)

One World Trade Center 40.7130186 -74.01317859995396
Empire State Building 40.748428399999995 -73.98565461987332
central park zoo new york 40.7676005 -73.97184547517396
Museum of the City of New York 40.792516250000006 -73.95180912958764
Yankee Stadium 40.82958275 -73.92652118491901


### Five points of NY

In [4]:
one_world_trade_center = ["One World Trade Center","40.7130186", "-74.01317859995396"]
empire_state_building = ["Empire State Building","40.748428399999995", "-73.98565461987332"]
central_park_zoo = ["central park zoo new york","40.7676005", "-73.97184547517396"]
museum_city_of_new_york = ["Museum of the City of New York","40.792516250000006", "-73.95180912958764"]
yankee_stadium = ["Yankee Stadium","40.82958275", "-73.92652118491901"]

In [5]:


new_york_points_with_lat_long = pd.DataFrame([one_world_trade_center, empire_state_building, 
                                              central_park_zoo, museum_city_of_new_york, yankee_stadium], 
                                             columns=["point name","lat", "lng"])
new_york_points_with_lat_long

Unnamed: 0,point name,lat,lng
0,One World Trade Center,40.7130186,-74.01317859995396
1,Empire State Building,40.7484284,-73.98565461987332
2,central park zoo new york,40.7676005,-73.97184547517396
3,Museum of the City of New York,40.792516250000006,-73.95180912958764
4,Yankee Stadium,40.82958275,-73.92652118491901


### Api setup

In [6]:
def saving_data(latitude, longitude, area_name):
    url = f"https://api.foursquare.com/v2/venues/explore?client_id=N40W0THAJDZYKLHLHWRDBU01LIMNXBMXZ03X5ZOGZSRVMLSR&client_secret=NAYYDDFD0TGX2G4SSK1E0W1VPZEDUN5TSK2Q02QWKKJAESSM&ll={latitude},{longitude}&v=20180604&radius=30000&limit=100"
    results = requests.get(url).json()
    dataframe = pd.json_normalize(results["response"]["groups"][0]["items"])
    filtered_columns = ['venue.name', 'venue.categories'] + [col for col in dataframe.columns if col.startswith('venue.location.')] + ['venue.id']
    dataframe_filtered = dataframe.loc[:, filtered_columns]
    dataframe_filtered['venue.categories'] = dataframe_filtered.apply(get_category_type, axis=1) # category for each row
    dataframe_filtered.columns = [col.split('.')[-1] for col in dataframe_filtered.columns] # clean columns
    
    dataframe_filtered.to_pickle(f"{area_name}.pkl") # save to pickle so that wont call the api over and over again
    return dataframe_filtered

### Get the Categories

In [7]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Collect the data from the API and merge 

In [8]:
df_one_world_trade_center = saving_data(one_world_trade_center[1], one_world_trade_center[2], "one_world_trade_center")
df_empire_state_building = saving_data(empire_state_building[1], empire_state_building[2], "empire_state_building")
df_central_park_zoo = saving_data(central_park_zoo[1], central_park_zoo[2], "central_park_zoo")
df_museum_city_of_new_york = saving_data(museum_city_of_new_york[1], museum_city_of_new_york[2], "museum_city_of_new_york")
df_yankee_stadium = saving_data(yankee_stadium[1], yankee_stadium[2], "yankee_stadium")

### Add Column with the Value with the name of NY point

In [9]:
df_one_world_trade_center["point name"] = ["One World Trade Center" for _ in range(len(df_one_world_trade_center["name"]))]
df_empire_state_building["point name"] =["Empire State Building" for _ in range(len(df_empire_state_building["name"]))]
df_central_park_zoo["point name"] = ["central park zoo new york" for _ in range(len(df_central_park_zoo["name"]))]
df_museum_city_of_new_york["point name"] = ["Museum of the City of New York" for _ in range(len(df_museum_city_of_new_york["name"]))]
df_yankee_stadium["point name"] = ["Yankee Stadium" for _ in range(len(df_yankee_stadium["name"]))]

In [10]:
all_df_data = [df_one_world_trade_center, 
               df_empire_state_building, 
               df_central_park_zoo, 
               df_museum_city_of_new_york, 
               df_yankee_stadium]

full_df = pd.concat([df_one_world_trade_center, 
               df_empire_state_building, 
               df_central_park_zoo, 
               df_museum_city_of_new_york, 
               df_yankee_stadium])

In [11]:
full_df.to_csv("full_nyc_data.csv")

In [12]:
full_df.head()

Unnamed: 0,name,categories,address,crossStreet,lat,lng,labeledLatLngs,distance,cc,city,state,country,formattedAddress,postalCode,neighborhood,id,point name
0,9/11 Memorial North Pool,Memorial Site,West St,at Fulton St,40.712077,-74.013187,"[{'label': 'display', 'lat': 40.71207726282092...",104,US,New York,NY,United States,"[West St (at Fulton St), New York, NY, United ...",,,58801864cc5b6a14dedce689,One World Trade Center
1,Battery Park City Esplanade,Park,Battery Park City,From Chambers St to Battery Park,40.711622,-74.017907,"[{'label': 'display', 'lat': 40.71162184551406...",428,US,New York,NY,United States,[Battery Park City (From Chambers St to Batter...,10280.0,,4c1164576e5dc9b69506b02d,One World Trade Center
2,Nelson A. Rockefeller Park,Park,North end of Battery Park City & West of River...,at Warren St,40.717501,-74.01648,"[{'label': 'display', 'lat': 40.71750136874259...",571,US,New York,NY,United States,[North end of Battery Park City & West of Rive...,10280.0,,4b929790f964a5209a0734e3,One World Trade Center
3,Los Tacos No. 1,Taco Place,136 Church St,,40.714267,-74.008756,"[{'label': 'display', 'lat': 40.714267, 'lng':...",398,US,New York,NY,United States,"[136 Church St, New York, NY 10007, United Sta...",10007.0,,5d5f24ec09484500079aee00,One World Trade Center
4,Washington Market Park,Playground,Greenwich St,at Reade St,40.717046,-74.011095,"[{'label': 'display', 'lat': 40.71704598853704...",481,US,New York,NY,United States,"[Greenwich St (at Reade St), New York, NY, Uni...",,,4a9bcc2ff964a5203b3520e3,One World Trade Center


#### Unique Categories

In [13]:
len(full_df["categories"].unique())

85

### One hot encoding

In [14]:
one_hot_ny = pd.get_dummies(full_df[["categories"]], prefix="", prefix_sep="")
one_hot_ny["point name"] = full_df["point name"]

In [15]:
one_hot_ny.head()

Unnamed: 0,American Restaurant,Art Gallery,Art Museum,Athletics & Sports,Bakery,Bar,Beach,Beer Store,Bookstore,Botanical Garden,...,Theater,Track,Trail,Udon Restaurant,Volleyball Court,Waterfront,Wine Bar,Wine Shop,Yoga Studio,point name
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,One World Trade Center
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,One World Trade Center
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,One World Trade Center
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,One World Trade Center
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,One World Trade Center


### Test Frequency of occurrence of each category

In [16]:
one_hot_ny_grouped = one_hot_ny.groupby("point name").mean().reset_index()
one_hot_ny_grouped

Unnamed: 0,point name,American Restaurant,Art Gallery,Art Museum,Athletics & Sports,Bakery,Bar,Beach,Beer Store,Bookstore,...,Thai Restaurant,Theater,Track,Trail,Udon Restaurant,Volleyball Court,Waterfront,Wine Bar,Wine Shop,Yoga Studio
0,Empire State Building,0.01,0.03,0.01,0.0,0.05,0.0,0.0,0.0,0.04,...,0.04,0.04,0.01,0.01,0.01,0.0,0.0,0.01,0.03,0.0
1,Museum of the City of New York,0.01,0.02,0.02,0.0,0.07,0.0,0.0,0.01,0.04,...,0.01,0.05,0.0,0.01,0.0,0.0,0.01,0.0,0.02,0.01
2,One World Trade Center,0.01,0.03,0.0,0.01,0.03,0.01,0.01,0.0,0.04,...,0.04,0.01,0.01,0.01,0.01,0.01,0.0,0.01,0.02,0.01
3,Yankee Stadium,0.01,0.02,0.02,0.0,0.05,0.0,0.0,0.01,0.03,...,0.01,0.04,0.0,0.01,0.0,0.0,0.01,0.0,0.02,0.01
4,central park zoo new york,0.01,0.02,0.02,0.0,0.07,0.0,0.0,0.01,0.04,...,0.01,0.05,0.01,0.0,0.0,0.0,0.01,0.0,0.03,0.01


In [17]:
one_hot_ny_grouped.shape

(5, 86)

### Top 10 venues

In [18]:
number = 10

for point_name in one_hot_ny_grouped["point name"]:
    print(f"---------{point_name}---------")
    temp = one_hot_ny_grouped[one_hot_ny_grouped["point name"] == point_name].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(number))
    print('\n')

---------Empire State Building---------
                  venue  freq
0                  Park  0.11
1                   Gym  0.05
2                Bakery  0.05
3             Bookstore  0.04
4               Theater  0.04
5       Thai Restaurant  0.04
6  Gym / Fitness Center  0.03
7             Wine Shop  0.03
8           Art Gallery  0.03
9          Gourmet Shop  0.03


---------Museum of the City of New York---------
                  venue  freq
0                  Park  0.13
1                Bakery  0.07
2                   Gym  0.06
3               Theater  0.05
4             Bookstore  0.04
5  Gym / Fitness Center  0.03
6           Pizza Place  0.03
7                 Plaza  0.03
8        Ice Cream Shop  0.03
9            Taco Place  0.02


---------One World Trade Center---------
             venue  freq
0             Park  0.15
1   Ice Cream Shop  0.05
2   Scenic Lookout  0.05
3        Bookstore  0.04
4  Thai Restaurant  0.04
5      Art Gallery  0.03
6           Bakery  0.03
7     

### Sort the Venues 

In [19]:
def return_top_ten_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [20]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['point name']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
point_name_venues_sorted = pd.DataFrame(columns=columns)
point_name_venues_sorted['point name'] = one_hot_ny_grouped['point name']

for ind in np.arange(point_name_venues_sorted.shape[0]):
    point_name_venues_sorted.iloc[ind, 1:] = return_top_ten_venues(one_hot_ny_grouped.iloc[ind, :], num_top_venues)

point_name_venues_sorted.head()

Unnamed: 0,point name,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Empire State Building,Park,Gym,Bakery,Bookstore,Theater,Thai Restaurant,Gym / Fitness Center,Wine Shop,Art Gallery,Gourmet Shop
1,Museum of the City of New York,Park,Bakery,Gym,Theater,Bookstore,Gym / Fitness Center,Pizza Place,Plaza,Ice Cream Shop,Taco Place
2,One World Trade Center,Park,Ice Cream Shop,Scenic Lookout,Bookstore,Thai Restaurant,Art Gallery,Bakery,Pier,Music Venue,Deli / Bodega
3,Yankee Stadium,Park,Gym,Bakery,Theater,Pizza Place,Bookstore,Plaza,Ice Cream Shop,Scenic Lookout,Gym / Fitness Center
4,central park zoo new york,Park,Bakery,Gym,Theater,Bookstore,Gym / Fitness Center,Pizza Place,Plaza,Ice Cream Shop,Wine Shop


### Cluster Points 

In [21]:
kclusters = 5
one_hot_ny_grouped_cluster = one_hot_ny_grouped.drop("point name", axis=1)

kmean = KMeans(n_clusters=kclusters, random_state=0).fit(one_hot_ny_grouped_cluster)
kmean.labels_

array([2, 4, 1, 3, 0], dtype=int32)

### Merge dataframe and clusters 

In [22]:
point_name_venues_sorted.insert(0, "Cluster Labels", kmean.labels_)
new_york_merged = new_york_points_with_lat_long
new_york_merged = new_york_merged.join(point_name_venues_sorted.set_index("point name"), on="point name")



In [27]:
new_york_merged

Unnamed: 0,point name,lat,lng,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,One World Trade Center,40.7130186,-74.01317859995396,1,Park,Ice Cream Shop,Scenic Lookout,Bookstore,Thai Restaurant,Art Gallery,Bakery,Pier,Music Venue,Deli / Bodega
1,Empire State Building,40.7484284,-73.98565461987332,2,Park,Gym,Bakery,Bookstore,Theater,Thai Restaurant,Gym / Fitness Center,Wine Shop,Art Gallery,Gourmet Shop
2,central park zoo new york,40.7676005,-73.97184547517396,0,Park,Bakery,Gym,Theater,Bookstore,Gym / Fitness Center,Pizza Place,Plaza,Ice Cream Shop,Wine Shop
3,Museum of the City of New York,40.792516250000006,-73.95180912958764,4,Park,Bakery,Gym,Theater,Bookstore,Gym / Fitness Center,Pizza Place,Plaza,Ice Cream Shop,Taco Place
4,Yankee Stadium,40.82958275,-73.92652118491901,3,Park,Gym,Bakery,Theater,Pizza Place,Bookstore,Plaza,Ice Cream Shop,Scenic Lookout,Gym / Fitness Center


### Map

In [24]:
new_york_latitude = "40.77359725490544"
new_work_longitude = "-73.96332140842885"

In [25]:
map_clusters = folium.Map(location=[new_york_latitude, new_work_longitude], zoom_start=11)


x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]


markers_colors = []
for lat, lon, poi, cluster in zip(new_york_merged['lat'], new_york_merged['lng'], new_york_merged['point name'], new_york_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [26]:
new_york_merged['Cluster Labels']

0    1
1    2
2    0
3    4
4    3
Name: Cluster Labels, dtype: int32

#### Cluster 1

In [33]:
new_york_merged.loc[new_york_merged['Cluster Labels'] == 0, new_york_merged.columns[[0] + list(range(3, new_york_merged.shape[1]))]]

Unnamed: 0,point name,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,central park zoo new york,0,Park,Bakery,Gym,Theater,Bookstore,Gym / Fitness Center,Pizza Place,Plaza,Ice Cream Shop,Wine Shop


#### Cluster 2

In [35]:
new_york_merged.loc[new_york_merged['Cluster Labels'] == 1, new_york_merged.columns[[0] + list(range(3, new_york_merged.shape[1]))]]

Unnamed: 0,point name,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,One World Trade Center,1,Park,Ice Cream Shop,Scenic Lookout,Bookstore,Thai Restaurant,Art Gallery,Bakery,Pier,Music Venue,Deli / Bodega


#### Cluster 3

In [37]:
new_york_merged.loc[new_york_merged['Cluster Labels'] == 2, new_york_merged.columns[[0] + list(range(3, new_york_merged.shape[1]))]]

Unnamed: 0,point name,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Empire State Building,2,Park,Gym,Bakery,Bookstore,Theater,Thai Restaurant,Gym / Fitness Center,Wine Shop,Art Gallery,Gourmet Shop


#### Cluster 4

In [38]:
new_york_merged.loc[new_york_merged['Cluster Labels'] == 3, new_york_merged.columns[[0] + list(range(3, new_york_merged.shape[1]))]]

Unnamed: 0,point name,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,Yankee Stadium,3,Park,Gym,Bakery,Theater,Pizza Place,Bookstore,Plaza,Ice Cream Shop,Scenic Lookout,Gym / Fitness Center


#### Cluster 5

In [39]:
new_york_merged.loc[new_york_merged['Cluster Labels'] == 4, new_york_merged.columns[[0] + list(range(3, new_york_merged.shape[1]))]]

Unnamed: 0,point name,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
3,Museum of the City of New York,4,Park,Bakery,Gym,Theater,Bookstore,Gym / Fitness Center,Pizza Place,Plaza,Ice Cream Shop,Taco Place
