# Seoul Restaurants Clustering

## Part 1 : Collecting data on restaurants

In [1]:
# Import libraries
import requests # library to handle requests
import pandas as pd # library for data analsysis
import urllib.request # for fetching URLs
from pandas.io.json import json_normalize # to normalize semi-structured JSON data into a flat table.

Let's call the Foursquare API to retrieve as many restaurants as we can.

In [None]:
# Foursquare credentials
CLIENT_ID = '***'
CLIENT_SECRET = '***'

# Foursquare API parameters
limit = 50 # max search results (max value is 50)
radius = 500 # radius in meters to search around the coordinates
search_criteria = ['browse','checkin'] # Two types of searches provided by Foursquare
category = '4d4b7105d754a06374d81259' # Foursquare category for restaurants
version = '20191212' # Foursquare API version

# Create a set in which we will store the venues retrieved from Foursquare API
venues_set = set()

# Since the API limits the results to 50 restaurants, we will execute one search for every neighbourhood instead of one for the whole city
# Loop over the list of neighbourhoods
for lat, lng, in zip(df_dong['Latitude'], df_dong['Longitude']):
    
    # Execute two different kinds of search for each neighbourhood so we get as many results as possible
    for intent in search_criteria:
        
        url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&intent={}&ll={},{}&v={}&categoryId={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, intent, lat, lng, version, category, radius, limit)
        results = requests.get(url).json()

        # Browse through the list of venues and extract the information we need
        for resource in results['response']['venues']:
            iden = resource['id'] # ID of the venue
            name = resource['name'] # Name of the venue
            place_lat = resource['location']['lat'] # Latitude
            place_lng = resource['location']['lng'] # Longitude
            if 'address' in resource['location']:
                addr = resource['location']['address'] # Address
            else:
                addr = None # Some venues don't have any address
            cat = resource['categories'][0]['name'] # Category (type of cuisine)

            # Add the data as a tuple to our set
            # Since sets do not contain duplicates, any restaurant which has already been added via a previous API call will be skipped
            venues_set.add(tuple([iden,name,cat,addr,place_lat,place_lng]))

    # Find even more venues via a call to another API function which returns a list of recommended venues near the given location
    url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&ll={},{}&v={}&categoryId={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lat, lng, version, category, radius, LIMIT)
    results = requests.get(url).json()

    # Browse through the list of venues and extract the data we need
    for resource in results['response']['groups'][0]['items']:
        iden = resource['venue']['id']
        name = resource['venue']['name']
        place_lat = resource['venue']['location']['lat']
        place_lng = resource['venue']['location']['lng']
        if 'address' in resource['venue']['location']:
            addr = resource['venue']['location']['address']
        else:
            addr = None            
        cat = resource['venue']['categories'][0]['name']

        # Add the data as a tuple to our set
        # Since sets do not contain duplicates, any restaurant which has already been added via a previous API call will be skipped
        venues_set.add(tuple([iden,name,cat,addr,place_lat,place_lng]))                 

# Turn our set into a data frame
df_venues = pd.DataFrame(venues_set, columns=['Id','Name','Category','Address','Latitude','Longitude'])

In [132]:
# Let's look at the first 10 restaurants
df_venues.head(10)

Unnamed: 0,Id,Name,Category,Address,Latitude,Longitude
0,5b83cf2cc0cacb002c0bd030,인기명,Seafood Restaurant,,37.538245,126.947835
1,5dc62e31e42b4c000735f531,"Nal,See (날,See)",Sandwich Place,증산로21길 16,37.596088,126.913109
2,5d7e19bed13342000856212d,핵도그,Hot Dog Joint,보문로 99 영광빌딩 1층,37.58389,127.019922
3,4ddca1eab0fba481fc8678e4,한우목장,Steakhouse,,37.606292,127.061433
4,4eaa7d8d49015844898445d0,풍년 닭도리탕,Korean Restaurant,중구 세종대로11길 30,37.56308,126.977783
5,4cb5255664998cfad9250ca2,레몬테라스,Pizza Place,,37.536486,126.894966
6,4c45a7788c1f20a1d30c3e99,김밥천국,Korean Restaurant,,37.52276,126.909374
7,4c68fbf3c946e21e1a06e98e,봉추(鳳雛)찜닭,Korean Restaurant,마포구 어울마당로 109,37.554639,126.92279
8,4e4e3e458877402b06b1e09d,쪼끼 군다리 치킨,Fried Chicken Joint,,37.573911,126.952432
9,4c7a3e3c2d3ba1437ac390d0,황프로 소머리국밥,Asian Restaurant,한국 서울특별시 중랑구 신내1동 492-5,37.600964,127.100696


In [133]:
# How many restaurants did we find?
df_venues.shape

(15576, 6)

The address information returned by Foursquare isn't consistent enough to be used for clustering and plotting our data to a map. Many fields are empty, and the addresses are not standardized, which makes it difficult to process.
However, we got the coordinates of every restaurants. We will use another API provided by Kakao which allows to retrieve a standardized address from coordinates.

In [135]:
# @hidden_cell
# Kakao credentials
KakaoAK = '***'

# Define a function that will modify every row in our data frame to add the standardized address of the venue
def search_address(row):

    # URL of the Kakao API to find address based on coordinates
    url = "https://dapi.kakao.com/v2/local/geo/coord2regioncode.json?&x={}&y={}".format(row['Longitude'], row['Latitude'])

    results = requests.get(url, headers={"Authorization": KakaoAK}).json()

    # Browse through the results
    row['District'] = results['documents'][0]['region_2depth_name'] # Name of the district (gu) where the venue is located
    row['Neighbourhood Legal'] = results['documents'][0]['region_3depth_name'] # Name of the neighbourhood where the venue is located (legal name)
    row['Neighbourhood Admin'] = results['documents'][1]['region_3depth_name'] # Name of the neighbourhood where the venue is located (administrative name)

    return row

# Apply the function to the whole data frame along the column axis
df_venues = df_venues.apply(search_address, axis=1)

In [136]:
# Now we know in which district and in which neighbourhood every restaurant is located
df_venues

Unnamed: 0,Id,Name,Category,Address,Latitude,Longitude,District,Neighbourhood Legal,Neighbourhood Admin
0,5b83cf2cc0cacb002c0bd030,인기명,Seafood Restaurant,,37.538245,126.947835,마포구,도화동,도화동
1,5dc62e31e42b4c000735f531,"Nal,See (날,See)",Sandwich Place,증산로21길 16,37.596088,126.913109,은평구,신사동,신사1동
2,5d7e19bed13342000856212d,핵도그,Hot Dog Joint,보문로 99 영광빌딩 1층,37.583890,127.019922,성북구,보문동5가,보문동
3,4ddca1eab0fba481fc8678e4,한우목장,Steakhouse,,37.606292,127.061433,성북구,석관동,석관동
4,4eaa7d8d49015844898445d0,풍년 닭도리탕,Korean Restaurant,중구 세종대로11길 30,37.563080,126.977783,중구,북창동,소공동
...,...,...,...,...,...,...,...,...,...
15571,5059bc2fe4b0616dc6acf84d,the JK kitchenbox,Italian Restaurant,금천구 가산동 371-50 에이스하이엔드 타워 3차 111호,37.529682,126.892634,영등포구,양평동3가,양평1동
15572,5ac2d7703fcee8366a8935fa,김가네 (상도점),Bunsik Restaurant,"상도로30길 40, 120호",37.504112,126.943580,동작구,상도동,상도2동
15573,4c10ca80ce57c928fe5682d2,오가와,Sushi Restaurant,종로구 새문안로5길 19,37.571841,126.974593,종로구,당주동,사직동
15574,4d89cfb876e1236ac0d00b43,부뚜막청국장,Korean Restaurant,,37.468699,126.902318,금천구,독산동,독산4동


In [137]:
# Save the data frame to a file so we don't have to call the API again
df_venues.to_pickle('restaurants_list.pkl')