# Seoul Restaurants Clustering

## Part 2 : Data preparation

In this part, we will prepare the data we collected in the first part.

In [18]:
# Import libraries
import pandas as pd # library for data analsysis
import numpy as np # library to handle data in a vectorized manner
import folium # map rendering library
from folium.plugins import FastMarkerCluster, MarkerCluster
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
import json

In [19]:
# Load data from the file we saved in the first part into a data frame
df_venues = pd.read_pickle('restaurants_list.pkl')

In [20]:
df_venues.head()

Unnamed: 0,Id,Name,Category,Address,Latitude,Longitude,District,Neighbourhood Legal,Neighbourhood Admin
0,5b83cf2cc0cacb002c0bd030,인기명,Seafood Restaurant,,37.538245,126.947835,마포구,도화동,도화동
1,5dc62e31e42b4c000735f531,"Nal,See (날,See)",Sandwich Place,증산로21길 16,37.596088,126.913109,은평구,신사동,신사1동
2,5d7e19bed13342000856212d,핵도그,Hot Dog Joint,보문로 99 영광빌딩 1층,37.58389,127.019922,성북구,보문동5가,보문동
3,4ddca1eab0fba481fc8678e4,한우목장,Steakhouse,,37.606292,127.061433,성북구,석관동,석관동
4,4eaa7d8d49015844898445d0,풍년 닭도리탕,Korean Restaurant,중구 세종대로11길 30,37.56308,126.977783,중구,북창동,소공동


In [21]:
# Let's check the list of districts
df_venues['District'].unique()

array(['마포구', '은평구', '성북구', '중구', '영등포구', '서대문구', '중랑구', '종로구', '동대문구',
       '양천구', '서초구', '동작구', '용산구', '강동구', '성동구', '송파구', '노원구', '강서구',
       '강남구', '구로구', '강북구', '광진구', '금천구', '도봉구', '관악구', '과천시', '성남시 수정구',
       '부천시'], dtype=object)

In [22]:
# We can see that the last 3 values are actually not districts of Seoul.
# They are towns around the capital. Let's delete those from the data set
to_keep  = [i not in ['과천시','성남시 수정구','부천시'] for i in df_venues['District']]
df_venues = df_venues[to_keep]

In [23]:
# Let's check that we now have 25 districts, which is the total number of districts in Seoul
df_venues['District'].nunique()

25

In [24]:
# How many unique categories (types of cuisine) do we have in our data frame?
df_venues['Category'].nunique()

150

In [25]:
# Let's see the most common types of cuisine we have in our results
df_venues.groupby('Category').count()[['Id']].sort_values(by=['Id'], ascending=False).head()

Unnamed: 0_level_0,Id
Category,Unnamed: 1_level_1
Korean Restaurant,3362
Café,1903
Coffee Shop,1424
BBQ Joint,953
Bakery,674


In [26]:
# Let's see the least common types of cuisine we have in our results
df_venues.groupby('Category').count()[['Id']].sort_values(by=['Id']).head()

Unnamed: 0_level_0,Id
Category,Unnamed: 1_level_1
African Restaurant,1
Design Studio,1
Eastern European Restaurant,1
English Restaurant,1
Event Space,1


In [13]:
# It looks like the Foursquare API returned some garbage
# Let's clean it up by removing venues that belong to categories for which we have only a few venues - we will set this threshold to 3)
venue_cat_count = df_venues['Category'].value_counts()
threshold  = [i not in set(venue_cat_count[venue_cat_count<3].index) for i in df_venues['Category']]
df_venues = df_venues[threshold]

In [14]:
# How many types of cuisine do we have now?
df_venues['Category'].nunique()

99

In [500]:
# And the least common types of cuisine are :
df_venues.groupby('Category').count()[['Id']].sort_values(by=['Id']).head()

Unnamed: 0_level_0,Id
Category,Unnamed: 1_level_1
Indonesian Restaurant,3
Halal Restaurant,3
Mongolian Restaurant,3
Scandinavian Restaurant,3
Theme Restaurant,3


In [502]:
# Save data frame to a file
df_venues.to_pickle('restaurants_list_prepared.pkl')