# Import libraries

In [1]:
# import the necessary Libraries 
import folium
import requests
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from bs4 import BeautifulSoup
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim
from folium.plugins import HeatMap

# Get ward's information in Tokyo

In [2]:
# get html table code
response_obj = requests.get('https://en.wikipedia.org/wiki/Special_wards_of_Tokyo').text
soup = BeautifulSoup(response_obj,'lxml')
tko_wards_table = soup.find('table', {'class':'wikitable sortable'})

In [3]:
# variables to store ward's information
stts = []
ward_names = []
ward_kanjis = []
ward_populations = []
ward_densities = []
ward_areas = []
ward_major_districts = []

# extract ward's information from html table code
for row in tko_wards_table('tr'):
    tds = row('td')
    if len(tds) == 8:
        stts.append(tds[0].text)
        for span in tds[2]('span'):
            span.decompose()
        ward_names.append(tds[2].text)
        ward_kanjis.append(tds[3].text[:- 1])
#         ward_populations.append(tds[4])
        for span in tds[4]('span'):
            span.decompose()
        ward_populations.append(tds[4].text)
        for span in tds[5]('span'):
            span.decompose()
        ward_densities.append(tds[5].text)
        for span in tds[6]('span'):
            span.decompose()
        ward_areas.append(tds[6].text[:-1])
        ward_major_districts.append(tds[7].text[:-1])

In [4]:
# create dataframe from extracted ward's information
df_tko = pd.DataFrame(ward_names, columns=['Ward Name'])
df_tko['Kanji'] = ward_kanjis
df_tko['Population'] = ward_populations
df_tko['Density (/Km2)'] = ward_densities
df_tko['Ward Area (Km2)'] = ward_areas
df_tko['Major District'] = ward_major_districts

df_tko.head()

Unnamed: 0,Ward Name,Kanji,Population,Density (/Km2),Ward Area (Km2),Major District
0,Chiyoda,千代田区,59441,5100,11.66,"Nagatachō, Kasumigaseki, Ōtemachi, Marunouchi,..."
1,Chūō,中央区,147620,14460,10.21,"Nihonbashi, Kayabachō, Ginza, Tsukiji, Hatchōb..."
2,Minato,港区,248071,12180,20.37,"Odaiba, Shinbashi, Hamamatsuchō, Mita, Roppong..."
3,Shinjuku,新宿区,339211,18620,18.22,"Shinjuku, Takadanobaba, Ōkubo, Kagurazaka, Ich..."
4,Bunkyō,文京区,223389,19790,11.29,"Hongō, Yayoi, Hakusan"


In [5]:
# rename some ward's name to word with map correctly
df_tko['Ward Name'] = df_tko['Ward Name'].map(lambda x: x.replace('Chūō', 'Chuo').replace('Bunkyō', 'Bunkyo').replace('Taitō', 'Taito').replace('Kōtō', 'Koto').replace('Ōta', 'Ota'))
df_tko['Population'] = df_tko['Population'].map(lambda x: x.replace(',', ''))
df_tko['Density (/Km2)'] = df_tko['Density (/Km2)'].map(lambda x: x.replace(',', ''))
df_tko['Ward Area (Km2)'] = df_tko['Ward Area (Km2)'].map(lambda x: x.replace('.', ''))
df_tko['Major District'] = df_tko['Major District'].map(lambda x: x.replace('Nagatachō', 'Nagatacho').replace('Hongō', 'Hongo').replace('Kinshichō', 'Kinshicho').replace('Ōmori', 'Omori').replace('Kōenji', 'Koenji'))
df_tko

Unnamed: 0,Ward Name,Kanji,Population,Density (/Km2),Ward Area (Km2),Major District
0,Chiyoda,千代田区,59441,5100,1166,"Nagatacho, Kasumigaseki, Ōtemachi, Marunouchi,..."
1,Chuo,中央区,147620,14460,1021,"Nihonbashi, Kayabachō, Ginza, Tsukiji, Hatchōb..."
2,Minato,港区,248071,12180,2037,"Odaiba, Shinbashi, Hamamatsuchō, Mita, Roppong..."
3,Shinjuku,新宿区,339211,18620,1822,"Shinjuku, Takadanobaba, Ōkubo, Kagurazaka, Ich..."
4,Bunkyo,文京区,223389,19790,1129,"Hongo, Yayoi, Hakusan"
5,Taito,台東区,200486,19830,1011,"Ueno, Asakusa"
6,Sumida,墨田区,260358,18910,1377,"Kinshicho, Morishita, Ryōgoku"
7,Koto,江東区,502579,12510,4016,"Kiba, Ariake, Kameido, Tōyōchō, Monzennakachō,..."
8,Shinagawa,品川区,392492,17180,2284,"Shinagawa, Gotanda, Ōsaki, Hatanodai, Ōimachi,..."
9,Meguro,目黒区,280283,19110,1467,"Meguro, Nakameguro, Jiyugaoka, Komaba, Aobadai"


In [6]:
# get latitude & longitude of major districts
geolocator = Nominatim(user_agent='Tokyo_Explorer')
df_tko['Ward Coord'] = df_tko['Ward Name'].apply(geolocator.geocode).apply(lambda x: (x.latitude, x.longitude))
# separate latitude & longtitude into 2 different columns
df_tko[['Ward Latitude', 'Ward Longtitude']] = df_tko['Ward Coord'].apply(pd.Series)
df_tko.drop(['Ward Coord'], axis=1, inplace=True)
df_tko

Unnamed: 0,Ward Name,Kanji,Population,Density (/Km2),Ward Area (Km2),Major District,Ward Latitude,Ward Longtitude
0,Chiyoda,千代田区,59441,5100,1166,"Nagatacho, Kasumigaseki, Ōtemachi, Marunouchi,...",35.69381,139.753216
1,Chuo,中央区,147620,14460,1021,"Nihonbashi, Kayabachō, Ginza, Tsukiji, Hatchōb...",35.666255,139.775565
2,Minato,港区,248071,12180,2037,"Odaiba, Shinbashi, Hamamatsuchō, Mita, Roppong...",35.643227,139.740055
3,Shinjuku,新宿区,339211,18620,1822,"Shinjuku, Takadanobaba, Ōkubo, Kagurazaka, Ich...",35.693763,139.703632
4,Bunkyo,文京区,223389,19790,1129,"Hongo, Yayoi, Hakusan",35.71881,139.744732
5,Taito,台東区,200486,19830,1011,"Ueno, Asakusa",35.71745,139.790859
6,Sumida,墨田区,260358,18910,1377,"Kinshicho, Morishita, Ryōgoku",35.700429,139.805017
7,Koto,江東区,502579,12510,4016,"Kiba, Ariake, Kameido, Tōyōchō, Monzennakachō,...",35.649154,139.81279
8,Shinagawa,品川区,392492,17180,2284,"Shinagawa, Gotanda, Ōsaki, Hatanodai, Ōimachi,...",35.599252,139.73891
9,Meguro,目黒区,280283,19110,1467,"Meguro, Nakameguro, Jiyugaoka, Komaba, Aobadai",35.62125,139.688014


In [7]:
# check data information
df_tko.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Ward Name        23 non-null     object 
 1   Kanji            23 non-null     object 
 2   Population       23 non-null     object 
 3   Density (/Km2)   23 non-null     object 
 4   Ward Area (Km2)  23 non-null     object 
 5   Major District   23 non-null     object 
 6   Ward Latitude    23 non-null     float64
 7   Ward Longtitude  23 non-null     float64
dtypes: float64(2), object(6)
memory usage: 1.6+ KB


In [8]:
# save ward's information to csv file
df_tko.to_csv('tokyo_wards_info.csv')

In [9]:
# check ward's information after save
df_tko = pd.read_csv('tokyo_wards_info.csv', index_col=[0])
df_tko.head()

Unnamed: 0,Ward Name,Kanji,Population,Density (/Km2),Ward Area (Km2),Major District,Ward Latitude,Ward Longtitude
0,Chiyoda,千代田区,59441,5100,1166,"Nagatacho, Kasumigaseki, Ōtemachi, Marunouchi,...",35.69381,139.753216
1,Chuo,中央区,147620,14460,1021,"Nihonbashi, Kayabachō, Ginza, Tsukiji, Hatchōb...",35.666255,139.775565
2,Minato,港区,248071,12180,2037,"Odaiba, Shinbashi, Hamamatsuchō, Mita, Roppong...",35.643227,139.740055
3,Shinjuku,新宿区,339211,18620,1822,"Shinjuku, Takadanobaba, Ōkubo, Kagurazaka, Ich...",35.693763,139.703632
4,Bunkyo,文京区,223389,19790,1129,"Hongo, Yayoi, Hakusan",35.71881,139.744732


# Get land's price in wards of Tokyo

In [10]:
# get html table code
response_obj = requests.get('https://utinokati.com/en/details/land-market-value/area/Tokyo/').text
soup = BeautifulSoup(response_obj, 'lxml')
tko_price_table = soup.tbody

In [11]:
# extract names and prices of wards
ward_names = []
price_avg = []

for row in tko_price_table('tr'):
    tds = row('td')
    ward_names.append(tds[0].text)
    price_avg.append(tds[2].text)

In [12]:
# create dataframe from extracted data
df_price = pd.DataFrame(ward_names, columns=['Ward Name'])
df_price['Ward Name'] = df_price['Ward Name'].map(lambda x: x[:-3])
df_price['Price Avg (JPY/Sq.M)'] = price_avg
df_price['Price Avg (JPY/Sq.M)'] = df_price['Price Avg (JPY/Sq.M)'].map(lambda x: x.rstrip('JPY/sq.m').rstrip().replace(',',''))
df_price.head()

Unnamed: 0,Ward Name,Price Avg (JPY/Sq.M)
0,Chiyoda,2737238
1,Chuo,1921309
2,Minato,2108296
3,Shinjuku,871944
4,Bunkyo,955379


In [13]:
# # create dataframe with price of 5 popular wards only
# ward_list = ['Chiyoda-Ku', 'Chuo-Ku', 'Shinjuku-Ku', 'Shibuya-Ku', 'Shinagawa-Ku']
# df_price_5w = df_price.loc[df_price['Ward Name'].isin(ward_list)]
# df_price_5w['Ward Name'] = df_price_5w['Ward Name'].map(lambda x: x[:-3])
# df_price_5w.reset_index(drop=True, inplace=True)
# df_price_5w.head()

In [14]:
# create dataframe with only 5 popular ward's information
# ward_list = df_price_5w['Ward Name'].to_list()
# df_tko_5w = df_tko.loc[df_tko['Ward Name'].isin(ward_list)]
# df_tko_5w.head()

In [15]:
# merge price information into ward's information
df_tko = df_tko.merge(df_price, on='Ward Name', how='left')
df_tko

Unnamed: 0,Ward Name,Kanji,Population,Density (/Km2),Ward Area (Km2),Major District,Ward Latitude,Ward Longtitude,Price Avg (JPY/Sq.M)
0,Chiyoda,千代田区,59441,5100,1166,"Nagatacho, Kasumigaseki, Ōtemachi, Marunouchi,...",35.69381,139.753216,2737238
1,Chuo,中央区,147620,14460,1021,"Nihonbashi, Kayabachō, Ginza, Tsukiji, Hatchōb...",35.666255,139.775565,1921309
2,Minato,港区,248071,12180,2037,"Odaiba, Shinbashi, Hamamatsuchō, Mita, Roppong...",35.643227,139.740055,2108296
3,Shinjuku,新宿区,339211,18620,1822,"Shinjuku, Takadanobaba, Ōkubo, Kagurazaka, Ich...",35.693763,139.703632,871944
4,Bunkyo,文京区,223389,19790,1129,"Hongo, Yayoi, Hakusan",35.71881,139.744732,955379
5,Taito,台東区,200486,19830,1011,"Ueno, Asakusa",35.71745,139.790859,978840
6,Sumida,墨田区,260358,18910,1377,"Kinshicho, Morishita, Ryōgoku",35.700429,139.805017,608004
7,Koto,江東区,502579,12510,4016,"Kiba, Ariake, Kameido, Tōyōchō, Monzennakachō,...",35.649154,139.81279,664259
8,Shinagawa,品川区,392492,17180,2284,"Shinagawa, Gotanda, Ōsaki, Hatanodai, Ōimachi,...",35.599252,139.73891,779498
9,Meguro,目黒区,280283,19110,1467,"Meguro, Nakameguro, Jiyugaoka, Komaba, Aobadai",35.62125,139.688014,871772


In [16]:
# save all information into csv
df_tko.to_csv('tokyo_info.csv')

In [17]:
# check saved data is correct or not
df_tko = pd.read_csv('tokyo_info.csv', index_col=[0])
df_tko.head()

Unnamed: 0,Ward Name,Kanji,Population,Density (/Km2),Ward Area (Km2),Major District,Ward Latitude,Ward Longtitude,Price Avg (JPY/Sq.M)
0,Chiyoda,千代田区,59441,5100,1166,"Nagatacho, Kasumigaseki, Ōtemachi, Marunouchi,...",35.69381,139.753216,2737238
1,Chuo,中央区,147620,14460,1021,"Nihonbashi, Kayabachō, Ginza, Tsukiji, Hatchōb...",35.666255,139.775565,1921309
2,Minato,港区,248071,12180,2037,"Odaiba, Shinbashi, Hamamatsuchō, Mita, Roppong...",35.643227,139.740055,2108296
3,Shinjuku,新宿区,339211,18620,1822,"Shinjuku, Takadanobaba, Ōkubo, Kagurazaka, Ich...",35.693763,139.703632,871944
4,Bunkyo,文京区,223389,19790,1129,"Hongo, Yayoi, Hakusan",35.71881,139.744732,955379


In [18]:
business_wards = ['Chiyoda', 'Chuo', 'Minato', 'Shinjuku', 'Shibuya', 'Shinagawa', 'Meguro']
df_tko_spe = df_tko.loc[df_tko['Ward Name'].isin(business_wards)].reset_index(drop=True)
df_tko_spe

Unnamed: 0,Ward Name,Kanji,Population,Density (/Km2),Ward Area (Km2),Major District,Ward Latitude,Ward Longtitude,Price Avg (JPY/Sq.M)
0,Chiyoda,千代田区,59441,5100,1166,"Nagatacho, Kasumigaseki, Ōtemachi, Marunouchi,...",35.69381,139.753216,2737238
1,Chuo,中央区,147620,14460,1021,"Nihonbashi, Kayabachō, Ginza, Tsukiji, Hatchōb...",35.666255,139.775565,1921309
2,Minato,港区,248071,12180,2037,"Odaiba, Shinbashi, Hamamatsuchō, Mita, Roppong...",35.643227,139.740055,2108296
3,Shinjuku,新宿区,339211,18620,1822,"Shinjuku, Takadanobaba, Ōkubo, Kagurazaka, Ich...",35.693763,139.703632,871944
4,Shinagawa,品川区,392492,17180,2284,"Shinagawa, Gotanda, Ōsaki, Hatanodai, Ōimachi,...",35.599252,139.73891,779498
5,Meguro,目黒区,280283,19110,1467,"Meguro, Nakameguro, Jiyugaoka, Komaba, Aobadai",35.62125,139.688014,871772
6,Shibuya,渋谷区,227850,15080,1511,"Shibuya, Ebisu, Harajuku, Daikanyama, Hiroo",35.664596,139.698711,1188676


# Get information about venues (restaurants, hotels, shops, bars, ...) of arround major districts

In [19]:
# declare foursquare's information to call foursquare API
CLIENT_ID = 'AKFKO1XSLFP30UAVFWM4EB3PC0YV5VMFYXOINXYHMKGYKXKF'
CLIENT_SECRET = 'JRATDLVWK5ZBC2ULRC2AFXIQZSE1GOCDO2DV5EK4UVFNWUSL'
VERSION = '20210824' # Foursquare API version

# variables to control how far, how many venues which you want to get from foursquare
radius = 1000
limit = 100

In [20]:
# function to get venues from foursquare through foursquare API
def getVenues(latitude, longitude, radius=1000):
    venues_100_list = []
    for lat, long in zip(latitude, longitude):
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, lat, long, radius, limit)

        result = requests.get(url).json()['response']['groups'][0]['items']

        venues_100_list.append(result)

    return venues_100_list


In [21]:
# call function to get venues information of 5 major districts
# venues_100_list = getVenues(latitude=df_tko['District Latitude'], longitude=df_tko['District Longtitude'])
venues_100_list = getVenues(latitude=df_tko_spe['Ward Latitude'], longitude=df_tko_spe['Ward Longtitude'])
# [print(venue_100) for venue_100 in venues_100_list]

In [22]:
# # create dataframe to hold venue information
# venues_list = []
# for district, dist_lat, dist_lng, venues_100 in zip(df_tko['Major District'], df_tko['District Latitude'], df_tko['District Longtitude'], venues_100_list):
#     if len(venues_100) > 0:
#         for rs in venues_100:
#             # if 'name' in rs['venue'] and 'name' in rs['venue']['categories'][0] and 'lat' in rs['venue']['location'] and 'lng' in rs['venue']['location'] 
#             # and 'address' in rs['venue']['location'] and 'distance' in rs['venue']['location']:
#             venues_list.append([
#                 district,
#                 dist_lat,
#                 dist_lng,
#                 rs['venue']['name'] if 'name' in rs['venue'] else '',
#                 rs['venue']['location']['lat'] if 'lat' in rs['venue']['location']  else '',
#                 rs['venue']['location']['lng'] if  'lng' in rs['venue']['location'] else '',
#                 rs['venue']['location']['address'] if 'address' in rs['venue']['location'] else '',
#                 rs['venue']['location']['distance'] if 'distance' in rs['venue']['location'] else '',
#                 rs['venue']['categories'][0]['name'] if 'name' in rs['venue']['categories'][0] else ''])

#     else:
#         print('Not found any venues for {}'.format(district))
# # create dataframe to hold venues information
# df_tko_venues = pd.DataFrame([item for item in venues_list])
# df_tko_venues.columns = ['District', 'District Latitude', 'District Longtitude', 'Venue Name', 'Venue Latitude', 'Venue Longtitude', 'Venue Address', 'Venue Distance', 'Venue Category']
# print(df_tko_venues.shape)
# df_tko_venues.head()

# create dataframe to hold venue information
venues_list = []
for ward, ward_lat, ward_lng, venues_100 in zip(df_tko_spe['Ward Name'], df_tko_spe['Ward Latitude'], df_tko_spe['Ward Longtitude'], venues_100_list):
    if len(venues_100) > 0:
        for rs in venues_100:
            # if 'name' in rs['venue'] and 'name' in rs['venue']['categories'][0] and 'lat' in rs['venue']['location'] and 'lng' in rs['venue']['location'] 
            # and 'address' in rs['venue']['location'] and 'distance' in rs['venue']['location']:
            venues_list.append([
                ward,
                ward_lat,
                ward_lng,
                rs['venue']['name'] if 'name' in rs['venue'] else '',
                rs['venue']['location']['lat'] if 'lat' in rs['venue']['location']  else '',
                rs['venue']['location']['lng'] if  'lng' in rs['venue']['location'] else '',
                rs['venue']['location']['address'] if 'address' in rs['venue']['location'] else '',
                rs['venue']['location']['distance'] if 'distance' in rs['venue']['location'] else '',
                rs['venue']['categories'][0]['name'] if 'name' in rs['venue']['categories'][0] else ''])

    else:
        print('Not found any venues for {}'.format(ward))
# create dataframe to hold venues information
df_tko_venues = pd.DataFrame([item for item in venues_list])
df_tko_venues.columns = ['Ward Name', 'Ward Latitude', 'Ward Longtitude', 'Venue Name', 'Venue Latitude', 'Venue Longtitude', 'Venue Address', 'Venue Distance', 'Venue Category']
print(df_tko_venues.shape)
df_tko_venues.head()

(700, 9)


Unnamed: 0,Ward Name,Ward Latitude,Ward Longtitude,Venue Name,Venue Latitude,Venue Longtitude,Venue Address,Venue Distance,Venue Category
0,Chiyoda,35.69381,139.753216,Nippon Budokan (日本武道館),35.693356,139.749865,北の丸公園2-3,307,Stadium
1,Chiyoda,35.69381,139.753216,Kanda Tendonya (神田天丼家),35.695765,139.754682,神田神保町3-1-14,254,Tempura Restaurant
2,Chiyoda,35.69381,139.753216,National Museum of Modern Art (東京国立近代美術館),35.690541,139.754694,北の丸公園3-1,387,Art Museum
3,Chiyoda,35.69381,139.753216,Kitanomaru Park (北の丸公園),35.691653,139.751201,北の丸公園1-1,301,Park
4,Chiyoda,35.69381,139.753216,Bondy (欧風カレー ボンディ),35.695544,139.757356,神田神保町2-3,421,Japanese Curry Restaurant


In [23]:
# df_tko_venues['District'].value_counts()

In [24]:
# save data to cvs file
df_tko_venues.to_csv('tokyo_venues_info.csv')

In [25]:
# load data from csv file
df_tko_venues = pd.read_csv('tokyo_venues_info.csv', index_col=[0])
df_tko_venues.head()

Unnamed: 0,Ward Name,Ward Latitude,Ward Longtitude,Venue Name,Venue Latitude,Venue Longtitude,Venue Address,Venue Distance,Venue Category
0,Chiyoda,35.69381,139.753216,Nippon Budokan (日本武道館),35.693356,139.749865,北の丸公園2-3,307,Stadium
1,Chiyoda,35.69381,139.753216,Kanda Tendonya (神田天丼家),35.695765,139.754682,神田神保町3-1-14,254,Tempura Restaurant
2,Chiyoda,35.69381,139.753216,National Museum of Modern Art (東京国立近代美術館),35.690541,139.754694,北の丸公園3-1,387,Art Museum
3,Chiyoda,35.69381,139.753216,Kitanomaru Park (北の丸公園),35.691653,139.751201,北の丸公園1-1,301,Park
4,Chiyoda,35.69381,139.753216,Bondy (欧風カレー ボンディ),35.695544,139.757356,神田神保町2-3,421,Japanese Curry Restaurant


In [26]:
# display data information
df_tko_venues.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 700 entries, 0 to 699
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Ward Name         700 non-null    object 
 1   Ward Latitude     700 non-null    float64
 2   Ward Longtitude   700 non-null    float64
 3   Venue Name        700 non-null    object 
 4   Venue Latitude    700 non-null    float64
 5   Venue Longtitude  700 non-null    float64
 6   Venue Address     698 non-null    object 
 7   Venue Distance    700 non-null    int64  
 8   Venue Category    700 non-null    object 
dtypes: float64(4), int64(1), object(4)
memory usage: 54.7+ KB


# Explore Data Analysis

## Compare average price

In [27]:
df_tko_spe.sort_values('Price Avg (JPY/Sq.M)', ascending=False, inplace=True)
# draw bar chart
fig = px.bar(df_tko_spe, 
             x='Ward Name', 
             y='Price Avg (JPY/Sq.M)',
             color='Price Avg (JPY/Sq.M)',
             title="Average of land price in Tokyo",
             labels={'Ward Name': 'Wards', 'Price Avg (JPY/Sq.M)': 'Average Land Price ((JPY/Sq.M)'},
             text='Price Avg (JPY/Sq.M)'
) 
fig.show()

## Compare ward_populations

In [28]:
# # draw bar chart
# fig = px.bar(df_tko_spe, 
#              x='Ward Name', 
#              y='Population',
#              color='Population',
#              title="Population in 23 wards of Tokyo",
#              labels={'Ward Name': 'Wards', 'Population': 'Population'},
#              text='Population'
# )   
# fig.show()

In [29]:
# draw pie chart
fig = px.pie(df_tko_spe, 
             values=df_tko_spe['Ward Area (Km2)'],
             names=df_tko_spe['Ward Name'],
             title='Percentage in area of 23 wards Tokyo',
             color_discrete_sequence=px.colors.sequential.Rainbow)
fig.update_layout(margin=dict(t=30, b=10, l=350, r=550))
fig.show()

In [30]:
# draw group bar chart
fig = go.Figure(
    data=[
        go.Bar(name="Density", x=df_tko_spe['Ward Name'], y=df_tko_spe['Density (/Km2)'], text=df_tko_spe['Density (/Km2)'], textposition='auto'),
        go.Bar(name="Population", x=df_tko_spe['Ward Name'], y=df_tko_spe['Population'], text=df_tko_spe['Population'], textposition='auto'),
    ],
    layout=go.Layout(title="Compare population and density of 7 wards",
                     yaxis_title="Value", xaxis_title="Wards")
)
fig.show()


## Create Tokyo map with 5 major districts of 5 wards

In [31]:
# get latitude & longitude of Tokyo
tko_geolocator = Nominatim(user_agent='Tokyo_Explorer')
tko_location = tko_geolocator.geocode(query='Tokyo')
tko_lat = tko_location.latitude
tko_lng = tko_location.longitude
print(tko_lat, tko_lng)

35.6828387 139.7594549


In [32]:
# create map of Tokyo with 5 major districts are displayed
tko_map = folium.Map(location=[tko_lat, tko_lng], zoom_start=12)

for lat, long, label in zip(df_tko_spe['Ward Latitude'], df_tko_spe['Ward Longtitude'], df_tko_spe['Ward Name']):
    label = folium.Popup(label, parser_html=True)
    folium.CircleMarker(
        [lat, long],
        radius=20,
        popup=label,
        color='magenta',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7
    ).add_to(tko_map)

# tko_map

## Count number of venue in each ward

In [33]:
# df_tko_venues_count = df_tko_venues.groupby('District')['Venue Category'].count().to_frame(name='Venue Count')
df_tko_venues_count = df_tko_venues.groupby('Ward Name')['Venue Category'].count().to_frame(name='Venue Count')
df_tko_venues_count = df_tko_venues_count.sort_values('Venue Count', ascending=False).reset_index()
df_tko_venues_count

Unnamed: 0,Ward Name,Venue Count
0,Chiyoda,100
1,Chuo,100
2,Meguro,100
3,Minato,100
4,Shibuya,100
5,Shinagawa,100
6,Shinjuku,100


In [34]:
# # draw bar chart
# fig = px.bar(df_tko_venues_count, 
#              x='Ward Name', 
#              y='Venue Count',
#              color='Venue Count',
#              title="Number of venues in business wards of Tokyo",
#              labels={'Venue Count': 'Number of venues'},
#              text='Venue Count'
# ) 
# fig.show()

## Statistic top 10 categories

In [35]:
# statistic top 10 categories
df_tko_venues_top10 = df_tko_venues['Venue Category'].value_counts()[0:10].to_frame(name='Frequency').reset_index()
df_tko_venues_top10.rename(index=str, columns={'index':'Venue Category'}, inplace=True)

# draw bar chart
fig = px.bar(df_tko_venues_top10, 
             x='Venue Category', 
             y='Frequency',
             color='Frequency',
             title="Top 10 Most Frequency Occuring Venues in 7 wards of Tokyo",
             labels={'Venue Category': 'Venues Category'},
             text='Frequency'
) 
fig.show()

## Statistic each category in each ward

In [36]:
# create new dataframe with feature 'Venue Category' is encoded to onehot encoding
df_tko_venues_onehot = pd.get_dummies(df_tko_venues['Venue Category'])
# df_tko_venues_onehot.insert(0, 'District', df_tko_venues['District'])
df_tko_venues_onehot.insert(0, 'Ward Name', df_tko_venues['Ward Name'])
df_tko_venues_onehot.head()

Unnamed: 0,Ward Name,Accessories Store,American Restaurant,Art Gallery,Art Museum,BBQ Joint,Bagel Shop,Bakery,Bar,Bath House,...,Unagi Restaurant,Used Bookstore,Vegetarian / Vegan Restaurant,Vehicle Inspection Station,Wagashi Place,Wine Bar,Wine Shop,Women's Store,Yakitori Restaurant,Yoshoku Restaurant
0,Chiyoda,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Chiyoda,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Chiyoda,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Chiyoda,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Chiyoda,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# calculate mean frequency of each category in each district
# df_tko_venues_onehot_mean = df_tko_venues_onehot.groupby('District').mean().reset_index()
df_tko_venues_onehot_mean = df_tko_venues_onehot.groupby('Ward Name').mean().reset_index()
df_tko_venues_onehot_mean.head()

Unnamed: 0,Ward Name,Accessories Store,American Restaurant,Art Gallery,Art Museum,BBQ Joint,Bagel Shop,Bakery,Bar,Bath House,...,Unagi Restaurant,Used Bookstore,Vegetarian / Vegan Restaurant,Vehicle Inspection Station,Wagashi Place,Wine Bar,Wine Shop,Women's Store,Yakitori Restaurant,Yoshoku Restaurant
0,Chiyoda,0.0,0.0,0.0,0.01,0.01,0.0,0.01,0.0,0.0,...,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.03
1,Chuo,0.0,0.0,0.0,0.0,0.01,0.0,0.03,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.01
2,Meguro,0.0,0.0,0.01,0.0,0.01,0.0,0.03,0.0,0.01,...,0.0,0.0,0.01,0.0,0.01,0.0,0.01,0.0,0.0,0.0
3,Minato,0.0,0.0,0.0,0.0,0.03,0.01,0.02,0.02,0.0,...,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.02,0.01
4,Shibuya,0.01,0.01,0.01,0.01,0.0,0.0,0.01,0.04,0.0,...,0.0,0.0,0.0,0.0,0.01,0.02,0.0,0.01,0.01,0.0


In [38]:
# calculate top 5 categories which are most commoned in each district
num_top_venues = 5
df_tko_venues_top5_list = []

# for place in df_tko_venues_onehot_mean['District']:
for place in df_tko_venues_onehot_mean['Ward Name']:
    # print(f'*****{place}*****')
    # df_top5 = df_tko_venues_onehot_mean[df_tko_venues_onehot_mean['District'] == place].T.reset_index()
    df_top5 = df_tko_venues_onehot_mean[df_tko_venues_onehot_mean['Ward Name'] == place].T.reset_index()
    df_top5.columns = ['Venue Category', 'Frequency Score']
    df_top5 = df_top5.iloc[1:]
    df_top5['Frequency Score'] = df_top5['Frequency Score'].astype(float).round(2)
    df_top5 = df_top5.sort_values('Frequency Score', ascending=False).reset_index(drop=True)
    df_top5 = df_top5.iloc[:5,:]
    df_tko_venues_top5_list.append([place, df_top5])
    # print(df_top5.head())

In [39]:
# display top 5 venues common of each ward in bar charts
for df_top5 in df_tko_venues_top5_list:
    # draw bar chart
    fig = px.bar(df_top5[1], 
                x='Venue Category', 
                y='Frequency Score',
                color='Frequency Score',
                title="Top 5 Most Frequency Occuring Venues in {} wards of Tokyo".format(df_top5[0]),
                labels={'Venue Category': 'Venues Category', 'Frequency Score':'Frequency Score'},
                text='Frequency Score'
    ) 
    fig.show()

In [40]:
# create a new dataframe with top 5 most common venue
# colum_names = ['District', '1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue', '4th Most Common Venue', '5th Most Common Venue']
colum_names = ['Ward Name', '1st Most Common Venue', '2nd Most Common Venue', '3rd Most Common Venue', '4th Most Common Venue', '5th Most Common Venue']
df_tko_venues_top5 = pd.DataFrame(columns=colum_names)

dist_list = []
most_com_vue_1 = []
most_com_vue_2 = []
most_com_vue_3 = []
most_com_vue_4 = []
most_com_vue_5 = []

for data in df_tko_venues_top5_list:
    dist_list.append(data[0])
    most_com_vue_1.append(data[1]['Venue Category'].iloc[0])
    most_com_vue_2.append(data[1]['Venue Category'].iloc[1])
    most_com_vue_3.append(data[1]['Venue Category'].iloc[2])
    most_com_vue_4.append(data[1]['Venue Category'].iloc[3])
    most_com_vue_5.append(data[1]['Venue Category'].iloc[4])

# df_tko_venues_top5['District'], \
df_tko_venues_top5['Ward Name'], \
df_tko_venues_top5['1st Most Common Venue'], \
df_tko_venues_top5['2nd Most Common Venue'], \
df_tko_venues_top5['3rd Most Common Venue'], \
df_tko_venues_top5['4th Most Common Venue'], \
df_tko_venues_top5['5th Most Common Venue'] = \
np.array(dist_list), most_com_vue_1, most_com_vue_2, most_com_vue_3, most_com_vue_4, most_com_vue_5

# df_tko_venues_top5['District'] = dist_list
df_tko_venues_top5['Ward Name'] = dist_list
df_tko_venues_top5['1st Most Common Venue'] = most_com_vue_1
df_tko_venues_top5.head()

Unnamed: 0,Ward Name,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,Chiyoda,Café,Ramen Restaurant,Japanese Curry Restaurant,Tempura Restaurant,Chinese Restaurant
1,Chuo,Sushi Restaurant,Japanese Restaurant,Monjayaki Restaurant,Italian Restaurant,Soba Restaurant
2,Meguro,Convenience Store,Coffee Shop,Café,Japanese Restaurant,Ramen Restaurant
3,Minato,Japanese Restaurant,Chinese Restaurant,Coffee Shop,Italian Restaurant,Soba Restaurant
4,Shibuya,Café,Coffee Shop,French Restaurant,Sake Bar,Bar


## Statistic number of restaurants in each ward

In [41]:
# create a new dataframe with restaurant only
df_tko_venues_rest = df_tko_venues[df_tko_venues['Venue Category'].str.contains('Restaurant')]
df_tko_venues_rest = df_tko_venues_rest.reset_index(drop=True)
print(df_tko_venues_rest.shape)
df_tko_venues_rest.head()

(289, 9)


Unnamed: 0,Ward Name,Ward Latitude,Ward Longtitude,Venue Name,Venue Latitude,Venue Longtitude,Venue Address,Venue Distance,Venue Category
0,Chiyoda,35.69381,139.753216,Kanda Tendonya (神田天丼家),35.695765,139.754682,神田神保町3-1-14,254,Tempura Restaurant
1,Chiyoda,35.69381,139.753216,Bondy (欧風カレー ボンディ),35.695544,139.757356,神田神保町2-3,421,Japanese Curry Restaurant
2,Chiyoda,35.69381,139.753216,Jimbocho Kurosu (神保町 黒須),35.695539,139.754851,神田神保町3-1-19,242,Ramen Restaurant
3,Chiyoda,35.69381,139.753216,Gavial (ガヴィアル),35.695738,139.758385,神田神保町1-9,514,Japanese Curry Restaurant
4,Chiyoda,35.69381,139.753216,Sushi Masa (九段下 寿司政),35.695234,139.752227,九段南1-4-1,182,Sushi Restaurant


In [42]:
# create map with restaurants
rest_map = folium.Map(location=[tko_lat, tko_lng], zoom_start=12, titles='OpenStreetMap', attr='<a href=https://github.com/python-visualization/folium/>Folium</a>')
# dist_list = df_tko_venues_rest['District'].unique().tolist()
dist_list = df_tko_venues_rest['Ward Name'].unique().tolist()
color_list = ['#FF0000', '#FFFF00','#00FF00','#008000' ,'#008080', '#0000FF', '#FF00FF']

for lat, lng, cat, ward in zip(df_tko_venues_rest['Venue Latitude'], df_tko_venues_rest['Venue Longtitude'], df_tko_venues_rest['Venue Category'], df_tko_venues_rest['Ward Name']):
    label = folium.Popup(str(cat) + ' ' + str(ward), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=7,
        popup=label,
        color=color_list[dist_list.index(ward)-1],
        fill=True,
        fill_color=color_list[dist_list.index(ward)-1],
        fill_oppacity=0.3
    ).add_to(rest_map)

# rest_map

In [43]:
# count number of restaurant in each district
# df_tko_venues_rest_count = df_tko_venues_rest.groupby(['District'])['Venue Category'].count().to_frame()
df_tko_venues_rest_count = df_tko_venues_rest.groupby(['Ward Name'])['Venue Category'].count().to_frame()
df_lat_lng_tem = df_tko_venues_rest.groupby(['Ward Name'])['Ward Latitude', 'Ward Longtitude'].first()
df_tko_venues_rest_count.reset_index(inplace=True)
df_tko_venues_rest_count.columns = ['Ward Name', 'Number of Rest']
df_tko_venues_rest_count = df_tko_venues_rest_count.merge(df_lat_lng_tem, on='Ward Name', how='left')
df_tko_venues_rest_count.sort_values('Number of Rest', ascending=False, inplace=True)
df_tko_venues_rest_count


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,Ward Name,Number of Rest,Ward Latitude,Ward Longtitude
1,Chuo,76,35.666255,139.775565
3,Minato,54,35.643227,139.740055
0,Chiyoda,45,35.69381,139.753216
4,Shibuya,31,35.664596,139.698711
5,Shinagawa,30,35.599252,139.73891
6,Shinjuku,29,35.693763,139.703632
2,Meguro,24,35.62125,139.688014


In [44]:
hm_data = df_tko_venues_rest_count[['Ward Latitude', 'Ward Longtitude', 'Number of Rest']]
hm_data = hm_data.values.tolist()
# hm_data[:5]
hmap = folium.Map(location=[tko_lat, tko_lng], control_scale=True, attr='USGS style', zoom_start=5)
HeatMap(hm_data, rasdius=10).add_to(hmap)
# hmap

<folium.plugins.heat_map.HeatMap at 0x7f8011035f10>

In [45]:
# draw bar chart
fig = px.bar(df_tko_venues_rest_count, 
            # x='District', 
            x='Ward Name', 
            y='Number of Rest',
            color='Number of Rest',
            title="Number of Restaurants in each Ward",
            labels={'Ward Name': 'Ward', 'Number of Rest':'Number of Restaurant'},
            text='Number of Rest'
) 
fig.show()

In [46]:
# set number of clusters
kclusters = 3

# df_tko_venues_clustering = df_tko_venues_onehot_mean.drop('District', axis=1)
df_tko_venues_clustering = df_tko_venues_onehot_mean.drop('Ward Name', axis=1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_tko_venues_clustering)

# check cluster labels generated for each row in the dataframe
print ("Check the 5 Cluster labels :",  kmeans.labels_)

Check the 5 Cluster labels : [1 0 2 0 1 2 1]


In [47]:
df_tko_spe_all = df_tko_spe.copy()
df_tko_spe_all.insert(0, 'Cluster ID', kmeans.labels_)
df_tko_spe_all = df_tko_spe_all.join(df_tko_venues_top5.set_index('Ward Name'), on='Ward Name')
df_tko_spe_all.head()

Unnamed: 0,Cluster ID,Ward Name,Kanji,Population,Density (/Km2),Ward Area (Km2),Major District,Ward Latitude,Ward Longtitude,Price Avg (JPY/Sq.M),1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue
0,1,Chiyoda,千代田区,59441,5100,1166,"Nagatacho, Kasumigaseki, Ōtemachi, Marunouchi,...",35.69381,139.753216,2737238,Café,Ramen Restaurant,Japanese Curry Restaurant,Tempura Restaurant,Chinese Restaurant
2,0,Minato,港区,248071,12180,2037,"Odaiba, Shinbashi, Hamamatsuchō, Mita, Roppong...",35.643227,139.740055,2108296,Japanese Restaurant,Chinese Restaurant,Coffee Shop,Italian Restaurant,Soba Restaurant
1,2,Chuo,中央区,147620,14460,1021,"Nihonbashi, Kayabachō, Ginza, Tsukiji, Hatchōb...",35.666255,139.775565,1921309,Sushi Restaurant,Japanese Restaurant,Monjayaki Restaurant,Italian Restaurant,Soba Restaurant
6,0,Shibuya,渋谷区,227850,15080,1511,"Shibuya, Ebisu, Harajuku, Daikanyama, Hiroo",35.664596,139.698711,1188676,Café,Coffee Shop,French Restaurant,Sake Bar,Bar
3,1,Shinjuku,新宿区,339211,18620,1822,"Shinjuku, Takadanobaba, Ōkubo, Kagurazaka, Ich...",35.693763,139.703632,871944,Sake Bar,Bar,Ramen Restaurant,BBQ Joint,Japanese Restaurant


['Chuo', 'Minato', 'Chiyoda', 'Shibuya', 'Shinagawa', 'Shinjuku', 'Meguro']

In [48]:
# create map
map_clusters = folium.Map(location=[tko_lat, tko_lng], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
wards = df_tko_venues_rest_count['Ward Name'].tolist()
n_rest = df_tko_venues_rest_count['Number of Rest'].tolist()

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_tko_spe_all['Ward Latitude'], 
                                  df_tko_spe_all['Ward Longtitude'], 
                                  df_tko_spe_all['Ward Name'], 
                                  df_tko_spe_all['Cluster ID']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=df_tko_venues_rest_count['Number of Rest'].tolist(),
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters