# Japan Hostels: CPI Analysis

 ## Data Preparation/Pre-processing

In [23]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import requests
import json
from bs4 import BeautifulSoup
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy import OpenCage
from sklearn.cluster import KMeans

%matplotlib inline

SyntaxError: invalid syntax (<ipython-input-23-27de370687e5>, line 1)

#### Hostel Dataset:

In [None]:
hostel_df = pd.read_csv('data/Hostel.csv')
hostel_df.head()

We will drop the 1st column and rename other columns to make them more readable.

In [3]:
columns = ['Name', 'City', 'StartPrice', 'DistanceFromCityCentre', 'OverallScore', 
           'RatingCategory', 'Atmosphere', 'Cleanliness', 'Facilities', 'Location', 
           'Security', 'Staff', 'ValueForMoney', 'Longitude', 'Latitude']

In [4]:
hostel_df.drop(['Unnamed: 0'], axis=1, inplace=True)
hostel_df.columns = columns
hostel_df.head()

NameError: name 'hostel_df' is not defined

In [None]:
print("Number of Hostels: ", hostel_df.shape[0])

Let's check if any hostels are missing location data since that is essential for our analysis.

In [None]:
print("Num of hostels with missing Longitude: ", hostel_df.Longitude.isna().sum())

In [None]:
print("Num of hostels with missing Latitude: ", hostel_df.Latitude.isna().sum())

In [5]:
# Remove hostels with missing latitude and longitude 
hostel_df = hostel_df[hostel_df.Longitude.isna() == False]
hostel_df = hostel_df[hostel_df.Latitude.isna() == False]
hostel_df.shape

NameError: name 'hostel_df' is not defined

If we see the DistanceFromCityCentre column, we see that it has distance as well as text, let's remove the unwanted text.

In [None]:
hostel_df.DistanceFromCityCentre = hostel_df.DistanceFromCityCentre.replace(r'[^\d\.]+', '', regex=True)
hostel_df.head()

Let's do a final check if any more values are missing and what we want to do about them

In [None]:
hostel_df.count()

In [None]:
hostel_df.dtypes

In [None]:
# Convert DistanceFromCityCentre to float64
hostel_df.DistanceFromCityCentre = hostel_df.DistanceFromCityCentre.astype('float')
hostel_df.dtypes

In [None]:
hostel_df.head()

In [None]:
tokyo_hostels_df = hostel_df[hostel_df.City == "Tokyo"]

<br>  

#### Neighborhood Data(Foursquare):

In [None]:
search_url = 'https://api.foursquare.com/v2/venues/search'
explore_url = 'https://api.foursquare.com/v2/venues/explore'

I've stored my foursqaure credentials in a file which I am reading below

In [None]:
with open("data/creds.json", 'r') as f:
    creds = json.load(f)
CLIENT_ID = creds['id'] # your Foursquare ID
CLIENT_SECRET = creds['secret'] # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

In [None]:
sample_hotel = hostel_df.loc[0]
sample_hotel.Name

In [6]:
params = dict(
  client_id=CLIENT_ID,
  client_secret=CLIENT_SECRET,
  v=VERSION,
  ll=f"{sample_hotel.Latitude}, {sample_hotel.Longitude}",
  radius=500,
  limit=100,
  sortByPopularity=1
)

NameError: name 'CLIENT_ID' is not defined

In [None]:
def get_recommended_venues(name, lattitude, longitude, limit=100, radius=500):
    venues = []
    params = dict(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET,
        v=VERSION,
        ll=f"{lattitude}, {longitude}",
        radius=radius,
        limit=limit,
        sortByPopularity=1
    )
    data = requests.get(explore_url, params=params).json()['response']['groups'][0]['items']
    for v in data:
        v = v['venue']
        primary_cat = ""
        for c in v['categories']:
            if(c['primary']):
                primary_cat=c['name']
        venues.append({
            'HostelName':name, 
            'VenueName': v['name'], 
            'Category': primary_cat, 
            'VenueLatitue': v['location']['lat'], 
            'VenueLongtitude': v['location']['lng']
        })
    return venues

In [None]:
venue_list = []

In [None]:
for name, lat, long in zip(hostel_df.Name, hostel_df.Latitude, hostel_df.Longitude):
    venue_list.extend(get_recommended_venues(name, lat, long))
    i = len(venue_list)
    print(i)

In [None]:
venues_df = pd.DataFrame(venue_list)
venues_df = venues_df.rename(columns={'VenueLatitue': 'VenueLatitude', 'VenueLongtitude': 'VenueLongitude'})
venues_df.head()

Let's create a new dataframe for venues for hostels in tokyo.

In [None]:
tokyo_venues_df = venues_df[venues_df.HostelName.isin(tokyo_hostels_df.Name)]

In [None]:
tokyo_venues_df.head()

In [None]:
tokyo_venues_df.shape

<br>

#### Land Price Data(Foursquare):

In [None]:
land_price_url = 'https://utinokati.com/en/details/land-market-value/area/Tokyo/'

data = BeautifulSoup(requests.get(land_price_url).content)

In [None]:
price_df = pd.read_html(data.find('table', {'class':'table-condensed'}).prettify())[0]

In [None]:
price_df.drop(['#', 'Average Trading Price'], axis=1, inplace=True)
price_df.rename(columns={'Average Unit Price': 'PricePerSqMeter', 'Area': 'Neighborhood'}, inplace=True)
price_df.PricePerSqMeter = price_df.PricePerSqMeter.apply(lambda x: x.strip(' JPY/sq.m').replace(',', '')).astype(int)

In [None]:
price_df.head()

Let's see all the neighborhoods

In [None]:
print(price_df.Neighborhood.sort_values())

Let's remove -ku suffix from all the neighborhood names

In [None]:
price_df.Neighborhood.replace("(Ku)", "", regex=True, inplace=True)

In [None]:
price_df.head()

The hostels dataset doesn't contain information on the locality of the hostel. We'll use reverse geocoding for this purpose. I'm using a free account on OpenCage for this purpose.

We'll try the API with one Hostel and then write function to do the same for all hostels in tokyo.

In [7]:
temp = tokyo_hostels_df.iloc[1]
temp.to_frame().T

NameError: name 'tokyo_hostels_df' is not defined

In [None]:
from opencage.geocoder import OpenCageGeocode

geo = OpenCageGeocode("28b371af498445e8842797870bb8dbbb")

In [None]:
results = geo.reverse_geocode(temp.Latitude, temp.Longitude)

In [None]:
results[0]['formatted']

In [None]:
import time

address = []
for lat, long in zip(tokyo_hostels_df.Latitude, tokyo_hostels_df.Longitude):
    loc = geo.reverse_geocode(lat, long)[0]
    address.append(loc['formatted'])
    time.sleep(1)

In [None]:
neighborhood = []
for addr in tokyo_hostels_df.Address:
    for nb in price_df.Neighborhood:
        if nb.lower() in addr.lower():
            neighborhood.append(nb)
            break
    else:
        print("No nb found for ", addr)
        neighborhood.append(np.nan)


We will remove these two hostels from our dataset since we don't have the name of their neighborhood.

In [8]:
print("Current number of hostels: ", tokyo_hostels_df.shape[0])

NameError: name 'tokyo_hostels_df' is not defined

In [None]:
tokyo_hostels_df['Neighborhood'] = neighborhood

In [None]:
tokyo_hostels_df = tokyo_hostels_df[tokyo_hostels_df.Neighborhood.isna() == False]

In [None]:
print("New number of hostels: ", tokyo_hostels_df.shape[0])

In [None]:
tokyo_hostels_df.head()

In [None]:
hostel_df.to_pickle('data/hostels_cleaned.pkl')
venues_df.to_pickle('data/hostel_venues.pkl')
price_df.to_pickle('data/tokyo_land_prices.pkl')
tokyo_hostels_df.to_pickle('data/tokyo_hostels.pkl')
tokyo_venues_df.to_pickle('data/tokyo_hostels_venues.pkl')

<hr><br>

## 5. Analysis

### Loading data

In [None]:
hostel_df = pd.read_pickle('data/hostels_cleaned.pkl')
price_df = pd.read_pickle('data/tokyo_land_prices.pkl')
tokyo_hostels_df = pd.read_pickle('data/tokyo_hostels.pkl')
tokyo_venues_df = pd.read_pickle('data/tokyo_hostels_venues.pkl')
venues_df = pd.read_pickle('data/hostel_venues.pkl')

### 5.1 Exploratory Data Analysis
Using EDA, we will try to answer the following questions:
- How does price vary with location?
- Which hostels are most secure and where are they located?
- Where are the <i>'value of money'</i> hostels located?
- How does proximity to transportation affect hostel rating?

Let's first start by analyzing which cities are covered in our dataset

In [None]:
hostel_df.City.value_counts()

In [None]:
plt.figure(figsize=(12, 6))
ax = hostel_df.City.value_counts().plot(kind="barh", color="#BC002D", fontsize=12)
ax.set_title("Hostel Count (Japan)", fontsize=16)
ax.set_xlabel("# of hostels", fontsize=12)

for p in ax.patches:
    ax.annotate(str(p.get_width()), (p.get_width() + 0.5, p.get_y() + 0.2))

<br>Tokyo has the most number of hostels which is understandable since it is once of the largest cities of the world and hence might attract a lot of visitors through its airport

<br>Going ahead, our focus will only be tokyo hostels

In [None]:
tokyo_hostels_df.head()

In [None]:
tokyo_hostels_df.shape

<br>Let's visualize all these hostels on the map

In [None]:
tokyo_map = folium.Map(location=(35.689487, 139.691711), zoom_start=11)

# add markers to map
for lat, lng, label in zip(tokyo_hostels_df.Latitude, tokyo_hostels_df.Longitude, tokyo_hostels_df.Name):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='#BC002D',
        fill=True,
        fill_color='#FFF',
        fill_opacity=0.75,
        parse_html=False).add_to(tokyo_map)
    
tokyo_map

<hr><br>

#### Chloropleth: Hostel Density

Let's visualize how many hostels are located in each neighborhood

In [None]:
hostels_density = tokyo_hostels_df.groupby('Neighborhood')['Name'].count().to_frame().reset_index()
hostels_density.columns = ['Neighborhood', 'Count']
hostels_density.sort_values(['Count'], ascending=False)[:5]

In [None]:
import geopandas

hostels_density.Neighborhood = hostels_density.Neighborhood + ' Ku'
gdf = geopandas.read_file('data/tokyo.json')
gdf = gdf.merge(hostels_density, left_on="area", right_on="Neighborhood")
gdf.head()

In [None]:
import branca

colormap = branca.colormap.LinearColormap(
    colors=['#FFF9C4','#FFF176','#FFCA28','#FF9800','#D84315'],
    index=gdf['Count'].quantile([0.25, 0.5, 0.7, 0.9]),
    vmin=hostels_density.Count.min(),
    vmax=hostels_density.Count.max()
)

colormap.caption="Hostel Density in Tokyo"

In [None]:
tokyo_map = folium.Map(location=(35.689487, 139.691711), zoom_start=11)

gj = folium.GeoJson(
    gdf,
    style_function=lambda x: {'fillColor': colormap(x['properties']['Count']), 'color': '#000',
                                                    'weight':0.25, 'fillOpacity':0.75},
    tooltip=folium.GeoJsonTooltip(fields=['Neighborhood', 'Count'], 
                                  aliases=['Neighborhood', 'Num of Hostels'], 
                                  localize=True)
).add_to(tokyo_map)

tokyo_map

It is evident from the above map that __Taito__ and __Chuo__ are host to many hostels. Hostel density is highest in these neighborhoods

Let's checkout the most expensive neighborhoods and number of hostels in that region:

In [None]:
gdf_new = gdf[['Neighborhood', 'Count']]
gdf_new.Neighborhood = gdf.Neighborhood.str.strip(" Ku")
cnt_price_df = gdf_new.merge(price_df, on="Neighborhood").sort_values('PricePerSqMeter', ascending=False).reset_index(drop=True)
cnt_price_df.head()

Let's checkout the least expensive neighborhoods and number of hostels in that region:

In [None]:
cnt_price_df.tail()

Now, let's take a look at all the neighborhoods

In [None]:
cnt_price_df.sort_values(['Count', 'PricePerSqMeter'])

In [None]:
(1028047 - 586675) / 1028047

- Sumida-Ku seems to be popular since it ranks third in the list of number of hostels and at the same time, it is the 4th cheapest neighborhood in our list
- The price of property(per sq. meter) in Sumita-Ku is almost 43% less than that in Taito-ku, which combined wih the fact that it is close to Taito-ku and has fewer hostels than Taito-ku, makes it an exciting prospect for new investors.

<hr><br>

##### Q. How does price vary with distance from city center?
As per me, the closer a hostel to the city center, the higher it's price. Let's see if we can prove that using data.

In [None]:
tokyo_hostels_df.StartPrice.value_counts()

We can see that one hostel has a very high starting price! Can a hostel charge 1003200 Yen? Doesn't seem likely. Hence, we will remove that row from the dataset.

In [None]:
tokyo_hostels_df = tokyo_hostels_df[tokyo_hostels_df.StartPrice < 1000000]

In [None]:
tokyo_hostels_df.plot(x='DistanceFromCityCentre', y='StartPrice', kind='scatter', figsize=(15, 8), color='#BC002D')
plt.xlabel("Distance from City Center (km)")
plt.ylabel("Starting Price")
plt.title("Hostel Starting Price w.r.t. proximity to city center", fontsize=16)
plt.show()

We cannot see any pattern which can help us verify our assumption. If our assumption was correct, we should be be seeing a curve which decreases as distance increases. However, there is on such curve visible.

Let's check the correlation score of StartPrice with DistanceFromCityCentre

In [None]:
tokyo_hostels_df[['StartPrice', 'DistanceFromCityCentre']].corr()

The correlation coefficient is very less. Hence, we can say that the starting price of hostels does not vary according to its distance from the city center.

> **The starting price of hostels does not vary much depending on its distance from the city center.**

<hr><br>

##### Q. Which hostels are most secure and where are they located?
I have no idea as to where the most secure hostels are located. Let's find out!

In [None]:
plt.figure(figsize=(15,8))
sns.countplot(x=tokyo_hostels_df.Security)

As we can see, most hostels are quite secure as rated by users, so that's some great news for travellers. Let's put hostels with security rating 9.5 and above on the map.

In [None]:
highsec_hostels = tokyo_hostels_df[tokyo_hostels_df.Security > 9]
lowsec_hostels = tokyo_hostels_df[tokyo_hostels_df.Security <= 9]


In [None]:
tokyo_map = folium.Map(location=(35.689487, 139.691711), zoom_start=11)

# add markers to map
for lat, lng, label in zip(highsec_hostels.Latitude, highsec_hostels.Longitude, highsec_hostels.Name):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='#33cc33',
        fill=True,
        fill_color='#FFF',
        fill_opacity=0.75,
        parse_html=False).add_to(tokyo_map)
    
# add markers to map
for lat, lng, label in zip(lowsec_hostels.Latitude, lowsec_hostels.Longitude, lowsec_hostels.Name):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=3,
        popup=label,
        color='#BC002D',
        fill=True,
        fill_color='#FFF',
        fill_opacity=0.75,
        parse_html=False).add_to(tokyo_map)
    
tokyo_map

Let's try to visualize 

In [None]:
highsec_hostel_areas = highsec_hostels.groupby('Neighborhood')['Name'].count().to_frame().reset_index()
highsec_hostel_areas.rename(columns={'Name': 'Count'}, inplace=True)
highsec_hostel_areas['Total'] = tokyo_hostels_df[tokyo_hostels_df.Neighborhood.isin(highsec_hostel_areas.Neighborhood)].groupby('Neighborhood')['Name'].count().values

In [None]:
highsec_hostel_areas['% High Security'] = np.round(highsec_hostel_areas.Count/highsec_hostel_areas.Total, decimals=2)*100
highsec_hostel_areas.sort_values(['% High Security', 'Count'], ascending=False)

We can see that Katsushika, Kita, Meguro, Shibuya and Shinagawa have 100% safe hostels. Let's see them on a map and check if they are nearby.

In [None]:
highsec_gdf = gdf[gdf.Neighborhood.isin(highsec_hostel_areas[highsec_hostel_areas['% High Security']==100].Neighborhood + ' Ku')]
highsec_gdf

In [None]:
tokyo_map = folium.Map(location=(35.689487, 139.691711), zoom_start=11)

gj = folium.GeoJson(
    highsec_gdf,
    style_function=lambda x: {'fillColor': '#BC002D', 'color': '#000',
                                                    'weight':0.25, 'fillOpacity':0.5},
    tooltip=folium.GeoJsonTooltip(fields=['Neighborhood', 'Count'], 
                                  aliases=['Neighborhood', 'Num of Hostels'], 
                                  localize=True)
).add_to(tokyo_map)

tokyo_map

> **Hostels with the very high security score are in Katsushika, Kita, Meguro, Shibuya and Shinagawa**

<hr><br>

##### Q. Where are the 'value of money' hostels located?

In [12]:
tokyo_hostels_df.ValueForMoney.plot(kind="hist")
plt.title("Value for Money Histogram")
plt.show()

NameError: name 'tokyo_hostels_df' is not defined

Let's place hostels with highest 'value for money' rating

In [11]:
tokyo_map = folium.Map(location=(35.689487, 139.691711), zoom_start=11)
high_vfm = tokyo_hostels_df[tokyo_hostels_df.ValueForMoney == 10]
low_vfm = tokyo_hostels_df[tokyo_hostels_df.ValueForMoney <= 8]

# add markers to map
for lat, lng, label in zip(high_vfm.Latitude, high_vfm.Longitude, high_vfm.Name):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='#33cc33',
        fill=True,
        fill_color='#FFF',
        fill_opacity=0.75,
        parse_html=False).add_to(tokyo_map)
    
tokyo_map

NameError: name 'folium' is not defined

In [None]:
low_vfm.StartPrice.mean()

In [None]:
high_vfm.StartPrice.mean()

In [9]:
print("Distance from city center:")
print("Low Value for money: ", round(low_vfm.DistanceFromCityCentre.mean(), 2))
print("High Value for money: ", round(high_vfm.DistanceFromCityCentre.mean(), 2))

Distance from city center:


NameError: name 'low_vfm' is not defined

<br>
Let's visualise neighborhoods according to value for money score.

In [None]:
nbr_vfm = tokyo_hostels_df.groupby('Neighborhood')['ValueForMoney'].mean().reset_index()
nbr_vfm.Neighborhood =  nbr_vfm.Neighborhood + " Ku"
vfm_gdf = pd.merge(gdf, nbr_vfm, on='Neighborhood')
vfm_gdf.head()

In [None]:
colormap = branca.colormap.LinearColormap(
    colors=['#FFF9C4', '#F0F4C3', '#DCE775','#689F38'],
    index=vfm_gdf['ValueForMoney'].quantile([0.1, 0.2, 0.75, 0.9]),
    vmin=hostels_density.Count.min(),
    vmax=hostels_density.Count.max()
)


In [None]:
tokyo_map = folium.Map(location=(35.689487, 139.691711), zoom_start=11)

gj = folium.GeoJson(
    vfm_gdf,
    style_function=lambda x: {'fillColor': colormap(x['properties']['ValueForMoney']), 'color': '#000',
                                                    'weight':0.25, 'fillOpacity':0.75},
    tooltip=folium.GeoJsonTooltip(fields=['Neighborhood', 'ValueForMoney'], 
                                  aliases=['Neighborhood', 'Value for Money'], 
                                  localize=True)
).add_to(tokyo_map)

tokyo_map

- **High value for money hostels are comparatively cheaper and are located away from the city center** 
- **Hostels in Chiyoda are near city center and have high value for money rating as well**

<hr><br>

##### Q. Cheap hostels near metro stations?

We will remove rows with Empty VenueCategory

In [None]:
tokyo_venues_df.Category.replace('', np.nan, inplace=True)
tokyo_venues_df.dropna(inplace=True)
tokyo_venues_df.drop(['Unnamed: 0'], axis=1, inplace=True)
tokyo_venues_df.shape

Let's check the frequency of each Venue Category

In [None]:
tokyo_venues_df.Category.value_counts()

<br> Let's check the average number of venues per Hostel

In [None]:
tokyo_venues_df.groupby('HostelName').count()

Let's extract the hostels which are close to Metro Stations.

In [None]:
metro_hostels = tokyo_venues_df[tokyo_venues_df.Category.str.contains('Station')]

metro_hostels

In [None]:
metro_hostels = tokyo_hostels_df[tokyo_hostels_df.Name.isin(metro_hostels.HostelName)]
metro_hostels.shape

In [None]:
metro_hostels.RatingCategory.value_counts()

In [None]:
tokyo_hostels_df.RatingCategory.value_counts()

In [None]:
print("Hostels located near a metro/train/bus station:")
print(f"{metro_hostels.RatingCategory.value_counts()['Fabulous']/metro_hostels.RatingCategory.value_counts().sum()*100}% were rated Fabulous")

In [None]:
print("In general:")
print(f"{tokyo_hostels_df.RatingCategory.value_counts()['Fabulous']/tokyo_hostels_df.RatingCategory.value_counts().sum()*100}% were rated Fabulous")

We can observe above that 50% of the hostels near metro/train/bus stations were rated Fabulous compared to only 33% in general.

This shows that there is some correlation between the proximity to mode of transportation and the overall rating of the hostel

> **The proportion of hostels rated fabulous is more near metro/bus/train stations than in general.**

<br>


##### Which neighborhood venues affect a user's rating for location of hostel?
Let's do a similar analysis for a few categories and see how it affects the user rating. 

We'll do similar analysis for *Convenience Store*, *Restaurant*, *Bar*, *Shopping Mall* categories

In [None]:
hostels_near_store = tokyo_hostels_df[
    tokyo_hostels_df.Name.isin(tokyo_venues_df.HostelName[tokyo_venues_df.Category.str.contains('store', case=False)])]

In [None]:
hostels_near_store.RatingCategory.value_counts()

In [None]:
print("Hostels located near a convenience store:")
print(f"{hostels_near_store.RatingCategory.value_counts()['Fabulous']/hostels_near_store.RatingCategory.value_counts().sum()*100}% were rated Fabulous")

> **Travellers do not bother much about proximity to a convenience store doesn't since it doesn't seem to affect overall hostel rating.**

Let's check for public parks

In [None]:
hostels_near_mall = tokyo_hostels_df[
    tokyo_hostels_df.Name.isin(tokyo_venues_df.HostelName[tokyo_venues_df.Category.str.contains('park', case=False)])]

In [None]:
hostels_near_mall.RatingCategory.value_counts()

In [None]:
print("Hostels located near a convenience store:")
print(f"{hostels_near_mall.RatingCategory.value_counts()['Fabulous']/hostels_near_mall.RatingCategory.value_counts().sum()*100}% were rated Fabulous")

> **Hostels with proximity to a park are quite lowly rated than others. Seems like travellers do not like public parks near hostels.**

In [10]:
hostels_near_site = tokyo_hostels_df[
    tokyo_hostels_df.Name.isin(tokyo_venues_df.HostelName[tokyo_venues_df.Category.str.contains('Historic Site', case=False)])]

NameError: name 'tokyo_hostels_df' is not defined

In [None]:
hostels_near_site.RatingCategory.value_counts()

In [None]:
print("Hostels located near a historic site:")
print(f"{hostels_near_site.RatingCategory.value_counts()['Fabulous']/hostels_near_site.RatingCategory.value_counts().sum()*100}% were rated Fabulous")

> **The proportion of hostels rated fabulous is more near a historic site than in general.**

In [None]:
hostels_near_museum = tokyo_hostels_df[
    tokyo_hostels_df.Name.isin(tokyo_venues_df.HostelName[tokyo_venues_df.Category.str.contains('Museum', case=False)])]

In [None]:
hostels_near_museum.RatingCategory.value_counts()

In [None]:
print("Hostels located near a historic site:")
print(f"{hostels_near_museum.RatingCategory.value_counts()['Fabulous']/hostels_near_museum.RatingCategory.value_counts().sum()*100}% were rated Fabulous")

> **Proportion of Hostels with proximity to a museum that are lowly rated is quite high than in general.**

<hr>
<br>


### 5.2 Clustering

Now, we will cluster the hostels twice. Once depending on the different rating parameters and one based on their neighborhood.

Consider the following case, that a traveller has selected a hostel, but he finds that it is full! What options does he have? He/She can either go for a hostel which is similar in rating and price to the one he/she has selected. Or he/she can consider hostels which boast similar neighborhood to the selected one.

We will help traveller in both cases

#### 5.2.1 Clustering based on rating parameters

In [None]:
X = tokyo_hostels_df[['StartPrice', 'DistanceFromCityCentre', 'OverallScore', 
                      'Atmosphere', 'Cleanliness', 'Facilities', 'Location', 
                      'Security', 'Staff', 'ValueForMoney']]

In [None]:
def get_inertia(n_clusters):
    km = KMeans(n_clusters=n_clusters, init='k-means++', max_iter=15, random_state=8)
    km.fit(X)
    return km.inertia_

In [None]:
scores = [get_inertia(x) for x in range(2, 21)]

In [None]:
plt.figure(figsize=[10, 8])
sns.lineplot(x=range(2, 21), y=scores)
plt.title("K vs Error")
plt.xticks(range(2, 21))
plt.xlabel("K")
plt.ylabel("Error")

It is evident from the above plot that we have 6 clusters in our data. Let's retrain the model and plot the hostels on the map.

In [None]:
kmeans = KMeans(n_clusters=6)
kmeans.fit(X)

In [None]:
tokyo_hostels_df['RatingCluster'] = kmeans.labels_

In [None]:
tokyo_map = folium.Map(location=(35.689487, 139.691711), zoom_start=11)

# set color scheme for the clusters
x = np.arange(9)
ys = [i + x + (i*x)**2 for i in range(9)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to map
markers_colors = []
for lat, lng, name, cluster in zip(tokyo_hostels_df.Latitude, tokyo_hostels_df.Longitude, tokyo_hostels_df.Name, kmeans.labels_):
    label = folium.Popup(str(name) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=4,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color='#FFF',
        fill_opacity=0.75,
        parse_html=False).add_to(tokyo_map)
    
tokyo_map

_**Examining Clusters**_

Now, we can examine each cluster and determine the discriminating venue categories that distinguish each cluster. But we won't be doing it for all. We'll compare first two clusters for the sake of brevity

##### Cluster 0

In [None]:
tokyo_hostels_df[tokyo_hostels_df.RatingCluster == 0].describe()

**Traits: Very High Cost. Close to City center. Very high cleaniness and security ratings**

##### Cluster 4

In [None]:
tokyo_hostels_df[tokyo_hostels_df.RatingCluster == 4].describe()

**Traits: High Cost, On avg 7km of City center. All ratings moderately high**

Now suppose that I want to stay at _**Retrometro Backpackers**_. However, when I go to book, it shows that it doesn't have any spots available! Fortunately, I can now use the cluster result to find a hostel similar to Retrometro Backpackers. Let's see what options do I have!

In [None]:
tokyo_hostels_df[tokyo_hostels_df.Name.str.contains("Retrometro")]

Retrometro Backpackers' RatingCluster is 5. Let's see what options do I have!

In [None]:
tokyo_hostels_df[tokyo_hostels_df.RatingCluster == 2].sort_values(['OverallScore'], ascending=False)[:5]

- Fortunately I can book a bed at Hostel bedgasm! It is similarly priced and rated as Retrometro Backpackers!

- We can do the same for the other 5 clusters as well, but we are not doing that now.

<hr><br>

#### 5.2.2 Clustering based on neighborhood

Now, we will cluster the same hostels. But this time, we will not use ratings or price instead we will use the neighborhood data and see which hostels have similar surroundings. Some travellers focus more on the location of a place rather than the price or ratings. This can be useful for them.

We will do one-hot encoding for the Category column to convert them to features

In [None]:
category_1hot = pd.get_dummies(tokyo_venues_df.Category)
category_1hot['HostelName'] = tokyo_venues_df.HostelName

# move neighborhood column to the first column
fixed_columns = [category_1hot.columns[-1]] + list(category_1hot.columns[:-1])
category_1hot = category_1hot[fixed_columns]
category_1hot.head()

In [None]:
category_1hot.shape

Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category.

In [None]:
category_1hot = category_1hot.groupby('HostelName').mean().reset_index()
category_1hot.head()

Let's see few Hostels along with the top 5 most common venues

In [None]:
num_top_venues = 5

for nbrhood in category_1hot.HostelName[:5]:
    print("---- "+nbrhood+" ----")
    temp = category_1hot[category_1hot.HostelName == nbrhood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

##### Let's put that into a pandas dataframe

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
indicators = ['st', 'nd', 'rd']
num_top_venues = 10

# create columns according to number of top venues
columns = ['HostelName']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
nbr_venues_sorted_df = pd.DataFrame(columns=columns)
nbr_venues_sorted_df['HostelName'] = category_1hot.HostelName

for ind in np.arange(category_1hot.shape[0]):
    nbr_venues_sorted_df.iloc[ind, 1:] = return_most_common_venues(category_1hot.iloc[ind, :], num_top_venues)

nbr_venues_sorted_df.head()

#### Clustering Neighborhoods

Let's cluster neighborhoods into 4 types based on venue categories.

In [None]:
kmeans = KMeans(n_clusters=4, init='k-means++', max_iter=15, random_state=8)
X = category_1hot.drop(['HostelName'], axis=1)
X.head()

In [None]:
kmeans.fit(X)

In [None]:
nbr_venues_sorted_df['Cluster'] = kmeans.labels_
nbr_venues_sorted_df.rename(columns={'HostelName': 'Name'}, inplace=True)

In [None]:
tokyo_hostels_df_merged = pd.merge(tokyo_hostels_df, nbr_venues_sorted_df, on='Name')
tokyo_hostels_df_merged.head()

__*Finally, let's visualize the resulting clusters*__

In [None]:
# create map
tokyo_map = folium.Map(location=(35.689487, 139.691711), zoom_start=11)

# set color scheme for the clusters
x = np.arange(9)
ys = [i + x + (i*x)**2 for i in range(9)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(tokyo_hostels_df_merged['Latitude'], tokyo_hostels_df_merged['Longitude'], tokyo_hostels_df_merged['Name'], tokyo_hostels_df_merged['Cluster']):
    label = folium.Popup(str(poi) + ' (Cluster ' + str(cluster) + ')', parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=4,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color='#FFF',
        fill_opacity=0.7).add_to(tokyo_map)

In [None]:
tokyo_map

##### Examine Clusters

Now, we can examine each cluster and determine the discriminating venue categories that distinguish each cluster. Based on the defining categories, we can then assign a name to each cluster.

**Cluster 0**

In [None]:
tokyo_hostels_df_merged.loc[tokyo_hostels_df_merged['Cluster'] == 0, tokyo_hostels_df_merged.columns[[0] + list(range(18, tokyo_hostels_df_merged.shape[1]))]][:5]

In [None]:
tokyo_hostels_df_merged.loc[tokyo_hostels_df_merged['Cluster'] == 0]['1st Most Common Venue'].value_counts()

> **1st Cluster Properties: Restaurants and Coffee Shops**

In [None]:
tokyo_hostels_df_merged.loc[tokyo_hostels_df_merged['Cluster'] == 1, tokyo_hostels_df_merged.columns[[0] + list(range(18, tokyo_hostels_df_merged.shape[1]))]][:5]

In [None]:
tokyo_hostels_df_merged.loc[tokyo_hostels_df_merged['Cluster'] == 1]['1st Most Common Venue'].value_counts()

> **2nd Cluster Properties: Convenience Stores**

In [None]:
tokyo_hostels_df_merged.loc[tokyo_hostels_df_merged['Cluster'] == 2, tokyo_hostels_df_merged.columns[[0] + list(range(18, tokyo_hostels_df_merged.shape[1]))]][:5]

> **3rd Cluster Neighborhood: Baseball Field and Gym Pool**

In [None]:
tokyo_hostels_df_merged.loc[tokyo_hostels_df_merged['Cluster'] == 3, tokyo_hostels_df_merged.columns[[0] + list(range(18, tokyo_hostels_df_merged.shape[1]))]][:5]

> **4th Cluster Neighborhood: Bar and Restaurants**

## 6. Conclusion

We got a glimpse of the hostel scene in Tokyo and were able to find out some interesting insights which might be useful to travellers as well as people with business interests. Let's summarize our findings:

- Most hostels are located in Taito-ku and Chuo-ku.
- Sumita-ku seems to be an interesting locality since it is close to Taito-ku and 43% cheaper than Taito-ku.
- The starting price of hostels does not vary much depending on its distance from the city center.
- Most of the hostels rated high for their security are in Katsushika, Kita, Meguro, Shibuya and Shinagawa
- Proximity to a mode of transportation or a historic site positively affects the hostel rating.
- Hostels rated highly for being value for money are comparatively cheaper and are located away from the city center

In addition, we also clustered the hostels, first based on the different rating parameters, and second based on the neighborhood of the hostels. It can useful for travellers to identify a alternate hostel.

There are many things which I have assumed while making the above claims since we were working with limited. I'll try to expand the dataset for a more comprehensive study.

Until then, sayonara!