# Find Similar Clusters - Expedia Hotel dataset

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
import warnings

import sys

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

import scipy.sparse as sparse
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
import implicit
warnings.filterwarnings("ignore")

  import pandas.util.testing as tm


In [2]:
df = pd.read_csv('../data/hotel_data/train.csv', sep=',', nrows=1000000)
# rename 2 columns
df = df.rename(columns={'hotel_cluster': 'item_id', 'is_booking': 'rating'})

### Remove rows with the same user_id and item_id but different rating

In [3]:
max_rating = df.groupby(['user_id', 'item_id']).rating.transform(max)
df = df.loc[df.rating == max_rating]
df.drop_duplicates(keep='first',inplace=True) 
df= df.reset_index().drop('index',axis=1)
len(df['item_id'].unique())

100

## Assumptions:


#### user_location_countries

1 Italy   66 USA   205 Canada  215 Mexico

#### hotel_countries

8 Mexico  /  50 USA /  70 UK /   198 Canada /  204 France

#### user_location_regions

174 California  246 Hawaii  348 New York state

Ontario 354 / 155, 135 Quebec, BC / 385 Alberta

#### user_location_cities

24103 Los Angeles  26232 San Francisco   48862 New York city
25315 Toronto
#### hotel_markets
397 Toronto

19 - London  / 27 - Paris / 110 - Cancún (Mexico) / 212-214 - Hawaii/  365 - Los Angeles/

628 - Las Vegas/  637 - Chicago / 675 - New York city / 701 - Miami  /1230 - San Francisco

# Findings:
# ***** **** *** **
#### The most common hotel_cluster for New York (hotel_market=675 ) is 56 => we assign Apartment to it.
#### The second most common hotel_cluster for New York (hotel_market=675 ) is 70 => we assign business hotels to it.

#### The most common hotel_cluster for Toronto (hotel_market=397 ) is 55 => we assign condo to it.
#### The second most common hotel_cluster for  Toronto (hotel_market=397 ) is 21 => we assign Private Vacation Homes to it.
#### One of the least common hotel_cluster for Toronto (hotel_market=397 ) is 39 => we assign bed and breakfast to it.
#### One of the least common hotel_cluster for Toronto (hotel_market=397 ) is 20,60 => we assign hostels to it.

#### The most common hotel_cluster for  Los Angeles (hotel_market=365 ) is 4 => we add it Private Vacation Homes.

#### The most common hotel_cluster for Paris (hotel_market=27 ) is 5 => we add it to Apartment.

#### The most common hotel_cluster for  Hawaii (hotel_market=212 ) is 0 => we assign beach resort to it. 

#### The most common hotel_cluster for  Las Vegas (hotel_market=628 ) is 1 => we assign casino hotel to it.

#### The most common hotel_cluster for  Cancún  (hotel_market=110 ) is 65 => we assign hotel resort to it.

#### The most common hotel_cluster for  Chicago  (hotel_market=637 ) is 95 => we assign motel to it.

#### The most common hotel_cluster for Paris  (hotel_market=27 ) is 5 and 25 => we assign  to it.

## Find the exact user_location_city with the help of their corresponding hotel_market

In [4]:
distaggs = (df.groupby(['user_location_city','hotel_market'])
            ['orig_destination_distance']
            .agg(['min','mean','max','count']))
dff = distaggs.sort_values(by='min')

In [5]:
dff=dff.reset_index()
dff[dff['hotel_market']==399]

Unnamed: 0,user_location_city,hotel_market,min,mean,max,count
41,23672,399,0.0458,12.089609,17.8056,11
106,14703,399,0.1391,3.671688,23.0365,103
207,8613,399,0.2957,9.218880,11.0891,15
226,53078,399,0.3478,183.680181,2390.4014,53
420,6802,399,1.2078,3.794541,11.6608,17
...,...,...,...,...,...,...
144920,55104,399,,,,0
144984,55226,399,,,,0
145197,55529,399,,,,0
145815,56268,399,,,,0


In [6]:
#Toronto user_location_city 25315
df[df['user_location_city']==25315][['user_location_country','user_location_region']]

Unnamed: 0,user_location_country,user_location_region
727,205,354
728,205,354
729,205,354
730,205,354
731,205,354
...,...,...
758838,205,354
758839,205,354
758840,205,354
758841,205,354


### Find the name of the hotel_market from its distance with New York (hotel_market = 48862 )

In [7]:
dff=df[(df['user_location_city']==48862) & (df['hotel_country']==50) ][['orig_destination_distance','hotel_market']]

In [8]:
dff_f= dff[(dff['orig_destination_distance']<2400) & (dff['orig_destination_distance']>2200)]
dff_f

Unnamed: 0,orig_destination_distance,hotel_market
0,2234.2641,628
1665,2234.1470,628
6523,2233.3839,628
8316,2233.6722,628
28567,2233.6105,628
...,...,...
762283,2234.3283,628
762338,2234.5569,628
762339,2233.6105,628
762379,2234.0320,628


In [9]:
dff_f.drop_duplicates(subset='hotel_market', keep="last")

Unnamed: 0,orig_destination_distance,hotel_market
184321,2234.4758,629
215262,2346.1168,631
282370,2233.5022,517
585720,2366.5085,367
606723,2399.7186,475
606727,2398.0597,663
649773,2373.3065,970
709224,2395.0902,351
762380,2233.3495,628


## Find Most Common item_id (Hotel cluster) on the city (hotel_market)

In [10]:
df[df['hotel_market']==397]["item_id"].value_counts().head()

#[['posa_continent','hotel_continent' ,'hotel_country','user_location_country' ,'hotel_market' ,'item_id' ]]

55    329
21    315
9     291
95    261
33    220
Name: item_id, dtype: int64

### Check information of hotel_market 
Add until now: hotel_market = 212,213,214 , item_id = (0,26,34,73)

In [11]:
df[df['item_id']==95][['hotel_continent' ,'hotel_country' ,'hotel_market' ,'item_id' ]]

Unnamed: 0,hotel_continent,hotel_country,hotel_market,item_id
62,2,50,680,95
63,2,50,680,95
64,2,50,680,95
130,6,105,29,95
427,2,50,368,95
...,...,...,...,...
761862,2,50,696,95
762166,2,50,637,95
762222,2,50,637,95
762326,2,50,743,95


In [12]:
df[df['item_id']==1]['hotel_market'].unique()

array([628, 623, 633])

# Find Similar Hotel clusters to the chosen cluster 

In [13]:
#csr_matrix((data, (row, col))
sparse_item_user = sparse.csr_matrix((df['rating'].astype(float),(df['item_id'], df['user_id'])))
sparse_user_item = sparse.csr_matrix((df['rating'].astype(float),(df['user_id'], df['item_id'])))


model = implicit.als.AlternatingLeastSquares(factors=20,regularization=0.1,iterations=20)
alpha_val = 15
data_conf = (sparse_item_user * alpha_val).astype('double')
model.fit(data_conf)

item_id = 25

n_similar = 5
similar = model.similar_items(item_id,n_similar)
similar



HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




[(25, 0.5224644),
 (9, 0.21302693),
 (64, 0.20654067),
 (97, 0.19879587),
 (72, 0.18624814)]

## Extract Top 5

In [14]:
top5=[]
for i in similar:
    top5.append(i[0])

In [15]:
sorted(top5) 

[9, 25, 64, 72, 97]

# Create dataframe to store clusters

In [16]:
hotel_df = pd.DataFrame(columns=['item_id','hotel_type'])
hotel_df['item_id']=list(range(100))

In [17]:
cluster = {"apartment":[5, 11, 22, 28,41, 56, 73],
          'business_hotels':[ 64,69, 70, 97],
          "condo":[3,8,36, 37, 55],
          "private_vacation_homes":[ 4, 9, 21, 49, 75, 77],
          "motel":[2,25,27, 95, 98],
          "beach_resort":[0, 17, 26, 31, 34, 80, 84, 92],
          "casino_hotel":[1, 19, 45, 54, 79,89, 93],
          "hotel_resort":[52, 65, 66, 87, 96],
          "bed_n_breakfast":[23, 39, 50, 51, 76],
          "hosetel":[12, 20, 38, 53, 57, 60, 61, 85, 86]}

## Store on dataframe

In [18]:
warnings.filterwarnings("ignore")
for i in cluster.keys():
    hotel_df['hotel_type'][cluster[i]]= i

In [19]:
hotel_df

Unnamed: 0,item_id,hotel_type
0,0,beach_resort
1,1,casino_hotel
2,2,motel
3,3,condo
4,4,private_vacation_homes
...,...,...
95,95,motel
96,96,hotel_resort
97,97,business_hotels
98,98,motel


## Resources:

https://www.kaggle.com/dvasyukova/the-locations-puzzle