# Expedia Hotel Recommendations
### ーWhich hotel type will an Expedia customer book?

## 1. Loading helpful packages

In [1]:
#data analysis and wrangling
import numpy as np
import pandas as pd
from subprocess import check_output

#visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## 2. Aquire data

In [2]:
#Read in the train data using only the necessary columns
train = pd.read_csv('../kaggle/train.csv',
                    dtype={'is_booking':bool,'srch_destination_id':np.int32, 'hotel_cluster':np.int32},
                    usecols=['srch_destination_id','is_booking','hotel_cluster'],
                    chunksize=1000000)
aggs = []
print('-'*38)
for chunk in train:
    agg = chunk.groupby(['srch_destination_id',
                         'hotel_cluster'])['is_booking'].agg(['sum','count'])
    agg.reset_index(inplace=True)
    aggs.append(agg)
    print('.',end='')
print('')
aggs = pd.concat(aggs, axis=0)
aggs.head()

--------------------------------------
......................................


Unnamed: 0,srch_destination_id,hotel_cluster,sum,count
0,1,20,0.0,2
1,1,30,0.0,1
2,1,60,0.0,2
3,4,22,1.0,2
4,4,25,1.0,2


srch_destination_id : ID of the destination where the hotel search was performed<br>
hotel_cluster : ID of a hotel cluster<br>
sum : state(1=booking ,0=click only) <br>
count : the total number of clicks



In [3]:
# Compute the total number of booking over all chunks
agg = aggs.groupby(['srch_destination_id','hotel_cluster']).sum().reset_index()

# clicks = total row counts - the number of bookings
agg['count'] -= agg['sum']
agg = agg.rename(columns={'sum':'bookings','count':'clicks'})

# relevance = the number of bookings + weighted clicks* the number of clicks
CLICK_WEIGHT = 0.05
agg['relevance'] = agg['bookings'] + CLICK_WEIGHT * agg['clicks']
agg.head()

Unnamed: 0,srch_destination_id,hotel_cluster,bookings,clicks,relevance
0,0,3,0.0,2.0,0.1
1,1,20,4.0,22.0,5.1
2,1,30,2.0,20.0,3.0
3,1,57,0.0,1.0,0.05
4,1,60,0.0,17.0,0.85


## 3. Find most popular hotel clusters by destination

In [28]:
#  Define a function to get most popular hotels for a destination group
def most_popular(group, n_max=5):
    relevance = group['relevance'].values
    hotel_cluster = group['hotel_cluster'].values
    most_popular = hotel_cluster[np.argsort(relevance)[::-1]][:n_max]
    return np.array_str(most_popular)[1:-1] # remove square brackets

In [29]:
# Get most popular hotel clusters for all destinations.
most_pop = agg.groupby(['srch_destination_id']).apply(most_popular)
most_pop = pd.DataFrame(most_pop).rename(columns={0:'hotel_cluster'})
most_pop.head()

Unnamed: 0_level_0,hotel_cluster
srch_destination_id,Unnamed: 1_level_1
0,3
1,20 30 60 57
2,20 30 53 46 41
3,53 60
4,82 25 32 58 78


## 4. Predict for test data

In [39]:
# Read in  the test data 
test = pd.read_csv('../kaggle/test.csv',
                    dtype={'srch_destination_id':np.int32},
                    usecols=['srch_destination_id'],)

In [40]:
# Merge most popular hotel clusters
test = test.merge(most_pop, how='left',left_on='srch_destination_id',right_index=True)
test.head()

Unnamed: 0,srch_destination_id,hotel_cluster
0,12243,5 55 37 11 22
1,14474,5
2,11353,0 31 77 91 96
3,8250,1 45 79 24 54
4,11812,91 42 2 48 59


In [41]:
# Check hotel_cluster column in test for null value
test.hotel_cluster.isnull().sum()

14036

There's about 14K new destinations in test.<br>
Need to fill nas with hotel clusters that are most popular overall.

In [42]:
# Make most popular hotel clusters over all 
most_pop_all = agg.groupby('hotel_cluster')['relevance'].sum().nlargest(5).index
most_pop_all = np.array_str(most_pop_all)[1:-1]
most_pop_all

'91 48 42 59 28'

In [44]:
#  Fill nas with most popular hotel clusters over all 
test.hotel_cluster.fillna(most_pop_all,inplace=True)
test.head()


Unnamed: 0,srch_destination_id,hotel_cluster
0,12243,5 55 37 11 22
1,14474,5
2,11353,0 31 77 91 96
3,8250,1 45 79 24 54
4,11812,91 42 2 48 59


##  5. Save submission

In [45]:
test.hotel_cluster.to_csv('submission_20170326.csv',header=True, index_label='id')