# Booking.com WSDM WebTour 2021 Challenge demo

In [1]:
import pandas as pd

## Load train set

In [3]:
train_set = pd.read_csv('booking_train_set.csv', index_col=[0]).sort_values(by=['utrip_id','checkin'])

In [4]:
train_set.head()

Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id
542546,1000027,2016-08-13,2016-08-14,8183,desktop,7168,Elbonia,Gondal,1000027_1
542547,1000027,2016-08-14,2016-08-16,15626,desktop,7168,Elbonia,Gondal,1000027_1
542548,1000027,2016-08-16,2016-08-18,60902,desktop,7168,Elbonia,Gondal,1000027_1
542549,1000027,2016-08-18,2016-08-21,30628,desktop,253,Elbonia,Gondal,1000027_1
1061281,1000033,2016-04-09,2016-04-11,38677,mobile,359,Gondal,Cobra Island,1000033_1


### Load testset

In [5]:
test_set = pd.read_csv('sample_test_set.csv').sort_values(by=['utrip_id','checkin'])

In [6]:
test_set.head()

Unnamed: 0,user_id,checkin,checkout,device_class,affiliate_id,booker_country,utrip_id,legs,row_num,total_rows,city_id,hotel_country
0,1000066,21/07/2016,23/07/2016,desktop,9924,Gondal,1000066_5,4,1,4,56430,Urkesh
1,1000066,23/07/2016,25/07/2016,desktop,9924,Gondal,1000066_5,4,2,4,11543,Urkesh
2,1000066,25/07/2016,28/07/2016,desktop,9924,Gondal,1000066_5,4,3,4,5797,Urkesh
3,1000066,28/07/2016,31/07/2016,mobile,2436,Gondal,1000066_5,4,4,4,0,
4,1000270,08/02/2016,09/02/2016,mobile,9452,The Devilfire Empire,1000270_5,4,1,4,50075,The Devilfire Empire


### Generate Dummy Predictions - use top 4 cities in the trainset as benchmark recommendation

In [7]:
topcities = train_set.city_id.value_counts().index[:4]
topcities

Int64Index([47499, 23921, 36063, 17013], dtype='int64')

In [8]:
test_trips = (test_set[['utrip_id']].drop_duplicates()).reset_index().drop('index', axis=1)

In [9]:
cities_prediction = pd.DataFrame([topcities]*test_trips.shape[0]
                                 , columns= ['city_id_1','city_id_2','city_id_3','city_id_4'])

### Create Submission file according to the format

In [10]:
submission = pd.concat([test_trips,cities_prediction], axis =1)
submission

Unnamed: 0,utrip_id,city_id_1,city_id_2,city_id_3,city_id_4
0,1000066_5,47499,23921,36063,17013
1,1000270_5,47499,23921,36063,17013
2,1000441_5,47499,23921,36063,17013
3,100048_5,47499,23921,36063,17013


In [11]:
submission.to_csv('submission.csv',index=False)

## Read submission file and ground truth

In [12]:
ground_truth = pd.read_csv('sample_truth.csv', index_col =[0])
submission = pd.read_csv('submission.csv', index_col =[0])

In [13]:
ground_truth

Unnamed: 0_level_0,city_id,hotel_country
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000066_5,41971,Urkesh
1000270_5,23921,Cobra Island
1000441_5,50457,Osterlich
100048_5,17013,Borginia


## Evaluate - use accuracy at 4 to evaluate the prediction

In [14]:
def evaluate_accuracy_at_4(submission,ground_truth):
    '''checks if the true city is within the four recommended cities'''
    data_to_eval = submission.join(ground_truth,on='utrip_id')
    hits = data_to_eval.apply(
        lambda row: row['city_id'] in (row[['city_id_1', 'city_id_2', 'city_id_3', 'city_id_4']].values),
            axis = 1)
    return hits.mean()

In [15]:
evaluate_accuracy_at_4(submission,ground_truth)

0.5