### imports

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

import matplotlib.pyplot as plt
import pickle

def create_submission(test_data, property_predicted_score):
    !mv submission.csv.zip prev.submission.zip
    test_data['raiting'] = property_predicted_score
    submition_data = test_data[['srch_id','prop_id','raiting']]
    submition_data = submition_data.sort_values(by=['srch_id', 'raiting'], ascending=[True,  False])
    submition_data = submition_data.drop(columns="raiting")
    submition_data.to_csv('submission.csv', index=False)
    !zip submission.csv.zip  submission.csv
    !rm submission.csv

### load training and testing sets

In [2]:
#if loaded train file exists
try:
    with open('data/train-data.pickle', 'rb') as handle:
        train = pickle.load(handle)
#if not load and save
except:
    !unzip data/training_set_VU_DM.csv.zip
    train = pd.read_csv('training_set_VU_DM.csv')
    !rm training_set_VU_DM.csv
    with open('data/train-data.pickle', 'wb') as handle:
        pickle.dump(train, handle)

#if loaded test file exists
try:
    with open('data/test-data.pickle', 'rb') as handle:
        test = pickle.load(handle)
#if not load and save
except:
    !unzip data/test_set_VU_DM.csv.zip
    test = pd.read_csv('test_set_VU_DM.csv')
    !rm test_set_VU_DM.csv
    with open('data/test-data.pickle', 'wb') as handle:
        pickle.dump(test, handle)

### Functions

In [31]:
def discountedCumulativeGain(result, k=5):
    """
    Evaluated per query
    taken from 
    https://towardsdatascience.com/normalized-discounted-cumulative-gain-37e6f75090e9
    """
    dcg = []
    for idx, val in enumerate(result[0:k]): 
        numerator = val #2**val - 1
        # add 2 because python 0-index
        denominator =  np.log2(idx + 2) 
        score = numerator/denominator
        dcg.append(score)
    return sum(dcg)


def NDCG_at_k(X, ranking, ideal_ranking, k=5):
    #create df with a querry rating and ideal rating
    df = X[['srch_id']]
    df.loc[: ,('ranking')] = ranking
    df.loc[: ,('true_ranking')] = ideal_ranking
    
    df = df.sort_values(by=['srch_id', 'true_ranking'], ascending=[True,  False])
    INDCG  = df.groupby('srch_id').agg\
    (lambda x: discountedCumulativeGain(x, k))
    x_true = INDCG[INDCG['true_ranking']!=0]

    df = df.sort_values(by=['srch_id', 'ranking'], ascending=[True,  False])
    NDCG  = df.groupby('srch_id').agg\
    (lambda x: discountedCumulativeGain(x, k))
    x = NDCG[NDCG['true_ranking']!=0] # true ranking will be alwas zero if no booking/click was made

    x = x['true_ranking']/x_true['true_ranking'] # true ranking has the information regarding actual booking
    print(f'ndcg_@{k} {x.mean()}')

    return x.mean()
    
    
def target_function(data_frame):
    return data_frame['click_bool'] + data_frame['booking_bool']*4 #if booked then clicked booked = 5


def hotel_booking_likelihood(data_frame, df_out):
    lklhd = data_frame.groupby('prop_id').sum().reset_index()
    lklhd = lklhd[['prop_id','booking_bool']]
    lklhd = lklhd.rename(columns={"booking_bool": "lklhd"})
    lklhd['lklhd'] = lklhd['lklhd']/data_frame['booking_bool'].sum()
    data_frame = data_frame.merge(lklhd, left_on='prop_id', right_on='prop_id')
    df_out = df_out.merge(lklhd, left_on='prop_id', right_on='prop_id')
    df_out.loc[df_out['lklhd'].isnull(),'lklhd'] = 0

    return data_frame, df_out

def features_engeneering(data_frame):
    #fill missing property review score by median over whole data
    data_frame.loc[data_frame['prop_review_score'].isnull(),'prop_review_score'] = data_frame['prop_review_score'].median()

    #fill missing prop_log_historical_price  by country id median
    mask_log_0 = data_frame['prop_log_historical_price'] == 0
    data_frame[['historical_price']] = data_frame[['prop_log_historical_price']].applymap(np.exp)
    data_frame.loc[mask_log_0, 'historical_price'] = data_frame.loc[mask_log_0, 'price_usd']

    price_per_country_median = data_frame.groupby('prop_country_id').median().reset_index()
    price_per_country_median = price_per_country_median[['prop_country_id','price_usd']]
    price_per_country_median = price_per_country_median.rename(columns={"price_usd": "price_per_country_median"})
    data_frame = data_frame.merge(price_per_country_median, left_on='prop_country_id', right_on='prop_country_id')
    # apply median to missing values in price_usd
    mask = data_frame['price_usd']==0
    data_frame.loc[mask, 'price_usd'] = data_frame.loc[mask, 'price_per_country_median']
    mask = data_frame['historical_price']==0
    data_frame.loc[mask, 'historical_price'] = data_frame.loc[mask, 'price_per_country_median']

    # normalization
    #data_frame['price_usd'] = data_frame['price_usd']/data_frame['price_per_country_median']
    #data_frame['historical_price'] = data_frame['historical_price']/data_frame['price_per_country_median']

    # normalise by number of persons
    number_of_person = (data_frame['srch_adults_count'] + data_frame['srch_children_count']/2)
    data_frame['price_per_person_per_night'] = data_frame['price_usd']/number_of_person/data_frame['srch_length_of_stay']
        
    #
    #data standartization
    
    #for feature in ['prop_starrating','prop_location_score1']:
    #    x = data_frame[feature].values
    #    x = x.reshape(-1,1)
    #    data_frame[feature] = normalize(x)
    
    
    return data_frame

def undersample(data_frame):
    mask = data_frame['click_bool']==False
    average_non_booked = data_frame[mask].groupby('srch_id').mean().reset_index()
    
    data_frame = data_frame.drop(data_frame[mask].index)
    data_frame = data_frame.reset_index()

    data_frame = data_frame.append(average_non_booked)
    data_frame = data_frame.sort_values(by=['srch_id'], ascending=[True])
    
    #Fixing boolean values
    for feature in ['random_bool', 'prop_brand_bool','promotion_flag','srch_saturday_night_bool' ]:
        data_frame.loc[data_frame[feature]<0.5, feature] = False
        data_frame.loc[data_frame[feature]>=0.5, feature] = True
    return data_frame


### Total columns

In [4]:
len(train)

4958347

### Total null columns

In [5]:
print(train.isnull().sum())


srch_id                              0
date_time                            0
site_id                              0
visitor_location_country_id          0
visitor_hist_starrating        4706481
visitor_hist_adr_usd           4705359
prop_country_id                      0
prop_id                              0
prop_starrating                      0
prop_review_score                 7364
prop_brand_bool                      0
prop_location_score1                 0
prop_location_score2           1090348
prop_log_historical_price            0
position                             0
price_usd                            0
promotion_flag                       0
srch_destination_id                  0
srch_length_of_stay                  0
srch_booking_window                  0
srch_adults_count                    0
srch_children_count                  0
srch_room_count                      0
srch_saturday_night_bool             0
srch_query_affinity_score      4640941
orig_destination_distance

### Amount of hotels in data sets

In [6]:
x = test['prop_id']
print(f'Amount of hotels in test data {len(set(x))} ')
y = train['prop_id']
print(f'Amount of hotels in train data {len(set(y))}')
print(f'Amount of uniqe hotels in train and not in test {len(set(x) - set(y))}')

Amount of hotels in test data 129438 
Amount of hotels in train data 129113
Amount of uniqe hotels in train and not in test 7773


In [7]:
train = features_engeneering(train)
test = features_engeneering(test)
df = train.dropna(axis=1)
del train

a = set(df.columns)
b = set(test.columns)
print(f'columns not in both sets {a-b}')

columns not in both sets {'booking_bool', 'position', 'click_bool'}


### Choose features to fit

In [28]:
#  the columns after droping
#['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
#       'prop_country_id', 'prop_id', 'prop_starrating', 'prop_review_score',
#       'prop_brand_bool', 'prop_location_score1', 'prop_log_historical_price',
#       'position', 'price_usd', 'promotion_flag', 'srch_destination_id',
#       'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
#       'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool',
#       'random_bool', 'click_bool', 'booking_bool', 'price_per_person_per_night']
 

features_to_choose = ['prop_id', 'prop_review_score',  
                      'prop_starrating', 
                      'prop_brand_bool', 
                      'prop_location_score1',
                      'srch_booking_window',
                      'prop_log_historical_price', #
                      'promotion_flag', 'srch_destination_id',
                      'srch_saturday_night_bool', 
                      'random_bool', 'lklhd', 
                      'price_per_person_per_night']

In [9]:
#from mlxtend.frequent_patterns import apriori
#from mlxtend.frequent_patterns import association_rules
#frequent_itemsets= apriori(df[['promotion_flag','srch_saturday_night_bool', 'random_bool', 'click_bool', 'booking_bool']], min_support=0.07, use_colnames=True)
#rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
#rules = rules.sort_values(by=[ 'support', 'confidence',], ascending=False)

#rules[['antecedents', 'consequents','support','confidence']].iloc[0:10]


### Test and Evaluate

In [32]:
from sklearn import neighbors, linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Fit regression model
#model = linear_model.LinearRegression()
#n_neighbors = 10
#model = neighbors.KNeighborsRegressor(n_neighbors)
#model= DecisionTreeRegressor(random_state=0)
model = GradientBoostingRegressor(random_state=0)

########        EVALUATION           ###############
ind = int(len(df)*0.1)
X_train = df.iloc[0:ind, :]
X_test = df.iloc[ind:ind*2, :]

X_train, X_test = hotel_booking_likelihood(X_train, X_test)

start = time.time()

#X_train = undersample(X_train)
model.fit(X_train[features_to_choose], target_function(X_train))
end = time.time()
print("The time of evaluation fit:", end-start)

Y_evaluation = model.predict(X_test[features_to_choose])
Y_ideal = target_function(X_test)

ndcg_score = NDCG_at_k(X_test, ranking=Y_evaluation, ideal_ranking=Y_ideal, k=5)
# in submissioned version was 0.25193  that was evaluated here as 0.6472178554938205
#test ndcg function
#ndcg_score = NDCG_at_k(X_test, ranking=Y_ideal, ideal_ranking=Y_ideal, k=5)
#assert ndcg_score == 1, "Houston we've got a problem, Ideal ranking is not 1"


The time of evaluation fit: 55.958049297332764


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value


ndcg_@5 0.6085351431789754


In [33]:
X_test

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,...,srch_children_count,srch_room_count,srch_saturday_night_bool,random_bool,click_bool,booking_bool,historical_price,price_per_country_median,price_per_person_per_night,lklhd
0,53855,2013-01-24 15:02:48,24,92,219,92698,4,4.0,0,4.63,...,0,1,0,1,0,0,278.662118,119.0,10.390667,0.000576
1,53874,2013-02-28 22:28:52,14,100,219,92698,4,4.0,0,4.63,...,2,1,0,0,0,0,278.662118,119.0,7.808571,0.000576
2,54062,2013-04-22 21:05:06,24,216,219,92698,4,4.0,0,4.63,...,0,1,0,0,0,0,273.144238,119.0,23.712500,0.000576
3,54315,2013-05-10 19:52:49,14,100,219,92698,4,4.0,0,4.63,...,0,1,0,0,0,0,273.144238,119.0,11.468333,0.000576
4,54421,2013-06-05 20:20:34,14,100,219,92698,4,4.0,0,4.63,...,1,1,0,0,0,0,173.030000,119.0,69.212000,0.000576
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
491854,107917,2013-04-22 20:29:22,5,219,219,62453,2,3.5,1,1.61,...,0,1,0,1,0,0,93.690800,119.0,71.000000,0.000000
491855,107917,2013-04-22 20:29:22,5,219,219,115531,3,5.0,1,1.10,...,0,1,0,1,0,0,149.904736,119.0,104.000000,0.000000
491856,107959,2013-01-03 09:16:23,5,219,219,11559,2,2.0,1,0.69,...,0,1,1,0,0,0,138.379512,119.0,40.000000,0.000000
491857,107992,2013-03-07 16:08:01,5,219,219,32064,3,5.0,0,0.00,...,2,1,0,1,0,0,92.758561,119.0,5.375000,0.000000


### Fit and submit

In [11]:
start = time.time()
model.fit(df[features_to_choose], target_function(df))
end = time.time()
print("The time of final fit:", end-start)
raitings = model.predict(test[features_to_choose])
create_submission(test, raitings)

KeyboardInterrupt: 