### imports

In [None]:
import numpy as np
import pandas as pd 
import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

import matplotlib.pyplot as plt
import pickle

def create_submission(test_data, property_predicted_score):
    !mv submission.csv.zip prev.submission.zip
    test_data['raiting'] = property_predicted_score
    submition_data = test_data[['srch_id','prop_id','raiting']]
    submition_data = submition_data.sort_values(by=['srch_id', 'raiting'], ascending=[True,  False])
    submition_data = submition_data.drop(columns="raiting")
    submition_data.to_csv('submission.csv', index=False)
    !zip submission.csv.zip  submission.csv
    !rm submission.csv

### load training and testing sets

In [None]:
#if loaded train file exists
try:
    with open('data/train-data.pickle', 'rb') as handle:
        train = pickle.load(handle)
#if not load and save
except:
    !unzip data/training_set_VU_DM.csv.zip
    train = pd.read_csv('training_set_VU_DM.csv')
    !rm training_set_VU_DM.csv
    with open('data/train-data.pickle', 'wb') as handle:
        pickle.dump(train, handle)

#if loaded test file exists
try:
    with open('data/test-data.pickle', 'rb') as handle:
        test = pickle.load(handle)
#if not load and save
except:
    !unzip data/test_set_VU_DM.csv.zip
    test = pd.read_csv('test_set_VU_DM.csv')
    !rm test_set_VU_DM.csv
    with open('data/test-data.pickle', 'wb') as handle:
        pickle.dump(test, handle)

### Functions

In [None]:
def discountedCumulativeGain(result, k=5):
    """
    Evaluated per query
    taken from 
    https://towardsdatascience.com/normalized-discounted-cumulative-gain-37e6f75090e9
    """
    dcg = []
    for idx, val in enumerate(result[0:k]): 
        numerator = 2**val - 1
        # add 2 because python 0-index
        denominator =  np.log2(idx + 2) 
        score = numerator/denominator
        dcg.append(score)
    return sum(dcg)


def NDCG_at_k(X, ranking, ideal_ranking, k=5):
    #create df with a querry rating and ideal rating
    df = X[['srch_id']]
    df.loc[: ,('ranking')] = ranking
    df.loc[: ,('true_ranking')] = ideal_ranking
    
    df = df.sort_values(by=['srch_id', 'true_ranking'], ascending=[True,  False])
    INDCG  = df.groupby('srch_id').agg\
    (lambda x: discountedCumulativeGain(x, k))
    x_true = INDCG[INDCG['true_ranking']!=0]

    df = df.sort_values(by=['srch_id', 'ranking'], ascending=[True,  False])
    NDCG  = df.groupby('srch_id').agg\
    (lambda x: discountedCumulativeGain(x, k))
    x = NDCG[NDCG['true_ranking']!=0] # true ranking will be alwas zero if no booking/click was made

    x = x['true_ranking']/x_true['true_ranking'] # true ranking has the information regarding actual booking
    print(f'ndcg_@{k} {x.mean()}')

    return x.mean()
    
    
def target_function(data_frame):
    return data_frame['click_bool'] + data_frame['booking_bool']*4 #if booked then clicked booked = 5

def features_engeneering(data_frame):
    #normalise price_usd by country id
    
    mask_log_0 = data_frame['prop_log_historical_price'] == 0
    data_frame[['historical_price']] = data_frame[['prop_log_historical_price']].applymap(np.exp)
    data_frame.loc[mask_log_0, 'historical_price'] = data_frame.loc[mask_log_0, 'price_usd']

    price_per_country_median = data_frame.groupby('prop_country_id').median().reset_index()
    price_per_country_median = price_per_country_median[['prop_country_id','price_usd']]
    
    
    price_per_country_median = price_per_country_median.rename(columns={"price_usd": "price_per_country_median"})
    data_frame = data_frame.merge(price_per_country_median, left_on='prop_country_id', right_on='prop_country_id')
    
    # apply median to missing values in price_usd
    mask = data_frame['price_usd']==0
    data_frame.loc[mask, 'price_usd'] = data_frame.loc[mask, 'price_per_country_median']
    mask = data_frame['historical_price']==0
    data_frame.loc[mask, 'historical_price'] = data_frame.loc[mask, 'price_per_country_median']

    # normalization
    #data_frame['price_usd'] = data_frame['price_usd']/data_frame['price_per_country_median']
    #data_frame['historical_price'] = data_frame['historical_price']/data_frame['price_per_country_median']

    # normalise by number of persons
    number_of_person = (data_frame['srch_adults_count'] + data_frame['srch_children_count']/2)
    data_frame['price_per_person_per_night'] = data_frame['price_usd']/number_of_person/data_frame['srch_length_of_stay']
    data_frame.loc[data_frame['prop_review_score'].isnull(),'prop_review_score'] = data_frame['prop_review_score'].median()
    
    #
    #data standartization
    
    #for feature in ['prop_starrating','prop_location_score1']:
    #    x = data_frame[feature].values
    #    x = x.reshape(-1,1)
    #    data_frame[feature] = normalize(x)
    
    
    return data_frame

def undersample(data_frame):
    mask = data_frame['click_bool']==False
    average_non_booked = data_frame[mask].groupby('srch_id').mean().reset_index()
    
     
    #average_non_booked = input_data_frame[mask].sample(221879)
     
    data_frame = data_frame.drop(data_frame[mask].index)
    data_frame = data_frame.reset_index()
    #print('average non booked')
    #display(average_non_booked)
    data_frame = data_frame.append(average_non_booked)
    data_frame = data_frame.sort_values(by=['srch_id'], ascending=[True])
    #Fixing boolean values
    for feature in ['random_bool', 'prop_brand_bool','promotion_flag','srch_saturday_night_bool' ]:
        data_frame.loc[data_frame[feature]<0.5, feature] = False
        data_frame.loc[data_frame[feature]>=0.5, feature] = True
    return data_frame


### Total columns

In [None]:
len(train)

### Total null columns

In [None]:
print(train.isnull().sum())


In [None]:
train.loc[train['prop_review_score'].isnull(),'prop_review_score'] = train['prop_review_score'].median()

df = train.dropna(axis=1)
df[df['click_bool']==True]
del train
a = set(df.columns)
b = set(test.columns)
print(f'columns not in both sets {a-b}')

### split data to train and test

In [None]:
df = features_engeneering(df)
#df = undersample(df)

test = features_engeneering(test)

#  the columns after droping
#['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
#       'prop_country_id', 'prop_id', 'prop_starrating', 'prop_review_score',
#       'prop_brand_bool', 'prop_location_score1', 'prop_log_historical_price',
#       'position', 'price_usd', 'promotion_flag', 'srch_destination_id',
#       'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count',
#       'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool',
#       'random_bool', 'click_bool', 'booking_bool', 'price_per_person']
 

features_to_choose = ['prop_review_score',  
                      'prop_starrating', 
                      'prop_brand_bool', 
                      'prop_location_score1',
                      'srch_booking_window',
                      'historical_price', #'prop_log_historical_price', #
                      'promotion_flag', 
                      'srch_saturday_night_bool', 
                      'random_bool', 
                      'price_per_person_per_night']

Y = target_function(df)
X_train, X_test, Y_train, Y_test = \
train_test_split(df, Y, test_size=0.5, random_state=0)


### Hotels numbers

In [None]:
len(set(test['prop_id']) ) #- len(set(df['prop_id']) )
len(set(test['prop_id']) - set(df['prop_id']))

In [None]:
#from mlxtend.frequent_patterns import apriori
#from mlxtend.frequent_patterns import association_rules
#frequent_itemsets= apriori(df[['promotion_flag','srch_saturday_night_bool', 'random_bool', 'click_bool', 'booking_bool']], min_support=0.07, use_colnames=True)
#rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
#rules = rules.sort_values(by=[ 'support', 'confidence',], ascending=False)

#rules[['antecedents', 'consequents','support','confidence']].iloc[0:10]


### Test and validate

In [None]:
#from sklearn import neighbors, linear_model
#from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
#from sklearn.tree import DecisionTreeRegressor
#from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor, GradientBoostingRegressor
# Fit regression model

#model = linear_model.LinearRegression()
#n_neighbors = 10
#model = neighbors.KNeighborsRegressor(n_neighbors)
#model= DecisionTreeRegressor(random_state=0)
#model = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=5, random_state=0)
model = GradientBoostingRegressor(random_state=0)

########        EVALUATION           ###############
start = time.time()
model.fit(X_train[features_to_choose], Y_train)
end = time.time()
print("The time of evaluation fit:", end-start)

Y_evaluation = model.predict(X_test[features_to_choose])
#mae = mean_absolute_error(Y_test, Y_pred)
#mse = mean_squared_error(Y_test, Y_pred, squared=False)
#print(f'mae: {mae} mse: {mse}')
ndcg_score = NDCG_at_k(X_test, ranking=Y_evaluation, ideal_ranking=Y_test, k=5)

#test
ndcg_score = NDCG_at_k(X_test, ranking=Y_test,ideal_ranking=Y_test, k=5)
assert ndcg_score == 1, "Houston we've got a problem, Ideal ranking is not 1"


### Fit and submit

In [None]:
start = time.time()
model.fit(df[features_to_choose], Y)
end = time.time()
print("The time of final fit:", end-start)
raitings = model.predict(test[features_to_choose])
create_submission(test, raitings)