In [11]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split  
import xgboost as xgb
from sklearn.cluster import KMeans
import warnings
from xgboostextension import XGBRanker
warnings.filterwarnings('ignore')

In [2]:
# specifying some paths
path = r"C:\Users\janse\OneDrive\Bureaublad\Master\Data Mining Techniques\Assignment 2"
testpath = os.path.join(path, 'test_set_VU_DM.csv')
trainpath = os.path.join(path, 'training_set_VU_DM.csv')
samplepath = os.path.join(path, 'train_sample.csv')
resultpath = os.path.join(path, 'result.csv')


In [3]:
# load train_main
def reload_train():
    train_main = pd.read_csv(trainpath)
    train_main.date_time = pd.to_datetime(train_main.date_time)
    return train_main

In [4]:
train_main = reload_train()

In [5]:
train_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4958347 entries, 0 to 4958346
Data columns (total 54 columns):
srch_id                        int64
date_time                      datetime64[ns]
site_id                        int64
visitor_location_country_id    int64
visitor_hist_starrating        float64
visitor_hist_adr_usd           float64
prop_country_id                int64
prop_id                        int64
prop_starrating                int64
prop_review_score              float64
prop_brand_bool                int64
prop_location_score1           float64
prop_location_score2           float64
prop_log_historical_price      float64
position                       int64
price_usd                      float64
promotion_flag                 int64
srch_destination_id            int64
srch_length_of_stay            int64
srch_booking_window            int64
srch_adults_count              int64
srch_children_count            int64
srch_room_count                int64
srch_saturday

# Feature extraction functions

In [61]:
def cumulate_comp_scores(train_df):    
    # Cumulative competitor scores

    cols = [col for col in train_df.columns if col.endswith('rate_percent_diff')]
    cols2 = [col for col in train_df.columns if col.endswith('_rate')]
    cols3 = [col for col in train_df.columns if col.endswith('_inv')]

    df = train_df[cols.extend(cols2).extend(cols3).extend(['prop_id'])]
    df['comp_rate_percent_diff_cumm'] = 0

    for col1, col2, col3 in zip(cols, cols2, cols3):
        df['comp_rate_percent_diff_cumm'] += df[col1].fillna(0) * df[col2].fillna(0) * (1 - df[col3].fillna(0))
        
    df = train2.drop(cols, axis=1)
    df = train2.drop(cols2, axis=1)
    df = train2.drop(cols3, axis=1)
    return df

def average_prop_score(train_df):
    cols = ['prop_id', 'price_usd', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2']
    df = train_df[cols]
    df['prop_location_score2'] = df['prop_location_score2'].fillna(df.prop_location_score2.mean())
    return df.groupby('prop_id').agg('mean').reset_index()

class search_clusters:
    def __init__(self, n_clusters, searches):
        kmean = KMeans(n_clusters=n_clusters)
        kmean.fit(searches)
        self.clusters = kmean
    def assign(self, search_terms):
        return self.clusters.predict(search_terms)
    
def cluster_srch_ids(train_main, one_hot = False, cluster_amount=30):    
    train_df = train_main
    
    cols = [x for x in train_df.columns if x.startswith('srch_')]
    df = train_df[cols].drop('srch_query_affinity_score', axis=1)
    
     
    groups = search_clusters(cluster_amount, df.drop_duplicates().drop(['srch_id','srch_destination_id'], axis=1))

    clusters = pd.DataFrame()
    clusters['search_group'] = groups.assign(train_df[cols].drop(['srch_id','srch_destination_id', 'srch_query_affinity_score'], axis=1))    
    clusters['srch_id'] = train_df['srch_id']
    clusters['prop_id'] = train_df['prop_id']
    
    if one_hot:        
        clusters = pd.get_dummies(clusters, columns=['search_group'])
        
    else:
        pass
    
    return clusters

def rate(row, mean_rate):
    if row['srch_id'] < 20:
        return mean_rate
    else:
        return row['click_bool']
    
def avg_click_rate(train_main):
    prop_click_rate = train_main[['prop_id','click_bool', 'srch_id']].groupby('prop_id').agg({'click_bool' : 'mean', 'srch_id': 'count'}).reset_index()
    mean_rate = prop_click_rate.click_bool.mean()
    prop_click_rate['click_rate'] = prop_click_rate.apply(lambda x: rate(x, mean_rate), axis=1)
    return prop_click_rate[['prop_id', 'click_rate']]

def split_sets(df, test_size = 0.2, shuffle=False, downsample=False):
    '''
        Returns: X_train, X_test, Y_train, Y_test
    '''
    df_train, df_test = train_test_split(df, test_size=test_size, shuffle=shuffle) 
    
    if downsample:
        df_train = downsample(df_train)
    
    X_train = df_train.drop(['click_bool','booking_bool'] , axis=1)
    X_train = X_train.fillna(X_train.mean())
    X_test = df_test.drop(['click_bool','booking_bool'] , axis=1)    
    X_test = X_test.fillna(X_test.mean())
    
    Y_train = df_train.click_rate
    Y_test = df_test.click_rate
    
    target_train = df_train.click_bool + 4 * df_train.booking_bool
    target_test = df_test.click_bool + 4 * df_test.booking_bool    
    
    return X_train, X_test, Y_train, Y_test, target_train, target_test


# Models

In [68]:
def train_gxboost_ranker(X_train, Y_train, lr = 0.1, n_estimators=50, max_depth=6, objective='rank:ndcg'):
    
    group_train = X_train.srch_id.values

    X = X_train.drop(['srch_id', 'prop_id'], axis=1).values
    X = np.concatenate([group_train[:,None], X], axis=1)

    Y = Y_train

    model = XGBRanker(n_estimators=n_estimators, learning_rate=lr, subsample=0.7, max_depth=max_depth, objective=objective)
    model.fit(X, Y, verbose=2)
    
    return model

def train_ranking_SVM():
    pass


# Evaluation functions

In [21]:
def evaluate(X_test, pred, target_test):
    prop_srch = X_test[['srch_id', 'prop_id']]
    prop_srch['value'] = pred
    prop_srch['actual'] = target_test.values
    eval_ndcg(prop_srch, 'srch_id', 'value','actual', n=5000)

def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

def eval_ndcg(df, srch_id_col, predict_col, value_col, n=9999999999999999):
    df = df.sort_values([srch_id_col, predict_col], ascending=False)
    k = 5
    ndcgs = []
    for i, srchid in enumerate(df[srch_id_col].unique()):
        if i == n:
            break
        if i % 10000 == 0 and i != 0:
            print(i)
            print(np.mean(ndcgs))
        r = df[df[srch_id_col] == srchid][value_col]
        ndcgs.append(ndcg_at_k(r,k))

    print(np.mean(ndcgs))


In [64]:
def feature_eng(train_main):
    # load initial dataframe
    # Select several features that don't need to be changed
    df = train_main[['prop_id', 'srch_id', 'position', 'promotion_flag', 'orig_destination_distance', 'random_bool', 'click_bool','booking_bool']]
    
    # cluster the type of search (one-hot encoded)
    df = df.merge(cluster_srch_ids(train_main, one_hot = True), on=['srch_id', 'prop_id'])
    
    # add average click rate of property
    df = df.merge(avg_click_rate(train_main), on='prop_id')
    
    df = df.merge(average_prop_score(train_main), on='prop_id')
    print('Features engineered done')
    return df

def train_and_test(df):
    X_train, X_test, Y_train, Y_test, target_train, target_test = split_sets(df)
    
    
    model = train_gxboost_ranker(X_train, Y_train)
    
    group_test = X_test.srch_id.values
    X_t = X_test.drop(['srch_id', 'prop_id'], axis=1).values
    X_t = np.concatenate([group_test[:,None], X_t], axis=1)
    
    pred = model.predict(X_t)
    print('Training done')
    
    evaluate(X_test, pred, target_test)
    print('Evaluated')






In [62]:
df = feature_eng(train_main)

Features engineered done


In [69]:
train_and_test(df)

Training done
0.10999612931361298
Evaluated


In [None]:
def rate(row, mean_rate):
    if row['srch_id'] < 20:
        return mean_rate
    else:
        return row['click_bool']
    
def avg_click_rate(train_main):
    prop_click_rate = train_main[['prop_id','click_bool', 'srch_id']].groupby('prop_id').agg({'click_bool' : 'mean', 'srch_id': 'count'}).reset_index()
    mean_rate = prop_click_rate.click_bool.mean()
    prop_click_rate['click_rate'] = prop_click_rate.apply(lambda x: rate(x, mean_rate), axis=1)
    return prop_click_rate[['prop_id', 'click_rate']]

#prop_srch.sort_values(['srch_id','value'], ascending=[True, False]).head(100)
avg_rates = avg_click_rate(train_main)
print(avg_rates.head())
prop_srch = prop_srch.merge(avg_rates, on='prop_id', how='outer')
prop_srch['value2'] = prop_srch['value'] * prop_srch['click_rate']
eval_ndcg(prop_srch, 'srch_id', 'value2','actual', n=5000)

In [60]:
train_main[train_main.prop_id == 893].price_usd.head()

0        104.77
6938      88.37
12605    126.00
21958    139.00
30512    109.00
Name: price_usd, dtype: float64