In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split  
import xgboost as xgb
from sklearn.cluster import KMeans
import warnings
from xgboost import XGBRanker, XGBRegressor
warnings.filterwarnings('ignore')

In [2]:
# specifying some paths
path = r"C:\Users\janse\OneDrive\Bureaublad\Master\Data Mining Techniques\Assignment 2"
testpath = os.path.join(path, 'test_set_VU_DM.csv')
trainpath = os.path.join(path, 'training_set_VU_DM.csv')
samplepath = os.path.join(path, 'train_sample.csv')
resultpath = os.path.join(path, 'result.csv')


In [3]:
# load train_main
def reload_train():
    train_main = pd.read_csv(trainpath)
    train_main.date_time = pd.to_datetime(train_main.date_time)
    return train_main

In [4]:
train_main = reload_train()

# Feature extraction functions

In [6]:
def cumulate_comp_scores(train_df):    
    # Cumulative competitor scores

    cols = [col for col in train_df.columns if col.endswith('rate_percent_diff')]
    cols2 = [col for col in train_df.columns if col.endswith('_rate')]
    cols3 = [col for col in train_df.columns if col.endswith('_inv')]

    df = train_df[cols.extend(cols2).extend(cols3).extend(['prop_id'])]
    df['comp_rate_percent_diff_cumm'] = 0

    for col1, col2, col3 in zip(cols, cols2, cols3):
        df['comp_rate_percent_diff_cumm'] += df[col1].fillna(0) * df[col2].fillna(0) * (1 - df[col3].fillna(0))
        
    df = train2.drop(cols, axis=1)
    df = train2.drop(cols2, axis=1)
    df = train2.drop(cols3, axis=1)
    return df

def average_prop_score(train_df):
    cols = ['prop_id', 'price_usd', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2']
    df = train_df[cols]
    df['prop_location_score2'] = df['prop_location_score2'].fillna(df.prop_location_score2.mean())
    return df.groupby('prop_id').agg('mean').reset_index()

class search_clusters:
    def __init__(self, n_clusters, searches):
        kmean = KMeans(n_clusters=n_clusters)
        kmean.fit(searches)
        self.clusters = kmean
    def assign(self, search_terms):
        return self.clusters.predict(search_terms)
    
def cluster_srch_ids(train_main, one_hot = False, cluster_amount=30):    
    train_df = train_main
    
    cols = [x for x in train_df.columns if x.startswith('srch_')]
    df = train_df[cols].drop('srch_query_affinity_score', axis=1)
    
     
    groups = search_clusters(cluster_amount, df.drop_duplicates().drop(['srch_id','srch_destination_id'], axis=1))

    clusters = pd.DataFrame()
    clusters['search_group'] = groups.assign(train_df[cols].drop(['srch_id','srch_destination_id', 'srch_query_affinity_score'], axis=1))    
    clusters['srch_id'] = train_df['srch_id']
    clusters['prop_id'] = train_df['prop_id']
    
    if one_hot:        
        clusters = pd.get_dummies(clusters, columns=['search_group'])
        
    else:
        pass
    
    return clusters

def rate(row, mean_rate):
    if row['srch_id'] < 20:
        return mean_rate
    else:
        return row['click_bool']
    
def avg_click_rate(train_main):
    prop_click_rate = train_main[['prop_id','click_bool', 'srch_id']].groupby('prop_id').agg({'click_bool' : 'mean', 'srch_id': 'count'}).reset_index()
    mean_rate = prop_click_rate.click_bool.mean()
    prop_click_rate['click_rate'] = prop_click_rate.apply(lambda x: rate(x, mean_rate), axis=1)
    return prop_click_rate[['prop_id', 'click_rate']]

def split_sets(df, test_size = 0.2, shuffle=False, downsample=False):
    '''
        Returns: X_train, X_test, Y_train, Y_test
    '''
    df_train, df_test = train_test_split(df.sort_values('srch_id'), test_size=test_size, shuffle=shuffle) 
    
    if downsample:
        df_train = downsample(df_train)
    
    X_train = df_train.drop(['click_bool','booking_bool'] , axis=1)
    X_train = X_train.fillna(X_train.mean())
    X_test = df_test.drop(['click_bool','booking_bool'] , axis=1)    
    X_test = X_test.fillna(X_test.mean())
    
    Y_train = df_train.click_bool + df_train.booking_bool * 4
    Y_test = df_test.click_bool + df_train.booking_bool * 4
    
    target_train = df_train.click_bool + 4 * df_train.booking_bool
    target_test = df_test.click_bool + 4 * df_test.booking_bool    
    
    return X_train, X_test, Y_train, Y_test, target_train, target_test

def cumulate_comp_scores(train_df):    
    # Cumulative competitor scores

    cols = [col for col in train_df.columns if col.endswith('rate_percent_diff')]
    cols2 = [col for col in train_df.columns if col.endswith('_rate')]
    cols3 = [col for col in train_df.columns if col.endswith('_inv')]

    df = train_df[cols.extend(cols2).extend(cols3).extend(['prop_id'])]
    df['comp_rate_percent_diff_cumm'] = 0

    for col1, col2, col3 in zip(cols, cols2, cols3):
        df['comp_rate_percent_diff_cumm'] += df[col1].fillna(0) * df[col2].fillna(0) * (1 - df[col3].fillna(0))
        
    df = train2.drop(cols, axis=1)
    df = train2.drop(cols2, axis=1)
    df = train2.drop(cols3, axis=1)
    return df

def avg_prop_features(train_main):
    cols = [
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool']
    return train_main[cols].groupby('prop_id').agg(['mean', 'median', 'std'])


# Models

In [7]:
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBRegressor
from pyltr.models import LambdaMART
from pyltr.metrics.dcg import NDCG

def train_lambdaMART(X_train, Y_train, lr = 0.1, n_estimators=1000, max_depth=4):
    group_train = X_train.srch_id

    X = X_train.drop(['srch_id', 'prop_id'], axis=1).values

    Y = Y_train.values

    model = LambdaMART(n_estimators=n_estimators, 
                       learning_rate=lr,  
                       max_depth=max_depth,
                       metric = NDCG(k=5)
                     )
    model.fit(X, Y, group_train)
    
    return model

def train_gxboost_ranker(X_train, Y_train, lr = 0.1, n_estimators=200, max_depth=4, objective='rank:ndcg'):
    
    group_train = X_train[['srch_id','prop_id']].groupby('srch_id').agg('count').prop_id.values

    X = X_train.drop(['srch_id', 'prop_id'], axis=1).values

    Y = Y_train.values

    model = XGBRanker(n_estimators=n_estimators, 
                      learning_rate=lr,  
                      max_depth=max_depth, 
                      objective=objective,
                      sub_sample = 1,
                      colsample_bytree = 0.7 #,
#                       predictor='gpu_predictor'
                     )
    model.fit(X, Y, group_train, verbose=2)
    
    return model

def train_gxboost_regr(X_train, Y_train, lr = 0.05, n_estimators=50, max_depth=4, objective='reg:linear'):
    
    group_train = X_train[['srch_id','prop_id']].groupby('srch_id').agg('count').prop_id.values

    X = X_train.drop(['srch_id', 'prop_id'], axis=1).values

    Y = Y_train.values

    model = XGBRegressor(n_estimators=n_estimators, 
                      learning_rate=lr, 
                      base_score=1, 
                      max_depth=max_depth, 
                      objective=objective,
                      sub_sample = 1,
                      colsample_bytree = 0.6,
                      predictor='gpu_predictor'
                     )
    model.fit(X, Y, group_train, verbose=2)
    
    return model

def train_KNN(X_train, Y_train):
    X = X_train.drop(['srch_id','prop_id'], axis=1).values
    Y = Y_train.values
    return KNeighborsClassifier(n_neighbors=20).fit(X, Y)


# Evaluation functions

In [8]:
def evaluate(X_test, pred, target_test):
    prop_srch = X_test[['srch_id', 'prop_id']]
    prop_srch['value'] = pred
    prop_srch['actual'] = target_test.values
    eval_ndcg(prop_srch.sort_values(['srch_id', 'value'], ascending=False), 'srch_id', 'value','actual', n=5000)
    return prop_srch

def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

def eval_ndcg(df, srch_id_col, predict_col, value_col, n=9999999999999999):
    df = df.sort_values([srch_id_col, predict_col], ascending=False)
    k = 5
    ndcgs = []
    for i, srchid in enumerate(df[srch_id_col].unique()):
        if i == n:
            break
        if i % 10000 == 0 and i != 0:
            print(i)
            print(np.mean(ndcgs))
        r = df[df[srch_id_col] == srchid][value_col]
        ndcgs.append(ndcg_at_k(r,k))

    print(np.mean(ndcgs))


# Functions used to combine the above

In [9]:
def feature_eng(train_main):
    # load initial dataframe
    # Select several features that don't need to be changed
    df = train_main[['prop_id', 'srch_id', 'date_time', 'position', 'promotion_flag', 'orig_destination_distance', 'random_bool', 'click_bool','booking_bool']]
    
    # cluster the type of search (one-hot encoded)
    df = df.merge(cluster_srch_ids(train_main, one_hot = True), on=['srch_id', 'prop_id'])
    
    # add average click rate of property
    df = df.merge(avg_click_rate(train_main), on='prop_id')
    
    df = df.merge(average_prop_score(train_main), on='prop_id')
    print('Features engineering done')
    return df

def feature_eng2(train_main):

    df = train_main[['prop_id', 'srch_id','date_time', 'click_bool', 'booking_bool']]        
    df = df.merge(avg_prop_features(train_main), on='prop_id')    
    df = df.fillna(df.mean())
    return df.sort_values(['prop_id', 'srch_id'], ascending=[True, False])

def test_eng(test_main, train_main):
    df = test_main.merge(avg_prop_features(train_main), on='prop_id')
    df = df.fillna(df.mean())
    return df.sort_values(['prop_id', 'srch_id'], ascending=[True, False])

def train_and_test(df):
    X_train, X_test, Y_train, Y_test, target_train, target_test = split_sets(df)
    
    
    model = train_gxboost_ranker(X_train, Y_train, n_estimators=50)
    #model = train_KNN(X_train, Y_train)
    #model = train_lambdaMART(X_train, Y_train)
#     for col, imp in zip(X_train.drop(['srch_id', 'prop_id'], axis=1).columns, model.feature_importances_):
#         print(str(col) + ': ' + str(imp))
        
    print('Training done, generating predictions')
    
    for i, srch in enumerate(X_test.srch_id.unique()):
        X_t = X_test[X_test.srch_id == srch].drop(['srch_id', 'prop_id'], axis=1).values
        pred = model.predict(X_t)
        if i == 0:
            predictions = pred
        else:
            predictions = np.append(predictions, pred)    
    
    
    results = evaluate(X_test, predictions, target_test)
    print('Evaluated')
    return results



# Main stuff

In [13]:
df = feature_eng2(train_main)


In [None]:
size = int(df.shape[0] / 50)
results = train_and_test(df[:size])

In [None]:
print(df.date_time.min(), df.date_time.max())

In [14]:
# Create datasets in the correct format for rankLib (java program)

# functions below are required for this

train = df[df.date_time < '2013-05-30']
train = train.sort_values(['srch_id'])
train = train.drop(['date_time', 'prop_id'], axis=1)
train['label'] = train.click_bool + 4 * train.booking_bool
train = train.drop(['click_bool', 'booking_bool'], axis=1)
to_ranklib(train,'label','srch_id',r'C:\Users\janse\OneDrive\Bureaublad\Master\Data Mining Techniques\Assignment 2\preprocessed\train.txt')

test = df[df.date_time >= '2013-05-30']
test = test.sort_values(['srch_id'])
test = test.drop(['date_time', 'prop_id'], axis=1)
test['label'] = test.click_bool + 4 * test.booking_bool
test = test.drop(['click_bool', 'booking_bool'], axis=1)
to_ranklib(test,'label','srch_id',r'C:\Users\janse\OneDrive\Bureaublad\Master\Data Mining Techniques\Assignment 2\preprocessed\test.txt')


['visitor_hist_starrating_mean', 'visitor_hist_starrating_median', 'visitor_hist_starrating_std', 'visitor_hist_adr_usd_mean', 'visitor_hist_adr_usd_median', 'visitor_hist_adr_usd_std', 'prop_starrating_mean', 'prop_starrating_median', 'prop_starrating_std', 'prop_review_score_mean', 'prop_review_score_median', 'prop_review_score_std', 'prop_brand_bool_mean', 'prop_brand_bool_median', 'prop_brand_bool_std', 'prop_location_score1_mean', 'prop_location_score1_median', 'prop_location_score1_std', 'prop_location_score2_mean', 'prop_location_score2_median', 'prop_location_score2_std', 'prop_log_historical_price_mean', 'prop_log_historical_price_median', 'prop_log_historical_price_std', 'price_usd_mean', 'price_usd_median', 'price_usd_std', 'promotion_flag_mean', 'promotion_flag_median', 'promotion_flag_std', 'srch_length_of_stay_mean', 'srch_length_of_stay_median', 'srch_length_of_stay_std', 'srch_booking_window_mean', 'srch_booking_window_median', 'srch_booking_window_std', 'srch_adults_co

In [None]:
testing_main = pd.read_csv(testpath, usecols=['srch_id', 'prop_id'])
df = test_eng(testing_main, train_main)
df['label'] = 0
df = df.sort_values('srch_id')
to_ranklib(df,'label','srch_id',r'C:\Users\janse\OneDrive\Bureaublad\Master\Data Mining Techniques\Assignment 2\preprocessed\kaggle_test.txt')


In [10]:
# Ranklib formatting
def f(x, label_col, query_col, feature_columns, cols2id):
    if x.name in feature_columns:
        return cols2id[x.name] + ':' + x.astype(str)
    elif x.name == query_col:
        return 'qid:' + x.astype(str)
    else:
        return x


def to_ranklib(df, label_col, query_col, target_file):
    cols = df.columns
    new_col_names = [col[0] + '_' + col[1] if (type(col) == tuple) else col for col in cols ]
    df.columns = new_col_names
    
    feature_columns = list(df.drop([label_col, query_col], axis=1).columns)
    print(feature_columns, [label_col, query_col])
    cols2id = {col:str(i+1) for i,col in enumerate(feature_columns)}
    
    (df.apply(lambda x: f(x, label_col, query_col, feature_columns, cols2id))[[label_col, query_col] + feature_columns]
      .to_csv(target_file, sep=' ', index=False, header=None)
    )

# Some other stuff

In [None]:
def rate(row, mean_rate):
    if row['srch_id'] < 20:
        return mean_rate
    else:
        return row['click_bool']
    
def avg_click_rate(train_main):
    prop_click_rate = train_main[['prop_id','click_bool', 'srch_id']].groupby('prop_id').agg({'click_bool' : 'mean', 'srch_id': 'count'}).reset_index()
    mean_rate = prop_click_rate.click_bool.mean()
    prop_click_rate['click_rate'] = prop_click_rate.apply(lambda x: rate(x, mean_rate), axis=1)
    return prop_click_rate[['prop_id', 'click_rate']]

#prop_srch.sort_values(['srch_id','value'], ascending=[True, False]).head(100)
avg_rates = avg_click_rate(train_main)
print(avg_rates.head())
prop_srch = results.merge(avg_rates, on='prop_id')
prop_srch['value'] = (prop_srch['value'] + prop_srch['value'].min()) / prop_srch['value'].max()
prop_srch['click_rate'] = (prop_srch['click_rate'] - prop_srch['click_rate'].min()) / prop_srch['click_rate'].max()
prop_srch['value2'] = (prop_srch['value'] + prop_srch['click_rate']) / 2
prop_srch = prop_srch.sort_values(['srch_id', 'value2'] , ascending=[True, False])
eval_ndcg(prop_srch, 'srch_id', 'value2','actual', n=5000)

In [None]:
train_main[train_main.prop_id == 893].price_usd.head()

In [None]:
df = train_main[['prop_id','srch_id', 'click_bool', 'booking_bool']]
df = df.merge(avg_click_rate(train_main), on='prop_id')
df['target'] = df['click_bool'] + df['booking_bool'] * 4
eval_ndcg(df, 'srch_id','click_rate','target', n=5000)

In [15]:
# rankLib formatting:
# first value is the label (0/1/5), second value is srch_id, then there are features

with open(r'C:\Users\janse\OneDrive\Bureaublad\Master\Data Mining Techniques\Assignment 2\preprocessed\train.txt') as f:
    for i in range(20):
        print(f.readline())

0 qid:1 1:3.322413793103448 2:3.5 3:0.7259213612093934 4:182.71137931034482 5:170.74 6:94.44132932008021 7:4 8:4 9:0.0 10:4.0 11:4.0 12:0.0 13:1 14:1 15:0.0 16:2.200000000000024 17:2.2 18:0.0 19:0.025825547445255496 20:0.0149 21:0.03185471137541937 22:4.4000000000000155 23:5.03 24:1.6661063319243317 25:152.05408233276165 26:129.0 27:390.9285730663125 28:0.051457975986277875 29:0.0 30:0.22111969329662595 31:1.6569468267581475 32:1.0 33:1.1749017399520345 34:28.221269296740996 35:12.0 36:39.78547405102965 37:1.8936535162950257 38:2.0 39:0.9446962802262713 40:0.33962264150943394 41:0.0 42:0.7236252720378428 43:1.1492281303602059 44:1.0 45:0.49744562751568244 46:0.5420240137221269 47:1.0 48:0.4986587020604008 49:-20.22207173913044 50:-17.172150000000002 51:11.945580138308022 52:1760.0581861575174 53:1091.95 54:2245.9363960137625 55:0.20240137221269297 56:0.0 57:0.40213484872824407

0 qid:1 1:3.24 2:3.4400000000000004 3:0.6770632423535792 4:170.49142857142857 5:164.465 6:83.87312795545041 7