In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split  
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

In [8]:
# specifying some paths
path = r"C:\Users\Tommy\Documents\GitHub\dataminingtech\Assignment2\vu-dmt-2assignment"
testpath = os.path.join(path, 'test_set_VU_DM.csv')
trainpath = os.path.join(path, 'training_set_VU_DM.csv')
samplepath = os.path.join(path, 'train_sample.csv')
resultpath = os.path.join(path, 'result.csv')


In [9]:
# load train_main
# made a function for quick reusability during development
def reload_train():
    train_main = pd.read_csv(trainpath)
    train_main.date_time = pd.to_datetime(train_main.date_time)
    return train_main

In [10]:
train_main = reload_train()

# Feature Extraction

In [16]:
def average_prop_score(train_df):
    cols = ['prop_id', 'price_usd', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2']
    df = train_df[cols]
    df['prop_location_score2'] = df['prop_location_score2'].fillna(df.prop_location_score2.mean())
    return df.groupby('prop_id').agg('mean').reset_index()

def rate(row, mean_rate):
    if row['srch_id'] < 20:
        return mean_rate
    else:
        return row['click_bool']
    
def avg_click_rate(train_main):
    prop_click_rate = train_main[['prop_id','click_bool', 'srch_id']].groupby('prop_id').agg({'click_bool' : 'mean', 'srch_id': 'count'}).reset_index()
    mean_rate = prop_click_rate.click_bool.mean()
    prop_click_rate['click_rate'] = prop_click_rate.apply(lambda x: rate(x, mean_rate), axis=1)
    return prop_click_rate[['prop_id', 'click_rate']]

def split_sets(df, test_size = 0.2, shuffle=False, downsample=False):
    '''
        Returns: X_train, X_test, Y_train, Y_test
    '''
    df_train, df_test = train_test_split(df.sort_values('srch_id'), test_size=test_size, shuffle=shuffle) 
    
    if downsample:
        df_train = downsample(df_train)
    
    X_train = df_train.drop(['click_bool','booking_bool'] , axis=1)
    X_train = X_train.fillna(X_train.mean())
    X_test = df_test.drop(['click_bool','booking_bool'] , axis=1)    
    X_test = X_test.fillna(X_test.mean())
    
    Y_train = df_train.click_bool + df_train.booking_bool * 4
    Y_test = df_test.click_bool + df_train.booking_bool * 4
    
    target_train = df_train.click_bool + 4 * df_train.booking_bool
    target_test = df_test.click_bool + 4 * df_test.booking_bool    
    
    return X_train, X_test, Y_train, Y_test, target_train, target_test

def cumulate_comp_scores(train_df):    
    # Cumulative competitor scores

    cols = [col for col in train_df.columns if col.endswith('rate_percent_diff')]
    cols2 = [col for col in train_df.columns if col.endswith('_rate')]
    cols3 = [col for col in train_df.columns if col.endswith('_inv')]

    df = train_df[cols.extend(cols2).extend(cols3).extend(['prop_id'])]
    df['comp_rate_percent_diff_cumm'] = 0

    for col1, col2, col3 in zip(cols, cols2, cols3):
        df['comp_rate_percent_diff_cumm'] += df[col1].fillna(0) * df[col2].fillna(0) * (1 - df[col3].fillna(0))
        
    df = train2.drop(cols, axis=1)
    df = train2.drop(cols2, axis=1)
    df = train2.drop(cols3, axis=1)
    return df

def avg_prop_features(train_main, test_main = None, predict_phase=False):
    cols = [
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool']
    df = train_main[cols].groupby('prop_id').agg(['mean', 'std']) # median?
    
    if predict_phase:
        df = df.merge(test_main[cols + ['srch_id']], on='prop_id')
    else:
        df = df.merge(train_main[cols + ['srch_id']], on='prop_id')   
        
    return df


In [17]:
def feature_eng(train_main):

    df = train_main[['prop_id', 'srch_id','date_time', 'click_bool', 'booking_bool']]        
    df = df.merge(avg_prop_features(train_main), on=['prop_id', 'srch_id'])    
    df = df.fillna(df.mean())
    return df.sort_values(['prop_id', 'srch_id'], ascending=[True, False])

def test_eng(test_main, train_main):
    df = avg_prop_features(train_main, test_main=test_main, predict_phase=True)  
    df = df.fillna(df.mean())
    return df.sort_values(['srch_id', 'prop_id'], ascending=[True, False])


# Evaluation

In [None]:
# Create main dataset
df = feature_eng(train_main)
df_train = df[df.date_time < '2013-05-30']
df_test = df[df.date_time >= '2013-05-30']

In [11]:

def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k):
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    return dcg_at_k(r, k) / idcg

def eval_ndcg(df, srch_id_col, predict_col, value_col, n=9999999999999999):
    df = df.sort_values([srch_id_col, predict_col], ascending=False)
    k = 5
    ndcgs = []
    for i, srchid in enumerate(df[srch_id_col].unique()):
        if i == n:
            break
        if i % 10000 == 0 and i != 0:
            print(i)
            print(np.mean(ndcgs))
        r = df[df[srch_id_col] == srchid][value_col]
        ndcgs.append(ndcg_at_k(r,k))

    print(np.mean(ndcgs))


In [12]:
# select columns
cols = [x for x in train_main.columns if x.startswith('srch_')]
cols.extend([x for x in train_main.columns if x.startswith('prop_')])
cols.extend(['promotion_flag', 'random_bool', 'click_bool', 'booking_bool'])
#cols.extend(['price_usd','orig_destination_distance', 'promotion_flag'])
print(cols)

X = df[cols].fillna(0)
y = df.click_bool + 4 * train_main.booking_bool
X = X.drop(['click_bool','booking_bool'], axis=1)

['srch_id', 'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool', 'srch_query_affinity_score', 'prop_country_id', 'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'promotion_flag', 'random_bool', 'click_bool', 'booking_bool']


# Nearest Neighbour

In [13]:
from sklearn.model_selection import train_test_split  
from sklearn.neighbors import KNeighborsClassifier  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)  


classifier = KNeighborsClassifier(n_neighbors=40)  
classifier.fit(X_train, y_train)  

y_pred = classifier.predict(X_test)

In [14]:
# evaluation
results = pd.DataFrame({'srch_id': X_test.srch_id, 'predicted': y_pred, 'actual' : y_test})
eval_score = eval_ndcg(results, 'srch_id', 'predicted', 'actual')

10000
0.12050288069333226
20000
0.12015395844716527
30000
0.12060896001984102
40000
0.11985276404280648
50000
0.1194777128841086
60000
0.119613321957053
70000
0.11910450314758046
80000
0.11953357783234969
90000
0.12031632283599615
100000
0.12014730607336503
110000
0.12038081076240482
120000
0.12052037711512295
130000
0.12033482431127475
140000
0.12050585591252275
150000
0.12078569984371915
160000
0.12089067519883538
170000
0.1209608444750801
180000
0.1208355358198804
190000
0.12109868428569842
0.12112854531983085
