# Module

In [1]:
import pandas as pd
import os
import numpy as np

import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

# Config 

In [2]:
debug = False
# if debug:
debug_num = 100
# else:
#     sample_num = -1
data_dir = '../raw_data/'
result_dir = '../result/'

submission_cols =  ['SearchId','PropertyId']
search_id = 'srch_id'
prop_id = 'prop_id'
regression_label = 'label'


In [3]:
# train_df.columns

# Data info 

In [4]:
if debug:
    train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'), nrows=debug_num)
    test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'), nrows=debug_num)
else:
    train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
    test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))

In [5]:
print(f"train dimension: {train_df.shape}")
print(f"test dimension: {test_df.shape}")
print(f"Unique cols for train_df: {set(train_df.columns)-set(test_df.columns)}")
print(f"Unique cols for test_df: {set(test_df.columns)-set(train_df.columns)}")

train dimension: (9917530, 54)
test dimension: (6622629, 50)
Unique cols for train_df: {'click_bool', 'gross_bookings_usd', 'booking_bool', 'position'}
Unique cols for test_df: set()


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9917530 entries, 0 to 9917529
Data columns (total 54 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   srch_id                      int64  
 1   date_time                    object 
 2   site_id                      int64  
 3   visitor_location_country_id  int64  
 4   visitor_hist_starrating      float64
 5   visitor_hist_adr_usd         float64
 6   prop_country_id              int64  
 7   prop_id                      int64  
 8   prop_starrating              int64  
 9   prop_review_score            float64
 10  prop_brand_bool              int64  
 11  prop_location_score1         float64
 12  prop_location_score2         float64
 13  prop_log_historical_price    float64
 14  position                     int64  
 15  price_usd                    float64
 16  promotion_flag               int64  
 17  srch_destination_id          int64  
 18  srch_length_of_stay          int64  
 19  

In [7]:
test_df.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff
0,2,2012-11-05 21:28:38,15,55,,,98,3105,3,2.0,...,,,,,,,,,,
1,2,2012-11-05 21:28:38,15,55,,,98,6399,3,0.0,...,,,,,,,,,,
2,2,2012-11-05 21:28:38,15,55,,,98,7374,4,3.5,...,,,,,,,,,,
3,2,2012-11-05 21:28:38,15,55,,,98,7771,3,4.5,...,,,,,,,,,,
4,2,2012-11-05 21:28:38,15,55,,,98,12938,3,0.0,...,,,,,,,,,,


# Functions

In [25]:
def save_submission(rec_df, file_name: str):
    result_df = test_df[['srch_id', 'prop_id']].rename(columns=dict(zip(['srch_id', 'prop_id'], submission_cols)))
    result_df = result_df.merge(rec_df, how='left', on=submission_cols)
    assert result_df.shape[0] == rec_df.shape[0]
    result_df = result_df.sort_values(['SearchId', "predicted"], ascending=[True, False])
    file_name = os.path.join(result_dir, file_name)
    result_df[submission_cols].to_csv(file_name, index=False)
    print(f"shape: {result_df[submission_cols].shape}")
    print(result_df.head())
    print(result_df[submission_cols].head())
    

# Random Recommenation 

## Train 

In [7]:
random_rec = test_df[['srch_id', 'prop_id']].rename(columns=dict(zip(['srch_id', 'prop_id'], submission_cols)))
random_rec['predicted'] = np.random.rand(random_rec.shape[0])

In [8]:
random_rec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6622629 entries, 0 to 6622628
Data columns (total 3 columns):
 #   Column      Dtype  
---  ------      -----  
 0   SearchId    int64  
 1   PropertyId  int64  
 2   predicted   float64
dtypes: float64(1), int64(2)
memory usage: 151.6 MB


## Evaluation 

# XGBRegressor

## Config 

In [9]:
test_size = 0.2

In [10]:
# train_df['click_bool']

In [11]:
feature_col = test_df.columns

feature_col = feature_col.drop(search_id)
feature_col = feature_col.drop(prop_id)
feature_col = feature_col.drop('date_time')
feature_col

Index(['site_id', 'visitor_location_country_id', 'visitor_hist_starrating',
       'visitor_hist_adr_usd', 'prop_country_id', 'prop_starrating',
       'prop_review_score', 'prop_brand_bool', 'prop_location_score1',
       'prop_location_score2', 'prop_log_historical_price', 'price_usd',
       'promotion_flag', 'srch_destination_id', 'srch_length_of_stay',
       'srch_booking_window', 'srch_adults_count', 'srch_children_count',
       'srch_room_count', 'srch_saturday_night_bool',
       'srch_query_affinity_score', 'orig_destination_distance', 'random_bool',
       'comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff', 'comp2_rate',
       'comp2_inv', 'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv',
       'comp7_rate_percent_diff', 'comp

In [12]:
def feature_engineering(df):
    return df

def get_label(row):
    if row['booking_bool'] == 1:
        return 5
    elif row['click_bool'] == 1:
        return 2
    else:
        return 0

In [13]:
train_df.columns


Index(['srch_id', 'date_time', 'site_id', 'visitor_location_country_id',
       'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id',
       'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score',
       'orig_destination_distance', 'random_bool', 'comp1_rate', 'comp1_inv',
       'comp1_rate_percent_diff', 'comp2_rate', 'comp2_inv',
       'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate'

In [14]:
train_df_y = train_df.apply(lambda row: get_label(row), axis=1)

In [15]:
cleaned_train_df = feature_engineering(train_df)
cleaned_test_df = feature_engineering(test_df)

In [16]:
cleaned_train_df[feature_col].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9917530 entries, 0 to 9917529
Data columns (total 47 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   site_id                      int64  
 1   visitor_location_country_id  int64  
 2   visitor_hist_starrating      float64
 3   visitor_hist_adr_usd         float64
 4   prop_country_id              int64  
 5   prop_starrating              int64  
 6   prop_review_score            float64
 7   prop_brand_bool              int64  
 8   prop_location_score1         float64
 9   prop_location_score2         float64
 10  prop_log_historical_price    float64
 11  price_usd                    float64
 12  promotion_flag               int64  
 13  srch_destination_id          int64  
 14  srch_length_of_stay          int64  
 15  srch_booking_window          int64  
 16  srch_adults_count            int64  
 17  srch_children_count          int64  
 18  srch_room_count              int64  
 19  

In [17]:
train_X, eval_X, train_y, eval_y = train_test_split(cleaned_train_df[feature_col], train_df_y)

In [18]:
assert train_X.shape[1] == eval_X.shape[1]

In [20]:
model_params = {
    'learning_rate':0.05
    , 'n_estimators': 1000
#     'learning_rate':0.01
#     , 'n_estimators': 1000
# , 'max_depth': 8
#     , 'min_child_weight': 0.5
#     , 'gamma':1
#     , 'colsample_bytree': 0.9
#     , 'subsample': 0.9
#     , 'reg_alpha': 1

}

In [27]:
model = XGBRegressor(**model_params)

In [29]:
model

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None, gamma=None,
             gpu_id=None, importance_type='gain', interaction_constraints=None,
             learning_rate=0.05, max_delta_step=None, max_depth=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             n_estimators=1000, n_jobs=None, num_parallel_tree=None,
             random_state=None, reg_alpha=None, reg_lambda=None,
             scale_pos_weight=None, subsample=None, tree_method=None,
             validate_parameters=None, verbosity=None)

In [None]:
model.fit(X=train_X, y=train_y, verbose=True, eval_metric=['mae']
                         , eval_set=[[train_X, train_y], [eval_X, eval_y]], early_stopping_rounds=20)

[0]	validation_0-mae:0.62056	validation_1-mae:0.61228
[1]	validation_0-mae:0.59756	validation_1-mae:0.59925
[2]	validation_0-mae:0.58276	validation_1-mae:0.58393
[3]	validation_0-mae:0.57361	validation_1-mae:0.57210
[4]	validation_0-mae:0.56173	validation_1-mae:0.55943
[5]	validation_0-mae:0.54533	validation_1-mae:0.54754
[6]	validation_0-mae:0.53595	validation_1-mae:0.53616
[7]	validation_0-mae:0.52736	validation_1-mae:0.52563
[8]	validation_0-mae:0.51627	validation_1-mae:0.51538
[9]	validation_0-mae:0.50399	validation_1-mae:0.50610
[10]	validation_0-mae:0.49537	validation_1-mae:0.49650
[11]	validation_0-mae:0.48889	validation_1-mae:0.48791
[12]	validation_0-mae:0.48171	validation_1-mae:0.47961
[13]	validation_0-mae:0.47313	validation_1-mae:0.47176
[14]	validation_0-mae:0.46381	validation_1-mae:0.46435
[15]	validation_0-mae:0.45601	validation_1-mae:0.45737
[16]	validation_0-mae:0.44936	validation_1-mae:0.45044
[17]	validation_0-mae:0.44409	validation_1-mae:0.44401
[18]	validation_0-ma

In [None]:
xgb_rec = test_df[['srch_id', 'prop_id']].rename(columns=dict(zip(['srch_id', 'prop_id'], submission_cols)))
xgb_rec['predicted'] = model.predict(cleaned_test_df[feature_col])

In [None]:
xgb_rec.head()

## Save 

In [None]:
save_submission(rec_df=xgb_rec, file_name='0429_xgb.csv')