# Module

In [1]:
import pandas as pd
import os
import numpy as np

import xgboost as xgb
from xgboost import XGBRegressor

# Config 

In [2]:
debug = False
# if debug:
debug_num = 100
# else:
#     sample_num = -1
data_dir = '../raw_data/'
result_dir = '../result/'

submission_cols =  ['SearchId','PropertyId']


# Data info 

In [3]:
if debug:
    train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'), nrows=debug_num)
    test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'), nrows=debug_num)
else:
    train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'))
    test_df = pd.read_csv(os.path.join(data_dir, 'test.csv'))

In [4]:
print(f"train dimension: {train_df.shape}")
print(f"test dimension: {test_df.shape}")
print(f"Unique cols for train_df: {set(train_df.columns)-set(test_df.columns)}")
print(f"Unique cols for test_df: {set(test_df.columns)-set(train_df.columns)}")

train dimension: (9917530, 54)
test dimension: (6622629, 50)
Unique cols for train_df: {'gross_bookings_usd', 'position', 'click_bool', 'booking_bool'}
Unique cols for test_df: set()


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9917530 entries, 0 to 9917529
Data columns (total 54 columns):
 #   Column                       Dtype  
---  ------                       -----  
 0   srch_id                      int64  
 1   date_time                    object 
 2   site_id                      int64  
 3   visitor_location_country_id  int64  
 4   visitor_hist_starrating      float64
 5   visitor_hist_adr_usd         float64
 6   prop_country_id              int64  
 7   prop_id                      int64  
 8   prop_starrating              int64  
 9   prop_review_score            float64
 10  prop_brand_bool              int64  
 11  prop_location_score1         float64
 12  prop_location_score2         float64
 13  prop_log_historical_price    float64
 14  position                     int64  
 15  price_usd                    float64
 16  promotion_flag               int64  
 17  srch_destination_id          int64  
 18  srch_length_of_stay          int64  
 19  

In [6]:
test_df.head()

Unnamed: 0,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,...,comp5_rate_percent_diff,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff
0,2,2012-11-05 21:28:38,15,55,,,98,3105,3,2.0,...,,,,,,,,,,
1,2,2012-11-05 21:28:38,15,55,,,98,6399,3,0.0,...,,,,,,,,,,
2,2,2012-11-05 21:28:38,15,55,,,98,7374,4,3.5,...,,,,,,,,,,
3,2,2012-11-05 21:28:38,15,55,,,98,7771,3,4.5,...,,,,,,,,,,
4,2,2012-11-05 21:28:38,15,55,,,98,12938,3,0.0,...,,,,,,,,,,


# Random Recommenation 

## Train 

In [7]:
random_rec = test_df[['srch_id', 'prop_id']].rename(columns=dict(zip(['srch_id', 'prop_id'], submission_cols)))
random_rec['predicted'] = np.random.rand(random_rec.shape[0])

In [8]:
random_rec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6622629 entries, 0 to 6622628
Data columns (total 3 columns):
 #   Column      Dtype  
---  ------      -----  
 0   SearchId    int64  
 1   PropertyId  int64  
 2   predicted   float64
dtypes: float64(1), int64(2)
memory usage: 151.6 MB


## Evaluation 

# Save result

In [9]:
result_df = test_df[['srch_id', 'prop_id']].rename(columns=dict(zip(['srch_id', 'prop_id'], submission_cols)))

In [10]:
result_df = result_df.merge(random_rec, how='left', on=submission_cols)

In [16]:
assert result_df.shape[0] == random_rec.shape[0]

In [11]:
result_df = result_df.sort_values(['SearchId', "predicted"], ascending=[True, False])

In [12]:
result_df.head()

Unnamed: 0,SearchId,PropertyId,predicted
12,2,131173,0.957494
0,2,3105,0.945264
8,2,30434,0.880982
1,2,6399,0.803069
6,2,26540,0.756882


In [17]:
file_name = os.path.join(result_dir, '0429_RandomRec.csv')
result_df[submission_cols].to_csv(file_name, index=False)

In [18]:
! head {file_name}

SearchId,PropertyId
2,131173
2,3105
2,30434
2,6399
2,26540
2,7374
2,91899
2,37331
2,12938


In [19]:
! cat {file_name} | wc -l

 6622630
