In [1]:

import numpy as np
import pandas as pd
import os
%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from tqdm.notebook import tqdm
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from sklearn import metrics
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
pd.set_option('display.max_colwidth', -1)
tqdm.pandas()

SEED=32
submission_file = "../Test.csv"
file = "../features.csv"
BASELINE = 0.2364157534206991



In [2]:
df = pd.read_csv(file)
# df.set_index("Uniq Id", drop=True, inplace=True)
df.head()

Unnamed: 0,Uniq Id,Package Type,Start City,Flight Stops,Meals,Cancellation Rules,Per Person Price,destination_count,airlines_count,sightseeing_count,...,package_type_price_1000,start_city_price_100,start_city_price_1000,PT_AC,PT_DC,PT_SC,start_city_mean,start_city_sum,package_type_mean,package_type_sum
0,349e0d65d7c0cd0ff250060d9df3d085,3,0,0,5,0,38311.0,4,2,23,...,38311.0,38311.0,38311.0,21,24,6,20811.841682,151239653.5,23965.407006,133750936.5
1,179863780585016c62d8a7c274793076,4,1,2,3,5,9823.5,2,1,10,...,9823.5,9823.5,9823.5,26,31,9,19661.128996,270006284.5,16354.089081,92711331.0
2,a0b099bcbc4e86c5505dc69464676d42,1,1,1,3,9,2868.0,1,1,1,...,2868.0,6345.75,6345.75,4,5,3,19661.128996,270006284.5,18409.508825,110567510.0
3,3343f7bdcd11e2571c07ada0ca9ef968,3,1,0,5,9,21918.5,1,1,6,...,30114.75,11536.666667,11536.666667,20,21,7,19661.128996,270006284.5,23965.407006,133750936.5
4,f7d70956ff8c3b8f39d213958f89bf2d,3,0,0,5,0,13827.5,1,1,5,...,24685.666667,26069.25,26069.25,20,21,6,20811.841682,151239653.5,23965.407006,133750936.5


In [3]:
cat_cols = ['Package Type', 'Start City', 'Cancellation Rules',
           'airline_0', 'airline_1', 'airline_2', 'airline_3',
       'airline_4', 
           'PT_AC', 'PT_DC', 'PT_SC']
reg_cols = ['Flight Stops', 'Meals', 'destination_count',
       'airlines_count', 'sightseeing_count', 'hotel_details_count',
       'place_covered_count', 'itinerary_count', 'package_name_length',
           'package_name_0', 'package_name_1', 'package_name_2', 'package_name_3',
       'package_name_4', 'package_name_5', 'package_name_6', 'package_name_7',
       'package_name_8', 'package_name_9', 'destination_0', 'destination_1',
       'destination_2', 'destination_3', 'places_covered_0',
       'places_covered_1', 'places_covered_2', 'places_covered_3',
       'sight_seeing_places_covered_0', 'sight_seeing_places_covered_1',
       'sight_seeing_places_covered_3', 'sight_seeing_places_covered_4',
       'itinerary_0', 'itinerary_1', 'itinerary_2', 'itinerary_3',
       'hotel_details_0', 'hotel_details_1', 'hotel_details_2',
       'hotel_details_3', 'hotel_details_4', 'hotel_details_5',
       'hotel_details_6', 'hotel_details_7', 
#             'hotel_details_8','hotel_details_9', 
            'Itinerary_night_count', 'start_city_mean',
            'start_city_sum', 'package_type_mean', 'package_type_sum']
#             'package_type_price_100','package_type_price_1000', 
#             'start_city_price_100', 'start_city_price_1000']

target_col = 'Per Person Price'

len(cat_cols) + len(reg_cols) + 1

60

In [4]:
df[reg_cols] = MinMaxScaler().fit_transform(df[reg_cols])
# df[reg_cols]

In [5]:
tdf = df[df[target_col].isna()].copy()
df = df[~df[target_col].isna()].copy()
df, vdf = train_test_split(df, test_size=.2, random_state=SEED)


In [6]:


class CatBoost(object):
    def __init__(self):
        self.model = CatBoostRegressor(random_state =SEED, task_type="GPU", devices='0:1', 
                           eval_metric='MSLE', thread_count=8, 
                           cat_features=cat_cols,
                           od_type='Iter', od_wait=10)
    
    def fit(self, dframe):
        grid = {
            'max_depth': [6, 8, 10, 12], 
            'iterations': [100, 200],
            'learning_rate': [.1, .2, .3]
        }
        grid_search_result= self.model.grid_search(
                                grid, 
                               X=dframe[cat_cols + reg_cols], 
                               y=dframe[target_col], 
                               shuffle=True, stratified=False, 
                               verbose=False, plot=False,  refit=True
                              )
        print(grid_search_result['params'])
        return self.model
    



In [7]:

def rmsle(true, preds):
    return np.sqrt(mean_squared_log_error(true, preds))

scorer = make_scorer(rmsle, greater_is_better=False)
fold = KFold(n_splits=3, shuffle=False, random_state=SEED)

def evaluate_model(m, value=None):
    if not value:
        preds = m.predict(vdf[cat_cols + reg_cols])
    else:
        preds = m.predict(vdf[cat_cols + reg_cols].values)
    true = vdf[target_col]
    return rmsle(true, preds)




In [8]:
class RandomForest(object):
    def __init__(self):
        self.model = RandomForestRegressor(random_state=SEED)
    
    def fit(self, dframe):
        parameters = {
            'oob_score': [False],
            'n_estimators': [800, 900 ,1000]
        }
        clf = GridSearchCV(self.model, parameters, cv=3, refit=True, n_jobs=-1, verbose=3, scoring=scorer)
        clf.fit(dframe[cat_cols + reg_cols].values, dframe[target_col].values)
        print(self.__class__, clf.best_params_)
        return clf

class XGBoost(object):
    def __init__(self):
        self.model = xgb.XGBRegressor(colsample_bytree=0.8)
        
    def fit(self, dframe, parameters=None):
        parameters = parameters if parameters else {
            'max_depth':[12, 13,14],
            'min_child_weight':[4, 5, 6],
            'learning_rate': [.01, ],
            'n_estimators': [2000, 2200],
            'gamma': [0.],
            'subsample': [.8]
        } 
        clf = GridSearchCV(self.model, parameters, cv=fold, 
                           refit=True, n_jobs=-1, verbose=10, 
                           scoring=scorer)
        clf.fit(dframe[cat_cols + reg_cols].values, dframe[target_col].values)
        print(self.__class__, clf.best_params_)
#         print(clf.cv_results_)
        return clf, clf.best_params_

In [9]:
# model = CatBoost().fit(df)
# print(model.__class__, evaluate_model(model, value=False))
# model = XGBoost().fit(df)
model, grid = XGBoost().fit(df)

print(evaluate_model(model, value=True))
# model = XGBoost().fit(df, 
#                       {'learning_rate': [0.1], 'max_depth': [9], 'min_child_weight': [6], 'n_estimators': [1100]})

# print(1100, evaluate_model(model, value=True))
# model = XGBoost().fit(df, 
#                       {'learning_rate': [0.1], 'max_depth': [9], 'min_child_weight': [6], 'n_estimators': [1200]})

# print(1200, evaluate_model(model, value=True))

# model = RandomForest().fit(df)

# print(model.__class__, evaluate_model(model, value=True))


Fitting 3 folds for each of 216 candidates, totalling 648 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  9.8min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 13.1min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed: 16.5min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed: 19.7min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed: 23.9min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed: 27.1min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed: 33.2min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed: 39.4min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed: 45.9min
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed: 55.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed: 62.3min
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed: 6

<class '__main__.XGBoost'> {'gamma': 0.0, 'learning_rate': 0.01, 'max_depth': 12, 'min_child_weight': 6, 'n_estimators': 2000, 'subsample': 0.8}
0.20904143144437182


In [10]:
results_df = pd.DataFrame(model.cv_results_)
results_df = results_df.sort_values(by=['rank_test_score'])
results_df = (
    results_df
    .set_index(results_df["params"].apply(
        lambda x: "_".join(str(val) for val in x.values()))
    )
    .rename_axis('kernel')
)
results_df[
    ['params', 'rank_test_score', 'mean_test_score', 'std_test_score']
]

Unnamed: 0_level_0,params,rank_test_score,mean_test_score,std_test_score
kernel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.8_0.01_12_6_2000_0.8,"{'gamma': 0.8, 'learning_rate': 0.01, 'max_depth': 12, 'min_child_weight': 6, 'n_estimators': 2000, 'subsample': 0.8}",1,-0.215434,0.001333
0.0_0.01_12_6_2000_0.8,"{'gamma': 0.0, 'learning_rate': 0.01, 'max_depth': 12, 'min_child_weight': 6, 'n_estimators': 2000, 'subsample': 0.8}",1,-0.215434,0.001333
0.5_0.01_12_6_2000_0.8,"{'gamma': 0.5, 'learning_rate': 0.01, 'max_depth': 12, 'min_child_weight': 6, 'n_estimators': 2000, 'subsample': 0.8}",1,-0.215434,0.001333
0.0_0.01_12_8_2000_0.8,"{'gamma': 0.0, 'learning_rate': 0.01, 'max_depth': 12, 'min_child_weight': 8, 'n_estimators': 2000, 'subsample': 0.8}",4,-0.215713,0.000965
0.8_0.01_12_8_2000_0.8,"{'gamma': 0.8, 'learning_rate': 0.01, 'max_depth': 12, 'min_child_weight': 8, 'n_estimators': 2000, 'subsample': 0.8}",4,-0.215713,0.000965
...,...,...,...,...
0.0_0.01_9_12_900_0.5,"{'gamma': 0.0, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 12, 'n_estimators': 900, 'subsample': 0.5}",211,-0.229124,0.002003
0.8_0.01_9_12_900_0.5,"{'gamma': 0.8, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 12, 'n_estimators': 900, 'subsample': 0.5}",211,-0.229124,0.002003
0.8_0.01_9_10_900_0.5,"{'gamma': 0.8, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 10, 'n_estimators': 900, 'subsample': 0.5}",214,-0.229172,0.001801
0.5_0.01_9_10_900_0.5,"{'gamma': 0.5, 'learning_rate': 0.01, 'max_depth': 9, 'min_child_weight': 10, 'n_estimators': 900, 'subsample': 0.5}",214,-0.229172,0.001801


In [None]:
grid

In [14]:
result = pd.read_csv(submission_file)[['Uniq Id']]
result

Unnamed: 0,Uniq Id
0,12c47aefbcca65d6156ba0bc7946704b
1,156febf7d27315ae9f210ed13bca3ac2
2,a7f43314e3ee6464a4e35b874bddf2ea
3,d9cc4b407f3d5d290560c03d9ac71035
4,b3bbc8aa6429feb260e9ac0a91f252c4
...,...
8995,75171484ea20440a5c08fe040a9a01a7
8996,16f57cebfb653f2c532314322598ca3a
8997,477f8b3f07e1a6c027152e2cd5d67e4a
8998,c9ce916ec5e1b93134a2f84090284bf9


In [15]:
model, grid = XGBoost().fit(pd.concat([df, vdf]), {i: [grid[i]] for i in grid})
#                       {'learning_rate': [0.01], 'max_depth': [9], 'min_child_weight': [6], 'n_estimators': [1600]})
# print(evaluate_model(model, value=True))

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  3.9min finished


<class '__main__.XGBoost'> {'gamma': 0.0, 'learning_rate': 0.01, 'max_depth': 12, 'min_child_weight': 6, 'n_estimators': 2000, 'subsample': 0.8}


In [16]:
result[target_col] = result['Uniq Id'].progress_apply(lambda x: model.predict(tdf[tdf['Uniq Id'] == x][cat_cols + reg_cols].values)[0])
result.head()


HBox(children=(FloatProgress(value=0.0, max=9000.0), HTML(value='')))




Unnamed: 0,Uniq Id,Per Person Price
0,12c47aefbcca65d6156ba0bc7946704b,19357.810547
1,156febf7d27315ae9f210ed13bca3ac2,9841.235352
2,a7f43314e3ee6464a4e35b874bddf2ea,5316.912598
3,d9cc4b407f3d5d290560c03d9ac71035,7477.385254
4,b3bbc8aa6429feb260e9ac0a91f252c4,21995.652344


In [17]:
result.drop(columns=["Uniq Id"]).to_csv("submission.csv", index=False)

In [18]:
from IPython.display import FileLink

FileLink('submission.csv')

In [19]:
df[~df[target_col].isna()][target_col].describe()

count    16800.000000 
mean     20059.856905 
std      11608.712685 
min      791.000000   
25%      12482.500000 
50%      17762.250000 
75%      25127.375000 
max      171062.500000
Name: Per Person Price, dtype: float64

In [20]:
result[target_col].describe()

count    9000.000000  
mean     19923.380155 
std      9798.718768  
min      2096.453857  
25%      13348.028564 
50%      18172.498047 
75%      24677.121094 
max      128227.007812
Name: Per Person Price, dtype: float64