In [1]:

import numpy as np
import pandas as pd
import os
%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from tqdm.notebook import tqdm
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from sklearn import metrics
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer

tqdm.pandas()

SEED=32
submission_file = "../Test.csv"
file = "../features.csv"
BASELINE = 0.2364157534206991

In [2]:
df = pd.read_csv(file)
# df.set_index("Uniq Id", drop=True, inplace=True)
df.head()

Unnamed: 0,Uniq Id,Package Type,Start City,Flight Stops,Meals,Cancellation Rules,Per Person Price,destination_count,airlines_count,sightseeing_count,...,hotel_details_7,hotel_details_8,hotel_details_9,airline_0,airline_1,airline_2,airline_3,airline_4,package_type_price,start_city_price
0,349e0d65d7c0cd0ff250060d9df3d085,3,0,0,5,0,38311.0,4,2,23,...,0.029675,-0.027695,0.02309,37,37,37,26,26,38311.0,38311.0
1,179863780585016c62d8a7c274793076,4,1,2,3,5,9823.5,2,1,10,...,0.030878,0.048031,0.027848,37,37,37,37,35,9823.5,9823.5
2,a0b099bcbc4e86c5505dc69464676d42,1,1,1,3,9,2868.0,1,1,1,...,0.041032,0.064535,-0.143252,37,37,37,37,35,2868.0,6345.75
3,3343f7bdcd11e2571c07ada0ca9ef968,3,1,0,5,9,21918.5,1,1,6,...,-0.007607,0.058315,-0.075154,37,37,37,37,24,30114.75,11536.666667
4,f7d70956ff8c3b8f39d213958f89bf2d,3,0,0,5,0,13827.5,1,1,5,...,0.070179,0.022961,-0.11645,37,37,37,37,24,24685.666667,26069.25


In [3]:
cat_cols = ['Package Type', 'Start City', 'Cancellation Rules',
           'airline_0', 'airline_1', 'airline_2', 'airline_3',
       'airline_4']
reg_cols = ['Flight Stops', 'Meals', 'destination_count',
       'airlines_count', 'sightseeing_count', 'hotel_details_count',
       'place_covered_count', 'itinerary_count', 'package_name_length',
           'package_name_0', 'package_name_1', 'package_name_2', 'package_name_3',
       'package_name_4', 'package_name_5', 'package_name_6', 'package_name_7',
       'package_name_8', 'package_name_9', 'destination_0', 'destination_1',
       'destination_2', 'destination_3', 'places_covered_0',
       'places_covered_1', 'places_covered_2', 'places_covered_3',
       'sight_seeing_places_covered_0', 'sight_seeing_places_covered_1',
       'sight_seeing_places_covered_3', 'sight_seeing_places_covered_4',
       'itinerary_0', 'itinerary_1', 'itinerary_2', 'itinerary_3',
       'hotel_details_0', 'hotel_details_1', 'hotel_details_2',
       'hotel_details_3', 'hotel_details_4', 'hotel_details_5',
       'hotel_details_6', 'hotel_details_7', 'hotel_details_8',
       'hotel_details_9']
#             'package_type_price', 'start_city_price']

target_col = 'Per Person Price'

len(cat_cols) + len(reg_cols) + 1

54

In [4]:
tdf = df[df[target_col].isna()].copy()
df = df[~df[target_col].isna()].copy()
df, vdf = train_test_split(df, test_size=.2, random_state=SEED)


In [5]:


class CatBoost(object):
    def __init__(self):
        self.model = CatBoostRegressor(random_state=SEED, task_type="GPU", devices='0:1', 
                           eval_metric='MSLE', thread_count=8, 
                           cat_features=cat_cols,
                           od_type='Iter', od_wait=10, learning_rate=.2, max_depth=8, iterations=100)
    
    def fit(self, dframe):
        self.model.fit(X=dframe[cat_cols + reg_cols], y=dframe[target_col], silent=True)
        return self.model
    



In [6]:

def rmsle(true, preds):
    return np.sqrt(mean_squared_log_error(true, preds))

scorer = make_scorer(rmsle, greater_is_better=False)

def evaluate_model(m, value=None):
    if not value:
        preds = m.predict(vdf[cat_cols + reg_cols])
    else:
        preds = m.predict(vdf[cat_cols + reg_cols].values)
    true = vdf[target_col]
    return rmsle(true, preds)


In [16]:
class RandomForest(object):
    def __init__(self):
        self.model = RandomForestRegressor(random_state=SEED)
    
    def fit(self, dframe):
        parameters = {
            'oob_score': [False],
            'n_estimators': [800, 900 ,1000]
        }
        clf = GridSearchCV(self.model, parameters, cv=3, refit=True, n_jobs=-1, verbose=3, scoring=scorer)
        clf.fit(dframe[cat_cols + reg_cols].values, dframe[target_col].values)
        print(self.__class__, clf.best_params_)
        return clf

class XGBoost(object):
    def __init__(self):
        self.model = xgb.XGBRegressor(gamma=0, subsample=0.8, colsample_bytree=0.8)
        
    def fit(self, dframe, parameters=None):
        parameters = parameters if parameters else {
            'max_depth':[9],
            'min_child_weight':[6],
            'learning_rate': [.1],
            'n_estimators': [700, 800, 900, 1000]
        } 
        clf = GridSearchCV(self.model, parameters, cv=3, refit=True, n_jobs=-1, verbose=3, scoring=scorer)
        clf.fit(dframe[cat_cols + reg_cols].values, dframe[target_col].values)
        print(self.__class__, clf.best_params_)
        
        return clf

In [8]:
model = XGBoost().fit(df)
print(model.__class__, evaluate_model(model, value=True))
model = CatBoost().fit(df)
print(model.__class__, evaluate_model(model, value=False))
# model = RandomForest().fit(df)

# print(model.__class__, evaluate_model(model, value=True))


Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   9 | elapsed:  1.1min remaining:  3.7min
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:  1.4min remaining:   42.2s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:  1.8min finished


<class '__main__.XGBoost'> {'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 6, 'n_estimators': 700}
<class 'sklearn.model_selection._search.GridSearchCV'> 0.2364157534206991


Metric MSLE is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


<class 'catboost.core.CatBoostRegressor'> 0.277973535437134


In [9]:
result = pd.read_csv(submission_file)[['Uniq Id']]
result

Unnamed: 0,Uniq Id
0,12c47aefbcca65d6156ba0bc7946704b
1,156febf7d27315ae9f210ed13bca3ac2
2,a7f43314e3ee6464a4e35b874bddf2ea
3,d9cc4b407f3d5d290560c03d9ac71035
4,b3bbc8aa6429feb260e9ac0a91f252c4
...,...
8995,75171484ea20440a5c08fe040a9a01a7
8996,16f57cebfb653f2c532314322598ca3a
8997,477f8b3f07e1a6c027152e2cd5d67e4a
8998,c9ce916ec5e1b93134a2f84090284bf9


In [10]:
model = XGBoost().fit(pd.concat([df, vdf]), 
                      {'learning_rate': [0.1], 'max_depth': [9], 'min_child_weight': [6], 'n_estimators': [500]})


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:   45.4s finished


<class '__main__.XGBoost'> {'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 6, 'n_estimators': 500}


In [11]:
result[target_col] = result['Uniq Id'].progress_apply(lambda x: model.predict(tdf[tdf['Uniq Id'] == x][cat_cols + reg_cols].values)[0])
result.head()


  0%|          | 0/9000 [00:00<?, ?it/s]

Unnamed: 0,Uniq Id,Per Person Price
0,12c47aefbcca65d6156ba0bc7946704b,18677.355469
1,156febf7d27315ae9f210ed13bca3ac2,11811.688477
2,a7f43314e3ee6464a4e35b874bddf2ea,4947.875488
3,d9cc4b407f3d5d290560c03d9ac71035,6752.257812
4,b3bbc8aa6429feb260e9ac0a91f252c4,20533.675781


In [12]:
result.drop(columns=["Uniq Id"]).to_csv("submission.csv", index=False)

In [13]:
from IPython.display import FileLink

FileLink('submission.csv')

In [14]:
df[~df[target_col].isna()][target_col].describe()

count     16800.000000
mean      20059.856905
std       11608.712685
min         791.000000
25%       12482.500000
50%       17762.250000
75%       25127.375000
max      171062.500000
Name: Per Person Price, dtype: float64

In [15]:
result[target_col].describe()

count      9000.000000
mean      19936.074562
std        9794.218098
min        1189.534912
25%       13373.987793
50%       18232.416016
75%       24996.697754
max      133754.109375
Name: Per Person Price, dtype: float64