In [None]:

import numpy as np
import pandas as pd
import os
%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from tqdm.notebook import tqdm
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score, RepeatedStratifiedKFold
from sklearn import metrics
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer
pd.set_option('display.max_colwidth', -1)
tqdm.pandas()

SEED=32
submission_file = "../Test.csv"
file = "../features.csv"


In [None]:
df = pd.read_csv(file)
# df.set_index("Uniq Id", drop=True, inplace=True)
df.head()

In [None]:
cat_cols = ['Package Type', 'Start City', 'Cancellation Rules',
           'airline_0', 'airline_1', 'airline_2', 'airline_3',
       'airline_4', 
    
           'PT_AC', 'PT_DC', 'PT_SC']#, 'MN_WD' , 'MN_YR', 'YR_WD']#'PT_NC', 'PT_IC', 'PT_FS', 'PT_CR',
    
#            'SC_AC', 'SC_DC', 'SC_NC', 'SC_IC', 'SC_FS', 'SC_CR',
#            'CR_AC', 'CR_DC', 'CR_SC', 'CR_NC', 'CR_IC', 'CR_FS']
reg_cols = ['Flight Stops', 'Meals', 'destination_count',
       'airlines_count', 'sightseeing_count', 'hotel_details_count',
       'place_covered_count', 'itinerary_count', 'package_name_length',
#            'package_name_0', 'package_name_1', 'package_name_2', 'package_name_3',
#        'package_name_4', 'package_name_5', 'package_name_6', 'package_name_7',
#        'package_name_8', 'package_name_9', 
            'destination_0', 'destination_1',
       'destination_2', 'destination_3', 'places_covered_0',
       'places_covered_1', 'places_covered_2', 'places_covered_3',
       'sight_seeing_places_covered_0', 'sight_seeing_places_covered_1',
       'sight_seeing_places_covered_3', 'sight_seeing_places_covered_4',
       'itinerary_0', 'itinerary_1', 'itinerary_2', 'itinerary_3',
#        'hotel_details_0', 'hotel_details_1', 'hotel_details_2',
#        'hotel_details_3', 'hotel_details_4', 'hotel_details_5',
#        'hotel_details_6', 'hotel_details_7',
        'Itinerary_night_count', 
            'start_city_mean',
        'start_city_sum', 'package_type_mean', 'package_type_sum',
           'month_type_mean', 'monthtype_sum', 
#             'day_of_week_type_mean',
#            'day_of_week_type_sum', 
            'year_type_sum', 'year_type_mean']
#         'package_type_price_100','package_type_price_1000', 
#         'start_city_price_100', 'start_city_price_1000']

target_col = 'Per Person Price'

len(cat_cols) + len(reg_cols) + 1

In [None]:
df[reg_cols] = MinMaxScaler().fit_transform(df[reg_cols])
# df[reg_cols]

In [None]:
tdf = df[df[target_col].isna()].copy()
df = df[~df[target_col].isna()].copy()
df, vdf = train_test_split(df, test_size=.2, random_state=SEED)


In [None]:


class CatBoost(object):
    def __init__(self):
        self.model = CatBoostRegressor(random_state =SEED, task_type="GPU", devices='0:1', 
                           eval_metric='MSLE', thread_count=8, 
                           cat_features=cat_cols,
                           od_type='Iter', od_wait=10)
    
    def fit(self, dframe):
        grid = {
            'max_depth': [6, 8, 10, 12], 
            'iterations': [100, 200],
            'learning_rate': [.1, .2, .3]
        }
        grid_search_result= self.model.grid_search(
                                grid, 
                               X=dframe[cat_cols + reg_cols], 
                               y=dframe[target_col], 
                               shuffle=True, stratified=False, 
                               verbose=False, plot=False,  refit=True
                              )
        print(grid_search_result['params'])
        return self.model
    



In [None]:

def rmsle(true, preds):
    return np.sqrt(mean_squared_log_error(true, preds))

scorer = make_scorer(rmsle, greater_is_better=False)
fold = KFold(n_splits=3, shuffle=True, random_state=SEED)

def evaluate_model(m, value=None):
    if not value:
        preds = m.predict(vdf[cat_cols + reg_cols])
    else:
        preds = m.predict(vdf[cat_cols + reg_cols].values)
    true = vdf[target_col]
    return rmsle(true, preds)


In [None]:
class RandomForest(object):
    def __init__(self):
        self.model = RandomForestRegressor(random_state=SEED)
    
    def fit(self, dframe):
        parameters = {
            'oob_score': [False],
            'n_estimators': [800, 900 ,1000]
        }
        clf = GridSearchCV(self.model, parameters, cv=3, refit=True, n_jobs=-1, verbose=3, scoring=scorer)
        clf.fit(dframe[cat_cols + reg_cols].values, dframe[target_col].values)
        print(self.__class__, clf.best_params_)
        return clf

class XGBoost(object):
    def __init__(self):
        self.model = xgb.XGBRegressor()
        
    def fit(self, dframe, parameters=None):
        parameters = parameters if parameters else {
            'max_depth':[12, 10, 8, 6, 14],
            'min_child_weight': [4 ,3 , 5],
            'learning_rate': [.01],
            'n_estimators': [500, 1000, 1500, 2000],
            'gamma': [0.],
            'subsample': [.8],
            'colsample_bytree':[0.8]
        } 
        clf = GridSearchCV(self.model, parameters, cv=fold, 
                           refit=True, n_jobs=-1, verbose=10, 
                           scoring=scorer)
        clf.fit(dframe[cat_cols + reg_cols].values, dframe[target_col].values)
        print(self.__class__, clf.best_params_)
#         print(clf.cv_results_)
        return clf, clf.best_params_

In [None]:
# model = CatBoost().fit(df)
# print(model.__class__, evaluate_model(model, value=False))

model, grid = XGBoost().fit(df)
print(evaluate_model(model, value=True))


# model = RandomForest().fit(df)
# print(model.__class__, evaluate_model(model, value=True))

In [None]:
grid

In [None]:
results_df = pd.DataFrame(model.cv_results_)
results_df = results_df.sort_values(by=['rank_test_score'])
results_df = (
    results_df
    .set_index(results_df["params"].apply(
        lambda x: "_".join(str(val) for val in x.values()))
    )
    .rename_axis('kernel')
)
results_df[
    ['params', 'rank_test_score', 'mean_test_score', 'std_test_score']
]

In [None]:
result = pd.read_csv(submission_file)[['Uniq Id']]
result

In [None]:
model, grid = XGBoost().fit(pd.concat([df, vdf]), {i: [grid[i]] for i in grid})
#                       {'learning_rate': [0.01], 'max_depth': [9], 'min_child_weight': [6], 'n_estimators': [1600]})
# print(evaluate_model(model, value=True))

In [None]:
result[target_col] = result['Uniq Id'].progress_apply(lambda x: model.predict(tdf[tdf['Uniq Id'] == x][cat_cols + reg_cols].values)[0])
result.head()


In [None]:
result.drop(columns=["Uniq Id"]).to_csv("submission.csv", index=False)

In [None]:
from IPython.display import FileLink

FileLink('submission.csv')

In [None]:
df[~df[target_col].isna()][target_col].describe()

In [None]:
result[target_col].describe()

In [None]:
importance = model.best_estimator_.feature_importances_
features = df[cat_cols + reg_cols].columns.values

for i,v in enumerate(importance):
    print('Feature: %s, Score: %.5f' % (features[i],v))

plt.figure(figsize=(15,8))
plt.xticks(rotation=90)
sns.barplot(x=[features[x] for x in range(len(importance))], y=importance)