# Import 

In [70]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import catboost as cb
from catboost import CatBoostRegressor
from scipy.stats import randint
from sklearn.utils.fixes import loguniform

# Load data

In [71]:
train = pd.read_parquet('data/final_train.parquet')
test = pd.read_parquet('data/final_test.parquet')

# Encoder

- 삭제 : id, base_date,day_of_week,base_hour , end_latitude, end_longitude, start_latitude, start_longitude, start_node_name, end_node_name, road_name
  

- One-Hot Encoder  : rain_snow, start_turn_restricted, end_turn_restricted, day_of_week ,week,time 

In [72]:
del_col = ['id','base_date','day_of_week','end_latitude', 'end_longitude', 'start_latitude', 'start_longitude', 'start_node_name', 'end_node_name', 'road_name','day','month']
cat_col= ['rain_snow', 'start_turn_restricted', 'end_turn_restricted','vacation']

In [73]:
train_drop = train.drop(columns = del_col)
test_drop = test.drop(columns = del_col)

# Model

## Catboost

In [75]:
def cat_dtyes (df):
    cat_col = df.select_dtypes(include="object").columns
    df[cat_col] = df[cat_col].astype("category")

    return df

In [76]:
train_drop = cat_dtyes(train_drop)
test_drop = cat_dtyes(test_drop)


In [77]:
y_train = train_drop['target']
X_train = train_drop.drop(columns=['target'])

In [78]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,
                                                    test_size=0.2, random_state=42)

In [79]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(3760973, 18) (3760973,)
(940244, 18) (940244,)


In [84]:
model_cat=cb.CatBoostRegressor(random_state=42,verbose=0,cat_features=cat_col)

In [85]:
param_grid = {
    'n_estimators': randint(100, 300),
    'depth': randint(1, 5),
    'learning_rate': loguniform(1e-3, 0.1),
    'min_child_samples': randint(10, 40),
    'grow_policy': ['SymmetricTree', 'Lossguide', 'Depthwise']
}


In [86]:
model_cat.randomized_search(param_grid,
                            X_train, y_train,
                            cv=3, n_iter=10)



bestTest = 9.850225541
bestIteration = 181

0:	loss: 9.8502255	best: 9.8502255 (0)	total: 12.2s	remaining: 1m 50s

bestTest = 33.64024624
bestIteration = 148

1:	loss: 33.6402462	best: 9.8502255 (0)	total: 22.1s	remaining: 1m 28s

bestTest = 16.576631
bestIteration = 243

2:	loss: 16.5766310	best: 9.8502255 (0)	total: 40.1s	remaining: 1m 33s

bestTest = 9.325589199
bestIteration = 175

3:	loss: 9.3255892	best: 9.3255892 (3)	total: 52.6s	remaining: 1m 18s

bestTest = 24.85612354
bestIteration = 212

4:	loss: 24.8561235	best: 9.3255892 (3)	total: 1m 10s	remaining: 1m 10s

bestTest = 27.73532505
bestIteration = 125

5:	loss: 27.7353251	best: 9.3255892 (3)	total: 1m 19s	remaining: 52.9s

bestTest = 31.70533372
bestIteration = 134

6:	loss: 31.7053337	best: 9.3255892 (3)	total: 1m 40s	remaining: 43.2s

bestTest = 18.37177531
bestIteration = 250

7:	loss: 18.3717753	best: 9.3255892 (3)	total: 1m 58s	remaining: 29.6s

bestTest = 31.23928944
bestIteration = 288

8:	loss: 31.2392894	best: 9.32

{'params': {'min_data_in_leaf': 26.0,
  'depth': 3.0,
  'learning_rate': 0.05586121261057126,
  'iterations': 176.0,
  'grow_policy': 'SymmetricTree'},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
             

In [87]:
model_cat.fit(X_train,y_train)

<catboost.core.CatBoostRegressor at 0x2c484ac40>

In [88]:
pred_cat= model_cat.predict(X_val)

In [89]:
mean_absolute_error(y_val, pred_cat)

7.336146338080428

In [95]:
model_cat1=cb.CatBoostRegressor(random_state=42,verbose=0,cat_features=cat_col)

In [96]:
param_grid1 = {
    'n_estimators': randint(100, 300),
    'depth': randint(1, 5),
    'learning_rate': loguniform(1e-3, 0.1),
    'min_child_samples': randint(10, 40), 
    'grow_policy': ['SymmetricTree', 'Lossguide', 'Depthwise']
}


In [97]:
model_cat1.randomized_search(param_grid1,
                            X_train, y_train,
                            cv=5, n_iter=30)


bestTest = 15.21632876
bestIteration = 159

0:	loss: 15.2163288	best: 15.2163288 (0)	total: 9.38s	remaining: 4m 32s

bestTest = 12.66454592
bestIteration = 171

1:	loss: 12.6645459	best: 12.6645459 (1)	total: 20.1s	remaining: 4m 41s

bestTest = 9.870454397
bestIteration = 176

2:	loss: 9.8704544	best: 9.8704544 (2)	total: 32.7s	remaining: 4m 54s

bestTest = 11.58953759
bestIteration = 138

3:	loss: 11.5895376	best: 9.8704544 (2)	total: 39.9s	remaining: 4m 19s

bestTest = 8.864510734
bestIteration = 198

4:	loss: 8.8645107	best: 8.8645107 (4)	total: 56s	remaining: 4m 39s

bestTest = 37.35941305
bestIteration = 125

5:	loss: 37.3594130	best: 8.8645107 (4)	total: 1m 2s	remaining: 4m 9s

bestTest = 29.13082038
bestIteration = 182

6:	loss: 29.1308204	best: 8.8645107 (4)	total: 1m 11s	remaining: 3m 55s

bestTest = 7.918166219
bestIteration = 268

7:	loss: 7.9181662	best: 7.9181662 (7)	total: 1m 33s	remaining: 4m 17s

bestTest = 23.59161032
bestIteration = 198

8:	loss: 23.5916103	best: 7.9

{'params': {'min_data_in_leaf': 18.0,
  'depth': 4.0,
  'learning_rate': 0.08785322408629322,
  'iterations': 269.0,
  'grow_policy': 'SymmetricTree'},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
             

In [98]:
model_cat1.fit(X_train,y_train)

<catboost.core.CatBoostRegressor at 0x2c675b160>

In [99]:
pred_cat1= model_cat1.predict(X_val)

In [100]:
mean_absolute_error(y_val, pred_cat1)

6.094100194566983