# Import 

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import catboost as cb
from catboost import CatBoostRegressor
from scipy.stats import randint
from sklearn.utils.fixes import loguniform

# Load data

In [2]:
train = pd.read_parquet('data/final_train.parquet')
test = pd.read_parquet('data/final_test.parquet')

# Encoder

- 삭제 : id, base_date,day_of_week,base_hour , end_latitude, end_longitude, start_latitude, start_longitude, start_node_name, end_node_name, road_name
  

- One-Hot Encoder  : rain_snow, start_turn_restricted, end_turn_restricted, day_of_week ,week,time 

In [3]:
del_col = ['id','base_date','day_of_week','end_latitude', 'end_longitude', 'start_latitude', 'start_longitude', 'start_node_name', 'end_node_name', 'road_name','day','month']
cat_col= ['rain_snow', 'start_turn_restricted', 'end_turn_restricted','vacation']

In [4]:
train_drop = train.drop(columns = del_col)
test_drop = test.drop(columns = del_col)

# Model

## Catboost

In [5]:
def cat_dtyes (df):
    cat_col = df.select_dtypes(include="object").columns
    df[cat_col] = df[cat_col].astype("category")

    return df

In [6]:
train_drop = cat_dtyes(train_drop)
test_drop = cat_dtyes(test_drop)


In [7]:
y_train = train_drop['target']
X_train = train_drop.drop(columns=['target'])

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train,
                                                    test_size=0.2, random_state=42)

In [9]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

(3760973, 18) (3760973,)
(940244, 18) (940244,)


# Catboost Hyperparameter tuning

In [10]:
model_cat2=cb.CatBoostRegressor(loss_function='MAE', verbose=0,cat_features=cat_col)

In [11]:
distributions2 = {
    'depth': [10,20,30,50],
    'learning_rate':[0.01, 0.05, 0.1],
    'n_estimators':[1000,2000, 2500],    
    'random_seed':[42, 43, 44, 45, 100],
    'l2_leaf_reg':[0.1, 0.3, 0.4, 0.5, 1]
    
}


In [12]:
randomized_search_result2 = model_cat2.randomized_search(
    distributions2,
    X = X_train, y = y_train, 
    cv=5, n_iter=20,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


bestTest = 4.738556578
bestIteration = 1999

0:	loss: 4.7385566	best: 4.7385566 (0)	total: 8m 9s	remaining: 2h 34m 54s

bestTest = 4.380589619
bestIteration = 1999

1:	loss: 4.3805896	best: 4.3805896 (1)	total: 16m 37s	remaining: 2h 29m 33s

bestTest = 4.400444551
bestIteration = 1999

2:	loss: 4.4004446	best: 4.3805896 (1)	total: 25m 4s	remaining: 2h 22m 4s

bestTest = 5.15636085
bestIteration = 999

3:	loss: 5.1563609	best: 4.3805896 (1)	total: 29m 15s	remaining: 1h 57m 1s
Estimating final quality...
Training on fold [0/5]

bestTest = 4.373555603
bestIteration = 1997

Training on fold [1/5]

bestTest = 4.370971993
bestIteration = 1999

Training on fold [2/5]

bestTest = 4.377848842
bestIteration = 1999

Training on fold [3/5]

bestTest = 4.375581473
bestIteration = 1999

Training on fold [4/5]

bestTest = 4.382355158
bestIteration = 1999



In [13]:
randomized_search_result2

{'params': {'depth': 10,
  'random_seed': 44,
  'iterations': 2000,
  'learning_rate': 0.1,
  'l2_leaf_reg': 0.5},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
            

In [14]:
model_cat=cb.CatBoostRegressor(
    loss_function='MAE', verbose=0,cat_features=cat_col,
    depth=10, random_seed=44, iterations=2000, 
    learning_rate=0.1, l2_leaf_reg=0.5)

In [15]:
model_cat.fit(X_train,y_train)

<catboost.core.CatBoostRegressor at 0x2cd6e20a0>

In [16]:
pred_cat2= model_cat.predict(X_val)

In [17]:
mean_absolute_error(y_val, pred_cat2)

4.3710915792550304

In [18]:
pred = model_cat.predict(test_drop)

In [19]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission['target'] = pred
sample_submission.to_csv("submit_cat2.csv", index = False)