# 물류 유통량 예측 경진대회 🚚

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## IMPORT

In [None]:
!pip install catboost
!pip install optuna

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import optuna
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

plt.rcParams['font.family'] = 'NanumBarunGothic'
%config inlinebackend.figure_format = 'retina'
plt.rcParams['figure.dpi'] = 100

pd.set_option('display.max_row', 526)
pd.set_option('display.max_columns', 100)

## DATA

In [None]:
path = '/content/drive/MyDrive/delivery/'
train = pd.read_csv(path + 'train_new.csv', encoding='cp949')
test = pd.read_csv(path + 'test_new.csv', encoding='cp949')
submission = pd.read_csv(path + 'sample_submission.csv')

In [None]:
data = pd.concat([train, test])

In [None]:
data['SEND_SPG_INNB'] = data['SEND_SPG_INNB'].astype('category').cat.codes
data['REC_SPG_INNB'] = data['REC_SPG_INNB'].astype('category').cat.codes

data['SEND_SPG_INNB'] = data['SEND_SPG_INNB'].astype('str')
data['REC_SPG_INNB'] = data['REC_SPG_INNB'].astype('str')

In [None]:
data = pd.get_dummies(data)

In [None]:
train, test = data.iloc[:-len(test)], data.iloc[-len(test):]

In [None]:
test.drop(columns=['INVC_CONT'], inplace=True)

In [None]:
train.drop(columns=['index'], inplace=True)
test.drop(columns=['index'], inplace=True)

In [None]:
X = train.drop(columns=['INVC_CONT'])
y = train[['INVC_CONT']]

## OPTUNA

In [None]:
def objective_cat(trial):
    """
    Objective function to tune a `CatBoostRegressor` model.
    """

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    params = {
        'iterations':trial.suggest_int("iterations", 4000, 25000),
        'od_wait':trial.suggest_int('od_wait', 500, 2300),
        'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
        'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
        'subsample': trial.suggest_uniform('subsample',0,1),
        'random_strength': trial.suggest_uniform('random_strength',10,50),
        'depth': trial.suggest_int('depth',1, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
        'verbose' : False
    }


    model = CatBoostRegressor(
        loss_function="RMSE",
        random_state=42,
        **params,
    )
    model.fit(x_train, y_train,early_stopping_rounds=100,verbose=False)

    pred = model.predict(x_test)

    rmse = mean_squared_error(y_test, pred, squared=False)


    return rmse

In [None]:
study_catboost = optuna.create_study(direction="minimize")
study_catboost.optimize(objective_cat, n_trials=50)

In [None]:
params=study_catboost.best_params 
params['loss_function'] = 'RMSE'
params['random_state'] = 42 

In [None]:
preds = np.zeros(test.shape[0])
kf = KFold(n_splits=5,random_state=42,shuffle=True)
rmse=[]  # list contains mae for each fold
n=0
for trn_idx, test_idx in kf.split(train[train.columns[1:].to_list()],train['INVC_CONT']):
    X_tr,X_val=train[train.columns[1:].to_list()].iloc[trn_idx],train[train.columns[1:].to_list()].iloc[test_idx]
    y_tr,y_val=train['INVC_CONT'].iloc[trn_idx],train['INVC_CONT'].iloc[test_idx]
    model = CatBoostRegressor(**params)
    model.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    preds+=model.predict(test[train.columns[1:].to_list()])/kf.n_splits
    rmse.append(mean_squared_error(y_val, model.predict(X_val), squared=False))
    print(n+1,rmse[n])
    n+=1

In [None]:
submission['INVC_CONT'] = preds

In [None]:
submission.to_csv(path + 'catboost_kfold5_remove_outlier.csv', index=False)