In [None]:
# ========================================
# Library
# ========================================
import math
import random
import pickle
import itertools
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib
# import jpholiday
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import (
    TimeSeriesSplit,
    StratifiedKFold,
    KFold,
    GroupKFold,
    StratifiedGroupKFold,
)
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, roc_auc_score, classification_report
from scipy.optimize import minimize
import lightgbm as lgb

In [None]:
i_path = '../../train/'
o_path = '../model/'

TARGET = 'is_congestion'

In [None]:
train_df = pd.read_csv(i_path + 'train.csv')
road_df = pd.read_csv(i_path + 'road_local.csv')
search_spec_df = pd.read_csv(i_path + 'search_specified.csv')
search_unspec_df = pd.read_csv(i_path + 'search_unspecified.csv')

In [None]:
def expand_datetime(df):
    if 'datetime' in df.columns:
        df['year'] = df['datetime'].dt.year
        df['month'] = df['datetime'].dt.month
        df['day'] = df['datetime'].dt.day
        df['hour'] = df['datetime'].dt.hour
    if 'date' in df.columns:
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df['day'] = df['date'].dt.day
    return df

In [None]:
def extract_dataset(train_df, search_spec_df, search_unspec_df):
    train_df['datetime'] = pd.to_datetime(train_df['datetime'])
    search_spec_df['datetime'] = pd.to_datetime(search_spec_df['datetime'])
    search_unspec_df['date'] = pd.to_datetime(search_unspec_df['date'])

    train_df = expand_datetime(train_df)
    search_unspec_df = expand_datetime(search_unspec_df)

    train_df = train_df.merge(search_spec_df, on=['datetime', 'start_code', 'end_code'], how='left')
    train_df = train_df.merge(search_unspec_df, on=['year', 'month', 'day', 'start_code', 'end_code'], how='left')
    train_df = train_df.merge(road_df.drop(['start_name', 'end_name'], axis=1), on=['start_code', 'end_code'], how='left')

    train_df['dayofweek'] = train_df['datetime'].dt.weekday

    return train_df

In [None]:
train = extract_dataset(train_df, search_spec_df, search_unspec_df)

In [None]:
train.head(5)

In [None]:
train['section'] = train['start_code'].astype(str) + '_' + train['KP'].astype(str) + '_' + train['end_code'].astype(str)

In [None]:
cat_cols = ['road_code', 'start_code', 'end_code', 'section', 'direction', 'hour', 'dayofweek']
num_cols = ['year', 'month', 'day', 'hour', 'search_specified', 'search_unspecified', 'KP', 'start_KP', 'end_KP', 'limit_speed', 'OCC']
feature_cols = cat_cols + num_cols

In [None]:
train[feature_cols].head(5)

In [None]:
def train_lgbm(X, y, cv,
               model_path = [],
               params: dict=None,
               verbose: int=100
               ):

    # パラメータがないときは、空の dict で置き換える
    if params is None:
        params = {}

    models = []
    n_records = len(X)
    # training data の target と同じだけのゼロ配列を用意
    oof_pred = np.zeros((n_records), dtype=np.float32)

    for i, (idx_train, idx_valid) in enumerate(cv):
        x_train, y_train = X[idx_train], y[idx_train]
        x_valid, y_valid = X[idx_valid], y[idx_valid]

        clf = lgb.LGBMClassifier(**params)

        clf.fit(x_train, y_train,
                eval_set=[(x_valid, y_valid)],
                callbacks=[
                    lgb.early_stopping(stopping_rounds=50, verbose=True),
                    lgb.log_evaluation(100)
                ]
                )

        pred_i = clf.predict_proba(x_valid)[:,1]
        oof_pred[idx_valid] = pred_i
        models.append(clf)
        score = roc_auc_score(y_valid, pred_i)
        print(f" - fold{i + 1} - {score:.4f}")

    score = roc_auc_score(y, oof_pred)

    print("=" * 50)
    print(f"FINISHI: CV Score: {score:.4f}")
    return score, oof_pred, models

In [None]:
params = {
    "objective": "binary",
    "metric": "auc",
    "n_estimators": 100000,
    "learning_rate": 0.1,
    "importance_type": "gain",
    "random_state": 42,
    "verbose": -1,
}
# ========================================
# train-validation split
# ========================================
N_SPLIT = 5
kf = StratifiedGroupKFold(N_SPLIT)
cv_list = list(kf.split(train, y=train[TARGET], groups=train['date']))

# ========================================
# define variables
# ========================================
X = train[feature_cols].values
y = train[TARGET].values

print('train shape:', train.shape)
# ========================================
# training
# ========================================
score, oof_pred, models = train_lgbm(X, y=y, params=params, cv=cv_list)

In [None]:
# def objective(trial):
#   params = {
#       "objective": "regression",
#       "metric": "mae",
#       "boosting_type": "rf",
#       "verbosity": -1,
#       "boost_from_average": "false",
#       "random_seed": 42,
#       "feature_pre_filter": False,
#       "max_depth": trial.suggest_int('max_depth', 4, 8),
#       "num_leaves": trial.suggest_int("num_leaves", 2, 100),
#       "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.1),
#       "feature_fraction": trial.suggest_float("feature_fraction", 0.1, 1.0),
#       "bagging_fraction": trial.suggest_float("bagging_fraction", 0.1, 1.0),
#       "min_child_samples": trial.suggest_int("min_child_samples", 1, 25),
#       "min_data_in_leaf": trial.suggest_int('min_data_in_leaf', 1, 4)
#       }

#   model = lgb.train(params, train_set, valid_sets=[val_set])

#   pred_y = model.predict(val_x, num_iteration=model.best_iteration)
#   f1 = f1_score(val_y, pred_y)

#   return f1

In [None]:
# study = optuna.create_study(direction="maximum")
# study.optimize(objective, n_trials=100)

In [None]:
# 最適な閾値を探索

def func(x_list, df, oof):
    score = f1_score(df[TARGET], oof>x_list[0])
    return -score

x0 = [0.5]
result = minimize(func, x0,  args=(train, oof_pred), method="nelder-mead")
threshold = result.x[0]
train['pred'] = (oof_pred>threshold).astype(int)
print('threshold:', threshold)
print(classification_report(train[TARGET], train['pred']))

In [None]:
with open('submit/model/model.pickle', mode='wb') as f:
    pickle.dump(models,f,protocol=2)

In [None]:
# ========================================
# feature importance
# ========================================
def visualize_importance(models, feat_train_df):
    feature_importance_df = pd.DataFrame()
    for i, model in enumerate(models):
        _df = pd.DataFrame()
        _df["feature_importance"] = model.feature_importances_
        _df["column"] = feat_train_df.columns
        _df["fold"] = i + 1
        feature_importance_df = pd.concat([feature_importance_df, _df],
                                          axis=0, ignore_index=True)

    order = feature_importance_df.groupby("column")\
        .sum()[["feature_importance"]]\
        .sort_values("feature_importance", ascending=False).index

    fig, ax = plt.subplots(figsize=(12, max(6, len(order) * .25)))
    sns.boxplot(data=feature_importance_df,
                  x="feature_importance",
                  y="column",
                  order=order,
                  ax=ax,
                  palette="viridis",
                  orient="h")
    ax.tick_params(axis="x", rotation=90)
    ax.set_title("Importance")
    ax.grid()
    fig.tight_layout()
    return fig, ax, feature_importance_df

fig, ax, feature_importance_df = visualize_importance(models, train[feature_cols])