In [1]:
# ========================================
# Library
# ========================================
import math
import random
import pickle
import joblib
import itertools
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib
import jpholiday
from glob import glob
from tqdm.notebook import tqdm
from sklearn.model_selection import (
    TimeSeriesSplit,
    StratifiedKFold,
    KFold,
    GroupKFold,
    StratifiedGroupKFold,
)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import f1_score, roc_auc_score, classification_report
from scipy.optimize import minimize
import lightgbm as lgb
import datetime
import copy

from sklearn.multioutput import ClassifierChain

## データの読み込み・前加工

In [2]:
road_df = pd.read_csv('./train/road.csv')
search_spec_df = pd.read_csv('./train/search_data.csv')
search_unspec_df = pd.read_csv('./train/search_unspec_data.csv')
train_df = pd.read_csv('./train/train.csv')

In [3]:
def expand_datetime(df):
    if 'datetime' in df.columns:
        df['year'] = df['datetime'].dt.year
        df['month'] = df['datetime'].dt.month
        df['day'] = df['datetime'].dt.day
        df['hour'] = df['datetime'].dt.hour
    if 'date' in df.columns:
        df['year'] = df['date'].dt.year
        df['month'] = df['date'].dt.month
        df['day'] = df['date'].dt.day
        # df["date"] = df["date"]

    return df

def extract_dataset(train_df, search_spec_df, search_unspec_df):
    train_df['datetime'] = pd.to_datetime(train_df['datetime'])
    search_spec_df['datetime'] = pd.to_datetime(search_spec_df['datetime'])
    search_unspec_df['date'] = pd.to_datetime(search_unspec_df['date'])

    train_df = expand_datetime(train_df)
    # search_spec_df = expand_datetime(search_spec_df)
    search_unspec_df = expand_datetime(search_unspec_df)

    train_df = train_df.merge(search_spec_df, on=['datetime', 'start_code', 'end_code'], how='left')
    train_df = train_df.merge(search_unspec_df, on=['year', 'month', 'day', 'start_code', 'end_code'], how='left')
    train_df = train_df.merge(road_df.drop(['start_name', 'end_name'], axis=1), on=['start_code', 'end_code'], how='left')

    train_df['dayofweek'] = train_df['datetime'].dt.weekday

    return train_df

In [4]:
train = extract_dataset(train_df, search_spec_df, search_unspec_df)
train['section'] = train['start_code'].astype(str)+'_'+train['end_code'].astype(str)

## データ加工

In [5]:
def holiday_add_a2(df, date_col="date"):
    merged_t = df.copy()
    
    start_year = 2021
    end_year = 2023
    holiday = jpholiday.between(datetime.date(start_year, 1, 1), datetime.date(end_year, 12, 31))
    holiday_date = [x[0] for x in holiday]
    df_holiday = pd.DataFrame(pd.date_range(start=f"{start_year}-1-1", end=f"{end_year}-12-31"),
                              columns=[date_col]).set_index(date_col)
    df_holiday.loc[holiday_date, "holiday"] = 1
    df_holiday = df_holiday.reset_index()
    df_holiday["dayofweek"] = df_holiday[date_col].dt.weekday
    df_holiday["month"] = df_holiday[date_col].dt.month
    df_holiday["day"] = df_holiday[date_col].dt.day
    
    
    # 土日も1にする
    df_holiday.loc[df_holiday["dayofweek"]==5, "holiday"] = 1
    df_holiday.loc[df_holiday["dayofweek"]==6, "holiday"] = 1
    
    # GW, お盆, 年末年始も1にする
    ## GW
    df_holiday.loc[(df_holiday["month"]==4)&(df_holiday["day"]>=29), "holiday"] = 1
    df_holiday.loc[(df_holiday["month"]==5)&(df_holiday["day"]<=5), "holiday"] = 1
    ## お盆
    df_holiday.loc[(df_holiday["month"]==8)&(df_holiday["day"]>=12)&(df_holiday["day"]<=16), "holiday"] = 1
    ## 年末年始
    df_holiday.loc[(df_holiday["month"]==12)&(df_holiday["day"]>=30), "holiday"] = 1
    df_holiday.loc[(df_holiday["month"]==1)&(df_holiday["day"]<=3), "holiday"] = 1
    
    # 欠損埋め
    df_holiday["holiday"] = df_holiday["holiday"].fillna(0)
    
    # 後休日
    df_holiday = add_holidays_from_tomorrow(df_holiday)
    # display(df_holiday)
    
    # 前休日
    df_holiday = add_holidays_to_yesterday(df_holiday)
    
    
    # 結合
    # display(df_holiday[date_col].value_counts())
    df_holiday[date_col] = pd.to_datetime(df_holiday[date_col]) # 型の調整
    merged_t[date_col] = pd.to_datetime(merged_t[date_col]) # 型の調整
    merged_t = pd.merge(merged_t, df_holiday[[date_col, "holiday", "holiday_before", "holiday_after"]], on=date_col, how="left")
    
    df_holiday.to_csv("./calendar.csv", index=False)
    df_holiday.to_pickle("./calendar.pickle")
    
    return merged_t

def add_holidays_from_tomorrow(df, col_date="date", col_holiday="holiday"):
    
    # dfに追加する列名
    col_new = 'holiday_after'
    
    # 日付一覧の取得
    date_list = df[col_date]
    
    # 休日数の算出
    for date in date_list:
        # 休日数を格納
        num_holidays = -1
        # 休日判定
        holidays_flag = True
        # 次の日
        date_next = date
        # 休日ではなくなるまで繰り返し
        while holidays_flag:
            # 休日数の更新
            num_holidays += 1
            # 更に次の日へ
            date_next += datetime.timedelta(days=1)
            # 次の日のデータ取得
            holidays = df.loc[df[col_date]==date_next, col_holiday]
            # dfに次の日が存在しない場合(False)
            if len(holidays) == 0:
                holidays_flag = False
            # 次の日が存在する場合
            else:
                holidays_flag = holidays.iloc[0]!=0
        
        # 休日数を格納
        df.loc[df[col_date]==date, col_new] = num_holidays
    
    # print(df.shape, df_calendar.shape)
    # display(df_calendar)
    # df = pd.merge(df, df_calendar[[col_date, col_new]], on=col_date, how="left")
    # display(df)
    # print(df.shape)
        
    return df

def add_holidays_to_yesterday(df, col_date="date", col_holiday="holiday"):
    
    # dfに追加する列名
    col_new = 'holiday_before'
    
    # 日付一覧の取得
    date_list = df[col_date]
    
    # 休日数の算出
    for date in date_list:
        # 休日数を格納
        num_holidays = -1
        # 休日判定
        holidays_flag = True
        # 次の日
        date_next = date
        # 休日ではなくなるまで繰り返し
        while holidays_flag:
            # 休日数の更新
            num_holidays += 1
            # 更に次の日へ
            date_next -= datetime.timedelta(days=1)
            # 次の日のデータ取得
            holidays = df.loc[df[col_date]==date_next, col_holiday]
            # dfに次の日が存在しない場合(False)
            if len(holidays) == 0:
                holidays_flag = False
            # 次の日が存在する場合
            else:
                holidays_flag = holidays.iloc[0]!=0
        
        # 休日数を格納
        df.loc[df[col_date]==date, col_new] = num_holidays
    
    return df

In [6]:
train = holiday_add_a2(train)

In [7]:
df_tmp = pd.read_pickle("./calendar.pickle")

In [8]:
# train.info()

In [9]:
# train[train["holiday_before"]>=6]

In [10]:
# jpholiday.between(datetime.date(2021, 1, 1), datetime.date(2023, 12, 31))

In [11]:
cat_cols = ["start_code", "end_code", "start_pref_code", "end_pref_code", "road_code", "dayofweek", "holiday", "direction", "month"] #9
# cat_cols = ["start_pref_code", "end_pref_code", "road_code", "dayofweek", "holiday", "direction", "month"] #9
num_cols = ["day", "search_unspec_1d", "start_lat", "end_lat", "start_lng", "end_lng",
            "start_degree", "end_degree", "KP", "limit_speed", "start_KP", "end_KP",
           "holiday_before", "holiday_after"] #12
num_1h_cols = ["OCC", "allCars", "speed", "search_1h",] #3
rem_cols = ["datetime", "year", "hour"] #3
tar_col = "is_congestion" #1
key_cols = ["date", "section"] #2

In [12]:
def convert_df_for_rnn(df, cat_cols, num_cols, num_1h_cols, rem_cols, tar_col, key_cols):
    
    """
    時間固有：occ, allcars, speed, is_con(traget), hour, search_1h
    """
    
    # 例外処理
    df["search_1h"] = df["search_1h"].fillna(-1)
    df["search_unspec_1d"] = df["search_unspec_1d"].fillna(-1)
        
    # 日にち単位にまとめる
    df_day = df[df["hour"]==10] # 適当な時間
    df_day = df_day[key_cols+cat_cols+num_cols]
    
    # object化
    for col in cat_cols:
        df_day[col] = df_day[col].astype("category")
    
    # 時間に依存する項目のみ
    df_1h = df.pivot_table(index=key_cols, columns=["hour"], values=num_1h_cols+[tar_col], aggfunc="mean")
    col_news = [str(col[0])+"_"+str(col[1]) for col in df_1h.columns]
    df_1h.columns = col_news
    df_1h = df_1h.reset_index()
    
    # merge
    df_day = pd.merge(df_day, df_1h, on=key_cols, how="left")
    
    # one hot
    # df_day = pd.get_dummies(df_day, drop_first=True, columns=cat_cols)
    
    # le
    le_dict = {}
    for c in tqdm(cat_cols):
        le = LabelEncoder()
        le.fit(df_day[c])
        df_day[c] = le.transform(df_day[c])
        le_dict[c] = le

    with open("../src/features/le_dict.pkl", "wb") as f:
        pickle.dump(le_dict, f)
    
    # sort
    df_day = df_day.sort_values(key_cols).reset_index(drop=True)
    
    feature_cols = []
    tar_cols = []
    for col in df_day.columns:
        if (tar_col in col)|(col in key_cols):
            if tar_col in col:
                tar_cols.append(col)
        else:
            feature_cols.append(col)
            
    # 目的変数を前日にする&当日の実績を変数に設定
    df_yesterday = df_day.copy(deep=True)
    df_yesterday["date"] = df_yesterday["date"] - datetime.timedelta(days=1)
    # df_day = df_day.drop(tar_cols, axis=1) ##ここを変更する
    old_tra_cols = [i.replace("is_congestion", "old") for i in tar_cols]
    feature_cols += old_tra_cols
    df_day = df_day.rename(dict(zip(tar_cols, old_tra_cols)), axis=1)
    df_yesterday = df_yesterday[key_cols+tar_cols]
    df_day = pd.merge(df_day, df_yesterday, on=key_cols, how="left")
    df_day = df_day.dropna(subset=tar_cols)
    
    return df_day, feature_cols, tar_cols

In [13]:
df, feature_cols, tar_cols = convert_df_for_rnn(train, cat_cols, num_cols, num_1h_cols, rem_cols, tar_col, key_cols)

  0%|          | 0/9 [00:00<?, ?it/s]

In [14]:
# df["holiday_before"].value_counts()

In [15]:
# df[tar_cols].sum()

In [16]:
df.shape

(37841, 169)

# モデル作成

In [17]:
X = df[feature_cols]
Y = df[tar_cols]

kf = StratifiedGroupKFold(n_splits=6)
cv_list = list(kf.split(X, y=Y.sum(axis=1)>0, groups=X['month'])) 

In [18]:
# len(cv_list)

In [19]:
# params = {
#     "objective": "binary",
#     "n_estimators": 100000,
#     "learning_rate": 0.05,
#     "importance_type": "gain",
#     "random_state": 42,
#     "verbose": -1,
# }

In [20]:
# help(lgb.LGBMClassifier)

In [21]:
%time
model_list = []

for i, cv in enumerate(tqdm(cv_list)):
    print(f"#####___Fold:{i:02}___#####")
    
    ind_train, ind_valid = cv
    X_train = X.loc[ind_train]
    Y_train = Y.loc[ind_train]
    X_valid = X.loc[ind_valid]
    Y_valid = Y.loc[ind_valid]

    base_lr = lgb.LGBMClassifier(objective="binary", n_estimators=1000, n_jobs=-1, class_weight="balanced", num_leaves=63, colsample_bytree=0.8)
    clf = ClassifierChain(base_lr, order=None, random_state=i, verbose=2, cv=5)
    clf.fit(X_train, Y_train)
    model_list.append(clf)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs


  0%|          | 0/6 [00:00<?, ?it/s]

#####___Fold:00___#####
[Chain] .................. (1 of 24) Processing order 0, total=   1.8s
[Chain] .................. (2 of 24) Processing order 1, total=   1.9s
[Chain] .................. (3 of 24) Processing order 2, total=   1.7s
[Chain] .................. (4 of 24) Processing order 3, total=   1.9s
[Chain] .................. (5 of 24) Processing order 4, total=   1.8s
[Chain] .................. (6 of 24) Processing order 5, total=   1.8s
[Chain] .................. (7 of 24) Processing order 6, total=   2.0s
[Chain] .................. (8 of 24) Processing order 7, total=   2.4s
[Chain] .................. (9 of 24) Processing order 8, total=   2.6s
[Chain] ................. (10 of 24) Processing order 9, total=   2.8s
[Chain] ................ (11 of 24) Processing order 10, total=   2.9s
[Chain] ................ (12 of 24) Processing order 11, total=   2.8s
[Chain] ................ (13 of 24) Processing order 12, total=   2.6s
[Chain] ................ (14 of 24) Processing order 

KeyboardInterrupt: 

In [None]:
df_score_list = []

Y_pred = Y.copy(deep=True)
for i in tqdm(range(len(model_list))):
    print(f"#####___Fold:{i:02}___#####")
    clf = model_list[i]
    ind_train, ind_valid = cv_list[i]
    
    X_train = X.loc[ind_train]
    Y_train = Y.loc[ind_train]
    X_valid = X.loc[ind_valid]
    Y_valid = Y.loc[ind_valid]
    
    Y_train_pred = clf.predict(X_train)
    Y_valid_pred = clf.predict(X_valid)
    
    Y_pred.loc[ind_valid] = Y_valid_pred
    # print(f1_score(Y_train.values.ravel(), Y_train_pred.ravel()))
    # print(f1_score(Y_valid.values.ravel(), Y_valid_pred.ravel()))

In [None]:
y_true = Y.values.ravel()
y_pred = Y_pred.values.ravel()

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
display(confusion_matrix(y_true, y_pred))
print("accuracy", accuracy_score(y_true, y_pred))
print("precison", precision_score(y_true, y_pred))
print("recall", recall_score(y_true, y_pred))
print("f1 score", f1_score(y_true, y_pred))

In [None]:
for i, m in enumerate(model_list):
    with open(f'../model/lgb_fold{i}.pickle', mode="wb") as f:
        pickle.dump(m, f)