In [37]:
import pandas as pd

In [38]:
df_train = pd.read_csv('../../00_data/train.csv')

# 全部読み込むと遅いのでとりあえず冒頭5000データのみで分析する
df_train = df_train[:5000].copy()
print(len(df_train))
df_train.head()

5000


Unnamed: 0,datetime,start_code,end_code,KP,OCC,allCars,speed,is_congestion
0,2021-04-08 00:00:00,1110210,1800006,2.48,1.833333,507,94.208661,0
1,2021-04-08 01:00:00,1110210,1800006,2.48,1.75,444,94.469663,0
2,2021-04-08 02:00:00,1110210,1800006,2.48,1.5,363,92.593407,0
3,2021-04-08 03:00:00,1110210,1800006,2.48,1.583333,430,94.50116,0
4,2021-04-08 04:00:00,1110210,1800006,2.48,1.75,500,94.07984,0


In [39]:
df_train['datetime'] = pd.to_datetime(df_train['datetime'])
df_train['date'] = df_train['datetime'].dt.date
df_train['time'] = df_train['datetime'].dt.time
df_train['weekday'] = df_train['datetime'].dt.weekday

df_train.head()

Unnamed: 0,datetime,start_code,end_code,KP,OCC,allCars,speed,is_congestion,date,time,weekday
0,2021-04-08 00:00:00,1110210,1800006,2.48,1.833333,507,94.208661,0,2021-04-08,00:00:00,3
1,2021-04-08 01:00:00,1110210,1800006,2.48,1.75,444,94.469663,0,2021-04-08,01:00:00,3
2,2021-04-08 02:00:00,1110210,1800006,2.48,1.5,363,92.593407,0,2021-04-08,02:00:00,3
3,2021-04-08 03:00:00,1110210,1800006,2.48,1.583333,430,94.50116,0,2021-04-08,03:00:00,3
4,2021-04-08 04:00:00,1110210,1800006,2.48,1.75,500,94.07984,0,2021-04-08,04:00:00,3


In [40]:
#一日前の情報をもとに翌日の結果を予測するので、一日ずらす必要がある
# is_congestionは現在の道路の情報なので、実際の目的変数は一日ずらさないといけない
# つまり、一番古い時刻の正解データ（目的変数）は学習には使えない
# 同様に、一番新しい時刻の学習データ（説明変数）の正解データはない


# 目的変数の作成
# 目的変数はspeedの実数値を予測する回帰予測タスクとする
df_target = df_train[['datetime', 'speed', 'is_congestion']].copy()
# 一日前の予測結果の目的変数として使用するため一日ずらす
df_target['datetime'] -= pd.to_timedelta(1, 'd')
df_target.rename(columns={'speed':'target_speed', 'is_congestion':'target_flag'}, inplace=True)
df_target

Unnamed: 0,datetime,target_speed,target_flag
0,2021-04-07 00:00:00,94.208661,0
1,2021-04-07 01:00:00,94.469663,0
2,2021-04-07 02:00:00,92.593407,0
3,2021-04-07 03:00:00,94.501160,0
4,2021-04-07 04:00:00,94.079840,0
...,...,...,...
4995,2021-11-01 03:00:00,95.005587,0
4996,2021-11-01 04:00:00,94.252650,0
4997,2021-11-01 05:00:00,93.789439,0
4998,2021-11-01 06:00:00,88.867681,0


In [41]:
df_train

Unnamed: 0,datetime,start_code,end_code,KP,OCC,allCars,speed,is_congestion,date,time,weekday
0,2021-04-08 00:00:00,1110210,1800006,2.48,1.833333,507,94.208661,0,2021-04-08,00:00:00,3
1,2021-04-08 01:00:00,1110210,1800006,2.48,1.750000,444,94.469663,0,2021-04-08,01:00:00,3
2,2021-04-08 02:00:00,1110210,1800006,2.48,1.500000,363,92.593407,0,2021-04-08,02:00:00,3
3,2021-04-08 03:00:00,1110210,1800006,2.48,1.583333,430,94.501160,0,2021-04-08,03:00:00,3
4,2021-04-08 04:00:00,1110210,1800006,2.48,1.750000,500,94.079840,0,2021-04-08,04:00:00,3
...,...,...,...,...,...,...,...,...,...,...,...
4995,2021-11-02 03:00:00,1110210,1800006,2.48,1.916667,536,95.005587,0,2021-11-02,03:00:00,1
4996,2021-11-02 04:00:00,1110210,1800006,2.48,1.916667,565,94.252650,0,2021-11-02,04:00:00,1
4997,2021-11-02 05:00:00,1110210,1800006,2.48,4.583333,1495,93.789439,0,2021-11-02,05:00:00,1
4998,2021-11-02 06:00:00,1110210,1800006,2.48,8.833333,3415,88.867681,0,2021-11-02,06:00:00,1


In [42]:
# もともとのデータとマージする
df_dataset = pd.merge(df_train, df_target, on='datetime', how='inner')
df_dataset.head

<bound method NDFrame.head of                 datetime  start_code  end_code    KP       OCC  allCars  \
0    2021-04-08 00:00:00     1110210   1800006  2.48  1.833333      507   
1    2021-04-08 01:00:00     1110210   1800006  2.48  1.750000      444   
2    2021-04-08 02:00:00     1110210   1800006  2.48  1.500000      363   
3    2021-04-08 03:00:00     1110210   1800006  2.48  1.583333      430   
4    2021-04-08 04:00:00     1110210   1800006  2.48  1.750000      500   
...                  ...         ...       ...   ...       ...      ...   
4971 2021-11-01 03:00:00     1110210   1800006  2.48  1.416667      444   
4972 2021-11-01 04:00:00     1110210   1800006  2.48  1.916667      540   
4973 2021-11-01 05:00:00     1110210   1800006  2.48  4.166667     1487   
4974 2021-11-01 06:00:00     1110210   1800006  2.48  8.000000     3184   
4975 2021-11-01 07:00:00     1110210   1800006  2.48  8.333333     3342   

          speed  is_congestion        date      time  weekday  target

In [43]:
print(df_dataset.dtypes)
print(df_dataset.columns)

datetime         datetime64[ns]
start_code                int64
end_code                  int64
KP                      float64
OCC                     float64
allCars                   int64
speed                   float64
is_congestion             int64
date                     object
time                     object
weekday                   int64
target_speed            float64
target_flag               int64
dtype: object
Index(['datetime', 'start_code', 'end_code', 'KP', 'OCC', 'allCars', 'speed',
       'is_congestion', 'date', 'time', 'weekday', 'target_speed',
       'target_flag'],
      dtype='object')


In [44]:
# 特徴量
columns_input = ['start_code', 'end_code', 'KP', 'OCC', 'allCars', 'speed', 'weekday']
columns_output = ['target_speed']

In [45]:
df_input_data = df_dataset[columns_input]
df_target_data = df_dataset[columns_output]
df_input_data

Unnamed: 0,start_code,end_code,KP,OCC,allCars,speed,weekday
0,1110210,1800006,2.48,1.833333,507,94.208661,3
1,1110210,1800006,2.48,1.750000,444,94.469663,3
2,1110210,1800006,2.48,1.500000,363,92.593407,3
3,1110210,1800006,2.48,1.583333,430,94.501160,3
4,1110210,1800006,2.48,1.750000,500,94.079840,3
...,...,...,...,...,...,...,...
4971,1110210,1800006,2.48,1.416667,444,95.817978,0
4972,1110210,1800006,2.48,1.916667,540,97.170055,0
4973,1110210,1800006,2.48,4.166667,1487,95.511425,0
4974,1110210,1800006,2.48,8.000000,3184,90.380848,0


In [46]:
# optuna再現性確保のための設定
import numpy as np
import random as rn
np.random.seed(42)
rn.seed(42)

In [47]:
import lightgbm as lgb
# import optuna.integration.lightgbm as lgb

# lightgbmのパラメータ
# とりあえず最低限
param = {
    'objective': 'regression',
    'learning_rate':0.01, 
    'n_estimators': 10000,
    'importance_type':'gain',
    'deterministic':True,   # 同じ条件のパラメータの時に安定して同じ結果を返す
    'verbose': -1, #途中経過を表示させない
}

In [48]:
# # カテゴリ変数化
# df_input_data = pd.get_dummies(df_input_data, drop_first=True)

# df_input_data.columns

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

X_train, X_valid, y_train, y_valid = train_test_split(df_input_data, df_target_data, test_size=0.2, random_state=42)

# lightgbm用のデータセットを作成
lgb_train = lgb.Dataset(data=X_train, label=y_train)
lgb_valid = lgb.Dataset(data=X_valid, label=y_valid)

# model = lgb.LGBMRegressor(params = param)
model = lgb.train(params=param,
                    train_set=lgb_train,
                    valid_sets=[lgb_train, lgb_valid],
                    num_boost_round=100,
                    # optuna_seed=42,
                    )
# model.fit()
y_pred_trian = model.predict(X_train)
y_pred_valid = model.predict(X_valid)

# model.best_params
# optunaの最良モデルを取得
# tuned_model = model.get_best_booster()



In [50]:
y_train_pred = model.predict(X_train)
y_valid_pred = model.predict(X_valid)

y_train_pred

array([96.29495744, 88.20254962, 91.95589754, ..., 96.64076829,
       95.60291352, 94.2246112 ])

In [51]:
df_result_train = pd.DataFrame(y_train_pred, columns=['train_pred_speed'])
df_result_train['prediction'] = df_result_train.eval('train_pred_speed < 40')

df_result_valid = pd.DataFrame(y_valid_pred, columns=['train_valid_speed'])
df_result_valid['prediction'] = df_result_valid.eval('train_valid_speed < 40')

df_result_train.head()

Unnamed: 0,train_pred_speed,prediction
0,96.294957,False
1,88.20255,False
2,91.955898,False
3,94.12197,False
4,90.537355,False


In [52]:
print(df_result_train.shape)
df_result_train.sum()

(3980, 2)


train_pred_speed    367991.23139
prediction               9.00000
dtype: float64

In [56]:
import pickle
import os
model_name = 'model.pkl'
# モデルまでのパス
model_dir_path = os.path.join('submit\\model\\', model_name)

# モデル出力
with open(model_dir_path, 'wb') as f:
    pickle.dump(model, f)


In [57]:
#モデル読み込み確認
with open(model_dir_path, 'rb') as f:
    tmp = pickle.load(f)

tmp

<lightgbm.basic.Booster at 0x27591724080>