# ライブラリのImport, Seedの固定

In [60]:
import random
import os
import torch

import numpy as np
import pandas as pd
import warnings
import pickle
from sklearn.metrics import mean_squared_error

import lightgbm as lgb
from catboost import Pool, CatBoostRegressor
import xgboost as xgb

warnings.simplefilter('ignore')

def seed_torch(seed=42):
    # python の組み込み関数の seed を固定
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    # numpy の seed を固定
    np.random.seed(seed)
    # torch の seed を固定
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # 決定論的アルゴリズムを使用する
    torch.backends.cudnn.deterministic = True

SEED = 42
seed_torch(SEED)

# 教師データ読み込みと前処理

In [61]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

df = pd.read_csv("~/Desktop/signate-main/data/train_add.csv") 
df_test = pd.read_csv("~/Desktop/signate-main/data/test_add.csv")


df

Unnamed: 0,id,year,month,day,Country,City,lat,lon,co_cnt,co_min,...,temperature_add,humidity_add,pressure_add,ws_add,dew_add,distance,manhattan,bearing,co_ci,near_city
0,1,2019,1,1,0,24,-27.46794,153.02809,38,0.749,...,21.1480,246.7772,638.3688,9.6826,12.7130,15818.210,20070.243,42.188,1,55
1,2,2019,1,1,0,54,-12.46113,130.84185,47,2.594,...,39.8486,146.6794,612.9314,14.0216,26.6758,14420.323,15934.564,10.566,2,1
2,3,2019,1,1,0,146,-37.81400,144.96332,17,1.190,...,60.5012,325.0358,1517.3096,30.1830,13.4438,14489.153,20323.911,41.167,3,1
3,4,2019,1,1,0,176,-32.92953,151.78010,63,4.586,...,32.0948,91.2904,639.7020,11.7728,31.3184,15311.013,20538.774,45.369,4,55
4,5,2019,1,1,0,193,-31.95224,115.86140,47,4.689,...,18.5614,172.8750,641.8532,21.7176,22.7498,12422.947,16436.127,14.388,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195936,195937,2021,12,31,28,103,30.33218,-81.65565,12,0.694,...,25.3270,143.9064,620.7708,7.3574,18.1526,9207.432,12452.479,175.764,288,9
195937,195938,2021,12,31,28,133,36.17497,-115.13722,14,0.528,...,32.3818,206.4498,624.9680,14.0382,12.0156,12237.383,16825.148,-164.519,289,17
195938,195939,2021,12,31,28,150,43.03890,-87.90647,171,1.975,...,47.4424,108.3046,654.9970,26.0542,8.7104,9837.417,14560.461,178.571,291,10
195939,195940,2021,12,31,29,88,21.02450,105.84117,31,2.613,...,67.8490,116.8230,678.7892,56.4306,14.5446,11648.967,14106.819,-5.813,299,19


In [62]:
def add_yearmonth(df):
    df1 = df[df['month'] < 10].copy()
    df2 = df[df['month'] >= 10].copy()
    df1['yearmonth'] = df1['year'].astype(str) + ('0' + df1['month'].astype(str))
    df2['yearmonth'] = df2['year'].astype(str) + df2['month'].astype(str)
    return pd.concat([df1, df2])

df = add_yearmonth(df)
df_test = add_yearmonth(df_test)

df['date'] = pd.to_datetime({'year': df['year'], 'month': df['month'], 'day': df['day']})
df_test['date'] = pd.to_datetime({'year': df_test['year'], 'month': df_test['month'], 'day': df_test['day']})


df

Unnamed: 0,id,year,month,day,Country,City,lat,lon,co_cnt,co_min,...,pressure_add,ws_add,dew_add,distance,manhattan,bearing,co_ci,near_city,yearmonth,date
0,1,2019,1,1,0,24,-27.46794,153.02809,38,0.749,...,638.3688,9.6826,12.7130,15818.210,20070.243,42.188,1,55,201901,2019-01-01
1,2,2019,1,1,0,54,-12.46113,130.84185,47,2.594,...,612.9314,14.0216,26.6758,14420.323,15934.564,10.566,2,1,201901,2019-01-01
2,3,2019,1,1,0,146,-37.81400,144.96332,17,1.190,...,1517.3096,30.1830,13.4438,14489.153,20323.911,41.167,3,1,201901,2019-01-01
3,4,2019,1,1,0,176,-32.92953,151.78010,63,4.586,...,639.7020,11.7728,31.3184,15311.013,20538.774,45.369,4,55,201901,2019-01-01
4,5,2019,1,1,0,193,-31.95224,115.86140,47,4.689,...,641.8532,21.7176,22.7498,12422.947,16436.127,14.388,5,1,201901,2019-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195936,195937,2021,12,31,28,103,30.33218,-81.65565,12,0.694,...,620.7708,7.3574,18.1526,9207.432,12452.479,175.764,288,9,202112,2021-12-31
195937,195938,2021,12,31,28,133,36.17497,-115.13722,14,0.528,...,624.9680,14.0382,12.0156,12237.383,16825.148,-164.519,289,17,202112,2021-12-31
195938,195939,2021,12,31,28,150,43.03890,-87.90647,171,1.975,...,654.9970,26.0542,8.7104,9837.417,14560.461,178.571,291,10,202112,2021-12-31
195939,195940,2021,12,31,29,88,21.02450,105.84117,31,2.613,...,678.7892,56.4306,14.5446,11648.967,14106.819,-5.813,299,19,202112,2021-12-31


# 学習用関数の定義

In [63]:
from dateutil.relativedelta import relativedelta

class TimeSeriesSplitGenerator:
    def __init__(self, n_split = 12, test_day_after = "2019-01-01", slide = False):
        self.test_day_after = pd.to_datetime(test_day_after)
        self.n_split = n_split
        self.test_month_period = 36
        self.month = relativedelta(months = 1)

        self.slide = slide

    def split(self, X):
        for m in range(self.test_month_period):
            test_month = self.test_day_after + relativedelta(months=m)
            print("test_month",test_month)
            if m == 0: # １月
                test_index = (test_month <= X.date) & (X.date < test_month + self.month)  # 2019-1-1＜X＜2019-2-1
                valid_index = ("2019-01-15 00:00:00" <= X.date) & (X.date < "2019-01-31 00:00:00") # 2019-1-1＜X＜2019-2-1
                train_index =("2019-01-01 00:00:00" <= X.date) & (X.date < "2019-01-15 00:00:00")  # 2019-1-1＜X＜2019-2-1
                if self.slide:
                    train_index = train_index &(
                        test_month - self.month - relativedelta(months =12) <= X.date
                    )
            elif m == 1: #　２月
                test_index = (test_month <= X.date) & (X.date < test_month + self.month)  # 2019-2-1＜X＜2019-3-1
                valid_index = ("2019-01-15 00:00:00" <= X.date) & (X.date < "2019-01-31 00:00:00") # 2019-1-1＜X＜2019-2-1
                train_index =("2019-01-01 00:00:00" <= X.date) & (X.date < "2019-01-15 00:00:00")  # 2019-9月よりも前
                if self.slide:
                    train_index = train_index &(
                        test_month - self.month - relativedelta(months =12) <= X.date
                    )
            else:
                test_index = (test_month <= X.date) & (X.date < test_month + self.month) # 2019-10   2019-01
                valid_index = (test_month - self.month <= X.date) & (X.date < test_month) # 2019-9
                train_index = (X.date < test_month - self.month)                          # 2019-9月よりも前
                if self.slide:
                    train_index = train_index &(
                        test_month - self.month - relativedelta(months =12) <= X.date
                    )
            yield train_index, valid_index, test_index

In [64]:
def lightgbm(X_train, Y_train, X_valid, Y_valid, X_test):
    bst_params = {
          "boosting_type": "gbdt",
          "metric": "rmse",
          "objective": "regression",
          "n_jobs": -1,
          "seed": SEED,
          'random_state': SEED,
          "learning_rate": 0.01,
          "bagging_fraction": 0.75,
          "bagging_freq": 10,
          "colsample_bytree": 0.75,
          "num_boost_round": 10000,
          "early_stopping_rounds": 10,
          "verbose_eval": 1000,
      }
    lgb_train = lgb.Dataset(X_train, Y_train)
    lgb_valid = lgb.Dataset(X_valid, Y_valid)

    model = lgb.train(bst_params, lgb_train,
                        valid_names=["train", "valid"], valid_sets=[lgb_train, lgb_valid],
                        verbose_eval=1000)

    # 検証データに対する予測値を求める
    va_pred = model.predict(X_valid, num_iteration=model.best_iteration)

    #テストデータに対する予測値を求める
    te_pred = np.array(model.predict(X_test, num_iteration=model.best_iteration))

    return va_pred, te_pred, model


def catboost(X_train, Y_train, X_valid, Y_valid, X_test):
    # objectの列番号を取得
    categorical_features_indices = np.where(X_train.dtypes==np.object)[0]
    lgb_train = Pool(X_train, Y_train, cat_features=categorical_features_indices)
    lgb_valid = Pool(X_valid, Y_valid, cat_features=categorical_features_indices)
    model = CatBoostRegressor(eval_metric='RMSE',
                            loss_function='RMSE',
                            num_boost_round=10000,
                            logging_level='Silent',
                            random_seed=SEED)
    model.fit(lgb_train, 
            eval_set=lgb_valid,
            early_stopping_rounds=10,
            verbose=True,
            use_best_model=True)

    # 検証データに対する予測値を求める
    va_pred = model.predict(X_valid)

    mse = mean_squared_error(Y_valid, va_pred)
    rmse = np.sqrt(mse) # RSME = √MSEの算出
    eval_metric = rmse

    print(f"eval's rmse: {eval_metric}")

    #テストデータに対する予測値を求める
    te_pred = np.array(model.predict(X_test))

    return va_pred, te_pred, model

def xgboost(X_train, Y_train, X_valid, Y_valid, X_test):

    xgb_params = {
      'objective': 'reg:linear',
      'eval_metric': 'rmse',
      # "verbosity": 0,
      "seed": SEED,
      "eta": 0.01,
      "num_boost_round": 10000,
      # "early_stopping_rounds": 10,
      # "verbose_eval": 100,
    }

    lgb_train = xgb.DMatrix(X_train, label=Y_train)
    lgb_valid = xgb.DMatrix(X_valid, label=Y_valid)
    lgb_test = xgb.DMatrix(X_test)
    evals = [(lgb_train, 'train'), (lgb_valid, 'eval')]
    evals_result = {}

    model = xgb.train(xgb_params,
                    lgb_train,
                    evals=evals,
                    evals_result=evals_result,
                    num_boost_round=10000,
                    early_stopping_rounds=10,
                    verbose_eval=1000,
                  )
  
    # 検証データに対する予測値を求める
    va_pred = model.predict(lgb_valid)

    #テストデータに対する予測値を求める
    te_pred = model.predict(lgb_test)

    return va_pred, te_pred, model

# 学習と予測

最初に学習済みモデル保存用にフォルダを作成しておく

In [65]:

main_df = df

scores = []
ids = []
submission = []

OUTPUT = "~/Desktop/sony_cup"

for i, (tr,va,te) in enumerate(TimeSeriesSplitGenerator(slide = False).split(main_df)):
    train_index, valid_index, test_index = tr,va,te

    print("-------------------------------------------")
    print("train:", main_df[train_index].date.min(), main_df[train_index].date.max())
    print("valid:", main_df[valid_index].date.min(), main_df[valid_index].date.max())

    train = main_df[train_index].dropna(subset=["pm25_mid"])
    valid = main_df[valid_index].dropna(subset=["pm25_mid"])
    
    
    if i == 0:
        year = main_df[valid_index].date.min().year
        month = main_df[valid_index].date.max().month
        test = df_test[df_test["year"] == year]
        test = test[test["month"] == month]
        print("test: ", test.date.min(), test.date.max())
        print(f'Fold : {i}')
    else:
        year = (main_df[valid_index].date.min()  + relativedelta(months=1)).year
        month = (main_df[valid_index].date.max() + relativedelta(months=1)).month
        test = df_test[df_test["year"] == year]
        test = test[test["month"] == month]
        print("test: ", test.date.min(), test.date.max())
        print(f'Fold : {i}')
    
    
    train_add = train

    X_train, Y_train = train_add.drop(columns=["pm25_mid","date","id","yearmonth"]), train_add["pm25_mid"]
    X_valid, Y_valid = valid.drop(columns=["pm25_mid","date","id","yearmonth"]), valid["pm25_mid"]
    ids += list(test["id"])
    X_test = test.drop(columns=["date","id","yearmonth"])

    print("LGBD")
    va_pred1, te_pred1, model = lightgbm(X_train, Y_train, X_valid, Y_valid, X_test)
    #pickle.dump(model, open(f"{OUTPUT}/lgbm/{start_date}.pkl", 'wb'))
    print()
    print("CABT")
    #va_pred2, te_pred2, model = catboost(X_train, Y_train, X_valid, Y_valid, X_test)
    #pickle.dump(model, open(f"{OUTPUT}/cat/{start_date}.pkl", 'wb'))
    print()
    print("XGBT")
    #va_pred3, te_pred3, model = xgboost(X_train, Y_train, X_valid, Y_valid, X_test)
    #pickle.dump(model, open(f"{OUTPUT}/xgb/{start_date}.pkl", 'wb'))

    #va_pred = (va_pred1 / 3) + (va_pred2 / 3) + (va_pred3 / 3)
    #te_pred = (te_pred1 / 3) + (te_pred2 / 3) + (te_pred3 / 3)

    va_pred = (va_pred1)
    te_pred = (te_pred1)

    # RSME = √MSEの算出
    mse = mean_squared_error(Y_valid, va_pred)
    rmse = np.sqrt(mse)

    scores.append(rmse)

    #テストデータに対する予測値を求める
    submission += list(te_pred)

    print('')
    print('################################')
    print(f"Fold: {i} RMSE:{rmse}")
    print("")

print() 
print(f"CV: {np.mean(scores)}")
print("Best CV: 21.150075570163597")

# CV: 3.4555098

test_month 2019-01-01 00:00:00
-------------------------------------------
train: 2019-01-01 00:00:00 2019-01-14 00:00:00
valid: 2019-01-15 00:00:00 2019-01-30 00:00:00
test:  2019-01-01 00:00:00 2019-01-31 00:00:00
Fold : 0
LGBD
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15344
[LightGBM] [Info] Number of data points in the train set: 2485, number of used features: 64
[LightGBM] [Info] Start training from score 74.758560
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[720]	train's rmse: 12.7066	valid's rmse: 26.82

CABT

XGBT

################################
Fold: 0 RMSE:26.819999746716686

test_month 2019-02-01 00:00:00
-------------------------------------------
train: 2019-01-01 00:00:00 2019-01-14 00:00:00
valid: 2019-01-15 00:00:00 2019-01-30 00:00:00
test:  2019-02-01 00:00:00 2019-02-28 00:00:00
Fold : 1
LGBD
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] To

[1000]	train's rmse: 19.0089	valid's rmse: 19.7349
Early stopping, best iteration is:
[1021]	train's rmse: 18.943	valid's rmse: 19.7251

CABT

XGBT

################################
Fold: 6 RMSE:19.725066198811447

test_month 2019-08-01 00:00:00
-------------------------------------------
train: 2019-01-01 00:00:00 2019-06-30 00:00:00
valid: 2019-07-01 00:00:00 2019-07-31 00:00:00
test:  2019-08-01 00:00:00 2019-08-31 00:00:00
Fold : 7
LGBD
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15541
[LightGBM] [Info] Number of data points in the train set: 34622, number of used features: 65
[LightGBM] [Info] Start training from score 60.659067
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[921]	train's rmse: 19.1142	valid's rmse: 18.5205

CABT

XGBT

################################
Fold: 7 RMSE:18.520485470077325

test_month 2019-09-01 00:00:00
-------------------------------------------
train: 2019-01-

[1000]	train's rmse: 19.39	valid's rmse: 28.2139
Early stopping, best iteration is:
[1118]	train's rmse: 19.1677	valid's rmse: 28.1387

CABT

XGBT

################################
Fold: 13 RMSE:28.138661497487607

test_month 2020-03-01 00:00:00
-------------------------------------------
train: 2019-01-01 00:00:00 2020-01-31 00:00:00
valid: 2020-02-01 00:00:00 2020-02-28 00:00:00
test:  2020-03-01 00:00:00 2020-03-31 00:00:00
Fold : 14
LGBD
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15568
[LightGBM] [Info] Number of data points in the train set: 72661, number of used features: 66
[LightGBM] [Info] Start training from score 60.216979
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[861]	train's rmse: 20.106	valid's rmse: 25.8293

CABT

XGBT

################################
Fold: 14 RMSE:25.829335883388048

test_month 2020-04-01 00:00:00
-------------------------------------------
train: 2019-01

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15566
[LightGBM] [Info] Number of data points in the train set: 103923, number of used features: 66
[LightGBM] [Info] Start training from score 58.068869
Training until validation scores don't improve for 10 rounds
[1000]	train's rmse: 19.7594	valid's rmse: 17.0727
Early stopping, best iteration is:
[1013]	train's rmse: 19.7368	valid's rmse: 17.0627

CABT

XGBT

################################
Fold: 20 RMSE:17.0627240275786

test_month 2020-10-01 00:00:00
-------------------------------------------
train: 2019-01-01 00:00:00 2020-08-31 00:00:00
valid: 2020-09-01 00:00:00 2020-09-30 00:00:00
test:  2020-10-01 00:00:00 2020-10-31 00:00:00
Fold : 21
LGBD
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15566
[LightGBM] [Info] Number of data points in the train set: 109831, number of used features: 66
[LightGBM] [Info] Start training from score 57.536676
Training un

[1000]	train's rmse: 19.9493	valid's rmse: 23.9959
Early stopping, best iteration is:
[1211]	train's rmse: 19.6466	valid's rmse: 23.8441

CABT

XGBT

################################
Fold: 26 RMSE:23.84412515288214

test_month 2021-04-01 00:00:00
-------------------------------------------
train: 2019-01-01 00:00:00 2021-02-28 00:00:00
valid: 2021-03-01 00:00:00 2021-03-31 00:00:00
test:  2021-04-01 00:00:00 2021-04-30 00:00:00
Fold : 27
LGBD
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15571
[LightGBM] [Info] Number of data points in the train set: 142317, number of used features: 66
[LightGBM] [Info] Start training from score 60.080137
Training until validation scores don't improve for 10 rounds
[1000]	train's rmse: 20.0462	valid's rmse: 24.7172
Early stopping, best iteration is:
[1942]	train's rmse: 18.9871	valid's rmse: 24.3659

CABT

XGBT

################################
Fold: 27 RMSE:24.365945342901867

test_month 2021-05-01 00:00:00
---

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15578
[LightGBM] [Info] Number of data points in the train set: 176177, number of used features: 66
[LightGBM] [Info] Start training from score 58.905488
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[411]	train's rmse: 21.3524	valid's rmse: 16.9715

CABT

XGBT

################################
Fold: 33 RMSE:16.971476413592473

test_month 2021-11-01 00:00:00
-------------------------------------------
train: 2019-01-01 00:00:00 2021-09-30 00:00:00
valid: 2021-10-01 00:00:00 2021-10-31 00:00:00
test:  2021-11-01 00:00:00 2021-11-30 00:00:00
Fold : 34
LGBD
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15578
[LightGBM] [Info] Number of data points in the train set: 181388, number of used features: 66
[LightGBM] [Info] Start training from score 58.637564
Training until validation scores don't improve for 10 rounds


In [66]:
sub = pd.Series(submission)
sub.index = ids
sub.to_csv(f"{OUTPUT}/submmission_ensenble(month_by_month).csv",header=False)
sub

195942     11.706840
195943     39.613038
195944     31.287131
195945     61.769683
195946    134.242223
             ...    
249446     72.697392
249447    124.563946
249448     68.295455
249449     35.044842
249450     39.084504
Length: 53509, dtype: float64

In [67]:
from tqdm import tqdm
all_data = pd.read_csv("~/Desktop/signate-main/submit_ensamble_safety.csv",header=None)
mon_data = pd.read_csv("~/Desktop/sony_cup/submmission_ensenble(month_by_month).csv",header=None)


for i in tqdm(range(2693)):
    mon_data[1][i] = all_data[1][i] 

mon_data.to_csv('~/Desktop/submit_month.csv', header=False, index=False)
mon_data

100%|██████████| 2693/2693 [00:00<00:00, 6122.80it/s]


Unnamed: 0,0,1
0,195942,22.206779
1,195943,38.103874
2,195944,27.759193
3,195945,66.442556
4,195946,135.993679
...,...,...
53504,249446,72.697392
53505,249447,124.563946
53506,249448,68.295455
53507,249449,35.044842
