# About this notebook

- Using pytorch lightning, wandb
- For Table data
- Can select 1D-CNN, MLP

## Version Info

- v1

## Get env

In [None]:
!nvidia-smi

In [1]:
# 環境によって処理を変えるためのもの
import sys
import os
IN_COLAB = 'google.colab' in sys.modules
IN_KAGGLE = 'kaggle_web_client' in sys.modules
LOCAL = not (IN_KAGGLE or IN_COLAB)
print(f'IN_COLAB:{IN_COLAB}, IN_KAGGLE:{IN_KAGGLE}, LOCAL:{LOCAL}')

IN_COLAB:False, IN_KAGGLE:True, LOCAL:False


In [2]:
# installation
if IN_KAGGLE or IN_COLAB:
    %env CUBLAS_WORKSPACE_CONFIG=:4096:8
    !pip install --upgrade -q wandb
    !pip install pytorch-lightning==1.4.2 -q 
#     !pip install torch_optimizer==0.1.0
#     !pip install einops
#     !pip install timm
    !pip install setuptools==57.4.0 -q
    !pip install xfeat -q

env: CUBLAS_WORKSPACE_CONFIG=:4096:8
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
explainable-ai-sdk 1.3.2 requires xai-image-widget, which is not installed.
beatrix-jupyterlab 3.1.6 requires google-cloud-bigquery-storage, which is not installed.
gcsfs 2021.11.1 requires fsspec==2021.11.1, but you have fsspec 2022.1.0 which is incompatible.
cloud-tpu-client 0.10 requires google-api-python-client==1.8.0, but you have google-api-python-client 1.12.10 which is incompatible.[0m


## Import Libraries

In [8]:
# Hide Warning
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore')

# Python Libraries
import os
import math
import random
import glob
import pickle
import gc
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Optional, Tuple

# Third party
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set(style="whitegrid")

import category_encoders as ce
from xfeat import *

# Utilities and Metrics
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupKFold, KFold
from sklearn.preprocessing import RobustScaler, normalize, QuantileTransformer, StandardScaler
# Pytorch 
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, CosineAnnealingLR, ReduceLROnPlateau
from torch.optim.optimizer import Optimizer, required
#import torch_optimizer as optim

# Pytorch Lightning
import pytorch_lightning as pl
from pytorch_lightning import Callback, seed_everything
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import WandbLogger, CSVLogger

# For Transformer Models
from transformers import AutoTokenizer, AutoModel, AutoConfig, AdamW, get_linear_schedule_with_warmup

print('torch version',torch.__version__)
print('pytorch lightnging version',pl.__version__)

In [10]:
# Weights and Biases Tool
import wandb
if IN_KAGGLE:
    from kaggle_secrets import UserSecretsClient
    wandb_api_key = UserSecretsClient().get_secret("wandb_api")
elif LOCAL:
    wandb_api_key = os.getenv('WANDB_API_KEY')
wandb.login(key=wandb_api_key)

[34m[1mwandb[0m: Currently logged in as: [33mteyosan1229[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

## Config

In [11]:
class CFG:
    debug = False
    competition='pm25'
    exp_name = "exp051"
    seed = [30,31,32]
    # model
    model_name = 'CNN'
    # data
    target_col = 'pm25_mid' # 目標値のある列名
    # optimizer
    optimizer_name = 'AdamW' #['RAdam', sgd, AdamW]
    lr = 1e-3
    weight_decay = 0.1
    amsgrad = False
    
    # scheduler
    epochs = 15
#     scheduler = 'CosineAnnealingLR' #['CosineAnnealingLR', 'ReduceLROnPlateau']
    T_max = 500
    min_lr = 1e-4
    # scheduler = 'ReduceLROnPlateau' #['CosineAnnealingLR', 'ReduceLROnPlateau']
    # factor = 0.5
    # patience = 10
    # eps = 1e-6
    # min_lr = 1e-05
    scheduler = 'get_linear_schedule_with_warmup'
    num_warmup_steps_rate = 0.1 # 総ステップのうち何割をwarm upに使うか
    num_warmup_steps = 1

    # criterion
    criterion_name = 'mse'
    margin = 0.5
    
    # training
    train = True
    inference = True
    n_fold = 4
    trn_fold = [0,1,2,3]
    precision = 32 #[16, 32, 64]
    grad_acc = 1
    # DataLoader
    loader = {
        "train": {
            "batch_size": 256,
            "num_workers": 0,
            "shuffle": True,
            "pin_memory": True,
            "drop_last": True
        },
        "valid": {
            "batch_size": 512,
            "num_workers": 0,
            "shuffle": False,
            "pin_memory": True,
            "drop_last": False
        }
    }
    # pl
    trainer = {
        'gpus': 1,
        'progress_bar_refresh_rate': 1,
        'benchmark': False,
        'deterministic': True,
        }
    # COL
    cate_cols = []
    n_categories = [] # cate_colsのユニーク数を入れる・・・？
    cont_cols = []
    feature_cols = []
    
COMPUTE_IMPORTANCE = True
if not LOCAL:
    CFG.loader["train"]["num_workers"] = 4
    CFG.loader["valid"]["num_workers"] = 4
seed_everything(CFG.seed[0])

30

## Directory & LoadData

In [12]:
# LINEに通知
import requests
def send_line_notification(message):
    env = ""
    if IN_COLAB: env = "colab"
    elif IN_KAGGLE: env = "kaggle"
    elif LOCAL: env = "local"
        
    line_token = os.getenv('LINE_API_KEY')
    endpoint = 'https://notify-api.line.me/api/notify'
    message = f"[{env}]{message}"
    payload = {'message': message}
    headers = {'Authorization': 'Bearer {}'.format(line_token)}
    requests.post(endpoint, data=payload, headers=headers)
    
def simpleEDA1(df):
    print(f'dataframe shape is {df.shape}')
    display(pd.concat([
        pd.DataFrame(df.dtypes.rename("dtypes")).T,
        pd.DataFrame(df.isnull().sum().rename("isnull")).T,
        pd.DataFrame(df.nunique().rename("nunique")).T,
        df.describe(include='all')
    ]).T)
    
def metric(true, pred):
    """コンペの評価指標 CVに使う"""
#     print('rmse')
    score = np.sqrt(mean_squared_error(true, pred))
    return score

def save_pickle(filename, obj):
    with open(filename, mode='wb') as f:
        pickle.dump(obj, f)
        
def load_pickle(filename):
    with open(filename, mode='rb') as f:
        p = pickle.load(f)
    return p 

In [13]:
if IN_KAGGLE:
    INPUT_DIR = Path('../input/my-private/')
    OUTPUT_DIR = './'
    FEAT_DIR = Path('../input/sony-fe')
elif IN_COLAB:
    INPUT_DIR = Path('/content/input/')
    OUTPUT_DIR = f'/content/drive/MyDrive/kaggle/Ventilator Pressure/{CFG.exp_name}/'
if LOCAL:
    INPUT_DIR = Path("F:/Kaggle/sig_sony/data/input/")
    OUTPUT_DIR = f'F:/Kaggle/sig_sony/data/output/{CFG.exp_name}/'
    FEAT_DIR = Path("F:/Kaggle/sig_sony/data/input/features")

df_train = pd.read_csv(INPUT_DIR / "train.csv")
df_test = pd.read_csv(INPUT_DIR / "test.csv")
df_sub = pd.read_csv(INPUT_DIR / "submit_sample.csv", header=None, names=['id', 'pred'])
df_oof = df_train.copy()
display(df_train.head())

if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

if CFG.debug:
    CFG.epochs = 5
    df_train = df_train.head(500)
    CFG.trn_fold = [0]
    #df_train  =df_train.head(2000)
    #CFG.inference = False

Unnamed: 0,id,year,month,day,Country,City,lat,lon,co_cnt,co_min,...,ws_min,ws_mid,ws_max,ws_var,dew_cnt,dew_min,dew_mid,dew_max,dew_var,pm25_mid
0,1,2019,1,1,Australia,Brisbane,-27.46794,153.02809,38,0.749,...,0.241,1.088,3.101,1.983,17,7.671,10.358,15.112,13.424,19.901
1,2,2019,1,1,Australia,Darwin,-12.46113,130.84185,47,2.594,...,0.828,3.473,7.396,10.411,62,21.324,23.813,24.221,2.021,13.741
2,3,2019,1,1,Australia,Melbourne,-37.814,144.96332,17,1.19,...,0.0,2.107,8.089,15.719,22,10.309,13.133,15.422,6.355,25.918
3,4,2019,1,1,Australia,Newcastle,-32.92953,151.7801,63,4.586,...,0.284,0.503,3.592,2.485,116,7.146,10.685,13.344,9.417,174.37
4,5,2019,1,1,Australia,Perth,-31.95224,115.8614,47,4.689,...,0.5,0.755,3.396,1.937,93,1.091,3.277,12.272,4.109,167.063


## Prepare data

In [14]:
CFG.cate_cols = []
CFG.cont_cols = [col for col in df_train.columns if pd.api.types.is_numeric_dtype(df_train[col])]
for col in ['id','pm25_mid']:
    CFG.cont_cols.remove(col)

In [15]:
%%time
feats = [
    'OE',
    'Country_TE',
    'near_city1',
    'near_city2',
    'near_city3',
    'near_city4',
    'agg_main_mean',
    'diff_mean_City_month',
    'min_max_diff',
    'lag1', # null有り
#     'lag2',
#     'lag3',
#     'lag4',
    'lag-1',
#     'lag-2',
#     'lag-3',
#     'lag-4',
    'near1_City_LE_fillna', # null有り
    'near2_City_LE_fillna',
    'near3_City_LE_fillna',
    'near4_City_LE_fillna',
#     'rolling_max_2',
#     'rolling_mean_2',
#     'rolling_mim_2',
#     'rolling_mstd_2',
#     'rolling_mean_4',
    'UMAP',
    'PCA',
    'near1_City_label',
    'near2_City_label',
    'near3_City_label',
    'near4_City_label',
    'near1_City_main_feature',
    'near2_City_main_feature',
    'near3_City_main_feature',
    'near4_City_main_feature',
]
for f in feats:
    d = load_pickle(FEAT_DIR / f"{f}.pkl")
    df_train = pd.concat([df_train, d["train"].reset_index(drop=True)],axis=1)
    df_test = pd.concat([df_test, d["test"].reset_index(drop=True)],axis=1)
    CFG.cate_cols += d["cate_cols"]
    CFG.cont_cols += d["cont_cols"]
    print(f'Load... {f} , {len(d["cate_cols"])+len(d["cont_cols"])}features')
print(f'{len(CFG.cate_cols)}cate_cols, {len(CFG.cont_cols)}cont_cols')
print(f"{df_train.shape},{df_test.shape}")
df_train.head()

Load... OE , 3features
Load... Country_TE , 6features
Load... near_city1 , 2features
Load... near_city2 , 2features
Load... near_city3 , 2features
Load... near_city4 , 2features
Load... agg_main_mean , 56features
Load... diff_mean_City_month , 7features
Load... min_max_diff , 9features
Load... lag1 , 7features
Load... lag-1 , 7features
Load... near1_City_LE_fillna , 4features
Load... near2_City_LE_fillna , 4features
Load... near3_City_LE_fillna , 4features
Load... near4_City_LE_fillna , 4features
Load... UMAP , 6features
Load... PCA , 6features
Load... near1_City_label , 3features
Load... near2_City_label , 3features
Load... near3_City_label , 3features
Load... near4_City_label , 3features
Load... near1_City_main_feature , 7features
Load... near2_City_main_feature , 7features
Load... near3_City_main_feature , 7features
Load... near4_City_main_feature , 7features
7cate_cols, 214cont_cols
(195941, 225),(53509, 224)
CPU times: user 2.34 s, sys: 1.58 s, total: 3.93 s
Wall time: 6.33 s


Unnamed: 0,id,year,month,day,Country,City,lat,lon,co_cnt,co_min,...,near3_o3_mid,near3_ws_mid,near3_temperature_mid,near4_co_mid,near4_co_max,near4_no2_mid,near4_so2_mid,near4_o3_mid,near4_ws_mid,near4_temperature_mid
0,1,2019,1,1,Australia,Brisbane,-27.46794,153.02809,38,0.749,...,12.527,2.107,0.0,3.181,4.828,0.301,0.102,7.572,3.473,30.125
1,2,2019,1,1,Australia,Darwin,-12.46113,130.84185,47,2.594,...,12.527,2.107,0.0,11.044,14.802,17.471,3.23,14.141,0.503,19.819
2,3,2019,1,1,Australia,Melbourne,-37.814,144.96332,17,1.19,...,4.295,1.088,14.038,3.181,4.828,0.301,0.102,7.572,3.473,30.125
3,4,2019,1,1,Australia,Newcastle,-32.92953,151.7801,63,4.586,...,12.527,2.107,0.0,3.181,4.828,0.301,0.102,7.572,3.473,30.125
4,5,2019,1,1,Australia,Perth,-31.95224,115.8614,47,4.689,...,11.869,0.498,13.92,11.044,14.802,17.471,3.23,14.141,0.503,19.819


In [16]:
# 使わない特徴量除去
remove_cate = []
remove_cont = []
for col in remove_cate:
    CFG.cate_cols.remove(col)
for col in remove_cont:
    CFG.cont_cols.remove(col)

In [17]:
CFG.feature_cols += CFG.cate_cols.copy()
CFG.feature_cols += CFG.cont_cols.copy()
len(CFG.feature_cols), len(CFG.cate_cols), len(CFG.cont_cols)

(221, 7, 214)

In [18]:
%%time
# fill na
m = df_train[CFG.feature_cols].mean()
df_train = df_train.fillna(m)
df_test = df_test.fillna(m)

CPU times: user 1.04 s, sys: 744 ms, total: 1.78 s
Wall time: 1.78 s


In [19]:
# scalling
for col in tqdm(CFG.cont_cols):
    qt = QuantileTransformer(random_state=0, output_distribution='normal')
    df_train.loc[:,col] = qt.fit_transform(df_train[[col]].to_numpy())
    df_test.loc[:,col] = qt.transform(df_test[[col]].to_numpy())

  0%|          | 0/214 [00:00<?, ?it/s]

In [20]:
# 埋め込みのやつ
CFG.n_categories =[]
for cat in CFG.cate_cols:
#     CFG.n_categories.append(pd.concat([df_train,df_test])[cat].nunique()+1)
    CFG.n_categories.append(int(pd.concat([df_train,df_test])[cat].max()+1))
CFG.n_categories

[31, 91, 1081, 239, 239, 239, 239]

## Utils

## CV Split

In [21]:
# CityがかぶらないようにCVを切る
cv_col = 'City'
print(df_train[cv_col].value_counts())
df_train["fold"] = -1
"""
StratifiedKFold
"""
# Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
# for n, (train_index, val_index) in enumerate(Fold.split(df_train, df_train[cv_col])):
#     df_train.loc[val_index, 'fold'] = int(n)
    
"""
GroupKFold
"""
# kfold = GroupKFold(n_splits=CFG.n_fold)
# for n, (train_index, val_index) in enumerate(kfold.split(df_train,df_train, df_train[cv_col].values)):
#     df_train.loc[val_index, 'fold'] = int(n)
# df_train['fold'] = df_train['fold'].astype(int)
# df_oof['fold'] = df_train['fold']
# # print(df_train.groupby(['fold', CFG.target_col]).size())
# print(df_train.groupby(['fold', cv_col]).size())

"""
SeedつきGroupKFold
"""
def get_fold(df_train, seed):
    city_train = df_train[cv_col]
    unique_city = df_train[cv_col].unique()
    kf = KFold(n_splits=CFG.n_fold, shuffle=True, random_state=seed)
    for n, (tr_group_idx, va_group_idx) in enumerate(kf.split(unique_city)):
        df_train.loc[df_train['City'].isin(unique_city[va_group_idx]), 'fold'] = int(n)
    df_train['fold'] = df_train['fold'].astype(int)
    return df_train
df_train = get_fold(df_train, CFG.seed[0])
df_oof['fold'] = df_train['fold']
print(df_train.groupby(['fold', cv_col]).size())
print(df_train.fold.value_counts())

Nanning      1086
Kunming      1081
Jieyang      1079
Shantou      1079
Budapest     1078
             ... 
Kielce         39
El Paso        39
Phoenix         6
Jerusalem       4
Denver          3
Name: City, Length: 239, dtype: int64
fold  City     
0     Akita         978
      Andong       1011
      Atlanta        93
      Bengaluru     980
      Busan        1013
                   ... 
3     Vancouver    1055
      Wrocław      1054
      Xinxiang      881
      Xi’an        1065
      İzmit        1013
Length: 239, dtype: int64
0    50053
3    49328
2    48475
1    48085
Name: fold, dtype: int64


## Transforms

## Dataset

In [22]:
class TabularDataset(Dataset):
    def __init__(self, x_num: np.ndarray, x_cat: np.ndarray, y: Optional[np.ndarray]):
        super().__init__()
        self.x_num = x_num
        self.x_cat = x_cat
        self.y = y
        
    def __len__(self):
        return len(self.x_num)
    
    def __getitem__(self, index):
        if self.y is None:
            return torch.FloatTensor(self.x_num[index]), torch.LongTensor(self.x_cat[index])
        else:
            return torch.FloatTensor(self.x_num[index]), torch.LongTensor(self.x_cat[index]), torch.FloatTensor(self.y[index])

In [29]:
# # Check dataset
# ds = TabularDataset(df_train[CFG.cont_cols].to_numpy(),
#                     df_train[CFG.cate_cols].to_numpy(),
#                     df_train[[CFG.target_col]].to_numpy())
# ds[0]

## DataModule

In [26]:
class DataModule(pl.LightningDataModule):
    def __init__(self, 
                 df_train,
                 df_val,
                 df_test,
                 cfg):
        super().__init__()
        self._df_train = df_train
        self._df_val = df_val
        self._df_test = df_test
        self._cfg = cfg
        
    def setup(self, stage=None):
        self.train_dataset = TabularDataset(x_num = self._df_train[self._cfg.cont_cols].to_numpy(),
                                            x_cat = self._df_train[self._cfg.cate_cols].to_numpy(),
                                            y = self._df_train[[self._cfg.target_col]].to_numpy(),)
        self.valid_dataset = TabularDataset(x_num = self._df_val[self._cfg.cont_cols].to_numpy(),
                                            x_cat = self._df_val[self._cfg.cate_cols].to_numpy(),
                                            y = self._df_val[[self._cfg.target_col]].to_numpy(),)
        self.test_dataset = TabularDataset(x_num = self._df_test[self._cfg.cont_cols].to_numpy(),
                                           x_cat = self._df_test[self._cfg.cate_cols].to_numpy(),
                                           y=None)
        
    # Trainer.fit() 時に呼び出される
    def train_dataloader(self):
        return DataLoader(self.train_dataset, **self._cfg.loader['train'])

    # Trainer.fit() 時に呼び出される
    def val_dataloader(self):
        return DataLoader(self.valid_dataset, **self._cfg.loader['valid'])

    def test_dataloader(self):
        return DataLoader(self.test_dataset, **self._cfg.loader['valid'])

In [33]:
# # Check datamodule
# _Data = DataModule(df_train, df_train, df_train, CFG)
# _Data.setup()
# _dl = _Data.train_dataloader()
# _data = iter(_dl).next()
# _data[2].squeeze(1).shape

## Pytorch Lightning Module

In [34]:
# ====================================================
# criterion
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self,yhat,y):
#         print(self.mse(yhat,y))
        return torch.sqrt(self.mse(yhat,y))
    
def get_criterion(cfg):
    if cfg.criterion_name == 'BCEWithLogitsLoss':
        # plだとto(device)いらない
        criterion = nn.BCEWithLogitsLoss(reduction="mean")
    elif cfg.criterion_name == 'CrossEntropyLoss':
        criterion = nn.CrossEntropyLoss()
    elif cfg.criterion_name == 'MarginRankingLoss':
        criterion = nn.MarginRankingLoss(margin=cfg.margin)
#     elif cfg.criterion_name == 'rmse':
#         criterion = RMSELoss()
    elif cfg.criterion_name == 'mse':
        criterion = nn.MSELoss()
    else:
        raise NotImplementedError
    return criterion
# ====================================================
# optimizer
# ====================================================
def get_optimizer(model: nn.Module, config: dict):
    """
    input:
    model:model
    config:optimizer_nameやlrが入ったものを渡す
    
    output:optimizer
    """
    optimizer_name = config.optimizer_name
    if 'Adam' == optimizer_name:
        return Adam(model.parameters(),
                    lr=config.lr,
                    weight_decay=config.weight_decay,
                    amsgrad=config.amsgrad)
    elif 'RAdam' == optimizer_name:
        return optim.RAdam(model.parameters(),
                           lr=config.lr,
                           weight_decay=config.weight_decay)
    elif 'AdamW' == optimizer_name:
        return AdamW(model.parameters(),
                     lr=config.lr,
                     weight_decay=config.weight_decay)
    elif 'Ranger' == optimizer_name:
        return optim.Ranger(model.parameters(),
                            lr=config.lr)
    elif 'sgd' == optimizer_name:
        return SGD(model.parameters(),
                   lr=config.lr,
                   momentum=0.9,
                   nesterov=True,
                   weight_decay=config.weight_decay,)
    else:
        raise NotImplementedError

# ====================================================
# scheduler
# ====================================================
def get_scheduler(cfg, optimizer):
    if cfg.scheduler=='ReduceLROnPlateau':
        """
        factor : 学習率の減衰率
        patience : 何ステップ向上しなければ減衰するかの値
        eps : nanとかInf回避用の微小数
        """
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=cfg.factor, patience=cfg.patience, verbose=True, eps=cfg.eps, min_lr=cfg.min_lr)
    elif cfg.scheduler=='CosineAnnealingLR':
        """
        T_max : 1 半周期のステップサイズ
        eta_min : 最小学習率(極小値)
        """
        scheduler = CosineAnnealingLR(optimizer, T_max=cfg.T_max, eta_min=cfg.min_lr, last_epoch=-1)
    elif cfg.scheduler=='CosineAnnealingWarmRestarts':
        """
        T_0 : 初期の繰りかえし回数
        T_mult : サイクルのスケール倍率
        """
        scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=cfg.T_0, T_mult=1, eta_min=cfg.min_lr, last_epoch=-1)
    elif cfg.scheduler=='get_linear_schedule_with_warmup':
        scheduler = get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=cfg.num_warmup_steps,
                                                    num_training_steps=cfg.T_max)
    else:
        raise NotImplementedError
    return scheduler

def get_lightning_scheduler(cfg, optimizer):
    scheduler = get_scheduler(cfg, optimizer)
    if cfg.scheduler=='ReduceLROnPlateau':
        return {'scheduler': scheduler,
                'monitor': 'val_loss_epoch',
                'interval': 'epoch',
                'frequency': 1}
    else:
        return {'scheduler': scheduler,
                'interval': 'step',
                'frequency': 1}


In [47]:
# ====================================================
# model
# ====================================================
class MLP(nn.Module):
    def __init__(self,
                 src_num_dim: int,
                 n_categories: List[int],
                 dropout: float = 0.0,
                 hidden: int = 50,
                 emb_dim: int = 10,
                 dropout_cat: float = 0.2,
                 bn: bool = False):
        super().__init__()
        
        self.embs = nn.ModuleList([
            nn.Embedding(x, emb_dim) for x in n_categories
        ])
        self.cat_dim = emb_dim * len(n_categories)
        self.dropout_cat = nn.Dropout(dropout_cat)
        
        if bn:
            # todo 他の順番試す 
            self.sequence = nn.Sequential(
                nn.Linear(src_num_dim + self.cat_dim, hidden),
                nn.Dropout(dropout),
                nn.BatchNorm1d(hidden),
                nn.ReLU(),
                nn.Linear(hidden, hidden),
                nn.Dropout(dropout),
                nn.BatchNorm1d(hidden),
                nn.ReLU(),
                nn.Linear(hidden, hidden),
                nn.Dropout(dropout),
                nn.BatchNorm1d(hidden),
                nn.ReLU(),
                nn.Linear(hidden, 1)
            )
        else:
            self.sequence = nn.Sequential(
                nn.Linear(src_num_dim + self.cat_dim, hidden),
                nn.Dropout(dropout),
                nn.ReLU(),
                nn.Linear(hidden, hidden),
                nn.Dropout(dropout),
                nn.ReLU(),
                nn.Linear(hidden, 1)
            )
    def forward(self, x_num, x_cat):
        if x_cat.shape[1] != 0:
            embs = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embs)]
            x_cat_emb = self.dropout_cat(torch.cat(embs, 1))
            x_all = torch.cat([x_num, x_cat_emb], 1)
        else:
            x_all = x_num
        x = self.sequence(x_all)
        return torch.squeeze(x)
    
class CNN(nn.Module):
    def __init__(self,
                 num_features: int,
                 hidden_size: int,
                 n_categories: List[int],
                 emb_dim: int = 10,
                 dropout_cat: float = 0.2,
                 channel_1: int = 256,
                 channel_2: int = 512,
                 channel_3: int = 512,
                 dropout_top: float = 0.1,
                 dropout_mid: float = 0.3,
                 dropout_bottom: float = 0.2,
                 weight_norm: bool = True,
                 two_stage: bool = True,
                 celu: bool = True,
                 kernel1: int = 5,
                 leaky_relu: bool = False):
        super().__init__()

        num_targets = 1

        cha_1_reshape = int(hidden_size / channel_1)
        cha_po_1 = int(hidden_size / channel_1 / 2)
        cha_po_2 = int(hidden_size / channel_1 / 2 / 2) * channel_3

        self.cat_dim = emb_dim * len(n_categories)
        self.cha_1 = channel_1
        self.cha_2 = channel_2
        self.cha_3 = channel_3
        self.cha_1_reshape = cha_1_reshape
        self.cha_po_1 = cha_po_1
        self.cha_po_2 = cha_po_2
        self.two_stage = two_stage

        self.expand = nn.Sequential(
            nn.BatchNorm1d(num_features + self.cat_dim),
            nn.Dropout(dropout_top),
            nn.utils.weight_norm(nn.Linear(num_features + self.cat_dim, hidden_size), dim=None),
            nn.CELU(0.06) if celu else nn.ReLU()
        )

        def _norm(layer, dim=None):
            return nn.utils.weight_norm(layer, dim=dim) if weight_norm else layer

        self.conv1 = nn.Sequential(
            nn.BatchNorm1d(channel_1),
            nn.Dropout(dropout_top),
            _norm(nn.Conv1d(channel_1, channel_2, kernel_size=kernel1, stride=1, padding=kernel1 // 2, bias=False)),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(output_size=cha_po_1),
            nn.BatchNorm1d(channel_2),
            nn.Dropout(dropout_top),
            _norm(nn.Conv1d(channel_2, channel_2, kernel_size=3, stride=1, padding=1, bias=True)),
            nn.ReLU()
        )

        if self.two_stage:
            self.conv2 = nn.Sequential(
                nn.BatchNorm1d(channel_2),
                nn.Dropout(dropout_mid),
                _norm(nn.Conv1d(channel_2, channel_2, kernel_size=3, stride=1, padding=1, bias=True)),
                nn.ReLU(),
                nn.BatchNorm1d(channel_2),
                nn.Dropout(dropout_bottom),
                _norm(nn.Conv1d(channel_2, channel_3, kernel_size=5, stride=1, padding=2, bias=True)),
                nn.ReLU()
            )

        self.max_po_c2 = nn.MaxPool1d(kernel_size=4, stride=2, padding=1)

        self.flt = nn.Flatten()

        if leaky_relu:
            self.dense = nn.Sequential(
                nn.BatchNorm1d(cha_po_2),
                nn.Dropout(dropout_bottom),
                _norm(nn.Linear(cha_po_2, num_targets), dim=0),
                nn.LeakyReLU()
            )
        else:
            self.dense = nn.Sequential(
                nn.BatchNorm1d(cha_po_2),
                nn.Dropout(dropout_bottom),
                _norm(nn.Linear(cha_po_2, num_targets), dim=0)
            )

        self.embs = nn.ModuleList([nn.Embedding(x, emb_dim) for x in n_categories])
        self.cat_dim = emb_dim * len(n_categories)
        self.dropout_cat = nn.Dropout(dropout_cat)

    def forward(self, x_num, x_cat):
        if x_cat.shape[1] != 0:
            embs = [embedding(x_cat[:, i]) for i, embedding in enumerate(self.embs)]
            x_cat_emb = self.dropout_cat(torch.cat(embs, 1))
            x = torch.cat([x_num, x_cat_emb], 1)
        else:
            x = x_num

        x = self.expand(x)

        x = x.reshape(x.shape[0], self.cha_1, self.cha_1_reshape)

        x = self.conv1(x)

        if self.two_stage:
            x = self.conv2(x) * x

        x = self.max_po_c2(x)
        x = self.flt(x)
        x = self.dense(x)

        return torch.squeeze(x)

    
def get_model(cfg):
    if 'MLP' in cfg.model_name:
        model = MLP(src_num_dim = len(cfg.cont_cols),
                    n_categories = cfg.n_categories,
                    hidden=256,
                    bn=True)
    elif 'CNN' in cfg.model_name:
        model = CNN(num_features = len(cfg.cont_cols),
                    n_categories = cfg.n_categories,
                    hidden_size=8*128,
                    emb_dim = 10,
                    )
    else:
        raise NotImplementedError
    return model


In [45]:
# # Check model
# _Data = DataModule(df_train,df_train,df_train, CFG)
# _Data.setup()
# _dl = _Data.train_dataloader()
# _data = iter(_dl).next()
# _model = get_model(CFG)
# _output = _model(_data[0],_data[1])
# criterion = get_criterion(CFG)
# criterion(_output, _data[2].squeeze(0))

In [46]:
# # Check Scheduler
# model = get_model(CFG)
# optimizer = get_optimizer(model, CFG)
# scheduler = get_scheduler(CFG,optimizer)
# from pylab import rcParams
# lrs = []
# for epoch in range(1, CFG.epochs+1):
#     scheduler.step(epoch-1)
#     lrs.append(optimizer.param_groups[0]["lr"])
# rcParams['figure.figsize'] = 20,3
# print(lrs)
# plt.plot(lrs)

In [None]:
class Trainer(pl.LightningModule):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.model = get_model(cfg)
        self.criterion = get_criterion(cfg)
    
    def forward(self, x_num, x_cat):
        output = self.model(x_num, x_cat)
        return output
    
    def training_step(self, batch, batch_idx):
        loss, pred, labels = self.__share_step(batch, 'train')
        #self.log('train_loss', loss, on_step=True, prog_bar=True, logger=True)
        # ここのlossはitemつけちゃだめ
        return {'loss': loss, 'pred': pred, 'labels': labels}
    
    def validation_step(self, batch, batch_idx):
        loss, pred, labels = self.__share_step(batch, 'val')
        #self.log('val_loss', loss, on_step= True, prog_bar=True, logger=True)
        return {'loss': loss, 'pred': pred, 'labels': labels}

    def __share_step(self, batch, mode):
        x_num, x_cat, labels = batch
        preds = self.forward(x_num, x_cat)
        labels = labels.squeeze(1)
        loss = self.criterion(preds, labels)
        return loss, preds, labels

    def training_epoch_end(self, outputs):
        self.__share_epoch_end(outputs, 'train') 
        self.log("lr", self.optimizer.param_groups[0]['lr'], prog_bar=True, logger=True)

    def validation_epoch_end(self, outputs):
        self.__share_epoch_end(outputs, 'val') 

    def __share_epoch_end(self, outputs, mode):
        preds, labels = [], []
        for output in outputs:
            pred, label = output['pred'], output['labels']
            preds.append(pred)
            labels.append(label)
        preds = torch.cat(preds).to('cpu').detach().numpy()
        labels = torch.cat(labels).to('cpu').detach().numpy()
        score = metric(labels, preds)
        self.log(f"{mode}_loss_epoch", score)
        
    def predict_step(self, batch, batch_idx):
#         loss, pred, labels = self.__share_step(batch, 'pred')
        x_num, x_cat = batch
        preds = self.forward(x_num, x_cat)
        return preds
    
    def configure_optimizers(self):
        self.optimizer = get_optimizer(self, self.cfg)
        self.scheduler = get_lightning_scheduler(self.cfg, self.optimizer)
        return {'optimizer': self.optimizer, 'lr_scheduler': self.scheduler}

## Train

In [None]:
def train(df_train, df_oof, seed) -> None:
    for fold in range(CFG.n_fold):
        if not fold in CFG.trn_fold:
            continue
        print(f"{'='*38} Fold: {fold} {'='*38}")
        # Logger
        #======================================================
        lr_monitor = LearningRateMonitor(logging_interval='step')
        # 学習済重みを保存するために必要
        loss_checkpoint = ModelCheckpoint(
            dirpath=OUTPUT_DIR,
            filename=f"best_loss_seed{seed}_fold{fold}",
            monitor="val_loss_epoch",
            save_last=True,
            save_top_k=1,
            save_weights_only=True,
            mode="min",
        )
        csv_logger = CSVLogger(save_dir=str(OUTPUT_DIR), name=f"seed{seed}_fold_{fold}")
        wandb_logger = WandbLogger(
            project=f'{CFG.competition}',
            group= f'{CFG.exp_name}',
            name = f'seed{seed}_Fold{fold}',
            #name = f'{CFG.model_name}:{CFG.img_size}',
            save_dir=OUTPUT_DIR
        )
        data_module = DataModule(
          df_train[df_train['fold']!=fold],
          df_train[df_train['fold']==fold], 
          df_train[df_train['fold']==fold], 
          CFG
        )
        data_module.setup()
        
        # setting step param
        # ===================================================================================
        CFG.T_max = int(math.ceil(len(data_module.train_dataloader())/CFG.grad_acc)*CFG.epochs)
        CFG.num_warmup_steps = int(CFG.T_max * CFG.num_warmup_steps_rate)
        print(f"set schedular T_max {CFG.T_max}")
        
        early_stopping_callback = EarlyStopping(monitor='val_loss_epoch', mode="min", patience=20)
        trainer = pl.Trainer(
            logger=[csv_logger,wandb_logger],
            callbacks=[loss_checkpoint],#lr_monitor,early_stopping_callback
            default_root_dir=OUTPUT_DIR,
            accumulate_grad_batches=CFG.grad_acc,
            max_epochs=CFG.epochs,
            precision=CFG.precision,
            **CFG.trainer
        )
        # ================================
        # Train
        # ================================
        model = Trainer(CFG)
        trainer.fit(model, data_module)
        
        """Load best loss model"""
        best_model = Trainer.load_from_checkpoint(cfg=CFG,checkpoint_path=loss_checkpoint.best_model_path)
        torch.save(best_model.model.state_dict(),OUTPUT_DIR + '/' + f'{CFG.exp_name}_seed{seed}_fold{fold}_best.pth')
        
        wandb.finish()
        
        # ================================
        # OOF
        # ================================
        predictions = inference(data_module, OUTPUT_DIR  + f'{CFG.exp_name}_seed{seed}_fold{fold}_best.pth')
        df_oof.loc[df_oof["fold"] == fold, ['pred']] = predictions
    df_oof.to_csv(OUTPUT_DIR + f'oof_{seed}.csv',index=False)
        
def inference(data_module, weight_pass):
    trainer = pl.Trainer(
            default_root_dir=OUTPUT_DIR,
            accumulate_grad_batches=CFG.grad_acc,
            max_epochs=CFG.epochs,
            precision=CFG.precision,
            **CFG.trainer
        )
    model = Trainer(CFG)
    model.model.load_state_dict(torch.load(weight_pass))
    predictions = trainer.predict(model, data_module.test_dataloader())
    preds= []
    for p in predictions:
        preds += p
    return torch.stack(preds).flatten().to('cpu').detach().numpy()

In [None]:
for s in CFG.seed:
    df_train = get_fold(df_train, s)
    df_oof['fold'] = df_train['fold']
    train(df_train,df_oof, s)
    wandb.finish()
send_line_notification("finished")

In [None]:
topk=3
scores = []
for s in CFG.seed:
    df_oof = pd.read_csv(OUTPUT_DIR + f'oof_{s}.csv')
    # cv
    score = metric(df_oof.dropna(subset=['pred'])['pm25_mid'],df_oof.dropna(subset=['pred'])['pred'])
    print(f'seed{s}:{score}')
    scores.append(score)
A=np.array(scores).argsort()#[::-1]
topk_seeds = []
for i in A[:topk]:
    topk_seeds.append(CFG.seed[i])
topk_seeds

In [None]:
for s in CFG.seed:
    # submission
    df_sub['pred'] = 0
    for fold in range(CFG.n_fold):
        if not fold in CFG.trn_fold:
            continue
        data_module = DataModule(
                  df_train[df_train['fold']!=fold],
                  df_train[df_train['fold']==fold], 
                  df_test, 
                  CFG
                )
        data_module.setup()
        predictions = inference(data_module, OUTPUT_DIR  + f'{CFG.exp_name}_seed{s}_fold{fold}_best.pth')
        df_sub['pred'] += predictions
    df_sub['pred'] = np.clip(df_sub['pred']/len(CFG.trn_fold), 0, None)
    df_sub.to_csv(OUTPUT_DIR + f'submission_{s}.csv', header=False, index=False)
    df_sub.head()

In [None]:
if COMPUTE_IMPORTANCE:
    # Feature Importance
    imp_fold = 0
    results = []
    print(' Computing NN feature importance...')
    df_valid = df_train[df_train["fold"] == imp_fold].copy()
    data_module = DataModule(df_valid,df_valid, df_valid, CFG)
    data_module.setup()
    # COMPUTE BASELINE (NO SHUFFLE)
    predictions = inference(data_module, OUTPUT_DIR + f'{CFG.exp_name}_seed{CFG.seed[-1]}_fold{imp_fold}_best.pth')
    baseline_score = metric(df_valid["pm25_mid"].to_numpy(), predictions)
    results.append({'feature':'BASELINE','rmse':baseline_score}) 
    print(f' baseline {baseline_score}')
    X_valid = df_valid[CFG.feature_cols].to_numpy()
    for k in tqdm(range(len(CFG.feature_cols))):
        # SHUFFLE FEATURE K
        save_col = X_valid[:,k].copy()
        np.random.shuffle(X_valid[:,k])
        df_valid[CFG.feature_cols[k]] = X_valid[:,k]
        # COMPUTE OOF MAE WITH FEATURE K SHUFFLED
        data_module = DataModule(df_valid,df_valid, df_valid, CFG)
        data_module.setup()
        predictions = inference(data_module, OUTPUT_DIR  + f'{CFG.exp_name}_seed{CFG.seed[-1]}_fold{imp_fold}_best.pth')
        score = metric(df_valid["pm25_mid"].to_numpy(), predictions)
        results.append({'feature':CFG.feature_cols[k],'rmse':score})
        df_valid[CFG.feature_cols[k]] = save_col

    df = pd.DataFrame(results)
    df = df.sort_values('rmse')
    df.to_csv(OUTPUT_DIR + f'featureimportance.csv',index=False)

## plot COMPUTE_IMPORTANCE

In [None]:
if COMPUTE_IMPORTANCE:
    # DISPLAY LSTM FEATURE IMPORTANCE
    plt.figure(figsize=(10,20))
    plt.barh(np.arange(len(CFG.feature_cols)+1),df.rmse)
    plt.yticks(np.arange(len(CFG.feature_cols)+1),df.feature.values)
    plt.title('NN Feature Importance',size=16)
    plt.ylim((-1,len(CFG.feature_cols)+1))
    plt.xlim((22,24))
    plt.plot([baseline_score,baseline_score],[-1,len(CFG.feature_cols)+1], '--', color='orange',
             label=f'Baseline OOF\nMAE={baseline_score:.3f}')
    plt.xlabel(f'Fold {imp_fold} OOF RMSE with feature permuted',size=14)
    plt.ylabel('Feature',size=14)
    plt.legend()
    plt.show()