# import

In [1]:
# embedding sizeをDeBERTa-v3に合わせてみる

In [2]:
# tabularとNLP 両方, それこそマルチモーダルにするか???
# SAINT + DeBERTa → 情報抽出 → 数層のMLP

# 順番的には
# 1. html contentを無視した lightgbm baseline
# 3. SAINTの実装
# 4. DeBERTa等, 自然言語モデルの実装
# 5. 3-4よりMultimodal化

In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)
%cd /content/drive/MyDrive/_MUFG_student

Mounted at /content/drive
/content/drive/MyDrive/_MUFG_student


In [4]:
%%capture
!pip install einops

In [5]:
# base
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import glob
import shutil

# others
import os
import warnings
warnings.simplefilter('ignore')

# main

import sys
ROOT_PATH = '/content/drive/My Drive/_MUFG_student'
sys.path.append(ROOT_PATH)
ROOT_PATH = '/content/drive/My Drive/_MUFG_student/saint'
sys.path.append(ROOT_PATH)
ROOT_PATH = '/content/drive/My Drive/_MUFG_student/saint/models'
sys.path.append(ROOT_PATH)

import torch
from torch import nn
import argparse
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from saint.utils import count_parameters, classification_scores, mean_sq_error
from saint.augmentations import embed_data_mask
from saint.augmentations import add_noise
from saint.models import SAINT
from saint.pretraining import SAINT_pretrain
import re
import gc

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [6]:
!ls

data  figure  outputs  saint  src


# configration

In [7]:
class SAINT_Config:

    # private
    _exp_num = '003'

    # 学習param
    seed = 0
    num_fold = 5
    model_name = "saint"
    drop_columns = ['id', 'html_content', 'goal']
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # saint param
    task = 'binary'
    dtask = 'clf'
    cont_embeddings = 'MLP'
    embedding_size = 128# 768
    transformer_depth = 4# 6
    attention_heads = 8
    attention_dropout = 0.3# 0.1
    ff_dropout = 0.3# 0.1
    attentiontype = 'colrow'
    optimizer = 'AdamW'
    scheduler = 'cosine'

    lr = 0.0001
    epochs = 60# 100
    eval_epoch = 1
    batchsize = 4
    set_seed = seed# saint用
    dset_seed = seed# saint用

    vision_dset = False
    dset_id = None
    active_log = False
    pretrain = True
    pretrain_epochs = 100

    pt_tasks = ['contrastive','denoising']
    pt_aug = []# ['mixup','cutmix']
    pt_aug_lam = 0.1
    mixup_lam = 0.3
    train_mask_prob = 0# 0
    mask_prob = 0
    ssl_avail_y = 0
    pt_projhead_style = 'diff'
    nce_temp = 0.7
    lam0 = 0.5
    lam1 = 10
    lam2 = 1
    lam3 = 10
    final_mlp_style = 'sep'

    # 保存先
    save_folder_name = f'Exp{_exp_num}_{model_name}'
    run_name = save_folder_name# saint用
    
def set_seed(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def setup(config):
    print("### Configration Setup...")

    set_seed(config.seed)
    config.train_path = './data/train.csv'
    config.test_path = './data/test.csv'

    config.output_path = './outputs'
    config.experiment_path = os.path.join(config.output_path, config.save_folder_name)
    print(f'    experiment_path  >> {config.experiment_path}')
    config.model_save_path = os.path.join(config.experiment_path, 'model')
    config.modelsave_path = os.path.join(config.experiment_path, 'model')# saint用

    print(f'    model_save_path >> {config.model_save_path}')
    config.figure_save_path = os.path.join(config.experiment_path, 'figure')
    print(f'    figure_save_path >> {config.figure_save_path}')
    config.preds_save_path = os.path.join(config.experiment_path, 'preds')
    print(f'    preds_save_path >> {config.preds_save_path}')
    
    for d in [config.output_path, config.experiment_path, config.model_save_path, config.figure_save_path, config.preds_save_path]:
        os.makedirs(d, exist_ok=True)

    print("### Setup Complete. \n")
    return config

# Main

In [8]:
# 前処理系
# Train Test 共通の処理関数
def goal_split(x):
    x = x.split('-')
    x = re.sub('[^0-9]', '',  x[0])
    return int(x)

def singular_mask(df, column, threshold):
    counts = df[column].value_counts()
    res_bool = df[column].isin(counts[counts<threshold].index)
    df.loc[res_bool, column] = 'unknown'
    return df

def test_cat_mask(df, column, unique_list):
    def cat_mask(x):
        if x not in unique_list[column]:
            x = 'unknown'
        return x
    df.loc[:,column] = df[column].map(cat_mask)
    return df
# ======================


def get_train_data(config):

    train_df = pd.read_csv(config.train_path)
    
    # 前処理
    train_df['goal_min'] = train_df['goal'].map(goal_split)
    # 数によってunknownにするカテゴリ変数の設定
    unique_cat_list = {}
    threshold = 10
    train_df = singular_mask(train_df, 'category2', threshold)
    counts = train_df['category2'].value_counts()
    unique_cat_list['category2'] = counts[counts>threshold].index.values # save
    # print(unique_cat_list)
    config.unique_cat_list = unique_cat_list

    if len(config.drop_columns) > 0:# 余計な列のdrop
        train_df = train_df.drop(config.drop_columns, axis=1)
    
    # label encoding + categoriesの登録
    config.categories = train_df.columns[train_df.dtypes=="object"].values
    cat_dims = []
    if len(config.categories)>0:
        label_encoders = {}
        for c in config.categories:
            print(c)
            encoder = LabelEncoder()
            train_df[c] = encoder.fit_transform(train_df[c])
            label_encoders[c] = encoder
            cat_dims.append(len(encoder.classes_))
        config.label_encoders = label_encoders

    X = train_df.drop('state', axis=1)
    y = train_df['state']
    categories = list(config.categories)
    continuous = list(set(X.columns.tolist()) - set(categories))
    cat_idxs = [ i for i, c in enumerate(X.columns) if c in categories]
    con_idxs = list(set(range(len(X.columns))) - set(cat_idxs))
    cat_dims = np.append(np.array([1]),np.array(cat_dims)).astype(int)

    config.cat_dims = cat_dims
    config.con_idxs = con_idxs
    config.cat_idxs = cat_idxs

    return X, y, cat_dims, cat_idxs, con_idxs
    # return cat_dims, cat_idxs, con_idxs, train_df
    # train_df >> X_train, y_train, X_valid, y_valid, train_mean, train_std

def get_test_data(config):
    test_df = pd.read_csv(config.test_path)

    # 前処理
    test_df['goal_min'] = test_df['goal'].map(goal_split)
    test_df = test_cat_mask(test_df, 'category2', config.unique_cat_list)

    if len(config.drop_columns) > 0:
        test_df = test_df.drop(config.drop_columns, axis=1)
    
    # label encoding
    if len(config.categories)>0:
        for c in config.categories:
            print(c)
            test_df[c] = config.label_encoders[c].transform(test_df[c])

    return test_df

In [9]:
class DataSetCatCon(Dataset):
    def __init__(self, X, Y, cat_cols, task='clf', continuous_mean_std=None):

        cat_cols = list(cat_cols)

        temp = X.fillna("MissingValue")
        X_mask = temp.ne("MissingValue").astype(int)

        con_cols = list(set(np.arange(X.shape[1])) - set(cat_cols))
        self.X1 = X.iloc[:,cat_cols].copy().astype(np.int64) #categorical columns
        self.X2 = X.iloc[:,con_cols].copy().astype(np.float32) #numerical columns
        self.X1_mask = X_mask.iloc[:,cat_cols].copy().astype(np.int64) #categorical columns
        self.X2_mask = X_mask.iloc[:,con_cols].copy().astype(np.int64) #numerical columns
        self.y = Y
        self.cls = np.expand_dims(np.zeros_like(self.y,dtype=int), -1)
        self.cls_mask = np.expand_dims(np.ones_like(self.y,dtype=int), -1)
        if continuous_mean_std is not None:
            mean, std = continuous_mean_std
            self.X2 = (self.X2 - mean) / std

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        # X1 has categorical data, X2 has continuous
        return np.concatenate((self.cls[idx], self.X1.iloc[idx])), np.array(self.X2.iloc[idx]), np.array(self.y.iloc[idx]), np.concatenate((self.cls_mask[idx], self.X1_mask.iloc[idx])), np.array(self.X2_mask.iloc[idx])


In [10]:
# Train
from sklearn.metrics import f1_score
def f1_metric(preds, train_data):
    labels = train_data.get_label()
    preds = np.round(preds)
    return 'f1', f1_score(labels, preds), True

def saint_training(config, X, y, cat_dims, cat_idxs, con_idxs, param_tuning=False):

    folds = StratifiedKFold(n_splits=config.num_fold)
    splits = folds.split(np.zeros(len(X)), y)

    oof_pred = np.zeros((len(X), 2), dtype=np.float32)
    fold_num = np.zeros(len(X), dtype=np.int32)

    for fold, (train_index, valid_index) in enumerate(splits):

        print(f'\nStart fold {fold} =====================================')
        X_train = X.iloc[train_index].reset_index(drop=True)
        y_train = y.iloc[train_index].reset_index(drop=True)
        train_mean, train_std = np.array(X_train.iloc[:,con_idxs],dtype=np.float32).mean(0), np.array(X_train.iloc[:,con_idxs],dtype=np.float32).std(0)
        train_std = np.where(train_std < 1e-6, 1e-6, train_std)
        continuous_mean_std = np.array([train_mean,train_std]).astype(np.float32) 
        np.save(os.path.join(config.model_save_path, f'cms{fold}.npy'), continuous_mean_std)
        print(f'continuous_mean_std: \n{continuous_mean_std}\n')# saveする?
        X_valid = X.iloc[valid_index].reset_index(drop=True)
        y_valid = y.iloc[valid_index].reset_index(drop=True)

        print(f'X_train shape: {X_train.shape}')
        print(f'y_train shape: {y_train.shape}')
        print(f'X_valid shape: {X_valid.shape}')
        print(f'y_valid shape: {y_valid.shape}')

        # Dataset + Dataloader
        train_ds = DataSetCatCon(X_train, y_train, cat_idxs, task=config.dtask, continuous_mean_std=continuous_mean_std)
        trainloader = DataLoader(train_ds, batch_size=config.batchsize, shuffle=True, num_workers=os.cpu_count())
        valid_ds = DataSetCatCon(X_valid, y_valid, cat_idxs, task=config.dtask, continuous_mean_std=continuous_mean_std)
        validloader = DataLoader(valid_ds, batch_size=config.batchsize, shuffle=False, num_workers=os.cpu_count())

        # define model
        model = SAINT(
            categories = tuple(cat_dims), 
            num_continuous = len(con_idxs),                
            dim = config.embedding_size,                           
            dim_out = 1,                       
            depth = config.transformer_depth,                       
            heads = config.attention_heads,                         
            attn_dropout = config.attention_dropout,             
            ff_dropout = config.ff_dropout,                  
            mlp_hidden_mults = (4, 2),       
            cont_embeddings = config.cont_embeddings,
            attentiontype = config.attentiontype,
            final_mlp_style = config.final_mlp_style,
            y_dim = 2
        )
        criterion = nn.CrossEntropyLoss().to(config.device)
        model.to(config.device)
        optimizer = optim.AdamW(model.parameters(),lr=config.lr)

        # pretraining
        model = SAINT_pretrain(model, cat_idxs, X_train, y_train, continuous_mean_std, config, config.device)

        # training
        best_valid_auroc = 0
        best_valid_accuracy = 0
        best_valid_f1 = 0.0
        best_valid_preds = None
        best_epoch = 0
        print('Training begins now.')
        for epoch in range(config.epochs):
            model.train()
            running_loss = 0.0
            for i, data in enumerate(trainloader, 0):
                optimizer.zero_grad()
                x_categ, x_cont, y_gts, cat_mask, con_mask = data[0].to(config.device), data[1].to(config.device),data[2].to(config.device),data[3].to(config.device),data[4].to(config.device)

                _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask, model, config. vision_dset)   
                reps = model.transformer(x_categ_enc, x_cont_enc)
                y_reps = reps[:,0,:]
                y_outs = model.mlpfory(y_reps)
                loss = criterion(y_outs, y_gts.squeeze())
                loss.backward()
                optimizer.step()
                running_loss += loss.item()
            print(f'epoch{epoch+1}: running_loss={running_loss}')
            if epoch%config.eval_epoch==0:
                model.eval()
                with torch.no_grad():
                    train_accuracy, train_auroc, train_f1, _ = classification_scores(model, trainloader, config.device, 'binary', config.vision_dset)
                    print(f'[EPOCH {epoch+1}] TRAIN F1: {train_f1} TRAIN ACCURACY: {train_accuracy:.3f}, TRAIN AUROC: {train_auroc:.3f}')
                    
                    valid_accuracy, valid_auroc, valid_f1, valid_pred = classification_scores(model, validloader, config.device, 'binary', config.vision_dset)
                    print(f'[EPOCH {epoch+1}] VALID F1: {valid_f1} VALID ACCURACY: {valid_accuracy:.3f}, VALID AUROC: {valid_auroc:.3f}')
                    if valid_f1 > best_valid_f1:
                        best_valid_accuracy = valid_accuracy
                        best_valid_auroc = valid_auroc
                        best_valid_f1 = valid_f1   
                        best_valid_preds = valid_pred
                        best_epoch = epoch
                        torch.save(model.state_dict(),f'{config.modelsave_path}/bestmodel{fold}.pth')
                model.train()

        print(f'F1 on best model: {best_valid_f1:.6f} (Epoch{best_epoch})')

        print(best_valid_preds.shape)
        oof_pred[valid_index] = best_valid_preds.astype(np.float32)
        fold_num[valid_index] = fold+1
        del model; gc.collect()
        
    pred = np.argmax(oof_pred, axis=1)# torch.argmax(m(y_outs), dim=1).float()
    cv_score = f1_score(y, pred)
    print(f'\nFinal CV = {cv_score:.5f}')

    return cv_score, oof_pred

# util 入れ替え

In [11]:
def inferring(config, X_test):
    config.model_weights = [p for p in sorted(glob.glob(os.path.join(config.model_save_path, 'bestmodel*.pth')))]
    sub_pred = np.zeros((len(X_test),2), dtype=np.float32)
    print(sub_pred.shape)
    dummy_y = pd.Series([i for i in range(len(X_test))])
    for fold, model_weight in enumerate(config.model_weights):

        continuous_mean_std = np.load(os.path.join(config.model_save_path, f'cms{fold}.npy'))
        test_ds = DataSetCatCon(X_test, dummy_y, config.cat_idxs, task=config.dtask, continuous_mean_std=continuous_mean_std)
        testloader = DataLoader(test_ds, batch_size=config.batchsize, shuffle=False, num_workers=os.cpu_count())

        model = SAINT(
            categories = tuple(config.cat_dims), 
            num_continuous = len(config.con_idxs),                
            dim = config.embedding_size,                           
            dim_out = 1,                       
            depth = config.transformer_depth,                       
            heads = config.attention_heads,                         
            attn_dropout = config.attention_dropout,             
            ff_dropout = config.ff_dropout,                  
            mlp_hidden_mults = (4, 2),       
            cont_embeddings = config.cont_embeddings,
            attentiontype = config.attentiontype,
            final_mlp_style = config.final_mlp_style,
            y_dim = 2
        )
        model.load_state_dict(torch.load(model_weight))
        model.to(config.device)

        model.eval()
        y_pred = torch.empty(0).to(config.device)
        y_out = torch.empty(0).to(config.device)
        with torch.no_grad():
            for i, data in enumerate(testloader, 0):
                x_categ, x_cont, _, cat_mask, con_mask = data[0].to(config.device), data[1].to(config.device),data[2].to(config.device),data[3].to(config.device),data[4].to(config.device)
                _ , x_categ_enc, x_cont_enc = embed_data_mask(x_categ, x_cont, cat_mask, con_mask, model, config.vision_dset)           
                reps = model.transformer(x_categ_enc, x_cont_enc)
                y_reps = reps[:,0,:]
                print(y_reps.size())
                y_outs = model.mlpfory(y_reps)
                # import ipdb; ipdb.set_trace() 
                y_out = torch.cat([y_out,y_outs.float()],dim=0) 
                y_pred = torch.cat([y_pred,torch.argmax(y_outs, dim=1).float()],dim=0)

        sub_pred += y_out.detach().cpu().numpy() / len(config.model_weights)
        del model; gc.collect()

    np.save(os.path.join(config.preds_save_path, 'sub_pred.npy'), sub_pred)
    return sub_pred# 返すのはprobability

In [12]:
def copy_scripts(config):
    scripts_save_path = os.path.join(config.experiment_path, 'scripts')
    os.makedirs(scripts_save_path, exist_ok=True)
    for script in glob.glob('./src/*.ipynb'):
        dst_file = os.path.join(scripts_save_path, script.split('/')[-1])
        print(f'[save file] {dst_file}')
        shutil.copyfile(script, dst_file)

In [13]:
def main():
    
    saint_config = setup(SAINT_Config())
    train_path = './data/train.csv'
    test_path = './data/test.csv'
    submit_path = './data/sample_submit.csv'

    X, y, cat_dims, cat_idxs, con_idxs = get_train_data(saint_config)
    X_test = get_test_data(saint_config)
    score, oof_pred = saint_training(saint_config, X, y, cat_dims, cat_idxs, con_idxs, param_tuning=False)

    sub_pred = inferring(saint_config, X_test)
    sub = pd.read_csv(submit_path, header=None)
    sub[1] = np.argmax(sub_pred, axis=1).astype(int)

    def fix_leak(sub, train_path, test_path):
        print("===== fix_leak =====")
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        duplicated = pd.merge(test_df, train_df[['goal', 'country', 'duration', 'category1', 'category2', 'html_content', 'state']], on=['goal', 'country', 'duration', 'category1', 'category2', 'html_content'], how="left")
        duplicated = duplicated[~duplicated["state"].isnull()]
        for i in duplicated.index:
            print(f'Fix index{i}: {sub.loc[i,1]}')
            sub.loc[i, 1] = int(duplicated.loc[i, "state"])
            print(f'To {sub.loc[i,1]}')
        return sub
    sub = fix_leak(sub, train_path, test_path)

    # 提出用ファイル
    sub.to_csv(os.path.join(saint_config.preds_save_path, f'Exp{saint_config._exp_num}_CV{int(score*(10**10))}_submission.csv'), index=False, header=False)

    # scriptの保存
    copy_scripts(saint_config)

main()

### Configration Setup...
    experiment_path  >> ./outputs/Exp003_saint
    model_save_path >> ./outputs/Exp003_saint/model
    figure_save_path >> ./outputs/Exp003_saint/figure
    preds_save_path >> ./outputs/Exp003_saint/preds
### Setup Complete. 

country
category1
category2
country
category1
category2

continuous_mean_std: 
[[3.26407051e+01 1.19044473e+04]
 [1.21512165e+01 2.16121484e+04]]

X_train shape: (7832, 5)
y_train shape: (7832,)
X_valid shape: (1959, 5)
y_valid shape: (1959,)
Pretraining begins!
Epoch: 0, Running Loss: 2933.033352551982
Epoch: 1, Running Loss: 458.09788082540035
Epoch: 2, Running Loss: 382.3175116367638
Epoch: 3, Running Loss: 309.2883503451012
Epoch: 4, Running Loss: 243.42846058821306
Epoch: 5, Running Loss: 175.0873505875934
Epoch: 6, Running Loss: 165.148962332285
Epoch: 7, Running Loss: 177.89241575053893
Epoch: 8, Running Loss: 153.03003870195244
Epoch: 9, Running Loss: 102.55047184810974
Epoch: 10, Running Loss: 103.15929240174592
Epoch: 11, Runni

ValueError: ignored

# Parameter tuning

# Debug