# Import

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)
%cd /content/drive/MyDrive/_MUFG_student

Mounted at /content/drive
/content/drive/MyDrive/_MUFG_student


In [None]:
# deberta
%%capture
!pip install transformers datasets sentencepiece torchmetrics

In [None]:
# saint
%%capture
!pip install einops

In [None]:
# base
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import glob
import random
import shutil

# others
import os
import warnings
warnings.simplefilter('ignore')

# deberta-main
import gc
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchmetrics import F1Score
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

from datasets import load_dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

# base
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import glob
import shutil


# saint-main
import sys
ROOT_PATH = '/content/drive/My Drive/_MUFG_student'
sys.path.append(ROOT_PATH)
ROOT_PATH = '/content/drive/My Drive/_MUFG_student/saint'
sys.path.append(ROOT_PATH)
ROOT_PATH = '/content/drive/My Drive/_MUFG_student/saint/models'
sys.path.append(ROOT_PATH)

import torch.optim as optim
from saint.utils import count_parameters, classification_scores, mean_sq_error
from saint.augmentations import embed_data_mask
from saint.augmentations import add_noise
from saint.models import SAINT
from saint.pretraining import SAINT_pretrain
from sklearn.preprocessing import LabelEncoder

In [None]:
!ls

data  figure  outputs  saint  src


# config

In [None]:
class DeBERTa_Config:

    # 学習param
    model_name = "microsoft/deberta-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    max_token_len = 256
    # 保存先
    save_folder_name = f'{model_name}'

class SAINT_Config:

    # 学習param
    model_name = "saint"
    drop_columns = ['id', 'html_content', 'goal']# drop
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    # saint param
    task = 'binary'
    dtask = 'clf'
    cont_embeddings = 'MLP'
    embedding_size = 32
    transformer_depth = 4
    attention_heads = 8
    attention_dropout = 0.3
    ff_dropout = 0.3
    attentiontype = 'colrow'
    optimizer = 'AdamW'
    scheduler = 'cosine'

    vision_dset = False
    dset_id = None
    active_log = False
    pretrain = True
    pretrain_epochs = 100

    pt_tasks = ['contrastive','denoising']
    pt_aug = []# ['mixup','cutmix']
    pt_aug_lam = 0.1
    mixup_lam = 0.3
    train_mask_prob = 0# 0
    mask_prob = 0
    ssl_avail_y = 0
    pt_projhead_style = 'diff'
    nce_temp = 0.7
    lam0 = 0.5
    lam1 = 10
    lam2 = 1
    lam3 = 10
    final_mlp_style = 'sep'

    # 保存先
    save_folder_name = f'{model_name}'
    run_name = save_folder_name

def set_seed(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def setup(ex_root, config):
    print("### Configration Setup...")

    config.output_path = ex_root
    config.experiment_path = os.path.join(config.output_path, config.save_folder_name)
    print(f'    experiment_path  >> {config.experiment_path}')
    config.model_save_path = os.path.join(config.experiment_path, 'model')
    config.modelsave_path = os.path.join(config.experiment_path, 'model')# saint用

    print(f'    model_save_path >> {config.model_save_path}')
    config.figure_save_path = os.path.join(config.experiment_path, 'figure')
    print(f'    figure_save_path >> {config.figure_save_path}')
    config.preds_save_path = os.path.join(config.experiment_path, 'preds')
    print(f'    preds_save_path >> {config.preds_save_path}')
    
    for d in [config.output_path, config.experiment_path, config.model_save_path, config.figure_save_path, config.preds_save_path]:
        os.makedirs(d, exist_ok=True)

    print("### Setup Complete. \n")
    return config

class Multi_Config:

    # private
    model_name = 'DeBERTa_SAINT_Multimodal'

    train_path = './data/train.csv'
    test_path = './data/test.csv'

    # 共通 param
    _exp_num = '001'
    seed = 0
    set_seed(seed)
    num_fold = 5
    batch_size = 32
    n_epochs = 10

    lr = 2e-5
    weight_decay = 2e-5
    beta = (0.9, 0.98)
    num_warmup_steps_rate = 0.01
    gradient_accumulation_steps = 1
    num_eval = 1
    num_msd = 8
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # 保存先
    save_folder_name = f'Exp{_exp_num}_{model_name}'
    output_path = './outputs'
    experiment_path = os.path.join(output_path, save_folder_name)
    print(f'    experiment_root  >> {experiment_path}')

    # deberta
    deberta_config = setup(experiment_path, DeBERTa_Config())

    # saint
    saint_config = setup(experiment_path, SAINT_Config())
    saint_config.set_seed = seed
    saint_config.dset_seed = seed

# multi_config = Multi_Config()

    experiment_root  >> ./outputs/Exp001_DeBERTa_SAINT_Multimodal
### Configration Setup...
    experiment_path  >> ./outputs/Exp001_DeBERTa_SAINT_Multimodal/microsoft/deberta-base
    model_save_path >> ./outputs/Exp001_DeBERTa_SAINT_Multimodal/microsoft/deberta-base/model
    figure_save_path >> ./outputs/Exp001_DeBERTa_SAINT_Multimodal/microsoft/deberta-base/figure
    preds_save_path >> ./outputs/Exp001_DeBERTa_SAINT_Multimodal/microsoft/deberta-base/preds
### Setup Complete. 

### Configration Setup...
    experiment_path  >> ./outputs/Exp001_DeBERTa_SAINT_Multimodal/saint
    model_save_path >> ./outputs/Exp001_DeBERTa_SAINT_Multimodal/saint/model
    figure_save_path >> ./outputs/Exp001_DeBERTa_SAINT_Multimodal/saint/figure
    preds_save_path >> ./outputs/Exp001_DeBERTa_SAINT_Multimodal/saint/preds
### Setup Complete. 



# Dataset

In [None]:
COL_NAMES = ['goal', 'duration', 'country', 'category1', 'category2', 'html_content']

def text_cleaning(text):
    
    clean_lines = []
    lines = text.split(r"</li>")
    for line in lines:
        clean_line = remove_tag(line)
        
        clean_line = re.sub('\n', '', clean_line)
        clean_line = clean_line.strip()
        clean_line = clean_line + ('' if clean_line.endswith('.') else '.')
        if len(clean_line)!=1:
            clean_lines.append(clean_line)

    return ' '.join(clean_lines)

def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

# saint用
def goal_split(x):
    x = x.split('-')
    x = re.sub('[^0-9]', '',  x[0])
    return int(x)

def singular_mask(df, column, threshold):
    counts = df[column].value_counts()
    res_bool = df[column].isin(counts[counts<threshold].index)
    df.loc[res_bool, column] = 'unknown'
    return df

def test_cat_mask(df, column, unique_list):
    def cat_mask(x):
        if x not in unique_list[column]:
            x = 'unknown'
        return x
    df.loc[:,column] = df[column].map(cat_mask)
    return df

In [None]:
# ================================= 参考 =======================================

def get_train_data(config):
    train_df = pd.read_csv(config.train_path)
    # 前処理
    train_df['goal_min'] = train_df['goal'].map(goal_split)
    # 数によってunknownにするカテゴリ変数の設定
    unique_cat_list = {}
    threshold = 10
    train_df = singular_mask(train_df, 'category2', threshold)
    counts = train_df['category2'].value_counts()
    unique_cat_list['category2'] = counts[counts>threshold].index.values # save
    # print(unique_cat_list)
    config.unique_cat_list = unique_cat_list
    if len(config.drop_columns) > 0:# 余計な列のdrop
        train_df = train_df.drop(config.drop_columns, axis=1)
    # label encoding + categoriesの登録
    config.categories = train_df.columns[train_df.dtypes=="object"].values
    cat_dims = []
    if len(config.categories)>0:
        label_encoders = {}
        for c in config.categories:
            print(c)
            encoder = LabelEncoder()
            train_df[c] = encoder.fit_transform(train_df[c])
            label_encoders[c] = encoder
            cat_dims.append(len(encoder.classes_))
        config.label_encoders = label_encoders
    X = train_df.drop('state', axis=1)
    y = train_df['state']
    categories = list(config.categories)
    continuous = list(set(X.columns.tolist()) - set(categories))
    cat_idxs = [ i for i, c in enumerate(X.columns) if c in categories]
    con_idxs = list(set(range(len(X.columns))) - set(cat_idxs))
    cat_dims = np.append(np.array([1]),np.array(cat_dims)).astype(int)
    config.cat_dims = cat_dims
    config.con_idxs = con_idxs
    config.cat_idxs = cat_idxs
    return X, y, cat_dims, cat_idxs, con_idxs

def get_test_data(config):
    test_df = pd.read_csv(config.test_path)
    # 前処理
    test_df['goal_min'] = test_df['goal'].map(goal_split)
    test_df = test_cat_mask(test_df, 'category2', config.unique_cat_list)
    if len(config.drop_columns) > 0:
        test_df = test_df.drop(config.drop_columns, axis=1)
    # label encoding
    if len(config.categories)>0:
        for c in config.categories:
            print(c)
            test_df[c] = config.label_encoders[c].transform(test_df[c])
    return test_df

# ================================= 参考 =======================================

In [None]:
def get_train_data(config):
    train_df = pd.read_csv(config.train_path)

    # DeBERTa
    train_df['description'] = train_df[COL_NAMES[0]].fillna('NAN').astype(str).str.cat(train_df[COL_NAMES[1:]].fillna('NAN').astype(str), sep=config.deberta_config.tokenizer.sep_token)# 要修正
    train_df['clean_description'] = train_df['description'].map(text_cleaning)

    # SAINT
    train_df['goal_min'] = train_df['goal'].map(goal_split)
    unique_cat_list = {}
    threshold = 10
    train_df = singular_mask(train_df, 'category2', threshold)# unknown生成
    counts = train_df['category2'].value_counts()
    unique_cat_list['category2'] = counts[counts>threshold].index.values # unique保存
    

    return train_df

def get_test_data(config):
    test_df = pd.read_csv(config.test_path)
    test_df['description'] = test_df[COL_NAMES[0]].fillna('NAN').astype(str).str.cat(test_df[COL_NAMES[1:]].fillna('NAN').astype(str), sep=config.deberta_config.tokenizer.sep_token)# 要修正
    test_df['clean_description'] = test_df['description'].map(text_cleaning)
    return test_df

multi_config = Multi_Config()
train_df = get_train_data(multi_config)
test_df = get_test_data(multi_config)

In [None]:
train_df.head(3)

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,description,clean_description
0,train_00000,20001-21000,US,45,art,mixed media,"<div class=""contents""><div><p><a href=""http://...",1,20001-21000[SEP]45[SEP]US[SEP]art[SEP]mixed me...,20001-21000[SEP]45[SEP]US[SEP]art[SEP]mixed me...
1,train_00001,19001-20000,US,59,food,restaurants,"<div class=""contents""><div><p>Cultural Pretzel...",0,19001-20000[SEP]59[SEP]US[SEP]food[SEP]restaur...,19001-20000[SEP]59[SEP]US[SEP]food[SEP]restaur...
2,train_00002,2001-3000,US,38,art,performance art,"<div class=""contents""><div><p>I want to perfor...",0,2001-3000[SEP]38[SEP]US[SEP]art[SEP]performanc...,2001-3000[SEP]38[SEP]US[SEP]art[SEP]performanc...


In [None]:
test_df.head(3)

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,description,clean_description
0,test_00000,5001-6000,FR,30,dance,performances,"<div class=""contents""><div><p>Bonjour ,</p><p>...",5001-6000[SEP]30[SEP]FR[SEP]dance[SEP]performa...,5001-6000[SEP]30[SEP]FR[SEP]dance[SEP]performa...
1,test_00001,6001-7000,GB,23,publishing,children's books,"<div class=""contents""><div><p><span class=""bol...",6001-7000[SEP]23[SEP]GB[SEP]publishing[SEP]chi...,6001-7000[SEP]23[SEP]GB[SEP]publishing[SEP]chi...
2,test_00002,6001-7000,GB,30,theater,plays,"<div class=""contents""><div><p>COW is a rural t...",6001-7000[SEP]30[SEP]GB[SEP]theater[SEP]plays[...,6001-7000[SEP]30[SEP]GB[SEP]theater[SEP]plays[...


# Training

# Run