# import

In [1]:
# tabularとNLP 両方, それこそマルチモーダルにするか???
# SAINT + DeBERTa → 情報抽出 → 数層のMLP

# 順番的には
# 1. html contentを無視した lightgbm baseline
# 2. html contentのみを用いたnaive baise baseline
# 3. SAINTの実装
# 4. DeBERTa等, 自然言語モデルの実装
# 5. 3-4よりMultimodal化

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)
%cd /content/drive/MyDrive/_MUFG_student

Mounted at /content/drive
/content/drive/MyDrive/_MUFG_student


In [3]:
%%capture
!pip install transformers datasets sentencepiece torchmetrics

In [4]:
# base
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import glob
import random
import shutil

# others
import os
import warnings
warnings.simplefilter('ignore')

# main
import gc
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchmetrics import F1Score
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler

from datasets import load_dataset
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, confusion_matrix

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

In [5]:
!ls

data  figure  outputs  saint  src


# configration

In [6]:

class DeBERTa_Config:

    # private
    _exp_num = '003'

    # 学習param
    seed = 0
    model_name = "microsoft/deberta-v3-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # model_name = "microsoft/deberta-v2-xlarge"
    # model_name = "microsoft/deberta-v3-base"
    # model_name = 'microsoft/deberta-base'
    # model_name = 'roberta-base'
    # model_name = 'roberta-large'
    # model_name = 'roberta-large-mnli'
    # model_name = 'xlnet-large-cased'
    # model_name = 'albert-xxlarge-v2'
    # model_name = "microsoft/deberta-large"
    # model_name = "microsoft/deberta-v3-large" # bs=4
    # model_name = 'microsoft/deberta-v2-xlarge'
    # model_name = 'funnel-transformer/large'
    # model_name = 'funnel-transformer/medium'
    # model_name = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
    # model_name = 'bert-base-uncased'
    # model_name = 'albert-base-v2'
    # model_name = 'albert-large-v2'
    # model_name = 'google/electra-large-discriminator'
    # model_name = 'google/electra-base-discriminator'
    # model_name = "facebook/bart-large-mnli"
    # model_name = "facebook/bart-large"
    # model_name = "facebook/bart-base"

    # deberta config
    num_fold = 5
    batch_size = 16# 32
    n_epochs = 7
    max_token_len = 512# 256
    lr = 2e-5
    weight_decay = 0.01
    beta = (0.9, 0.98)
    num_warmup_steps_rate = 0.01
    gradient_accumulation_steps = 2
    num_eval = 1
    num_msd = 8
    
    # 保存先
    save_folder_name = f'Exp{_exp_num}_{model_name}'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
def setup(config):
    print("### Configration Setup...")

    config.output_path = './outputs'
    config.experiment_path = os.path.join(config.output_path, config.save_folder_name)
    print(f'    experiment_path  >> {config.experiment_path}')
    config.model_save_path = os.path.join(config.experiment_path, 'model')
    print(f'    model_save_path >> {config.model_save_path}')
    config.figure_save_path = os.path.join(config.experiment_path, 'figure')
    print(f'    figure_save_path >> {config.figure_save_path}')
    config.preds_save_path = os.path.join(config.experiment_path, 'preds')
    print(f'    preds_save_path >> {config.preds_save_path}')
    
    for d in [config.output_path, config.experiment_path, config.model_save_path, config.figure_save_path, config.preds_save_path]:
        os.makedirs(d, exist_ok=True)

    print("### Setup Complete. \n")
    return config

# lgb_config = setup(LightGBM_Config())
# deberta_config = setup(DeBERTa_Config())

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Main

In [7]:
def text_cleaning(batch):
    clean_texts = []
    for text in batch["description"]:
        clean_lines = []
        lines = text.split(r"</li>")
        for line in lines:
            clean_line = remove_tag(line)
            
            clean_line = re.sub('\n', '', clean_line)
            '''clean_line = re.sub(r' &amp; ', ' and ', line)
            clean_line = re.sub(r'&amp;', '', clean_line)

            clean_line = re.sub(r'&lt;.*?style.*?&gt;', '', clean_line)
            clean_line = re.sub(r'&lt;', ' less than ', clean_line)
            clean_line = re.sub(r'&gt;', ' more than ', clean_line)
            clean_line = re.sub(r'\\u202f', '', clean_line)
            clean_line = re.sub(r'\\xa0', '', clean_line)
            clean_line = re.sub(r'\\', '', clean_line)

            clean_line = re.sub(r'http?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', clean_line)
            clean_line = re.sub(r'https?://[\w/:%#\$&\?\(\)~\.=\+\-]+', '', clean_line)'''

            clean_line = clean_line.strip()
            clean_line = clean_line + ('' if clean_line.endswith('.') else '.')
            if len(clean_line)!=1:
                clean_lines.append(clean_line)
        clean_texts.append(' '.join(clean_lines))
    return {"clean_description": clean_texts}

def remove_tag(x):
    p = re.compile(r"<[^>]*?>")
    return p.sub('',x)

In [8]:

# id
# goal duration
# country category1 category2 html_content
# state
COL_NAMES = ['goal', 'duration', 'country', 'category1', 'category2', 'html_content']

from datasets import Dataset
def get_train_data(train_path, config):
    train_df = pd.read_csv(train_path)
    train_df['description'] = train_df[COL_NAMES[0]].fillna('NAN').astype(str).str.cat(train_df[COL_NAMES[1:]].fillna('NAN').astype(str), sep=config.tokenizer.sep_token)# 要修正
    train_ds = Dataset.from_pandas(train_df)
    train_ds = train_ds.map(text_cleaning, batched=True, batch_size=None)
    return train_ds

def get_test_data(test_path, config):
    test_df = pd.read_csv(test_path)
    test_df['description'] = test_df[COL_NAMES[0]].fillna('NAN').astype(str).str.cat(test_df[COL_NAMES[1:]].fillna('NAN').astype(str), sep=config.tokenizer.sep_token)# 要修正
    test_ds = Dataset.from_pandas(test_df)
    test_ds = test_ds.map(text_cleaning, batched=True, batch_size=None)
    return test_ds

In [9]:
class CrowdFunding_Dataset(Dataset):
    def __init__(self, config, descriptions, labels=None, mode=None, aug_data=False):
        self.config = config
        self.tokenizer = config.tokenizer
        self.max_token_len = config.max_token_len
        
        self.descriptions = descriptions
        self.labels = labels

    def __len__(self):
        return len(self.descriptions)

    def __getitem__(self, index):
        description = self.descriptions[index]
        # print(description)
        encoded = self.tokenizer.encode_plus(description,
                                                add_special_tokens=True,
                                                return_tensors='pt',
                                                truncation=True,
                                                padding='max_length',
                                                max_length=self.max_token_len,
                                                return_attention_mask = True)
        if self.labels is not None:
            label = torch.tensor(self.labels[index], dtype=torch.int64)
            return encoded.input_ids.flatten(), encoded.attention_mask.flatten(), label
        else:
            return encoded.input_ids.flatten(), encoded.attention_mask.flatten()

class CrowdFunding_Model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        cfg = AutoConfig.from_pretrained(
            config.model_name,
            output_hidden_states=True
        )
        self.pretrained_model = AutoModel.from_pretrained(config.model_name, config=cfg)

        self.hidden = torch.nn.Linear(self.pretrained_model.config.hidden_size, int(self.pretrained_model.config.hidden_size/2))
        torch.nn.init.xavier_uniform_(self.hidden.weight)

        self.classifier = torch.nn.Linear(int(self.pretrained_model.config.hidden_size/2), 1)
        torch.nn.init.xavier_uniform_(self.classifier.weight)
        
        self.loss_func = nn.BCEWithLogitsLoss()
        # self.f1_func = F1Score(num_classes=config.n_labels, average="macro")
        self.logits_dropouts = nn.ModuleList([nn.Dropout(p=0.1) for _ in range(config.num_msd)])
    

    def forward(self, input_ids, attention_mask, labels=None):

        output = self.pretrained_model(input_ids=input_ids, attention_mask=attention_mask)["last_hidden_state"]
        output = output[:, 0, :]
        print(output.size())
        pooled_output = sum([self.hidden(dropout(output)) for dropout in self.logits_dropouts]) / self.config.num_msd
        pooled_output = F.relu(pooled_output)
        logits = sum([self.classifier(dropout(pooled_output)) for dropout in self.logits_dropouts]) / self.config.num_msd

        if labels is not None:
            loss = self.loss_func(logits.view(-1), labels.view(-1).to(torch.float32))
            with torch.no_grad():
                # self.f1_func
                pass
                # F1の計算??
            return logits, loss
        else:
            return logits

In [10]:
def training(config, train_ds):

    folds = StratifiedKFold(n_splits=config.num_fold)
    splits = folds.split(np.zeros(train_ds.num_rows), train_ds["state"])

    oof_pred = np.zeros((len(train_ds), 1), dtype=np.float32)
    fold_num = np.zeros(len(train_ds), dtype=np.int32)

    for fold, (train_idxs, val_idxs) in enumerate(splits):

        print(f'\n# ==== start fold{fold+1} ==== #\n')
        train_fold_ds = train_ds.select(train_idxs)
        valid_fold_ds = train_ds.select(val_idxs)

        # debug(valid_fold_ds)# ========================

        train_dataset = CrowdFunding_Dataset(config, train_fold_ds["clean_description"], train_fold_ds["state"], aug_data=True)# mode="Train"
        valid_dataset = CrowdFunding_Dataset(config, valid_fold_ds["clean_description"], valid_fold_ds["state"])

        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=config.batch_size, 
            shuffle=True,
            drop_last=True
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=config.batch_size,
            shuffle=False,
            drop_last=False
        )

        best_val_preds = None
        best_val_f1_score = -1

        model = CrowdFunding_Model(config)
        model.to(config.device)

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = []
        optimizer_grouped_parameters.append({
            'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
            'weight_decay': config.weight_decay
        })
        optimizer_grouped_parameters.append({
            'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
            'weight_decay': 0.0
        })
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=config.lr,
            betas=config.beta,
            weight_decay=config.weight_decay,
        )

        num_train_optimization_steps = int(
            len(train_loader) * config.n_epochs // config.gradient_accumulation_steps
        )
        num_warmup_steps = int(num_train_optimization_steps * config.num_warmup_steps_rate)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_train_optimization_steps
        )
        num_eval_step = len(train_loader) // config.num_eval + config.num_eval

        for epoch in range(config.n_epochs):
            # training
            print(f"# ============ start epoch:{epoch+1} ============== #")
            model.train()
            train_preds = []
            train_labels = []
            val_losses_batch = []
            scaler = GradScaler()
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (input_ids, attention_mask, labels) in enumerate(pbar):

                    input_ids = input_ids.to(config.device)
                    attention_mask = attention_mask.to(config.device)
                    labels = labels.to(config.device)

                    optimizer.zero_grad()
                    with autocast():
                        output, loss = model(input_ids, attention_mask, labels)
                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })

                    if config.gradient_accumulation_steps>1:
                        loss = loss/config.gradient_accumulation_steps
                    scaler.scale(loss).backward()

                    if (step+1)%config.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        scheduler.step()
                    
                    output = output.sigmoid().detach().cpu().numpy()
                    train_preds.append(output)
                    train_labels.append(labels.detach().cpu().numpy())

            train_preds = np.concatenate(train_preds)
            train_labels = np.concatenate(train_labels)
            train_f1_score = f1_score(np.round(train_preds), train_labels)
            print(f'Training score: {train_f1_score}')

            # evaluating
            val_preds = []
            val_labels = []
            val_losses = []
            val_nums = []
            model.eval()
            with torch.no_grad():
                with tqdm(valid_loader, total=len(valid_loader)) as pbar:
                    for (input_ids, attention_mask, labels) in pbar:

                        input_ids = input_ids.to(config.device)
                        attention_mask = attention_mask.to(config.device)
                        labels = labels.to(config.device)
                        with autocast():
                            output, loss = model(input_ids, attention_mask, labels)
                        output = output.sigmoid().detach().cpu().numpy()
                        val_preds.append(output)
                        val_labels.append(labels.detach().cpu().numpy())
                        val_losses.append(loss.item() * len(labels))
                        val_nums.append(len(labels))
                        pbar.set_postfix({
                            'val_loss': loss.item()
                        })

            val_preds = np.concatenate(val_preds)
            val_labels = np.concatenate(val_labels)
            val_loss = sum(val_losses) / sum(val_nums)
            val_f1_score = f1_score(np.round(val_preds), val_labels)

            val_log = {
                'val_loss': val_loss,
                'val_f1_score': val_f1_score,
            }
            display(val_log)

            if best_val_f1_score < val_f1_score:
                print("\n [ save model weight ] \n")
                best_val_preds = val_preds
                best_val_f1_score = val_f1_score
                torch.save(
                    model.state_dict(), 
                    os.path.join(config.model_save_path, f"fold{fold}.pth")
                )
            
        oof_pred[val_idxs] = best_val_preds.astype(np.float32)
        fold_num[val_idxs] = fold+1
        del model; gc.collect()

    # scoring
    score = f1_score(np.round(oof_pred), train_ds['state'])
    print('CV:', round(score, 5))
    return score, oof_pred, fold_num

In [11]:
def inferring(config, test_ds):
    config.model_weights = [p for p in sorted(glob.glob(os.path.join(config.model_save_path, 'fold*.pth')))]
    sub_pred = np.zeros((len(test_ds), 1), dtype=np.float32)
    print(sub_pred.shape)
    for fold, model_weight in enumerate(config.model_weights):
        test_dataset = CrowdFunding_Dataset(config,  test_ds["clean_description"])
        test_loader = DataLoader(
            dataset=test_dataset, 
            batch_size=config.batch_size, 
            shuffle=False,
            drop_last=False
        )
        model = CrowdFunding_Model(config)
        model.load_state_dict(torch.load(model_weight))
        model.to(config.device)

        model.eval()
        fold_pred = []
        with torch.no_grad():
            with tqdm(test_loader, total=len(test_loader)) as pbar:
                for (input_ids, attention_mask) in pbar:

                    input_ids = input_ids.to(config.device)
                    attention_mask = attention_mask.to(config.device)
                    with autocast():
                        output = model(input_ids, attention_mask)
                    output = output.sigmoid().detach().cpu().numpy()
                    fold_pred.append(output)
        fold_pred = np.concatenate(fold_pred)
        print(fold_pred.shape)
        # np.save(os.path.join(cfg.EXP_PREDS, f'sub_pred_fold{fold}.npy'), fold_pred)
        sub_pred += fold_pred / len(config.model_weights)
        del model; gc.collect()

    np.save(os.path.join(config.preds_save_path, 'sub_pred.npy'), sub_pred)
    return sub_pred# 返すのはprobability

In [12]:
def copy_scripts(config):
    scripts_save_path = os.path.join(config.experiment_path, 'scripts')
    os.makedirs(scripts_save_path, exist_ok=True)
    for script in glob.glob('./src/*.ipynb'):
        dst_file = os.path.join(scripts_save_path, script.split('/')[-1])
        print(f'[save file] {dst_file}')
        shutil.copyfile(script, dst_file)

In [13]:
def main():

    # config
    deberta_config = setup(DeBERTa_Config())
    train_path = './data/train.csv'
    test_path = './data/test.csv'
    submit_path = './data/sample_submit.csv'

    train_ds = get_train_data(train_path, deberta_config)
    test_ds = get_test_data(test_path, deberta_config)

    score, oof_pred, fold_num = training(deberta_config, train_ds)

    sub_pred = inferring(deberta_config, test_ds)
    sub = pd.read_csv(submit_path, header=None)
    sub[1] = np.round(sub_pred).astype(int)

    def fix_leak(sub, train_path, test_path):
        print("===== fix_leak =====")
        train_df = pd.read_csv(train_path)
        test_df = pd.read_csv(test_path)
        duplicated = pd.merge(test_df, train_df[['goal', 'country', 'duration', 'category1', 'category2', 'html_content', 'state']], on=['goal', 'country', 'duration', 'category1', 'category2', 'html_content'], how="left")
        duplicated = duplicated[~duplicated["state"].isnull()]
        for i in duplicated.index:
            print(f'Fix index{i}: {sub.loc[i,1]}')
            sub.loc[i, 1] = int(duplicated.loc[i, "state"])
            print(f'To {sub.loc[i,1]}')
        return sub
    sub = fix_leak(sub, train_path, test_path)

    # 提出用ファイル
    sub.to_csv(os.path.join(deberta_config.preds_save_path, f'Exp{deberta_config._exp_num}_CV{int(score*(10**10))}_submission.csv'), index=False, header=False)

    # scriptの保存
    copy_scripts(deberta_config)

    return oof_pred, fold_num# debug

# run

In [None]:
oof_pred, fold_num = main()

### Configration Setup...
    experiment_path  >> ./outputs/Exp003_microsoft/deberta-v3-base
    model_save_path >> ./outputs/Exp003_microsoft/deberta-v3-base/model
    figure_save_path >> ./outputs/Exp003_microsoft/deberta-v3-base/figure
    preds_save_path >> ./outputs/Exp003_microsoft/deberta-v3-base/preds
### Setup Complete. 



  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]


# ==== start fold1 ==== #



Downloading pytorch_model.bin:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/489 [00:00<?, ?it/s]

torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])
torch.Size([16, 768])


# others

In [None]:
def check_leak():
    train_path = './data/train.csv'
    test_path = './data/test.csv'
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    all_dup = pd.concat([train_df, test_df])
    all_dup = all_dup[all_dup[['goal', 'country', 'duration', 'category1', 'category2', 'html_content']].duplicated(keep=False)]
    duplicated = pd.merge(test_df, train_df[['goal', 'country', 'duration', 'category1', 'category2', 'html_content', 'state']], on=['goal', 'country', 'duration', 'category1', 'category2', 'html_content'], how="left")
    duplicated = duplicated[~duplicated["state"].isnull()]
    return all_dup, duplicated
all_dup, dup = check_leak()
all_dup.head(50)

In [None]:
dup

In [None]:
all_dup.loc[365, :]

In [None]:
all_dup.loc[8232, :]