In [None]:
!nvidia-smi

In [None]:
#!pip install -q kaggle
!mkdir -p .kaggle
!cp "./kaggle.json" .kaggle/
!chmod 600 .kaggle/kaggle.json
!cp -r .kaggle /root

!kaggle -v

In [None]:
!pip install iterative-stratification==0.1.7

In [None]:
#!pip install accelerator

In [None]:
#!kaggle datasets download -d takamichitoda/fb3-deberta-v3-pseudo
#!kaggle kernels output takamichitoda/fb3-pseudo-train-3rd-fold-1-only
#!mkdir -p pseudo_base
#!unzip fb3-deberta-v3-pseudo.zip
#!rm -rf fb3-deberta-v3-pseudo.zip microsoft-deberta-v3-base_seed0_fold1_best.pth fb3-pseudo-train-3rd-fold-1-only.log
#!mv microsoft-deberta-v3-base_seed* pseudo_base

In [None]:
#!kaggle competitions download -c feedback-prize-english-language-learning
#!unzip feedback-prize-english-language-learning.zip
#!mkdir feedback-prize-english-language-learning
#!mv sample_submission.csv test.csv train.csv feedback-prize-english-language-learning/
#!rm -rf feedback-prize-english-language-learning.zip

In [None]:
#!mkdir fb3-make-pseudo-label-4th
#!kaggle kernels output takamichitoda/fb3-make-pseudo-label-4th -p fb3-make-pseudo-label-4th/

In [None]:
import json
import gc
import os
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
from torch.utils.checkpoint import checkpoint

import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup

#from accelerate import notebook_launcher
#from accelerate.utils import set_seed
#from accelerate import Accelerator

is_gpu = torch.cuda.is_available()
device = torch.device('cuda' if is_gpu else 'cpu')
scaler = torch.cuda.amp.GradScaler(enabled=is_gpu)

%env TOKENIZERS_PARALLELISM=true
print(device)
print(f"transformers.__version__: {transformers.__version__}")

In [None]:
class CFG:
    EXP = "exp143"
    INPUT = "/home/jupyter/feedback-prize-english-language-learning"
    #OUTPUT = "/home/jupyter/output/"
    OUTPUT = f"/home/jupyter/{EXP}/"
    SEED = 0
    N_FOLD = 4
    TARGETS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    
    #MODEL_NAME = "microsoft/deberta-v3-large"
    MODEL_NAME = "microsoft/deberta-v3-base"
    TOKENIZER = None
    MAX_LEN = 1428
    GRAD_CHECKPOINT = True
    
    N_EPOCH = 4 # 2
    N_WORKER = 2
    ENCODER_LR = 5e-6 # 2e-5
    DECODER_LR = 2e-5
    EPS = 1e-6
    BETAS = (0.9, 0.999)
    WEIGHT_DECAY = 0.01
    N_WARMUP = 0
    N_CYCLES = 0.5
    
    BS = 8 #2 #4
    ACCUMLATION = 1 # 4 #2
    
    GRAD_NORM = 0.1
    
    ADV_EPS = 1e-4
    ADV_LR = 1e-4
    ADV_START = 2
    
    FGM_EPS = 1e-1
    FGM_END = float("inf")
    
    SKIP_FOLDS = [1,2,3]
    LOCAL_SEED = 0

In [None]:
train_df = pd.read_csv(f"{CFG.INPUT}/train.csv")
train_df.head()

In [None]:
cv = MultilabelStratifiedKFold(n_splits=CFG.N_FOLD, shuffle=True, random_state=CFG.SEED)
for n, (train_index, valid_index) in enumerate(cv.split(train_df, train_df[CFG.TARGETS])):
    train_df.loc[valid_index, 'fold'] = int(n)
train_df['fold'] = train_df['fold'].astype(int)

In [None]:
TOKENIZER = AutoTokenizer.from_pretrained(CFG.MODEL_NAME)
TOKENIZER.save_pretrained(CFG.OUTPUT+'tokenizer/')
CFG.TOKENIZER = TOKENIZER
del TOKENIZER

In [None]:
class FB3Dataset(Dataset):
    def __init__(self, df):
        self.texts = df['full_text'].values
        self.labels = df[CFG.TARGETS].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = CFG.TOKENIZER.encode_plus(
            self.texts[item], 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.MAX_LEN,
            pad_to_max_length=True,
            truncation=True
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long) 
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class CustomModel(nn.Module):
    def __init__(self):
        super().__init__()

        self.config = AutoConfig.from_pretrained(CFG.MODEL_NAME, output_hidden_states=True)
        self.config.hidden_dropout = 0.
        self.config.hidden_dropout_prob = 0.
        self.config.attention_dropout = 0.
        self.config.attention_probs_dropout_prob = 0.

        self.model = AutoModel.from_pretrained(CFG.MODEL_NAME, config=self.config)

        if CFG.GRAD_CHECKPOINT:
            self.model.gradient_checkpointing_enable()
            
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        output = self.fc(feature)
        
        return output

In [None]:
class AWP:
    def __init__(
        self,
        model,
        optimizer,
        criterion,
        adv_param="weight",
        adv_lr=1,
        adv_eps=0.2,
        start_epoch=0,
        adv_step=1,
        scaler=None
    ):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.start_epoch = start_epoch
        self.adv_step = adv_step
        self.backup = {}
        self.backup_eps = {}
        self.scaler = scaler

    def attack_backward(self, inputs, labels, epoch):
        if (self.adv_lr == 0) or (epoch < self.start_epoch):
            return None

        self._save() 
        for i in range(self.adv_step):
            self._attack_step() 
            with torch.cuda.amp.autocast():
                y_preds = self.model(inputs)
                adv_loss = self.criterion(y_preds, labels)
                #adv_loss = hinge_loss(y_preds, labels, 0.025)
                
            #y_preds = self.model(inputs)
            #adv_loss = self.criterion(y_preds, labels)
            #accelerator.backward(adv_loss)
            
            self.optimizer.zero_grad()
            self.scaler.scale(adv_loss).backward()
            
        self._restore()

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )
                # param.data.clamp_(*self.backup_eps[name])

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self,):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

In [None]:
# reference: https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/143764
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=1., emb_name='word_embeddings'):
        """
        敵対的な摂動を求め、現在のembedding layerに摂動を加える
        """
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0:
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)
                    
    def restore(self, emb_name='word_embeddings'):
        """
        敵対的な摂動を求める際に変更してしまったembedding layerのパラメータについて
        元のパラメータを代入する
        """
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
            self.backup = {}

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
         'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

In [None]:
from torch.optim import Optimizer

class PriorWD(Optimizer):
    def __init__(self, optim, use_prior_wd=False, exclude_last_group=True):
        super(PriorWD, self).__init__(optim.param_groups, optim.defaults)
        self.param_groups = optim.param_groups
        self.optim = optim
        self.use_prior_wd = use_prior_wd
        self.exclude_last_group = exclude_last_group
        self.weight_decay_by_group = []
        for i, group in enumerate(self.param_groups):
            self.weight_decay_by_group.append(group["weight_decay"])
            group["weight_decay"] = 0

        self.prior_params = {}
        for i, group in enumerate(self.param_groups):
            for p in group["params"]:
                self.prior_params[id(p)] = p.detach().clone()

    def step(self, closure=None):
        if self.use_prior_wd:
            for i, group in enumerate(self.param_groups):
                for p in group["params"]:
                    if self.exclude_last_group and i == len(self.param_groups):
                        p.data.add_(-group["lr"] * self.weight_decay_by_group[i], p.data)
                    else:
                        p.data.add_(
                            -group["lr"] * self.weight_decay_by_group[i], p.data - self.prior_params[id(p)],
                        )
        loss = self.optim.step(closure)

        return loss

    def compute_distance_to_prior(self, param):
        assert id(param) in self.prior_params, "parameter not in PriorWD optimizer"
        return (param.data - self.prior_params[id(param)]).pow(2).sum().sqrt()

In [None]:
#def hinge_loss(y_pred, y_true):
#    return torch.mean(torch.clamp(1 - y_pred.t() * y_true, min=0))

#eps = 0.25
def hinge_loss(y_pred, y_true, eps):
    loss = nn.L1Loss(reduction='none')(y_pred, y_true)
    loss = torch.clamp(loss - eps, min=0)
    loss = torch.mean(loss)
    return loss

def mask_loss(y_pred, y_true):
    loss = nn.SmoothL1Loss(reduction='none', beta=1.0)(y_pred, y_true)
    mask = (nn.L1Loss(reduction='none')(y_pred, y_true) < 0.5).float()
    loss = loss * mask
    loss = torch.mean(loss)
    return loss

In [None]:
def train_fn(epoch, train_loader, model, awp, fgm, criterion, optimizer, scheduler):
    model.train()

    losses = AverageMeter()
    global_step = 0
    for step, (inputs, labels) in tqdm(enumerate(train_loader), total=len(train_loader)):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
            #inputs[k] = v.to(accelerator.device)
        #labels = labels.to(accelerator.device)
        labels = labels.to(device)
        
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=is_gpu):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
            #loss = hinge_loss(y_preds, labels, 0.025)
        #y_preds = model(inputs)
        #loss = criterion(y_preds, labels)
            
        if CFG.ACCUMLATION > 1:
            loss = loss / CFG.ACCUMLATION
            
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        #accelerator.backward(loss)
        
        if awp is not None:
            awp.attack_backward(inputs, labels, epoch) 
        
        # FGM
        if fgm is not None and epoch < CFG.FGM_END:
            fgm.attack(epsilon=CFG.FGM_EPS, emb_name='word_embeddings')
            with torch.cuda.amp.autocast():
                y_preds = model(inputs)
                loss_adv = criterion(y_preds, labels)
                #loss_adv = hinge_loss(y_preds, labels, 0.025)
            #y_preds = model(inputs)
            #loss_adv = criterion(y_preds, labels)
            #optimizer.zero_grad()
            scaler.scale(loss_adv).backward()
            #accelerator.backward(loss_adv)
            fgm.restore()
        #"""

        
        if (step + 1) % CFG.ACCUMLATION == 0:
            scaler.unscale_(optimizer)
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.GRAD_NORM)
            scaler.step(optimizer)
            scaler.update()
            #accelerator.clip_grad_norm_(model.parameters(), max_norm=CFG.GRAD_NORM, norm_type=2)
            #optimizer.step()
            
            optimizer.zero_grad()
            global_step += 1
            scheduler.step()
                
    return losses.avg


def valid_fn(valid_loader, model, criterion):
    losses = AverageMeter()
    model.eval()
    preds = []
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
            
        if CFG.ACCUMLATION > 1:
            loss = loss / CFG.ACCUMLATION
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())

    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [None]:
pseudo_dfs = []
for fold in range(CFG.N_FOLD):
    _df = pd.read_csv(f"./fb3-make-pseudo-label-4th/pseudo_v3_f{fold}.csv")
    _df = _df.rename(columns = {"text":"full_text"}).drop(["ARI", "predicted_grade", "tokenize_length"], axis=1)
    _df["fold"] = fold
    _df["text_id"] = [f"pseudo_{i}" for i in range(len(_df))]
    pseudo_dfs.append(_df)

In [None]:
def train_loop(fold, seed):
    #set_seed(CFG.SEED)
    #seed_everything(seed)
    #accelerator = Accelerator(mixed_precision="fp16")
    
    valid_df = train_df.query(f"fold=={fold}")
    valid_labels = valid_df[CFG.TARGETS].values
    train_dataset = FB3Dataset(train_df.query(f"fold!={fold}"))
    #train_dataset = FB3Dataset(pseudo_dfs[fold])
    #train_dataset = FB3Dataset(pd.concat([train_df.query(f"fold!={fold}"), pseudo_dfs[fold]], axis=0).reset_index(drop=True))
    valid_dataset = FB3Dataset(valid_df)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.BS,
                              shuffle=True,
                              num_workers=CFG.N_WORKER, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.BS,
                              shuffle=False,
                              num_workers=CFG.N_WORKER, pin_memory=True, drop_last=False)

    model = CustomModel()
    torch.save(model.config, CFG.OUTPUT + 'config.pth')
    
    s = 2 if fold == 1 else 0
    path = f"pseudo_base/microsoft-deberta-v3-base_seed{s}_fold{fold}_best.pth"   
    #path = f"pseudo/microsoft-deberta-v3-large_seed0_fold{fold}_best.pth"   
    state = torch.load(path, map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    model.to(device)
    
    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.ENCODER_LR, 
                                                decoder_lr=CFG.DECODER_LR,
                                                weight_decay=CFG.WEIGHT_DECAY)

    optimizer = AdamW(optimizer_parameters, lr=CFG.ENCODER_LR, eps=CFG.EPS, betas=CFG.BETAS, correct_bias=True)
    optimizer = PriorWD(optimizer, use_prior_wd=True)
    
    num_train_steps = int(len(train_dataset) / CFG.BS * CFG.N_EPOCH)
    scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=CFG.N_WARMUP, num_training_steps=num_train_steps, num_cycles=CFG.N_CYCLES
    )

    #model, optimizer, train_loader, valid_loader, scheduler = accelerator.prepare(
    #    model, optimizer, train_loader, valid_loader, scheduler
    #)
    
    #criterion = nn.SmoothL1Loss(reduction='mean', beta=1.0)
    criterion = mask_loss
    #scaler = None
    #"""
    awp = AWP(model,
              optimizer,
              criterion,
              adv_lr=CFG.ADV_LR,
              adv_eps=CFG.ADV_EPS,
              start_epoch=CFG.ADV_START,
              scaler=scaler
    )
    #"""
    #awp = None
    fgm = FGM(model)
    #"""
    #awp, fgm = None, None

    best_score = float("inf")
    best_predictions = None
    for epoch in range(CFG.N_EPOCH):
        avg_loss = train_fn(epoch, train_loader, model, awp, fgm, criterion, optimizer, scheduler)
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion)
        score, _ = get_score(valid_labels, predictions)
        if best_score > score:
            best_score = score
            best_predictions = predictions
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                         f"{CFG.OUTPUT}/{CFG.MODEL_NAME.replace('/', '-')}_seed{seed}_fold{fold}_best.pth")
        print(f"[Fold-{fold}] epoch-{epoch}: score={score}")
        #accelerator.print(f"[Fold-{fold}] epoch-{epoch}: score={score}")
        pd.DataFrame([score], columns=["score"]).to_csv(f"{CFG.OUTPUT}/fold{fold}_epoch{epoch}.csv", index=None)
        torch.cuda.empty_cache()
        gc.collect()
        
    return best_score

In [None]:
def _train_loop(fold, seed):
    set_seed(CFG.SEED)
    accelerator = Accelerator(mixed_precision="fp16")
    accelerator.print("Fold", fold)
    accelerator.print("seed", seed)
    accelerator.print(accelerator.device)

#args = (0, 0)
#notebook_launcher(train_loop, args, num_processes=2)

In [None]:
def main(seed):
    seed_everything(seed)
    scores = []
    for fold in range(CFG.N_FOLD):
        if fold in CFG.SKIP_FOLDS:
            continue
        score = train_loop(fold, seed)
        scores.append(score)
    print(scores)
    print(sum(scores) / CFG.N_FOLD)

In [None]:
if __name__ == '__main__':
    main(CFG.LOCAL_SEED)

# Upload

In [None]:
!kaggle datasets init -p {CFG.OUTPUT}

with open(f"{CFG.OUTPUT}/dataset-metadata.json", "r") as f:
    d = json.load(f)
    
t = f"FB3 {CFG.EXP} output"
d['title'] = t
d['id'] = "takamichitoda/"+"-".join(t.split())

with open(f"{CFG.OUTPUT}/dataset-metadata.json", "w") as f:
    json.dump(d, f)

In [None]:
#!kaggle datasets create -p {CFG.OUTPUT}
!kaggle datasets version -m "test" -p {CFG.OUTPUT}/

!kaggle datasets list -m --sort-by "updated"

In [None]:
!ls