In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q kaggle
!mkdir -p .kaggle
!cp "./drive/MyDrive/Study/config/kaggle.json" .kaggle/
!chmod 600 .kaggle/kaggle.json
!mv .kaggle /root

In [None]:
# competition data
!kaggle competitions download -c feedback-prize-english-language-learning
!unzip feedback-prize-english-language-learning.zip
!rm -rf feedback-prize-english-language-learning.zip
!mkdir -p competition_data
!mv sample_submission.csv test.csv train.csv competition_data/
!echo "### competition data ###"
!ls /content/competition_data/
!echo

# pseudo labels
!mkdir -p fb3-distillation-data-final
!kaggle kernels output takamichitoda/fb3-make-avg-distillation-pseudo -p ./fb3-distillation-data-final
!echo "### pseudo data ###"
!ls ./fb3-distillation-data-final
!echo


In [None]:
!pip install iterative-stratification==0.1.7
!pip install transformers
!pip install sentencepiece

## メイン処理

In [None]:
import gc
import json
import os
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
from torch.utils.checkpoint import checkpoint

import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup

is_gpu = torch.cuda.is_available()
device = torch.device('cuda' if is_gpu else 'cpu')
scaler = torch.cuda.amp.GradScaler(enabled=is_gpu)

%env TOKENIZERS_PARALLELISM=true
print(device)
print(f"transformers.__version__: {transformers.__version__}")

In [None]:
class CFG:
    EXP_NUM = 13
    MEMO = "final"

    INPUT = "/content/competition_data"
    OUTPUT = f"/content/output/distribution/"
    SEED = 0
    N_FOLD = 4
    TARGETS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    
    MODEL_NAME = "microsoft/deberta-v3-xsmall"
    TOKENIZER = None
    MAX_LEN = 512
    GRAD_CHECKPOINT = False
    
    N_EPOCH =16
    N_WORKER = 2
    ENCODER_LR = 1e-4
    DECODER_LR = 5e-3

    EPS = 1e-6
    BETAS = (0.9, 0.999)
    WEIGHT_DECAY = 0.1
    N_WARMUP = 0
    N_CYCLES = 0.5
    
    BS = 64
    ACCUMLATION = 1
    
    GRAD_NORM = 0.1
    
    SKIP_FOLDS = [None]
    LOCAL_SEED = 0

!mkdir -p {CFG.OUTPUT}
!kaggle datasets init -p {CFG.OUTPUT}

with open(f'{CFG.OUTPUT}/dataset-metadata.json', 'r') as f:
    d = json.load(f)
t = f'FB3 distribution last'

d['title'] = t
d['id'] = f'takamichitoda/'+ t.replace(' ', '-')
print(d)
with open(f'{CFG.OUTPUT}/dataset-metadata.json', 'w') as f:
    json.dump(d, f)
del d

s = {k:v for k, v in vars(CFG).items() if "__" != k[:2]}
with open(f'{CFG.OUTPUT}/setting.json', 'w') as f:
    json.dump(s, f)
del s

!rm -rf {CFG.OUTPUT}/tokenizer*
TOKENIZER = AutoTokenizer.from_pretrained(CFG.MODEL_NAME)
TOKENIZER.save_pretrained(CFG.OUTPUT+'tokenizer/')
CFG.TOKENIZER = TOKENIZER
del TOKENIZER
!zip -r tokenizer.zip {CFG.OUTPUT}/tokenizer
!mv tokenizer.zip {CFG.OUTPUT}

In [None]:
org_train_df = pd.read_csv(f"{CFG.INPUT}/train.csv")

cv = MultilabelStratifiedKFold(n_splits=CFG.N_FOLD, shuffle=True, random_state=CFG.SEED)
for n, (train_index, valid_index) in enumerate(cv.split(org_train_df, org_train_df[CFG.TARGETS])):
    org_train_df.loc[valid_index, 'fold'] = int(n)
org_train_df['fold'] = org_train_df['fold'].astype(int)

display(org_train_df.head())

text_for_distillation_df = pd.read_csv('fb3-distillation-data-final/AVG_pseudo.csv')

display(text_for_distillation_df.head())

(org_train_df['fold'] == text_for_distillation_df.query('origin')['fold']).mean()

In [None]:
all_pseudo_vecs = np.load("fb3-distillation-data-final/all_pseudo.npy")
all_pseudo_vecs.shape

In [None]:
all_pseudo_vecs_fold = np.stack([all_pseudo_vecs[i*4:i*4+4, :, :] for i in range(10)])
all_pseudo_vecs_fold.shape

In [None]:
class FB3Dataset(Dataset):
    def __init__(self, df, train=False):
        self.texts = df['full_text'].values
        self.labels = df[CFG.TARGETS].values
        self.train = train

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = CFG.TOKENIZER.encode_plus(
            self.texts[item], 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.MAX_LEN,
            pad_to_max_length=True,
            truncation=True
        )
            

        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long) 
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [None]:
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class CustomModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()

        self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
        self.config.hidden_dropout = 0.
        self.config.hidden_dropout_prob = 0.
        self.config.attention_dropout = 0.
        self.config.attention_probs_dropout_prob = 0.
        self.config.num_hidden_layers = 4

        self.model = AutoModel.from_pretrained(model_name, config=self.config)

        if CFG.GRAD_CHECKPOINT:
            self.model.gradient_checkpointing_enable()
            
        
        layer_start = self.config.num_hidden_layers
        self.layer_pool = WeightedLayerPooling(
            self.config.num_hidden_layers, 
            layer_start=layer_start, layer_weights=None
        )
        self.pool = MeanPooling()
        self.fc = nn.Sequential(
            nn.Linear(self.config.hidden_size, self.config.hidden_size),
            nn.ReLU(),
            nn.Linear(self.config.hidden_size, 6),
        )
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, inputs):
        outputs = self.model(**inputs)

        #pooling_embeddings = outputs[0]
    
        all_hidden_states = torch.stack(outputs[1])
        pooling_embeddings = self.layer_pool(all_hidden_states)

        feature = self.pool(pooling_embeddings, inputs['attention_mask'])
        output = self.fc(feature)

        return output

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
         'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

In [None]:
def train_fn(epoch, train_loader, student, criterion, optimizer, scheduler):
    student.train()

    losses = AverageMeter()
    global_step = 0
    for step, (inputs, labels) in tqdm(enumerate(train_loader), total=len(train_loader)):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=is_gpu):
            y_preds = student(inputs)

            loss = criterion(y_preds, labels)

        if CFG.ACCUMLATION > 1:
            loss = loss / CFG.ACCUMLATION
            
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(student.parameters(), CFG.GRAD_NORM)
        
        if (step + 1) % CFG.ACCUMLATION == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            scheduler.step()
                
    return losses.avg

def valid_fn(valid_loader, model, criterion):
    losses = AverageMeter()
    model.eval()
    preds = []
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
            
        if CFG.ACCUMLATION > 1:
            loss = loss / CFG.ACCUMLATION
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())

    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [None]:
from torch.optim.swa_utils import AveragedModel, SWALR

def swa_fn(epoch, train_loader, model, swa_model, criterion, optimizer, scheduler):
    model.train()

    losses = AverageMeter()
    global_step = 0
    for step, (inputs, labels) in tqdm(enumerate(train_loader), total=len(train_loader)):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=is_gpu):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
            
        if CFG.ACCUMLATION > 1:
            loss = loss / CFG.ACCUMLATION
            
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()


        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.GRAD_NORM)
        
        if (step + 1) % CFG.ACCUMLATION == 0:
            scaler.step(optimizer)
            scaler.update()
            swa_model.update_parameters(model)
            
            optimizer.zero_grad()
            global_step += 1
            scheduler.step()
                
    return losses.avg

In [None]:
class CustomLoss(nn.Module):
    def __init__(self, eps, mask):
        super(CustomLoss, self).__init__()
        self.eps = eps
        self.mask = mask

    def forward(self, outputs, targets):
        loss = nn.MSELoss(reduction='none')(outputs, targets)

        if self.mask is None:
            loss = torch.mean(torch.clamp(loss - self.eps**2, min=0))
        else:
            loss = torch.mean(torch.clamp(loss - self.eps**2, min=0, max=self.mask**2))

        return loss

In [None]:
from torch.optim import Optimizer

class PriorWD(Optimizer):
    def __init__(self, optim, use_prior_wd=False, exclude_last_group=True):
        super(PriorWD, self).__init__(optim.param_groups, optim.defaults)
        self.param_groups = optim.param_groups
        self.optim = optim
        self.use_prior_wd = use_prior_wd
        self.exclude_last_group = exclude_last_group
        self.weight_decay_by_group = []
        for i, group in enumerate(self.param_groups):
            self.weight_decay_by_group.append(group["weight_decay"])
            group["weight_decay"] = 0

        self.prior_params = {}
        for i, group in enumerate(self.param_groups):
            for p in group["params"]:
                self.prior_params[id(p)] = p.detach().clone()

    def step(self, closure=None):
        if self.use_prior_wd:
            for i, group in enumerate(self.param_groups):
                for p in group["params"]:
                    if self.exclude_last_group and i == len(self.param_groups):
                        p.data.add_(-group["lr"] * self.weight_decay_by_group[i], p.data)
                    else:
                        p.data.add_(
                            -group["lr"] * self.weight_decay_by_group[i], p.data - self.prior_params[id(p)],
                        )
        loss = self.optim.step(closure)

        return loss

    def compute_distance_to_prior(self, param):
        assert id(param) in self.prior_params, "parameter not in PriorWD optimizer"
        return (param.data - self.prior_params[id(param)]).pow(2).sum().sqrt()

In [None]:
def train_loop(fold, seed):
    if fold == "ALL":
        #text_for_distillation_df[CFG.TARGETS] = label_for_distillation.mean(0).mean(0)
        valid_df = org_train_df
        train_df = text_for_distillation_df
    else:
        #text_for_distillation_df[CFG.TARGETS] = all_pseudo_vecs_fold[[0,1,2,3,4,5,6,7,9], fold, :].mean(0)
        #text_for_distillation_df[CFG.TARGETS] = all_pseudo_vecs_fold[[0,1,2,3,4,5,6,7], fold, :].mean(0) # drop SVR
        #text_for_distillation_df[CFG.TARGETS] = all_pseudo_vecs_fold[[0,1,2,3,5,6,7,9], fold, :].mean(0) # drop distill
        text_for_distillation_df[CFG.TARGETS] = all_pseudo_vecs_fold[:, fold, :, :].mean(0)
        valid_df = org_train_df.query(f"fold=={fold}")
        train_df = text_for_distillation_df.query(f"fold!={fold}")

    valid_labels = valid_df[CFG.TARGETS].values
    train_dataset = FB3Dataset(train_df, train=True)
    valid_dataset = FB3Dataset(valid_df, train=False)
    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.BS,
                              shuffle=True,
                              num_workers=CFG.N_WORKER, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.BS,
                              shuffle=False,
                              num_workers=CFG.N_WORKER, pin_memory=True, drop_last=False)

    student = CustomModel(CFG.MODEL_NAME)
    torch.save(student.config, CFG.OUTPUT + 'config.pth')
    student.to(device)

    optimizer_parameters = get_optimizer_params(student,
                                                encoder_lr=CFG.ENCODER_LR, 
                                                decoder_lr=CFG.DECODER_LR,
                                                weight_decay=CFG.WEIGHT_DECAY)

    optimizer = AdamW(optimizer_parameters, lr=CFG.ENCODER_LR, eps=CFG.EPS, betas=CFG.BETAS, correct_bias=True)
    optimizer = PriorWD(optimizer, use_prior_wd=True)

    num_train_steps = int(len(train_dataset) / CFG.BS * CFG.N_EPOCH)
    scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=CFG.N_WARMUP, num_training_steps=num_train_steps, num_cycles=CFG.N_CYCLES
    )
    #criterion = nn.SmoothL1Loss(reduction='mean', beta=1.0)
    #criterion = nn.HuberLoss(reduction='mean', delta=1.0)
    #criterion = svm_loss
    criterion = CustomLoss(eps=0.1, mask=None)

    swa_model = AveragedModel(student)
    swa_scheduler = SWALR(optimizer, swa_lr=1e-6)

    best_score = float("inf")
    best_predictions = None
    results = []
    for epoch in range(CFG.N_EPOCH):
        if epoch == 7:
            mask = 2.0
            print(f'eps=0.0, mask={mask}')
            criterion = CustomLoss(eps=0.0, mask=mask)

        if epoch < 45:
            avg_loss = train_fn(epoch, train_loader, student, criterion, optimizer, scheduler)
            avg_val_loss, predictions = valid_fn(valid_loader, student, criterion)
        else:
            avg_loss = swa_fn(epoch, train_loader, student, swa_model, criterion, optimizer, swa_scheduler)
            avg_val_loss, predictions = valid_fn(valid_loader, swa_model, criterion)

        score, _ = get_score(valid_labels, predictions)
        
        if best_score > score:
            best_score = score
            best_predictions = predictions
            torch.save({'model': student.state_dict(),
                        'predictions': predictions},
                         f"{CFG.OUTPUT}/{CFG.MODEL_NAME.replace('/', '-')}_seed{seed}_fold{fold}_best.pth")
        print(f"[Fold-{fold}] epoch-{epoch}: score={score}")
        results.append((fold, epoch, score, best_score))

        if fold == "ALL" and epoch > 35:
            torch.save({'model': student.state_dict(),
                        'predictions': predictions},
                        f"{CFG.OUTPUT}/{CFG.MODEL_NAME.replace('/', '-')}_seed{seed}_fold{fold}_epoch{epoch}.pth")
        torch.cuda.empty_cache()
        gc.collect()
        
    return best_score, results

In [None]:
def main(seed):
    scores = []
    results_lst = []
    for fold in range(CFG.N_FOLD):
        if fold in CFG.SKIP_FOLDS:
            continue
        seed_everything(seed)
        score, results = train_loop(fold, seed)
        scores.append(score)
        results_lst += results
    print(scores)
    print(sum(scores) / CFG.N_FOLD)

    pd.DataFrame(results_lst, columns=['fold',  'epoch', 'score', 'best_score']).to_csv(f"{CFG.OUTPUT}/result.csv", index=None)

In [None]:
if __name__ == '__main__':
    main(CFG.LOCAL_SEED)

In [None]:
#!kaggle datasets create -p {CFG.OUTPUT}
#!kaggle datasets version -p {CFG.OUTPUT} -m '{CFG.MEMO}'

In [None]:
seed_everything(0)
score, results = train_loop("ALL", 0)
print(score)

In [None]:
df = pd.DataFrame(results, columns=['fold',  'epoch', 'score', 'best_score'])
df.columns=['fold',  'epoch', 'score', 'best_score']
df.to_csv(f"{CFG.OUTPUT}/result_all.csv", index=None)
df

In [None]:
df[['score', 'best_score']].plot()

In [None]:
#!kaggle datasets create -p {CFG.OUTPUT}
!kaggle datasets version -p {CFG.OUTPUT} -m '{CFG.MEMO} swa45'

In [None]:
#!rm -rf {CFG.OUTPUT}