In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q kaggle
!mkdir -p .kaggle
!cp "./drive/MyDrive/Study/config/kaggle.json" .kaggle/
!chmod 600 .kaggle/kaggle.json
!mv .kaggle /root

In [None]:
!kaggle datasets download -d takamichitoda/fb3-deberta-v3-awp
!kaggle competitions download -c feedback-prize-english-language-learning
!kaggle kernels output takamichitoda/fb3-make-pseudo-label-2nd

!unzip feedback-prize-english-language-learning.zip
!unzip fb3-deberta-v3-awp.zip
!rm -rf feedback-prize-english-language-learning.zip  fb3-deberta-v3-awp.zip

In [None]:
!pip install iterative-stratification==0.1.7
!pip install transformers
!pip install sentencepiece

## メイン処理

In [None]:
import gc
import os
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from torch.utils.checkpoint import checkpoint

import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup


is_gpu = torch.cuda.is_available()
device = torch.device('cuda' if is_gpu else 'cpu')
scaler = torch.cuda.amp.GradScaler(enabled=is_gpu)

%env TOKENIZERS_PARALLELISM=true
print(device)
print(f"transformers.__version__: {transformers.__version__}")

In [None]:
class CFG:
    INPUT = "/content"
    OUTPUT = "/content/drive/MyDrive/Study/FB3/output"
    SEED = 0
    N_FOLD = 4
    TARGETS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    
    MODEL_NAME = "microsoft/deberta-v3-base"
    TOKENIZER = None
    MAX_LEN = 1429 - 1
    GRAD_CHECKPOINT = True
    
    N_EPOCH = 4
    N_WORKER = 4
    ENCODER_LR = 2e-5
    DECODER_LR = 2e-5
    EPS = 1e-6
    BETAS = (0.9, 0.999)
    WEIGHT_DECAY = 0.01
    N_WARMUP = 0
    N_CYCLES = 0.5
    
    BS = 8
    ACCUMLATION = 1
    
    GRAD_NORM = 0.01
    
    SKIP_FOLDS = [0,2,3]
    LOCAL_SEED = 0

In [None]:
train_df = pd.read_csv(f"{CFG.INPUT}/train.csv")
train_df.head()

In [None]:
cv = MultilabelStratifiedKFold(n_splits=CFG.N_FOLD, shuffle=True, random_state=CFG.SEED)
for n, (train_index, valid_index) in enumerate(cv.split(train_df, train_df[CFG.TARGETS])):
    train_df.loc[valid_index, 'fold'] = int(n)
train_df['fold'] = train_df['fold'].astype(int)

In [None]:
TOKENIZER = AutoTokenizer.from_pretrained(CFG.MODEL_NAME)
TOKENIZER.save_pretrained(CFG.OUTPUT+'tokenizer/')
CFG.TOKENIZER = TOKENIZER
del TOKENIZER

In [None]:
class FB3Dataset(Dataset):
    def __init__(self, df):
        self.texts = df['full_text'].values
        self.labels = df[CFG.TARGETS].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = CFG.TOKENIZER.encode_plus(
            self.texts[item], 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.MAX_LEN,
            pad_to_max_length=True,
            truncation=True
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long) 
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [None]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
        
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()

        self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
        self.config.hidden_dropout = 0.
        self.config.hidden_dropout_prob = 0.
        self.config.attention_dropout = 0.
        self.config.attention_probs_dropout_prob = 0.

        self.model = AutoModel.from_pretrained(model_name, config=self.config)

        if CFG.GRAD_CHECKPOINT:
            self.model.gradient_checkpointing_enable()
            
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)

        #self.n_self_fc = 2
        #lst = []
        #for _ in range(self.n_self_fc):
        #    l = nn.Linear(self.config.hidden_size, 6)
        #    self._init_weights(l)
        #    lst.append(l)
        #self.self_fcs = nn.ModuleList(lst)
        #self.self_hints = nn.ModuleList([nn.Linear(self.config.hidden_size, self.config.hidden_size) for _ in range(self.n_self_fc)])

        # reinit_layers
        reinit_layer = 1
        if reinit_layer > 0:
            for layer in self.model.encoder.layer[-reinit_layer:]:
                for module in layer.modules():
                    if isinstance(module, nn.Linear):
                        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                        if module.bias is not None:
                            module.bias.data.zero_()
                    elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                    elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        #use_layer = 2
        #sequence_output = torch.cat([outputs[1][-1*i] for i in range(1, use_layer+1)], dim=2)  # concatenate
        
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        output = self.fc(feature)

        #if self.training:
        #    features = [self.pool(outputs[1][-1*(i + 2)], inputs['attention_mask']) for i in range(self.n_self_fc)]
        #    outputs = [self.self_fcs[i](features[i]) for i in range(self.n_self_fc)]
        #    #hints = [self.self_hints[i](features[i]) for i in range(self.n_self_fc)]
        #    hints = None
        #else:
        #    outputs = None
        #    hints = None
        
        #return output,  last_hidden_states # sequence_output
        #return output, outputs, hints, feature
        return output

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
         'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

In [None]:
def train_fn(epoch, train_loader, teacher, student, criterion, kl_loss, optimizer, scheduler):
    student.train()
    #l2_loss = nn.MSELoss()
    #cos_loss = nn.CosineEmbeddingLoss()

    losses = AverageMeter()
    global_step = 0
    for step, (inputs, labels) in tqdm(enumerate(train_loader), total=len(train_loader)):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=is_gpu):
            y_preds_s = student(inputs)
            #y_preds_s, h_s = student(inputs)
            #y_preds_s, h_preds = student(inputs)
            #y_preds_s, h_preds, hints, feat = student(inputs)
            #base_loss = criterion(y_preds_s, labels)
            #print(base_loss, nn.SmoothL1Loss(reduction='none')(y_preds_s, labels))
            """
            with torch.no_grad():
                _loss1 = nn.L1Loss(reduction='none')(y_preds_s, labels)
            mask = (_loss1 < 1e-3).float()
            _lab = y_preds_s.clone().detach()
            #feat = feat.clone().detach()
            for p in h_preds:
            #for p, h in zip(h_preds, hints):
                _loss2 = nn.SmoothL1Loss(reduction='none')(p, _lab)
                #_loss3 = nn.SmoothL1Loss(reduction='none')(p, labels)
                base_loss += (mask * _loss2).mean()
                #base_loss += (mask * _loss3).mean()

                #_loss1 = nn.SmoothL1Loss(reduction='mean')(p, labels).detach()
                #_loss2 = nn.SmoothL1Loss(reduction='none')(p, labels)
                #base_loss += (_loss2 * (_loss2.detach() < _loss1).float()).mean()
                #/ student.n_self_fc
                #base_loss += criterion(p, _lab) / student.n_self_fc
                _lab = p.clone().detach()
                #base_loss += criterion(p, labels) / student.n_self_fc / 8
                #base_loss += kl_loss(h, feat)
                #feat = h.clone().detach()
            """
            with torch.no_grad():
                #y_preds_t, h_t = teacher(inputs)
                y_preds_t = teacher(inputs)
                #_loss1 = nn.L1Loss(reduction='none')(y_preds_t, labels)
            #mask = (_loss1 < 1e-2).float()

            base_loss = criterion(y_preds_s, y_preds_t)
            #_loss2 = nn.SmoothL1Loss(reduction='none')(y_preds_s, y_preds_t)
            #base_loss = (mask * _loss2).mean()

            loss = base_loss #+ kl_loss(y_preds_s, y_preds_t) + kl_loss(h_s, h_t)
            #loss = base_loss + kl_loss(y_preds_s, y_preds_t) + l2_loss(h_s, h_t)
            #dummy = torch.ones((labels.size(0),)).to(device)
            #loss = base_loss + kl_loss(y_preds_s, y_preds_t) + cos_loss(h_s.reshape(labels.size(0), -1), h_t.reshape(labels.size(0), -1), dummy)


        if CFG.ACCUMLATION > 1:
            loss = loss / CFG.ACCUMLATION
            
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(student.parameters(), CFG.GRAD_NORM)
        
        if (step + 1) % CFG.ACCUMLATION == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            scheduler.step()
                
    return losses.avg

def valid_fn(valid_loader, model, criterion):
    losses = AverageMeter()
    model.eval()
    preds = []
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            #y_preds, _ = model(inputs)
            #y_preds, _, _, _ = model(inputs)
            loss = criterion(y_preds, labels)
            
        if CFG.ACCUMLATION > 1:
            loss = loss / CFG.ACCUMLATION
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())

    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [None]:
pseudo_dfs = []
for fold in range(CFG.N_FOLD):
    _df = pd.read_csv(f"./pseudo_v2_f{fold}.csv")
    _df = _df.rename(columns = {"text":"full_text"}).drop(["ARI", "predicted_grade", "tokenize_length"], axis=1)
    _df["fold"] = fold
    _df["text_id"] = [f"pseudo_{i}" for i in range(len(_df))]
    pseudo_dfs.append(_df)

In [None]:
def train_loop(fold, seed):
    valid_df = train_df.query(f"fold=={fold}")
    valid_labels = valid_df[CFG.TARGETS].values
    #train_dataset = FB3Dataset(train_df.query(f"fold!={fold}"))
    train_dataset = FB3Dataset(pd.concat([train_df.query(f"fold!={fold}"), pseudo_dfs[fold]], axis=0).reset_index(drop=True))
    valid_dataset = FB3Dataset(valid_df)
    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.BS,
                              shuffle=True,
                              num_workers=CFG.N_WORKER, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.BS * 2,
                              shuffle=False,
                              num_workers=CFG.N_WORKER, pin_memory=True, drop_last=False)

    #student = CustomModel("microsoft/deberta-v3-small")
    student = CustomModel("microsoft/deberta-v3-xsmall")
    teacher = CustomModel(CFG.MODEL_NAME)

    torch.save(student.config, CFG.OUTPUT + 'config.pth')
    state = torch.load(f"./microsoft-deberta-v3-base_seed0_fold{fold}_best.pth",
                       map_location=torch.device('cpu'))
    teacher.load_state_dict(state['model'])
    student.to(device)
    teacher.to(device)
    teacher.eval()

    optimizer_parameters = get_optimizer_params(student,
                                                encoder_lr=CFG.ENCODER_LR, 
                                                decoder_lr=CFG.DECODER_LR,
                                                weight_decay=CFG.WEIGHT_DECAY)

    optimizer = AdamW(optimizer_parameters, lr=CFG.ENCODER_LR, eps=CFG.EPS, betas=CFG.BETAS)
    num_train_steps = int(len(train_dataset) / CFG.BS * CFG.N_EPOCH)
    scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=CFG.N_WARMUP, num_training_steps=num_train_steps, num_cycles=CFG.N_CYCLES
    )
    criterion = nn.SmoothL1Loss(reduction='mean')
    kl_loss = nn.KLDivLoss(reduction="batchmean")

    best_score = float("inf")
    best_predictions = None
    for epoch in range(CFG.N_EPOCH):
        avg_loss = train_fn(epoch, train_loader, teacher, student, criterion, kl_loss, optimizer, scheduler)
        avg_val_loss, predictions = valid_fn(valid_loader, student, criterion)
        score, _ = get_score(valid_labels, predictions)
        if best_score > score:
            best_score = score
            best_predictions = predictions
            torch.save({'model': student.state_dict(),
                        'predictions': predictions},
                         f"{CFG.OUTPUT}/{CFG.MODEL_NAME.replace('/', '-')}_seed{seed}_fold{fold}_best.pth")
        print(f"[Fold-{fold}] epoch-{epoch}: score={score}")
        torch.cuda.empty_cache()
        gc.collect()
        
    return best_score

In [None]:
def main(seed):
    seed_everything(seed)
    scores = []
    for fold in range(CFG.N_FOLD):
        if fold in CFG.SKIP_FOLDS:
            continue
        score = train_loop(fold, seed)
        scores.append(score)
    print(scores)
    print(sum(scores) / CFG.N_FOLD)

In [None]:
if __name__ == '__main__':
    main(CFG.LOCAL_SEED)

In [None]:
# 通常の蒸留: 0.4586991475202113
# 普通のラベルも一緒に: 0.4610501424488637
# mask 0.2: 0.4920362723590115


# base; 0.4659324501443643, 0.4631171619592938
# ---------------------------------------

# 1.0; 0.48xxx
# 1e-1; 0.47074394338866504
# 1e-2; 0.4646448016043702
# 1e-3; 0.4629622086679886, 0.4631343361664209
# 1e-4; 0.46315064678772244


# 4 layer; 0.46618104604628985
# 1 layer; 0.4647979115503782

# label: 0.4633481539801743

```
[Fold-1] epoch-0: score=0.48272674656153364
100%
366/366 [06:13<00:00, 1.09s/it]
[Fold-1] epoch-1: score=0.47691091973256344
100%
366/366 [06:10<00:00, 1.17it/s]
[Fold-1] epoch-2: score=0.46621649542127847
100%
366/366 [06:14<00:00, 1.06s/it]
[Fold-1] epoch-3: score=0.4631343361664209
```

```
[Fold-1] epoch-0: score=0.5017394744811813
100%
366/366 [06:10<00:00, 1.07s/it]
[Fold-1] epoch-1: score=0.493347700873547
100%
366/366 [06:07<00:00, 1.18it/s]
[Fold-1] epoch-2: score=0.4827352103163725
100%
366/366 [06:11<00:00, 1.05s/it]
[Fold-1] epoch-3: score=0.4791843935000248
```

```
[Fold-1] epoch-0: score=0.634812305090922
100%
366/366 [06:10<00:00, 1.08s/it]
[Fold-1] epoch-1: score=0.5402783404093685
100%
366/366 [06:07<00:00, 1.19it/s]
[Fold-1] epoch-2: score=0.49520196558257107
100%
366/366 [06:11<00:00, 1.05s/it]
[Fold-1] epoch-3: score=0.4839299213206399
[0.4839299213206399]
0.12098248033015997
```

```
[Fold-1] epoch-0: score=0.7845522330439376
100%
366/366 [06:04<00:00, 1.08it/s]
[Fold-1] epoch-1: score=0.5016520683400798
100%
366/366 [06:05<00:00, 1.07s/it]
[Fold-1] epoch-2: score=0.4919881528497781
100%
366/366 [06:12<00:00, 1.05it/s]
[Fold-1] epoch-3: score=0.48170858445447107
```


### スコアメモ

#### basic lossのみ

|epoch|score|
|--|--|
|0|0.526931088418688|
|1||
|2||
|3||

#### basic loss + pseudo

|epoch|score|
|--|--|
|0|0.46634102132671007|
|1|0.4600812197041982|
|2|0.46426920637470737|
|3|0.46183874646246315|

#### basic loss + norm 0.01

|epoch|score|
|--|--|
|0|0.5363891288129078|
|1|0.4848848712197243|
|2|0.4729763663353857|
|3|0.4659324501443643|

#### basic loss + norm 1.0

|epoch|score|
|--|--|
|0|0.5261090139783512|
|1|0.491587607694613|
|2|0.4890059951163896|
|3||


#### 自己蒸留

#### 1層(LKloss)

|epoch|score|
|--|--|
|0|0.6494795403379258|
|1|0.6xxx|
|2||
|3||

#### 1層

|epoch|score|
|--|--|
|0|0.5031831317162344|
|1||
|2||
|3||

#### 2層

|epoch|score|
|--|--|
|0|0.49955065566735096|
|1|0.5446511805141959|
|2|0.4856458101870027|
|3|0.4808126223513056|

#### 2層 layer hint

|epoch|score|
|--|--|
|0|0.5030051283685592|
|1|0.5550490815798361|
|2|0.4870029197380179|
|3|0.4815796639136917|

#### 2層 layer hint KL

|epoch|score|
|--|--|
|0|0.6493452335084183|
|1|0.6484945935918974|
|2||
|3||


- AVG: 0.47679568644182346
- AVG / 2: 0.474152817510402
- AVG / 4: 0.47159031839216814

#### 4層

|epoch|score|
|--|--|
|0|0.51080632591456|
|1|0.4972932551819587|
|2|0.50240882382299|
|3||

#### ベースライン

|epoch|score|
|--|--|
|0|2.2438404386854334|
|1|1.4579084189192848|
|2|1.145673294975656|
|3||

#### ターゲットのKLは計算しない

|epoch|score|
|--|--|
|0|2.9955258771870557|
|1|2.7831887114903076|
|2||
|3||

#### L2 loss

|epoch|score|
|--|--|
|0|7.229032346756473|
|1|11.169361000883642|
|2|13.013728330459623|
|3|13.306884086590356|

#### cos loss

|epoch|score|
|--|--|
|0|7.219968155264872|
|1|11.167937634035303|
|2||
|3||

#### 各レイヤーのloss

##### 4層

|epoch|score|
|--|--|
|0|2.6595548899905315|
|1|2.207810980642428|
|2||
|3||

##### 2層

|epoch|score|
|--|--|
|0|2.5768557287045444|
|1|1.9825857271857064|
|2||
|3||

##### feature層

|epoch|score|
|--|--|
|0||
|1||
|2||
|3||

#### Layerリセット

##### 1層

|epoch|score|
|--|--|
|0|2.031728988555573|
|1|1.2354769068980633|
|2||
|3||

##### 1層+MSE

|epoch|score|
|--|--|
|0|8.05171028512562|
|1||
|2||
|3||

##### 2層

|epoch|score|
|--|--|
|0|2.051898461291952|
|1||
|2||
|3||

#### パラメータチューニング(1層リセット)

encoder = 5e-5

|epoch|score|
|--|--|
|0|2.0558456883621465|
|1|1.293154511884232|
|2|0.9928888108647462|
|3||

encoder = 2e-4

|epoch|score|
|--|--|
|0|2.053561709873904|
|1||
|2||
|3||

#### CLSトークン

|epoch|score|
|--|--|
|0||
|1||
|2||
|3||