In [None]:
!nvidia-smi

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q kaggle
!mkdir -p .kaggle
!cp "./drive/MyDrive/Study/config/kaggle.json" .kaggle/
!chmod 600 .kaggle/kaggle.json
!mv .kaggle /root

In [None]:
!kaggle competitions download -c feedback-prize-english-language-learning
!kaggle kernels output takamichitoda/fb3-make-pseudo-label-3rd

!unzip feedback-prize-english-language-learning.zip

!rm -rf feedback-prize-english-language-learning.zip

In [None]:
!pip install iterative-stratification==0.1.7
!pip install transformers
!pip install sentencepiece

## メイン処理

In [None]:
import gc
import os
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from tqdm.auto import tqdm
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from torch.utils.checkpoint import checkpoint

import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup


is_gpu = torch.cuda.is_available()
device = torch.device('cuda' if is_gpu else 'cpu')
scaler = torch.cuda.amp.GradScaler(enabled=is_gpu)

%env TOKENIZERS_PARALLELISM=true
print(device)
print(f"transformers.__version__: {transformers.__version__}")

In [None]:
class CFG:
    INPUT = "/content"
    OUTPUT = "/content/drive/MyDrive/Study/FB3/output/"
    SEED = 0
    N_FOLD = 4
    TARGETS = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    
    MODEL_NAME = "microsoft/deberta-v3-xsmall"
    TOKENIZER = None
    MAX_LEN = 768 #512 #1429 - 1
    GRAD_CHECKPOINT = True
    
    N_EPOCH = 4 * 2
    N_WORKER = 2
    ENCODER_LR = 5e-4
    DECODER_LR = 5e-3
    EPS = 1e-6
    BETAS = (0.9, 0.999)
    WEIGHT_DECAY = 0.01
    N_WARMUP = 0
    N_CYCLES = 0.5
    
    BS = 8 * 16
    ACCUMLATION = 1
    
    GRAD_NORM = 0.01
    
    SKIP_FOLDS = [0,2,3]
    LOCAL_SEED = 0

In [None]:
train_df = pd.read_csv(f"{CFG.INPUT}/train.csv")
train_df.head()

In [None]:
cv = MultilabelStratifiedKFold(n_splits=CFG.N_FOLD, shuffle=True, random_state=CFG.SEED)
for n, (train_index, valid_index) in enumerate(cv.split(train_df, train_df[CFG.TARGETS])):
    train_df.loc[valid_index, 'fold'] = int(n)
train_df['fold'] = train_df['fold'].astype(int)

In [None]:
TOKENIZER = AutoTokenizer.from_pretrained(CFG.MODEL_NAME)
TOKENIZER.save_pretrained(CFG.OUTPUT+'tokenizer/')
CFG.TOKENIZER = TOKENIZER
del TOKENIZER

In [None]:
class FB3Dataset(Dataset):
    def __init__(self, df):
        self.texts = df['full_text'].values
        self.labels = df[CFG.TARGETS].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = CFG.TOKENIZER.encode_plus(
            self.texts[item], 
            return_tensors=None, 
            add_special_tokens=True, 
            max_length=CFG.MAX_LEN,
            pad_to_max_length=True,
            truncation=True
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long) 
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

In [None]:
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
        
class CustomModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()

        self.config = AutoConfig.from_pretrained(model_name, output_hidden_states=True)
        self.config.hidden_dropout = 0.
        self.config.hidden_dropout_prob = 0.
        self.config.attention_dropout = 0.
        self.config.attention_probs_dropout_prob = 0.
        self.config.num_hidden_layers = 4 # org 12

        self.model = AutoModel.from_pretrained(model_name, config=self.config)

        if CFG.GRAD_CHECKPOINT:
            self.model.gradient_checkpointing_enable()
            
        #self.pool = MeanPooling()
        self.att = nn.Sequential(
            nn.Linear(self.config.hidden_size, self.config.hidden_size),
            nn.Tanh(),
            nn.Linear(self.config.hidden_size, 1),
            nn.Softmax(dim=1),
        )
        #layer_start = 4
        #self.pool = WeightedLayerPooling(
        #    self.config.num_hidden_layers, 
        #    layer_start=layer_start, layer_weights=None
        #)

        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)

        # reinit_layers
        reinit_layer = -1
        if reinit_layer > 0:
            for layer in self.model.encoder.layer[-reinit_layer:]:
                for module in layer.modules():
                    if isinstance(module, nn.Linear):
                        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                        if module.bias is not None:
                            module.bias.data.zero_()
                    elif isinstance(module, nn.Embedding):
                        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                        if module.padding_idx is not None:
                            module.weight.data[module.padding_idx].zero_()
                    elif isinstance(module, nn.LayerNorm):
                        module.bias.data.zero_()
                        module.weight.data.fill_(1.0)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, inputs):
        outputs = self.model(**inputs)

        #last_hidden_states = outputs[0]
        #feature = self.pool(last_hidden_states, inputs['attention_mask'])
        #feature = last_hidden_states[:, 0]

        #all_hidden_states = torch.stack(outputs[1])
        #weighted_pooling_embeddings = self.pool(all_hidden_states)
        #feature = weighted_pooling_embeddings[:, 0]

        all_hidden_states = outputs[0]
        att_weights = self.att(all_hidden_states)
        feature =  torch.sum(att_weights * all_hidden_states, dim=1)

        output = self.fc(feature)

        return output

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [None]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
         'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

In [None]:
def train_fn(epoch, train_loader, student, criterion, optimizer, scheduler):
    student.train()

    losses = AverageMeter()
    global_step = 0
    for step, (inputs, labels) in tqdm(enumerate(train_loader), total=len(train_loader)):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=is_gpu):
            y_preds = student(inputs)
            
            loss = criterion(y_preds, labels)

        if CFG.ACCUMLATION > 1:
            loss = loss / CFG.ACCUMLATION
            
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(student.parameters(), CFG.GRAD_NORM)
        
        if (step + 1) % CFG.ACCUMLATION == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            scheduler.step()
                
    return losses.avg

def valid_fn(valid_loader, model, criterion):
    losses = AverageMeter()
    model.eval()
    preds = []
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
            
        if CFG.ACCUMLATION > 1:
            loss = loss / CFG.ACCUMLATION
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())

    predictions = np.concatenate(preds)
    return losses.avg, predictions

In [None]:
pseudo_dfs = []
for fold in range(CFG.N_FOLD):
    _df = pd.read_csv(f"./pseudo_v3_f{fold}.csv")
    _df = _df.rename(columns = {"text":"full_text"}).drop(["ARI", "predicted_grade", "tokenize_length"], axis=1)
    _df["fold"] = fold
    _df["text_id"] = [f"pseudo_{i}" for i in range(len(_df))]
    pseudo_dfs.append(_df)

In [None]:
def train_loop(fold, seed):
    valid_df = train_df.query(f"fold=={fold}")

    _train_df = train_df.copy()
    _train_df[CFG.TARGETS] = np.load(f'/content/drive/MyDrive/Study/FB3/pseudo_2nd/pseudo_{fold}.npy')

    valid_labels = valid_df[CFG.TARGETS].values
    train_dataset = FB3Dataset(pd.concat([_train_df.query(f"fold!={fold}"), pseudo_dfs[fold]], axis=0).reset_index(drop=True))
    valid_dataset = FB3Dataset(valid_df)
    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.BS,
                              shuffle=True,
                              num_workers=CFG.N_WORKER, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.BS,
                              shuffle=False,
                              num_workers=CFG.N_WORKER, pin_memory=True, drop_last=False)

    student = CustomModel(CFG.MODEL_NAME)
    torch.save(student.config, CFG.OUTPUT + 'config.pth')
    student.to(device)

    optimizer_parameters = get_optimizer_params(student,
                                                encoder_lr=CFG.ENCODER_LR, 
                                                decoder_lr=CFG.DECODER_LR,
                                                weight_decay=CFG.WEIGHT_DECAY)

    optimizer = AdamW(optimizer_parameters, lr=CFG.ENCODER_LR, eps=CFG.EPS, betas=CFG.BETAS)
    num_train_steps = int(len(train_dataset) / CFG.BS * CFG.N_EPOCH)
    scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=CFG.N_WARMUP, num_training_steps=num_train_steps, num_cycles=CFG.N_CYCLES
    )
    criterion = nn.SmoothL1Loss(reduction='mean', beta=1.0)

    best_score = float("inf")
    best_predictions = None
    for epoch in range(CFG.N_EPOCH):
        avg_loss = train_fn(epoch, train_loader, student, criterion, optimizer, scheduler)
        avg_val_loss, predictions = valid_fn(valid_loader, student, criterion)
        score, _ = get_score(valid_labels, predictions)
        if best_score > score:
            best_score = score
            best_predictions = predictions
            torch.save({'model': student.state_dict(),
                        'predictions': predictions},
                         f"{CFG.OUTPUT}/{CFG.MODEL_NAME.replace('/', '-')}_seed{seed}_fold{fold}_best.pth")
        print(f"[Fold-{fold}] epoch-{epoch}: score={score}")

        torch.save({'model': student.state_dict(),
                    'predictions': predictions},
                    f"{CFG.OUTPUT}/{CFG.MODEL_NAME.replace('/', '-')}_seed{seed}_fold{fold}_epoch{epoch}.pth")
        torch.cuda.empty_cache()
        gc.collect()
        
    return best_score

In [None]:
def main(seed):
    seed_everything(seed)
    scores = []
    for fold in range(CFG.N_FOLD):
        if fold in CFG.SKIP_FOLDS:
            continue
        score = train_loop(fold, seed)
        scores.append(score)
    print(scores)
    print(sum(scores) / CFG.N_FOLD)

In [None]:
if __name__ == '__main__':
    main(CFG.LOCAL_SEED)

```
[Fold-1] epoch-0: score=0.4697210373809723
[Fold-1] epoch-1: score=0.4590620076505552
[Fold-1] epoch-2: score=0.4556656215482453
```

In [None]:
!ls {CFG.OUTPUT}