In [1]:
!pip install -q pyspellchecker textstat

[0m

In [2]:
import os

class Config:
    AUTHOR = "wanwan7123"

    NAME = "commonlit-exp014-deberta-v3-base"
    MODEL_PATH = "microsoft/deberta-v3-base"
    ROOT = "/notebooks"

    apex=True
    seed = 42
    num_fold = 4
    trn_fold = [0, 1, 2, 3]
    batch_size = 8
    n_epochs = 3
    max_len = 768
    target_list = ["content", "wording"]
    n_targets = len(target_list)
    
    weight_decay = 0.01
    scheduler='cosine'
    betas = (0.9, 0.999)
    encoder_lr = 2e-5
    decoder_lr = 2e-5
    lr_weight_decay = 1.00
    min_lr = 1e-6
    eps = 1e-6
    eval_step = 250
    num_cycles=0.5
    num_warmup_steps_rate=0.1
    clip_grad_norm = 1000
    gradient_accumulation_steps = 1
    
    # GPU Optimize Settings
    gpu_optimize_config= {
        "freezing": False,
        "gradient_checkpoint": True
    }

In [3]:
import os
import re
import gc
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval
from tqdm.auto import tqdm
from pathlib import Path
from glob import glob

import numpy as np
import pandas as pd
import scipy 
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold,
    StratifiedGroupKFold
)
from sklearn.metrics import mean_squared_error

from spellchecker import SpellChecker
import textstat

! pip install torch==1.10.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.checkpoint import checkpoint
from torch.cuda.amp import autocast, GradScaler
import torch.nn.functional as F

!pip install text-unidecode

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.10.1+cu113
  Downloading https://download.pytorch.org/whl/cu113/torch-1.10.1%2Bcu113-cp39-cp39-linux_x86_64.whl (1821.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 GB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:02[0mm
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.12.1+cu116
    Uninstalling torch-1.12.1+cu116:
      Successfully uninstalled torch-1.12.1+cu116
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.13.1+cu116 requires torch==1.12.1, but you have torch 1.10.1+cu113 which is incompatible.
torchaudio 0.12.1+cu116 requires torch==1.12.1, but you have torch 1.10.1+cu113 which is incompatible.[0m[31m
[0mSuccessfully installed torch-1.10.1+cu113


# utils

In [4]:
def setup(cfg):
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # pip install
    ! pip install -qq transformers==4.16.2
    ! pip install -qq tokenizers==0.11.6
    ! pip install -qq transformers[sentencepiece]

    # set dirs    
    cfg.INPUT = Path(f"{cfg.ROOT}/input")
    cfg.OUTPUT = Path(f"{cfg.ROOT}/output")
    cfg.EXP = cfg.OUTPUT / cfg.NAME

    cfg.api_path = f"{cfg.ROOT}/kaggle.json"

    cfg.EXP_MODEL = cfg.EXP / "model"
    cfg.EXP_PREDS = cfg.EXP / "preds"

    # make dirs
    for d in [cfg.EXP, cfg.EXP_MODEL, cfg.EXP_PREDS]:
        d.mkdir(exist_ok=True)
        
    # use kaggle api (need kaggle token)
    f = open(cfg.api_path, 'r')
    json_data = json.load(f) 
    os.environ['KAGGLE_USERNAME'] = json_data['username']
    os.environ['KAGGLE_KEY'] = json_data['key']
    
    return cfg

def dataset_create_new(dataset_name, upload_dir):
    dataset_metadata = {}
    dataset_metadata['id'] = f'{os.environ["KAGGLE_USERNAME"]}/{dataset_name}'
    dataset_metadata['licenses'] = [{'name': 'CC0-1.0'}]
    dataset_metadata['title'] = dataset_name
    with open(os.path.join(upload_dir, 'dataset-metadata.json'), 'w') as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode='tar')

In [5]:
# =====================
# Utils
# =====================
# Seed
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def get_commonlit_fold(train):
    id2fold = {
        "814d6b": 0,
        "39c16e": 1,
        "3b9047": 2,
        "ebad26": 3,
    }
    train["fold"] = train["prompt_id"].map(id2fold)
    return train["fold"]

In [6]:
def mcrmse(cfg, preds, df, verbose = True):
    all_score = 0
    for i, column in enumerate(cfg.target_list):
        score = np.sqrt(mean_squared_error(preds[:, i], df[column]))
        if verbose:
            print(f"{column} rmse:", score)
        all_score += score/len(cfg.target_list)
    return all_score

# preprocess

In [7]:
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [8]:
def processing_features(df):
    df['processed_text'] = df['text'].apply(lambda x : resolve_encodings_and_normalize(x))
    df['full_text'] = df["prompt_title"] + " [SEP] " + df["prompt_question"] + " [SEP] " + df["processed_text"]
    return df

In [9]:
%load_ext Cython

In [10]:
%%cython

import numpy as np
cimport numpy as cnp
cimport cython

@cython.boundscheck(False)  # Arrayの境界チェックを無効化
@cython.wraparound(False)   # 負のインデックスを無効化
def longest_common_substring(str s1, str s2):
    cdef int m, n, i, j, longest
    m, n = len(s1), len(s2)
    
    # Using numpy to initialize the 2D array
    cdef cnp.ndarray[int, ndim=2] dp = np.zeros((m+1, n+1), dtype=np.int32)
    
    longest = 0
    for i in range(m + 1):
        for j in range(n + 1):
            if i == 0 or j == 0:
                dp[i, j] = 0
            elif s1[i - 1] == s2[j - 1]:
                dp[i, j] = dp[i - 1, j - 1] + 1
                longest = max(longest, dp[i, j])
            else:
                dp[i, j] = 0
    return longest

In file included from /usr/local/lib/python3.9/dist-packages/numpy/core/include/numpy/ndarraytypes.h:1948,
                 from /usr/local/lib/python3.9/dist-packages/numpy/core/include/numpy/ndarrayobject.h:12,
                 from /usr/local/lib/python3.9/dist-packages/numpy/core/include/numpy/arrayobject.h:5,
                 from /root/.cache/ipython/cython/_cython_magic_0507ba6f7af5325357e70c61fd0cf3f5.c:769:
      |  ^~~~~~~


In [11]:
def quotes_count(row):
    text = row['text']
    prompt_text = row['prompt_text']
    quotes_from_text = re.findall(r'"([^"]*)"', text)
    if len(quotes_from_text)>0:
        return [quote in prompt_text for quote in quotes_from_text].count(True)
    else:
        return 0
    
def ngram_co_occurrence(row, n=3):
    text = row["text"]
    prompt_text = row["prompt_text"]

    text_ngram = set(zip(*[text[i:] for i in range(n)]))
    prompt_ngram = set(zip(*[prompt_text[i:] for i in range(n)]))
    return len(text_ngram & prompt_ngram)

def feature_engineering(input_df: pd.DataFrame) -> pd.DataFrame:
    output_df = pd.DataFrame()

    # basic
    output_df["n_words"] = input_df["text"].apply(lambda x: len(x.split()))
    output_df["n_unique_words"] = input_df["text"].apply(lambda x: len(set(x.split())))
    output_df["num_sentences"] = input_df["text"].apply(lambda x: len(x.split('.')))
    output_df["is_upper"] = input_df["text"].apply(lambda x: x[0].isupper())
    output_df["mean_num_words"] = input_df["text"].apply(lambda x: np.mean([len(e.split()) for e in x.split('.')]))
    output_df["mean_num_unique_words"] = input_df["text"].apply(lambda x: np.mean([len(set(e.split())) for e in x.split('.')]))
    output_df["num_slash"] = input_df["text"].apply(lambda x: x.count("\n"))
    output_df["paragraph_count"] = input_df["text"].apply(lambda x: x.count("\n\n"))
    output_df["upper_count"] = input_df["text"].apply(lambda x: np.sum([w.isupper() for w in x.split()])/len(x.split()))
    output_df["syntax_count"] = input_df["text"].apply(lambda x: x.count(",") + x.count("-") + x.count(";") + x.count(":"))
    output_df["vocab_strength"] = output_df["n_unique_words"] / output_df["n_words"]
    output_df["new_vocab"] = input_df.apply(lambda x: len(set(x["text"].split()) - set(x["prompt_text"].split())), axis=1)

    # compare
    # overwrap word
    output_df["n_overwrap_unique_word"] = input_df.apply(lambda x: len(set(x["text"].split()) & set(x["prompt_text"].split())), axis=1)
    # longest common substring
    output_df["longest_common_substring"] = input_df.apply(lambda x: longest_common_substring(x["text"], x["prompt_text"]), axis=1)
    # quote
    output_df["quote_count"] = input_df.apply(quotes_count, axis=1)
    # ngram co occurrence
    for n in [3, 4, 5, 6, 7, 8]:
        output_df[f"n_co_occurrence_{n}"] = input_df.apply(ngram_co_occurrence, n=n, axis=1) / output_df["n_words"]
    
    
    # misspell
    spell = SpellChecker()
    output_df["n_misspell"] = input_df["text"].apply(lambda x: len(spell.unknown(x.split())))


    output_df['automated_readability_index'] = input_df["text"].apply(lambda x: textstat.automated_readability_index(x))
    output_df['coleman_liau_index'] = input_df["text"].apply(lambda x: textstat.coleman_liau_index(x))
    output_df['smog_index'] = input_df["text"].apply(textstat.smog_index)
    output_df['dale_chall_readability_score'] = input_df["text"].apply(lambda x: textstat.dale_chall_readability_score(x))
    output_df['linsear_write_formula'] = input_df["text"].apply(lambda x: textstat.linsear_write_formula(x))
    output_df['gunning_fog'] = input_df["text"].apply(textstat.gunning_fog)
    output_df['text_standard_float'] = input_df["text"].apply(textstat.text_standard, float_output=True)
    output_df['spache_readability'] = input_df["text"].apply(textstat.spache_readability)
    output_df['rix'] = input_df["text"].apply(textstat.rix)
    output_df['lix'] = input_df["text"].apply(textstat.lix)


    return output_df

# dataset

In [12]:
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.text = df['processed_text'].to_numpy()
        self.labels = df[cfg.target_list].to_numpy()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, index):
        inputs = self.prepare_input(self.cfg, self.text[index])
        label = torch.tensor(self.labels[index], dtype=torch.float)
        return inputs, label

    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(text,
                               add_special_tokens=True,
                               max_length=cfg.max_len,
                               padding="max_length",
                               truncation=True,
                               return_offsets_mapping=False)
        inputs['input_ids'] = torch.tensor(
            inputs['input_ids'],
            dtype=torch.long
        )
        inputs['attention_mask'] = torch.tensor(
            inputs['attention_mask'],
            dtype=torch.long
        )
        inputs = {
            'input_ids': inputs['input_ids'],
            'attention_mask': inputs['attention_mask'],
        }
        return inputs

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# model

In [13]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class CustomModel(nn.Module):
    def __init__(self, cfg): 
        super().__init__()
        self.cfg = cfg
        self.gpu_optimize_config = cfg.gpu_optimize_config
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout": 0.,
                "hidden_dropout_prob": 0.,
                "attention_dropout": 0.,
                "attention_probs_dropout_prob": 0,
            }
        )
        self.backbone = AutoModel.from_pretrained(
            cfg.MODEL_PATH,
            config=self.config
        )

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.n_targets)
        self._init_weights(self.fc)
        self.ln = nn.LayerNorm(self.config.hidden_size)
        self._init_weights(self.ln)

        # Freeze
        if self.gpu_optimize_config['freezing']:
            freeze(self.backbone.encoder.layer[:8])

        # Gradient Checkpointing
        if self.gpu_optimize_config['gradient_checkpoint']:
            self.backbone.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.backbone(**inputs)
        last_state = outputs[0]
        feature = self.pool(last_state, inputs['attention_mask'])
        return feature

    def forward(self, inputs, labels):
        # batch, hidden_size
        feature = self.feature(inputs)
        # batch, 2
        output = self.fc(self.ln(feature))
        if labels is not None:
            loss_fct = nn.SmoothL1Loss(reduction='mean')
            loss = loss_fct(output, labels)
            return loss, output
        else:
            return output

# optimizer, scheduler

In [14]:
def get_optimizer_grouped_parameters(cfg, model):
    model_type = 'backbone'
    no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
    optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if "backbone" not in n],
             'lr': cfg.decoder_lr, 'weight_decay': 0.0},
    ]
    num_layers = model.config.num_hidden_layers
    layers = [getattr(model, model_type).embeddings] + list(getattr(model, model_type).encoder.layer)
    layers.reverse()
    lr = cfg.encoder_lr
    for layer in layers:
        lr *= cfg.lr_weight_decay
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": cfg.weight_decay,
                "lr": lr,
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr,
            },
        ]
    return optimizer_grouped_parameters

In [15]:
def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=int(num_train_steps*cfg.num_warmup_steps_rate), num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler

# eval,train

In [16]:
def evaluating(cfg, valid_loader, model, valid_df, fold, best_val_preds, best_val_score):
    val_preds = []
    val_losses = []
    val_nums = []
    model.eval()
    with torch.no_grad():
        with tqdm(valid_loader, total=len(valid_loader)) as pbar:
            for (inputs, labels) in pbar:
                inputs = collate(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(cfg.device)
                labels = labels.to(cfg.device)
                with autocast():
                    loss, output = model(inputs, labels)
                
                output = output.detach().cpu().numpy()
                val_preds.append(output)
                val_losses.append(loss.item() * len(labels))
                val_nums.append(len(labels))
                pbar.set_postfix({
                    'val_loss': loss.item()
                })

    val_preds = np.concatenate(val_preds)
    val_loss = sum(val_losses) / sum(val_nums)
    score = mcrmse(cfg, val_preds, valid_df)

    val_log = {
        'val_loss': val_loss,
        'mcrmse': score
    }
    display(val_log)

    if best_val_score > score:
        print('\033[31m'+'save model weight'+'\033[0m')
        best_val_preds = val_preds
        best_val_score = score
        torch.save(
            model.state_dict(), 
            cfg.EXP_MODEL / f"fold{fold}.pth"
        )
    
    return best_val_preds, best_val_score

def training(cfg, train):
    # =====================
    # Training
    # =====================
    set_seed(cfg.seed)
    oof_pred = np.zeros((len(train), 2), dtype=np.float32)
    fold_score = []

    for fold in cfg.trn_fold:
        # dataset, dataloader
        train_df = train.loc[cfg.folds!=fold]
        valid_df = train.loc[cfg.folds==fold]
        train_idx = list(train_df.index)
        valid_idx = list(valid_df.index)

        # Datasetの設定
        train_dataset = TrainDataset(cfg, train_df)
        valid_dataset = TrainDataset(cfg, valid_df)
        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=cfg.batch_size,
            shuffle=True,
            pin_memory=True,
            drop_last=True,
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )

        # model
        model = CustomModel(cfg)
        torch.save(model.config, cfg.EXP_MODEL / 'config.pth')
        model = model.to(cfg.device)

        # optimizer, scheduler
        optimizer_grouped_parameters = get_optimizer_grouped_parameters(cfg, model)
        optimizer = AdamW(optimizer_grouped_parameters, lr=cfg.encoder_lr, eps=cfg.eps, betas=cfg.betas, weight_decay=cfg.weight_decay)

        num_train_steps = int(len(train_df) / cfg.batch_size * cfg.n_epochs)
        scheduler = get_scheduler(cfg, optimizer, num_train_steps)

        # enable FGM
        # fgm = FGM(model)

        # model-training
        best_val_preds = None
        best_val_score = 9999
        
        for epoch in range(cfg.n_epochs):
            # training
            print(f"# ============ start epoch:{epoch} ============== #")
            train_losses = []
            train_nums = []
            model.train() 
            scaler = GradScaler(enabled=cfg.apex)
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (inputs, labels) in enumerate(pbar):
                    inputs = collate(inputs)
                    for k, v in inputs.items():
                        inputs[k] = v.to(cfg.device)
                    labels = labels.to(cfg.device)
                    with autocast(enabled=cfg.apex):
                        loss, output = model(inputs, labels)

                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })
                    train_losses.append(loss.item() * len(labels))
                    train_nums.append(len(labels))

                    if cfg.gradient_accumulation_steps > 1:
                        loss = loss / cfg.gradient_accumulation_steps

                    scaler.scale(loss).backward()

                    # FGM attack
                    # fgm.attack()
                    # with autocast(enabled=cfg.apex):
                    #     loss_adv, _ = model(inputs, labels)
                    # scaler.scale(loss_adv).backward()
                    # fgm.restore()
                    
                    if cfg.clip_grad_norm is not None:
                        # scaler.unscale_(optimizer)
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(),
                            cfg.clip_grad_norm
                        )
                        
                    if (step+1) % cfg.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        optimizer.zero_grad()
                        scheduler.step()

                    if step % cfg.eval_step == 0 and step != 0:
                        print(f'fold: {fold}, epoch: {epoch}, step: {step}')
                        best_val_preds, best_val_score = evaluating(
                            cfg, valid_loader,
                            model,
                            valid_df,
                            fold,
                            best_val_preds,
                            best_val_score,
                        )
                        model.train()

            train_loss = sum(train_losses)/sum(train_nums)
            train_log = {
                'train_loss':train_loss
            }
            display(train_log)

            # evaluating(epoch)
            print(f'fold: {fold}, epoch: {epoch}, complete')
            best_val_preds, best_val_score = evaluating(
                cfg, valid_loader,
                model,
                valid_df,
                fold,
                best_val_preds,
                best_val_score,
            )

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(cfg.EXP_PREDS / f'oof_pred_fold{fold}.npy', best_val_preds)
        fold_score.append(best_val_score)
        del model; gc.collect()

    np.save(cfg.EXP_PREDS / 'oof_pred.npy', oof_pred)

    # =====================
    # scoring
    # =====================
    score = mcrmse(cfg, oof_pred, train)
    print('fold score：', fold_score)
    print('CV:', round(score, 4))
    return score

# main

In [17]:
# setup
cfg = setup(Config)

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
import tokenizers
import sentencepiece
%env TOKENIZERS_PARALLELISM=true
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

# main
train_df = pd.read_csv(cfg.INPUT / "summaries_train.csv")
prompts_train_df = pd.read_csv(cfg.INPUT / "prompts_train.csv")


train_df = train_df.merge(prompts_train_df, on="prompt_id")
train_df = processing_features(train_df)
train_feat_df = feature_engineering(train_df)

cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)
cfg.tokenizer.save_pretrained(cfg.EXP / 'tokenizer')
cfg.folds = get_commonlit_fold(train_df)
cfg.folds.to_csv(cfg.EXP_PREDS / 'folds.csv')
score = training(cfg, train_df)

env: TOKENIZERS_PARALLELISM=true
tokenizers.__version__: 0.11.6
transformers.__version__: 4.16.2


Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading:   0%|          | 0.00/354M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/757 [00:00<?, ?it/s]

fold: 0, epoch: 0, step: 250


  0%|          | 0/138 [00:00<?, ?it/s]

content rmse: 0.6372885843774461
wording rmse: 1.1097331279598293


{'val_loss': 0.3538652095379661, 'mcrmse': 0.8735108561686377}

[31msave model weight[0m
fold: 0, epoch: 0, step: 500


  0%|          | 0/138 [00:00<?, ?it/s]

content rmse: 0.6823698330200406
wording rmse: 0.8077294913839268


{'val_loss': 0.25309816564847853, 'mcrmse': 0.7450496622019838}

[31msave model weight[0m
fold: 0, epoch: 0, step: 750


  0%|          | 0/138 [00:00<?, ?it/s]

content rmse: 0.525139758118734
wording rmse: 0.7743063655095322


{'val_loss': 0.1985676421269653, 'mcrmse': 0.6497230618141332}

[31msave model weight[0m


{'train_loss': 0.18816954251304957}

fold: 0, epoch: 0, complete


  0%|          | 0/138 [00:00<?, ?it/s]

content rmse: 0.6160173667331398
wording rmse: 0.7113794054296501


{'val_loss': 0.202968161995586, 'mcrmse': 0.663698386081395}



  0%|          | 0/757 [00:00<?, ?it/s]

fold: 0, epoch: 1, step: 250


  0%|          | 0/138 [00:00<?, ?it/s]

content rmse: 0.6839287947464997
wording rmse: 0.7854921082255809


{'val_loss': 0.25032219979726283, 'mcrmse': 0.7347104514860403}

fold: 0, epoch: 1, step: 500


  0%|          | 0/138 [00:00<?, ?it/s]

content rmse: 0.6745092536890963
wording rmse: 0.7327395101762103


{'val_loss': 0.22790548027048085, 'mcrmse': 0.7036243819326533}

fold: 0, epoch: 1, step: 750


  0%|          | 0/138 [00:00<?, ?it/s]

content rmse: 0.6648671503003692
wording rmse: 0.7287846095922709


{'val_loss': 0.22487506648679267, 'mcrmse': 0.6968258799463201}

{'train_loss': 0.08704172467623432}

fold: 0, epoch: 1, complete


  0%|          | 0/138 [00:00<?, ?it/s]

content rmse: 0.6584404188858177
wording rmse: 0.7285152628052687


{'val_loss': 0.22270149619386936, 'mcrmse': 0.6934778408455432}



  0%|          | 0/757 [00:00<?, ?it/s]

fold: 0, epoch: 2, step: 250


  0%|          | 0/138 [00:00<?, ?it/s]

content rmse: 0.7183609339152519
wording rmse: 0.7422605729332548


{'val_loss': 0.24677163994755402, 'mcrmse': 0.7303107534242533}

fold: 0, epoch: 2, step: 500


  0%|          | 0/138 [00:00<?, ?it/s]

content rmse: 0.6718924404345888
wording rmse: 0.7098768814226324


{'val_loss': 0.2203630633983197, 'mcrmse': 0.6908846609286106}

fold: 0, epoch: 2, step: 750


  0%|          | 0/138 [00:00<?, ?it/s]

content rmse: 0.6767920261445747
wording rmse: 0.7051064582057417


{'val_loss': 0.22024293102033551, 'mcrmse': 0.6909492421751582}

{'train_loss': 0.0712686509245827}

fold: 0, epoch: 2, complete


  0%|          | 0/138 [00:00<?, ?it/s]

content rmse: 0.676793163615436
wording rmse: 0.7050952946498585


{'val_loss': 0.22023935468761033, 'mcrmse': 0.6909442291326473}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/638 [00:00<?, ?it/s]

fold: 1, epoch: 0, step: 250


  0%|          | 0/258 [00:00<?, ?it/s]

content rmse: 0.46200709387820926
wording rmse: 0.5782811294284023


{'val_loss': 0.132043556385847, 'mcrmse': 0.5201441116533058}

[31msave model weight[0m
fold: 1, epoch: 0, step: 500


  0%|          | 0/258 [00:00<?, ?it/s]

content rmse: 0.4707496421115547
wording rmse: 0.5605938729081836


{'val_loss': 0.13056922932171278, 'mcrmse': 0.5156717575098692}

[31msave model weight[0m


{'train_loss': 0.214672165003176}

fold: 1, epoch: 0, complete


  0%|          | 0/258 [00:00<?, ?it/s]

content rmse: 0.432843759313221
wording rmse: 0.6629677057457314


{'val_loss': 0.15278286763227386, 'mcrmse': 0.5479057325294762}



  0%|          | 0/638 [00:00<?, ?it/s]

fold: 1, epoch: 1, step: 250


  0%|          | 0/258 [00:00<?, ?it/s]

content rmse: 0.48795291268605306
wording rmse: 0.5408720564416675


{'val_loss': 0.1303936908393736, 'mcrmse': 0.5144124845638602}

[31msave model weight[0m
fold: 1, epoch: 1, step: 500


  0%|          | 0/258 [00:00<?, ?it/s]

content rmse: 0.47500561349669546
wording rmse: 0.523017372311028


{'val_loss': 0.12264920418333185, 'mcrmse': 0.4990114929038617}

[31msave model weight[0m


{'train_loss': 0.1026648276406676}

fold: 1, epoch: 1, complete


  0%|          | 0/258 [00:00<?, ?it/s]

content rmse: 0.4328235912891848
wording rmse: 0.553042242338874


{'val_loss': 0.1212737739781587, 'mcrmse': 0.4929329168140294}

[31msave model weight[0m


  0%|          | 0/638 [00:00<?, ?it/s]

fold: 1, epoch: 2, step: 250


  0%|          | 0/258 [00:00<?, ?it/s]

content rmse: 0.46320411070037293
wording rmse: 0.5653217118161443


{'val_loss': 0.13119302196704363, 'mcrmse': 0.5142629112582586}

fold: 1, epoch: 2, step: 500


  0%|          | 0/258 [00:00<?, ?it/s]

content rmse: 0.4788487273256214
wording rmse: 0.5426228006855629


{'val_loss': 0.12860420560505012, 'mcrmse': 0.5107357640055922}

{'train_loss': 0.0794205038449187}

fold: 1, epoch: 2, complete


  0%|          | 0/258 [00:00<?, ?it/s]

content rmse: 0.4771698701607572
wording rmse: 0.5423240509107097


{'val_loss': 0.1281399262410724, 'mcrmse': 0.5097469605357334}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/644 [00:00<?, ?it/s]

fold: 2, epoch: 0, step: 250


  0%|          | 0/252 [00:00<?, ?it/s]

content rmse: 0.5294715753231648
wording rmse: 0.858349437652531


{'val_loss': 0.22237494618679648, 'mcrmse': 0.6939105064878479}

[31msave model weight[0m
fold: 2, epoch: 0, step: 500


  0%|          | 0/252 [00:00<?, ?it/s]

content rmse: 0.5064048166509861
wording rmse: 0.9059995267009966


{'val_loss': 0.22603982603923783, 'mcrmse': 0.7062021716759914}

{'train_loss': 0.20006516476607192}

fold: 2, epoch: 0, complete


  0%|          | 0/252 [00:00<?, ?it/s]

content rmse: 0.5314162157100586
wording rmse: 0.9509038214216939


{'val_loss': 0.24968412371640658, 'mcrmse': 0.7411600185658762}



  0%|          | 0/644 [00:00<?, ?it/s]

fold: 2, epoch: 1, step: 250


  0%|          | 0/252 [00:00<?, ?it/s]

content rmse: 0.513380877105574
wording rmse: 0.8169390464332797


{'val_loss': 0.20086846603363412, 'mcrmse': 0.6651599617694268}

[31msave model weight[0m
fold: 2, epoch: 1, step: 500


  0%|          | 0/252 [00:00<?, ?it/s]

content rmse: 0.5059068271149753
wording rmse: 0.8387539968159675


{'val_loss': 0.20505425628068852, 'mcrmse': 0.6723304119654714}

{'train_loss': 0.09321750474584677}

fold: 2, epoch: 1, complete


  0%|          | 0/252 [00:00<?, ?it/s]

content rmse: 0.5190550345434779
wording rmse: 0.8283427939915059


{'val_loss': 0.20402798938878797, 'mcrmse': 0.6736989142674918}



  0%|          | 0/644 [00:00<?, ?it/s]

fold: 2, epoch: 2, step: 250


  0%|          | 0/252 [00:00<?, ?it/s]

content rmse: 0.513474699165647
wording rmse: 0.8349810449891792


{'val_loss': 0.2042796978103011, 'mcrmse': 0.6742278720774131}

fold: 2, epoch: 2, step: 500


  0%|          | 0/252 [00:00<?, ?it/s]

content rmse: 0.5102843827791623
wording rmse: 0.813533310088602


{'val_loss': 0.19788845189586987, 'mcrmse': 0.6619088464338821}

[31msave model weight[0m


{'train_loss': 0.08017681020757426}

fold: 2, epoch: 2, complete


  0%|          | 0/252 [00:00<?, ?it/s]

content rmse: 0.5103831574825417
wording rmse: 0.8164558686505056


{'val_loss': 0.19870357769655017, 'mcrmse': 0.6634195130665237}

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/646 [00:00<?, ?it/s]

fold: 3, epoch: 0, step: 250


  0%|          | 0/250 [00:00<?, ?it/s]

content rmse: 0.5348320408093356
wording rmse: 0.6158846020938894


{'val_loss': 0.16072988528049065, 'mcrmse': 0.5753583214516125}

[31msave model weight[0m
fold: 3, epoch: 0, step: 500


  0%|          | 0/250 [00:00<?, ?it/s]

content rmse: 0.4608181483220685
wording rmse: 0.6499260209009017


{'val_loss': 0.15162715315818787, 'mcrmse': 0.5553720846114851}

[31msave model weight[0m


{'train_loss': 0.20707111064656047}

fold: 3, epoch: 0, complete


  0%|          | 0/250 [00:00<?, ?it/s]

content rmse: 0.49533440921431354
wording rmse: 0.5813773925095953


{'val_loss': 0.14058857596290852, 'mcrmse': 0.5383559008619544}

[31msave model weight[0m


  0%|          | 0/646 [00:00<?, ?it/s]

fold: 3, epoch: 1, step: 250


  0%|          | 0/250 [00:00<?, ?it/s]

content rmse: 0.48266235638942256
wording rmse: 0.5667391459699646


{'val_loss': 0.13330112025141716, 'mcrmse': 0.5247007511796935}

[31msave model weight[0m
fold: 3, epoch: 1, step: 500


  0%|          | 0/250 [00:00<?, ?it/s]

content rmse: 0.48012781167365964
wording rmse: 0.5580516761751667


{'val_loss': 0.13077243615337508, 'mcrmse': 0.5190897439244132}

[31msave model weight[0m


{'train_loss': 0.1025908000496196}

fold: 3, epoch: 1, complete


  0%|          | 0/250 [00:00<?, ?it/s]

content rmse: 0.48271992049631773
wording rmse: 0.5446119342593468


{'val_loss': 0.12881735668989844, 'mcrmse': 0.5136659273778323}

[31msave model weight[0m


  0%|          | 0/646 [00:00<?, ?it/s]

fold: 3, epoch: 2, step: 250


  0%|          | 0/250 [00:00<?, ?it/s]

content rmse: 0.5585296885375799
wording rmse: 0.5531428459009462


{'val_loss': 0.14967355845805877, 'mcrmse': 0.555836267219263}

fold: 3, epoch: 2, step: 500


  0%|          | 0/250 [00:00<?, ?it/s]

content rmse: 0.4999716156850755
wording rmse: 0.538900107539181


{'val_loss': 0.13144308410749167, 'mcrmse': 0.5194358616121283}

{'train_loss': 0.07829516685795582}

fold: 3, epoch: 2, complete


  0%|          | 0/250 [00:00<?, ?it/s]

content rmse: 0.4996734346087254
wording rmse: 0.540631370979542


{'val_loss': 0.13179804746039167, 'mcrmse': 0.5201524027941337}

content rmse: 0.4839007196378545
wording rmse: 0.6695549118683936
fold score： [0.6497230618141332, 0.4929329168140294, 0.6619088464338821, 0.5136659273778323]
CV: 0.5767
