In [1]:
EXP_ID = "001"

In [2]:
!nvidia-smi

Tue Aug  8 15:31:51 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    On   | 00000000:05:00.0 Off |                  Off |
| 30%   53C    P0    50W / 300W |     17MiB / 49140MiB |     70%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from pathlib import Path

EXP_DIR = Path(f"../data/exp{EXP_ID}")
EXP_DIR.mkdir(exist_ok=True, parents=True)

In [4]:
import os
import re
import time
import random
from pathlib import Path
from pprint import pprint
from collections import Counter, defaultdict

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error

import torch
from torch import nn
from torch.cuda import amp
from torch.optim.lr_scheduler import (
    CosineAnnealingWarmRestarts,
    CosineAnnealingLR,
    ReduceLROnPlateau,
    ExponentialLR
)
from torch.utils.data.dataset import Dataset
from torch.utils.data.dataloader import DataLoader

import transformers

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

In [5]:
class CFG:
    seed = 2022

    n_folds = 4

    model_name = "roberta-base"

    input_dir = "../input/"

    num_workers = 4
    batch_size = 32
    n_epoch = 5
    lr = 5e-6
    verbose = True
    verbose_step = 1

    max_length = 512

    SchedulerClass = CosineAnnealingLR
    scheduler_params = dict(
        T_max=n_epoch,
        eta_min=1e-7
    )
    num_warmup_steps_rate = 0.1
    # warmup_epoch = 3
    #SchedulerClass = ExponentialLR
    #scheduler_params = dict(
    #    gamma=0.95
    #)

    n_cv_fold = 4
    use_fp16 = False

    fold_map = {
        "814d6b": 0,
        "ebad26": 1,
        "3b9047": 2,
        "39c16e": 3,
    }

In [6]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

In [7]:
def read_data(csv_path: [Path, str], reset = False, *args, **kwargs) -> pd.DataFrame:
    pkl_path = csv_path.parent / (csv_path.stem + ".pkl")
    if pkl_path.exists() and not reset:
        print("read from pickle")
        df = pd.read_pickle(pkl_path)
    else:
        print("read from csv")
        df = pd.read_csv(csv_path, *args, **kwargs)
        df.to_pickle(pkl_path)
    return df

In [8]:
input_dir = Path(CFG.input_dir)
p_train_df = read_data(input_dir / "prompts_train.csv")
p_test_df = read_data(input_dir / "prompts_test.csv")
train_df = read_data(input_dir / "summaries_train.csv")
test_df = read_data(input_dir / "summaries_test.csv")

read from pickle
read from pickle
read from pickle
read from pickle


In [9]:
train_df["fold"] = train_df["prompt_id"].map(CFG.fold_map)
train_df["fold"] = train_df["fold"].astype(np.int8)

In [10]:
train_df.head(2)

Unnamed: 0,student_id,prompt_id,text,content,wording,fold
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,1


In [11]:
p_train_df.head(2)

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
1,3b9047,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...


In [12]:
class NLPDataset(Dataset):
    def __init__(self, df, prompt_df, tokenizer, is_train=True, max_len=128):
        self.student_ids = df["student_id"].tolist()
        self.text = df["text"].tolist()
        self.content = df["content"].tolist()
        self.wording = df["wording"].tolist()
        self.prompt_question = prompt_df.set_index("prompt_id")["prompt_question"].reindex(df["prompt_id"]).tolist()
        self.prompt_title  = prompt_df.set_index("prompt_id")["prompt_title"].reindex(df["prompt_id"]).tolist()
        self.prompt_text = prompt_df.set_index("prompt_id")["prompt_text"].reindex(df["prompt_id"]).tolist()
        
        self.tokenizer = tokenizer
        
        self.is_train = is_train
        self.max_len = max_len

    def __len__(self):
        return len(self.student_ids)
    
    def __getitem__(self, ix):
        prompt_question = str(self.prompt_question[ix])
        prompt_text = str(self.prompt_text[ix])
        text = str(self.text[ix])

        sentence = prompt_question + " [SEP] " + text  # + " [SEP] " + prompt_text
        target1 = self.content[ix]
        target2 = self.wording[ix]
        
        text_inputs = self.tokenizer(
            sentence, 
            max_length=self.max_len, 
            padding="max_length", 
            truncation=True,
            return_token_type_ids=True
        )

        data = {
            "input_ids": torch.tensor(text_inputs["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(text_inputs["attention_mask"], dtype=torch.long),
            "token_type_ids": torch.tensor(text_inputs["token_type_ids"], dtype=torch.long),
            "targets": torch.tensor([target1, target2], dtype=torch.float),
        }

        return data

In [13]:
class NLPModel(nn.Module):
    def __init__(self, mlm_model_path=None):
        super(NLPModel, self).__init__()
        if mlm_model_path is None:
            self.encoder = transformers.RobertaModel.from_pretrained(
                CFG.model_name,
                attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0,
                output_hidden_states=True
            )
        else:
            mlm_model = transformers.RobertaForMaskedLM.from_pretrained(
                CFG.model_name, output_hidden_states=True,
                attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0,
            )
            mlm_model.load_state_dict(
                torch.load(mlm_model_path, map_location=lambda storage,loc: storage)["model_state_dict"]
            )
            self.encoder = mlm_model.roberta

        self.fc = nn.Linear(768, 2)
        # torch.nn.init.normal_(self.fc.weight, std=0.02)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        last_hidden_state, pooler_output, hidden_state = self.encoder(input_ids=input_ids,
                                 attention_mask=attention_mask, 
                                 token_type_ids = token_type_ids, 
                                 return_dict=False)
        cls_token = last_hidden_state[:, 0, :]
        # cls_token = torch.cat([
        #            hidden_state[-1].mean(dim=1),
        #            hidden_state[-1][:, 0, :],
        #            hidden_state[-2][:, 0, :],
        #            hidden_state[-3][:, 0, :],
        #            hidden_state[-4][:, 0, :]
        # ], dim=1)
        output = self.fc(cls_token)
        return output

In [14]:
class AWP:
    def __init__(
        self,
        model,
        optimizer,
        adv_param="weight",
        adv_lr=1,
        adv_eps=0.2,
        start_epoch=0,
        adv_step=1,
        scaler=None
    ):
        self.model = model
        self.optimizer = optimizer
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.start_epoch = start_epoch
        self.adv_step = adv_step
        self.backup = {}
        self.backup_eps = {}
        self.scaler = scaler

    def attack_backward(self, ids, mask, token_type_ids, labels, criterion, epoch):
        if (self.adv_lr == 0) or (epoch < self.start_epoch):
            return None

        self._save() 
        for i in range(self.adv_step):
            self._attack_step() 
            with torch.cuda.amp.autocast(enabled=CFG.use_fp16):
                y_preds = self.model(ids, mask, token_type_ids)
            y_preds = y_preds.reshape(-1)
            adv_loss = criterion(y_preds, labels)
            self.optimizer.zero_grad()
            #adv_loss.backward()
            self.scaler.scale(adv_loss).backward()
        self._restore()

    def _attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )

    def _save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def _restore(self,):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

In [15]:
def train_one_epoch(model, loss_fn, data_loader, optimizer,
                    device, scheduler, epoch, scaler=None, awp=None):
    # get batch data loop
    epoch_loss = 0
    epoch_data_num = len(data_loader.dataset)

    model.train()

    bar = tqdm(enumerate(data_loader), total=len(data_loader))

    for iter_i, batch in bar:
        # input
        batch = {k : v.to(device) for k, v in batch.items()}
        text_inputs = {
            "input_ids": batch["input_ids"],
            "attention_mask": batch["attention_mask"],
            "token_type_ids": batch["token_type_ids"]
        }
        targets = batch["targets"]
        batch_size = len(targets)

        # zero grad
        optimizer.zero_grad()

        with torch.set_grad_enabled(True):
            with amp.autocast(enabled=CFG.use_fp16):
                preds = model(**text_inputs)
                preds = preds.view(-1)
                targets = targets.view(-1)
                loss = loss_fn(preds, targets)
            scaler.scale(loss).backward()
            if awp is not None:
                awp.attack_backward(
                    batch["input_ids"],
                    batch["attention_mask"],
                    batch["token_type_ids"],
                    targets, loss_fn, epoch
                ) 
            #torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad_norm)
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            epoch_loss += loss.item()
    
    epoch_loss_per_data = epoch_loss / epoch_data_num
    return epoch_loss_per_data

In [16]:
def valid_one_epoch(model, loss_fn, data_loader, device):
    # get batch data loop
    epoch_loss = 0
    epoch_data_num = len(data_loader.dataset)
    pred_list = []
    target_list = []
    bar = tqdm(enumerate(data_loader), total=len(data_loader))

    model.eval()
    for iter_i, batch in bar:
        # input
        batch = {k : v.to(device) for k, v in batch.items()}
        text_inputs = {
            "input_ids": batch["input_ids"],
            "attention_mask": batch["attention_mask"],
            "token_type_ids": batch["token_type_ids"]
        }
        targets = batch["targets"]
        batch_size = len(targets)

        with torch.no_grad():
            preds = model(**text_inputs)
            loss = loss_fn(preds.view(-1), targets.view(-1))
            epoch_loss += loss.item()

        pred_list.append(preds.detach().cpu().numpy())
        target_list.append(targets.detach().cpu().numpy())

    epoch_loss_per_data = epoch_loss / epoch_data_num
    val_preds = np.concatenate(pred_list, axis=0)
    val_targets = np.concatenate(target_list, axis=0)
    return epoch_loss_per_data, val_preds, val_targets

In [17]:
def calc_metric(targets: np.ndarray, preds: np.ndarray) -> float:
    score1 = np.sqrt(((targets[:, 0] - preds[:, 0]) ** 2).mean())
    score2 = np.sqrt(((targets[:, 1] - preds[:, 1]) ** 2).mean())
    score = (score1 + score2) / 2
    return score

In [18]:
def train_run(train_df, valid_df, prompt_df, model_prefix="", mlm_model_path=None):
    
    
    set_seed(CFG.seed)
    
    device = torch.device(
        "cuda:0" if torch.cuda.is_available() else "cpu"
    )
    print(f"train run device : {device}")
    
    ###################################
    # Model and Tokenizer
    ###################################
    model = NLPModel(mlm_model_path=mlm_model_path)
    model.to(device)

    tokenizer = transformers.RobertaTokenizer.from_pretrained(CFG.model_name)
    scaler = amp.GradScaler(enabled=CFG.use_fp16)
    
    ###################################
    # Make data
    ###################################
    
    train_dataset = NLPDataset(train_df, p_train_df, tokenizer, max_len=CFG.max_length, is_train=True)
    valid_dataset = NLPDataset(valid_df, p_train_df, tokenizer, max_len=CFG.max_length, is_train=False)

    # data loader
    train_loader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size,
        num_workers=CFG.num_workers,
        shuffle=True,
        drop_last=True
    )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size=CFG.batch_size,
        num_workers=CFG.num_workers,
        shuffle=False,
    )
    ##################
    # Optimiizer
    ##################
    optimizer = torch.optim.AdamW(model.parameters(), lr=CFG.lr)
 
    ##################
    # awp
    ##################   
    # awp = AWP(
    #     model,
    #     optimizer,
    #     adv_lr=1e-2,
    #     adv_eps=0.001,
    #     start_epoch=1,
    #     scaler=scaler
    # )
    awp = None
    
   
    ##################
    # lr scheduler
    ##################
    #scheduler = config.SchedulerClass(
    #    optimizer, **config.scheduler_params
    #)
    num_train_optimization_steps = int(len(train_loader) * CFG.n_epoch)
    num_warmup_steps = int(num_train_optimization_steps * CFG.num_warmup_steps_rate)

    scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_optimization_steps
    )

    ##################
    # loss function
    ##################
    loss_fn = nn.MSELoss()

    ###############################
    # train epoch loop
    ###############################
    # iteration and loss count
    iteration = 1
    epoch_train_loss = 0.0
    epoch_val_loss = 0.0
    valid_period = 1

    results_list = []
    val_preds_list = []
    old_model_save_path = None

    for epoch in range(CFG.n_epoch):
        
        t_epoch_start = time.time()

        # train loop
        train_epoch_loss = train_one_epoch(
            model, loss_fn, train_loader, optimizer, device, scheduler,
            epoch=epoch, scaler=scaler, awp=awp
        )

        # valid loop
        valid_epoch_loss, val_preds, val_targets = valid_one_epoch(
            model, loss_fn, valid_loader, device
        )

        # calc metric
        val_score = calc_metric(val_targets, val_preds)        
        t_epoch_finish = time.time()
        elapsed_time = t_epoch_finish - t_epoch_start

        # learning rate step
        lr = optimizer.param_groups[0]['lr']
        
        # save results
        results = {
            "epoch": epoch + 1,
            "lr": lr,
            "train_loss": train_epoch_loss,
            "valid_loss": valid_epoch_loss,
            "MCRMSE": val_score
        }
        print(results)
        results_list.append(results)

        if epoch == CFG.n_epoch -1:
            model_save_path = EXP_DIR / f"{model_prefix}last-checkpoint.bin"
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'epoch': epoch,
            }, model_save_path)

    return val_score, results_list, val_preds

In [19]:
test_pred_list = []
val_scores = []

oof_preds = np.zeros((len(train_df), 2))

for fold_ix in range(CFG.n_folds):
    print("="*30)
    print(f"Fold{fold_ix}")
    print("="*30)
    
    _train_df = train_df.loc[train_df["fold"] != fold_ix].reset_index(drop=True)
    _valid_df = train_df.loc[train_df["fold"] == fold_ix].reset_index(drop=True)

    # mlm_model_path = CFG.mlm_models[fold_ix] if len(CFG.mlm_models) == CFG.n_cv_fold else CFG.mlm_models[0]
    mlm_model_path = None

    val_score, score_list, val_preds = train_run(
        _train_df, _valid_df, p_train_df,
        model_prefix=f"fold{fold_ix}",
        mlm_model_path=mlm_model_path,
    )
    val_scores.append(val_score)

    oof_preds[train_df["fold"] == fold_ix] = val_preds

cv_score = calc_metric(train_df[["content", "wording"]].values, oof_preds)
print(f"CV Average at best epoch {cv_score}")

Fold0
train run device : cuda:0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

{'epoch': 1, 'lr': 4.4418331374853116e-06, 'train_loss': 0.019405030144279602, 'valid_loss': 0.014146581170565414, 'MCRMSE': 0.6562964916229248}


  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

{'epoch': 2, 'lr': 3.331374853113984e-06, 'train_loss': 0.008180542889256086, 'valid_loss': 0.01327417589595722, 'MCRMSE': 0.6361340880393982}


  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

{'epoch': 3, 'lr': 2.2209165687426558e-06, 'train_loss': 0.00691617767199633, 'valid_loss': 0.013059806602171953, 'MCRMSE': 0.6342687606811523}


  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

{'epoch': 4, 'lr': 1.1104582843713279e-06, 'train_loss': 0.006218312619759752, 'valid_loss': 0.012321629069225418, 'MCRMSE': 0.6145899295806885}


  0%|          | 0/189 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

{'epoch': 5, 'lr': 0.0, 'train_loss': 0.005787357512194109, 'valid_loss': 0.012431372906117686, 'MCRMSE': 0.6177763938903809}
Fold1
train run device : cuda:0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/161 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

{'epoch': 1, 'lr': 4.441379310344828e-06, 'train_loss': 0.021397347323644043, 'valid_loss': 0.011797604332108774, 'MCRMSE': 0.6064535975456238}


  0%|          | 0/161 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

{'epoch': 2, 'lr': 3.3310344827586212e-06, 'train_loss': 0.009125948497118537, 'valid_loss': 0.011039181379134766, 'MCRMSE': 0.5906041264533997}


  0%|          | 0/161 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

{'epoch': 3, 'lr': 2.220689655172414e-06, 'train_loss': 0.007703804935482351, 'valid_loss': 0.011180635369970708, 'MCRMSE': 0.5945671200752258}


  0%|          | 0/161 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

{'epoch': 4, 'lr': 1.110344827586207e-06, 'train_loss': 0.00675711137491076, 'valid_loss': 0.00990811003024927, 'MCRMSE': 0.5594998598098755}


  0%|          | 0/161 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

{'epoch': 5, 'lr': 0.0, 'train_loss': 0.006243125500774033, 'valid_loss': 0.00949383754738586, 'MCRMSE': 0.5476597547531128}
Fold2
train run device : cuda:0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/161 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

{'epoch': 1, 'lr': 4.441379310344828e-06, 'train_loss': 0.019835829081648167, 'valid_loss': 0.021504242156737477, 'MCRMSE': 0.8228884935379028}


  0%|          | 0/161 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

{'epoch': 2, 'lr': 3.3310344827586212e-06, 'train_loss': 0.007847794452270303, 'valid_loss': 0.020680370038276052, 'MCRMSE': 0.8088476657867432}


  0%|          | 0/161 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

{'epoch': 3, 'lr': 2.220689655172414e-06, 'train_loss': 0.006792342736796874, 'valid_loss': 0.019226425720008825, 'MCRMSE': 0.7764480113983154}


  0%|          | 0/161 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

{'epoch': 4, 'lr': 1.110344827586207e-06, 'train_loss': 0.006094682681366304, 'valid_loss': 0.017597955263568846, 'MCRMSE': 0.7415740489959717}


  0%|          | 0/161 [00:00<?, ?it/s]

  0%|          | 0/63 [00:00<?, ?it/s]

{'epoch': 5, 'lr': 0.0, 'train_loss': 0.005649199300352412, 'valid_loss': 0.018043797324106076, 'MCRMSE': 0.7502073049545288}
Fold3
train run device : cuda:0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

{'epoch': 1, 'lr': 4.441340782122905e-06, 'train_loss': 0.02010440591579783, 'valid_loss': 0.010583805636578413, 'MCRMSE': 0.5760666131973267}


  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

{'epoch': 2, 'lr': 3.331005586592179e-06, 'train_loss': 0.009220182443491256, 'valid_loss': 0.0095383215425776, 'MCRMSE': 0.5487094521522522}


  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

{'epoch': 3, 'lr': 2.2206703910614527e-06, 'train_loss': 0.007814147549746273, 'valid_loss': 0.008779750672369602, 'MCRMSE': 0.5252559781074524}


  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

{'epoch': 4, 'lr': 1.1103351955307263e-06, 'train_loss': 0.00683394464180632, 'valid_loss': 0.008726616050933946, 'MCRMSE': 0.524841845035553}


  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/65 [00:00<?, ?it/s]

{'epoch': 5, 'lr': 0.0, 'train_loss': 0.006223952187664264, 'valid_loss': 0.008655524718917918, 'MCRMSE': 0.5226565599441528}
Fold4
train run device : cuda:0


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/223 [00:00<?, ?it/s]

0it [00:00, ?it/s]

ZeroDivisionError: division by zero

In [22]:
cv_score = calc_metric(train_df[["content", "wording"]].values, oof_preds)
print(f"CV Average at best epoch {cv_score}")

CV Average at best epoch 0.6158708946936794


In [28]:
!bash ../upload.sh ../data/exp{EXP_ID}/ commonlit2-{EXP_ID}

Data package template written to: ../data/exp001/dataset-metadata.json
Starting upload for file fold3last-checkpoint.bin
100%|██████████████████████████████████████| 1.39G/1.39G [08:24<00:00, 2.96MB/s]
Upload successful: fold3last-checkpoint.bin (1GB)
Starting upload for file dataset-metadata.jsone
100%|███████████████████████████████████████████| 129/129 [00:02<00:00, 43.9B/s]
Upload successful: dataset-metadata.jsone (129B)
Starting upload for file fold1last-checkpoint.bin
100%|██████████████████████████████████████| 1.39G/1.39G [06:23<00:00, 3.89MB/s]
Upload successful: fold1last-checkpoint.bin (1GB)
Starting upload for file fold2last-checkpoint.bin
100%|██████████████████████████████████████| 1.39G/1.39G [04:24<00:00, 5.64MB/s]
Upload successful: fold2last-checkpoint.bin (1GB)
Starting upload for file fold0last-checkpoint.bin
100%|██████████████████████████████████████| 1.39G/1.39G [04:25<00:00, 5.62MB/s]
Upload successful: fold0last-checkpoint.bin (1GB)
Your private Dataset is bei