# Overview
https://uku28motab.feishu.cn/docs/doccnUDbEhudHm2V440lcY87B1c    
This is kernel is almost the same as [Lightweight Roberta solution in PyTorch](https://www.kaggle.com/andretugan/lightweight-roberta-solution-in-pytorch), but instead of "roberta-base", it starts from [Maunish's pre-trained model](https://www.kaggle.com/maunish/clrp-roberta-base).

Acknowledgments: some ideas were taken from kernels by [Torch](https://www.kaggle.com/rhtsingh) and [Maunish](https://www.kaggle.com/maunish).

In [1]:
# 导入相关库文件
import os
import math
import random
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoConfig
from transformers import get_cosine_schedule_with_warmup

from sklearn.model_selection import KFold

import gc
gc.enable()

In [2]:
# 参数
NUM_FOLDS = 10
NUM_EPOCHS = 5
BATCH_SIZE = 32
MAX_LEN = 248
EVAL_SCHEDULE = [(0.50, 16), (0.49, 8), (0.48, 4), (0.47, 2), (-1., 1)]
ROBERTA_PATH = "../input/clrp-roberta-base/clrp_roberta_base"
TOKENIZER_PATH = "../input/clrp-roberta-base/clrp_roberta_base"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# 设定随机种子
def set_random_seed(random_seed):
    random.seed(random_seed)
    np.random.seed(random_seed)
    os.environ["PYTHONHASHSEED"] = str(random_seed)

    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)

    torch.backends.cudnn.deterministic = True

In [4]:
# 数据，标签的路径
train_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/train.csv")

# 如果训练中有不完整的条目存在，请删除。
train_df.drop(train_df[(train_df.target == 0) & (train_df.standard_error == 0)].index,
              inplace=True)
train_df.reset_index(drop=True, inplace=True)

test_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/test.csv")
submission_df = pd.read_csv("/kaggle/input/commonlitreadabilityprize/sample_submission.csv")

In [5]:
# 从指定的路径读取分词器
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

# Dataset

In [6]:
# 数据整理（便于输入模型）
class LitDataset(Dataset):
    def __init__(self, df, inference_only=False):
        super().__init__()
        
        self.df = df        
        self.inference_only = inference_only
        self.text = df.excerpt.tolist()
        #self.text = [text.replace("\n", " ") for text in self.text]
        # 
        if not self.inference_only:
            self.target = torch.tensor(df.target.values, dtype=torch.float32)        
    
        self.encoded = tokenizer.batch_encode_plus(
            self.text,
            padding = 'max_length',            
            max_length = MAX_LEN,
            truncation = True,
            return_attention_mask=True
        )        
 

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, index):        
        input_ids = torch.tensor(self.encoded['input_ids'][index])
        attention_mask = torch.tensor(self.encoded['attention_mask'][index])
        
        if self.inference_only:
            return (input_ids, attention_mask)            
        else:
            target = self.target[index]
            return (input_ids, attention_mask, target)

# Model
The model is inspired by the one from [Maunish](https://www.kaggle.com/maunish/clrp-roberta-svm).

In [7]:
# 模型
class LitModel(nn.Module):
    def __init__(self):
        super().__init__()

        config = AutoConfig.from_pretrained(ROBERTA_PATH)
        config.update({"output_hidden_states":True, 
                       "hidden_dropout_prob": 0.0,
                       "layer_norm_eps": 1e-7})                       
        
        self.roberta = AutoModel.from_pretrained(ROBERTA_PATH, config=config)  
            
        self.attention = nn.Sequential(            
            nn.Linear(768, 512),            
            nn.Tanh(),                       
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )        

        self.regressor = nn.Sequential(                        
            nn.Linear(768, 1)                        
        )
        

    def forward(self, input_ids, attention_mask):
        roberta_output = self.roberta(input_ids=input_ids,
                                      attention_mask=attention_mask)        

        # There are a total of 13 layers of hidden states.
        # 1 for the embedding layer, and 12 for the 12 Roberta layers.
        # We take the hidden states from the last Roberta layer.
        last_layer_hidden_states = roberta_output.hidden_states[-1]

        # The number of cells is MAX_LEN.
        # The size of the hidden state of each cell is 768 (for roberta-base).
        # In order to condense hidden states of all cells to a context vector,
        # we compute a weighted average of the hidden states of all cells.
        # We compute the weight of each cell, using the attention neural network.
        weights = self.attention(last_layer_hidden_states)
                
        # weights.shape is BATCH_SIZE x MAX_LEN x 1
        # last_layer_hidden_states.shape is BATCH_SIZE x MAX_LEN x 768        
        # Now we compute context_vector as the weighted average.
        # context_vector.shape is BATCH_SIZE x 768
        context_vector = torch.sum(weights * last_layer_hidden_states, dim=1)        
        
        # Now we reduce the context vector to the prediction score.
        return self.regressor(context_vector)

In [8]:
def eval_mse(model, data_loader):
    """Evaluates the mean squared error of the |model| on |data_loader|"""
    model.eval()            
    mse_sum = 0

    with torch.no_grad():
        for batch_num, (input_ids, attention_mask, target) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)                        
            target = target.to(DEVICE)           
            
            pred = model(input_ids, attention_mask)                       

            mse_sum += nn.MSELoss(reduction="sum")(pred.flatten(), target).item()
                

    return mse_sum / len(data_loader.dataset)

In [9]:
def predict(model, data_loader):
    """Returns an np.array with predictions of the |model| on |data_loader|"""
    model.eval()

    result = np.zeros(len(data_loader.dataset))    
    index = 0
    
    with torch.no_grad():
        for batch_num, (input_ids, attention_mask) in enumerate(data_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)
                        
            pred = model(input_ids, attention_mask)                        

            result[index : index + pred.shape[0]] = pred.flatten().to("cpu")
            index += pred.shape[0]

    return result

In [10]:
def train(model, model_path, train_loader, val_loader,
          optimizer, scheduler=None, num_epochs=NUM_EPOCHS):    
    best_val_rmse = None
    best_epoch = 0
    step = 0
    last_eval_step = 0
    eval_period = EVAL_SCHEDULE[0][1]    

    start = time.time()

    for epoch in range(num_epochs):                           
        val_rmse = None         

        for batch_num, (input_ids, attention_mask, target) in enumerate(train_loader):
            input_ids = input_ids.to(DEVICE)
            attention_mask = attention_mask.to(DEVICE)            
            target = target.to(DEVICE)                        

            optimizer.zero_grad()
            
            model.train()

            pred = model(input_ids, attention_mask)
                                                        
            mse = nn.MSELoss(reduction="mean")(pred.flatten(), target)
                        
            mse.backward()

            optimizer.step()
            if scheduler:
                scheduler.step()
            
            if step >= last_eval_step + eval_period:
                # Evaluate the model on val_loader.
                elapsed_seconds = time.time() - start
                num_steps = step - last_eval_step
                print(f"\n{num_steps} steps took {elapsed_seconds:0.3} seconds")
                last_eval_step = step
                
                val_rmse = math.sqrt(eval_mse(model, val_loader))                            

                print(f"Epoch: {epoch} batch_num: {batch_num}", 
                      f"val_rmse: {val_rmse:0.4}")

                for rmse, period in EVAL_SCHEDULE:
                    if val_rmse >= rmse:
                        eval_period = period
                        break                               
                
                if not best_val_rmse or val_rmse < best_val_rmse:                    
                    best_val_rmse = val_rmse
                    best_epoch = epoch
                    torch.save(model.state_dict(), model_path)
                    print(f"New best_val_rmse: {best_val_rmse:0.4}")
                else:       
                    print(f"Still best_val_rmse: {best_val_rmse:0.4}",
                          f"(from epoch {best_epoch})")                                    
                    
                start = time.time()
                                            
            step += 1
                        
    
    return best_val_rmse

In [11]:
def create_optimizer(model):
    named_parameters = list(model.named_parameters())    
    
    roberta_parameters = named_parameters[:197]    
    attention_parameters = named_parameters[199:203]
    regressor_parameters = named_parameters[203:]
        
    attention_group = [params for (name, params) in attention_parameters]
    regressor_group = [params for (name, params) in regressor_parameters]

    parameters = []
    parameters.append({"params": attention_group})
    parameters.append({"params": regressor_group})

    for layer_num, (name, params) in enumerate(roberta_parameters):
        weight_decay = 0.0 if "bias" in name else 0.01

        lr = 3e-5

        if layer_num >= 69:        
            lr = 5e-5

        if layer_num >= 133:
            lr = 1e-4

        parameters.append({"params": params,
                           "weight_decay": weight_decay,
                           "lr": lr})

    return AdamW(parameters)

In [12]:
gc.collect()
# 随机种子设为1000
SEED = 1000
list_val_rmse = []
# 使用交叉验证方法，能更好的抗过拟合
kfold = KFold(n_splits=NUM_FOLDS, random_state=SEED, shuffle=True)
# 
for fold, (train_indices, val_indices) in enumerate(kfold.split(train_df)):    
    print(f"\nFold {fold + 1}/{NUM_FOLDS}")
    model_path = f"model_{fold + 1}.pth"
        
    set_random_seed(SEED + fold)
    # 数据集处理
    train_dataset = LitDataset(train_df.loc[train_indices])    
    val_dataset = LitDataset(train_df.loc[val_indices])    
    # 模型读取数据集    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              drop_last=True, shuffle=True, num_workers=2)    
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                            drop_last=False, shuffle=False, num_workers=2)    
        
    set_random_seed(SEED + fold)    
    
    model = LitModel().to(DEVICE)
    # 优化模型
    optimizer = create_optimizer(model)
    # 优化器衰减方式
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_training_steps=NUM_EPOCHS * len(train_loader),
        num_warmup_steps=50)    
    # 验证rmse损失做成列表
    list_val_rmse.append(train(model, model_path, train_loader,
                               val_loader, optimizer, scheduler=scheduler))
    # 删除模型
    del model
    gc.collect()
    
    print("\nPerformance estimates:")
    print(list_val_rmse)
    print("Mean:", np.array(list_val_rmse).mean())
    


Fold 1/10


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 14.0 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9813
New best_val_rmse: 0.9813

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7596
New best_val_rmse: 0.7596

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.612
New best_val_rmse: 0.612

16 steps took 12.2 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.554
New best_val_rmse: 0.554

16 steps took 12.2 seconds
Epoch: 1 batch_num: 1 val_rmse: 0.5655
Still best_val_rmse: 0.554 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 1 batch_num: 17 val_rmse: 0.5432
New best_val_rmse: 0.5432

16 steps took 12.1 seconds
Epoch: 1 batch_num: 33 val_rmse: 0.5311
New best_val_rmse: 0.5311

16 steps took 12.2 seconds
Epoch: 1 batch_num: 49 val_rmse: 0.5438
Still best_val_rmse: 0.5311 (from epoch 1)

16 steps took 12.1 seconds
Epoch: 1 batch_num: 65 val_rmse: 0.6269
Still best_val_rmse: 0.5311 (from epoch 1)

16 steps took 12.3 seconds
Epoch: 2 batch_num: 2 val_rmse: 0.5536
Still best_val_rmse: 0.5311 (from epoch 1)

Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 13.1 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8957
New best_val_rmse: 0.8957

16 steps took 12.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6906
New best_val_rmse: 0.6906

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.5685
New best_val_rmse: 0.5685

16 steps took 12.2 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.505
New best_val_rmse: 0.505

16 steps took 12.3 seconds
Epoch: 1 batch_num: 1 val_rmse: 0.5197
Still best_val_rmse: 0.505 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 1 batch_num: 17 val_rmse: 0.5742
Still best_val_rmse: 0.505 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 1 batch_num: 33 val_rmse: 0.5549
Still best_val_rmse: 0.505 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 1 batch_num: 49 val_rmse: 0.5062
Still best_val_rmse: 0.505 (from epoch 0)

16 steps took 12.2 seconds
Epoch: 1 batch_num: 65 val_rmse: 0.475
New best_val_rmse: 0.475

2 steps took 1.54 seconds
Epoch: 1 batch_num: 67 val_rmse: 0.4794
Still best_val_rmse: 0.475 (

Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 13.1 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9205
New best_val_rmse: 0.9205

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6926
New best_val_rmse: 0.6926

16 steps took 12.2 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.5766
New best_val_rmse: 0.5766

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5267
New best_val_rmse: 0.5267

16 steps took 12.6 seconds
Epoch: 1 batch_num: 1 val_rmse: 0.5577
Still best_val_rmse: 0.5267 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 1 batch_num: 17 val_rmse: 0.6186
Still best_val_rmse: 0.5267 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 1 batch_num: 33 val_rmse: 0.5156
New best_val_rmse: 0.5156

16 steps took 12.1 seconds
Epoch: 1 batch_num: 49 val_rmse: 0.5101
New best_val_rmse: 0.5101

16 steps took 12.1 seconds
Epoch: 1 batch_num: 65 val_rmse: 0.4672
New best_val_rmse: 0.4672

1 steps took 0.787 seconds
Epoch: 1 batch_num: 66 val_rmse: 0.4747
Still best_val_rmse: 0.4672 (from epoch 1)

2 steps t

Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 13.1 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9673
New best_val_rmse: 0.9673

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7109
New best_val_rmse: 0.7109

16 steps took 12.2 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6601
New best_val_rmse: 0.6601

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.6535
New best_val_rmse: 0.6535

16 steps took 12.4 seconds
Epoch: 1 batch_num: 1 val_rmse: 0.5276
New best_val_rmse: 0.5276

16 steps took 12.2 seconds
Epoch: 1 batch_num: 17 val_rmse: 0.512
New best_val_rmse: 0.512

16 steps took 12.2 seconds
Epoch: 1 batch_num: 33 val_rmse: 0.5373
Still best_val_rmse: 0.512 (from epoch 1)

16 steps took 12.1 seconds
Epoch: 1 batch_num: 49 val_rmse: 0.6822
Still best_val_rmse: 0.512 (from epoch 1)

16 steps took 12.2 seconds
Epoch: 1 batch_num: 65 val_rmse: 0.5029
New best_val_rmse: 0.5029

16 steps took 12.3 seconds
Epoch: 2 batch_num: 2 val_rmse: 0.5088
Still best_val_rmse: 0.5029 (from epoch 1)

16 steps took 

Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 13.0 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9932
New best_val_rmse: 0.9932

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6753
New best_val_rmse: 0.6753

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6982
Still best_val_rmse: 0.6753 (from epoch 0)

16 steps took 12.2 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.561
New best_val_rmse: 0.561

16 steps took 12.3 seconds
Epoch: 1 batch_num: 1 val_rmse: 0.5567
New best_val_rmse: 0.5567

16 steps took 12.2 seconds
Epoch: 1 batch_num: 17 val_rmse: 0.5607
Still best_val_rmse: 0.5567 (from epoch 1)

16 steps took 12.1 seconds
Epoch: 1 batch_num: 33 val_rmse: 0.5156
New best_val_rmse: 0.5156

16 steps took 12.2 seconds
Epoch: 1 batch_num: 49 val_rmse: 0.4877
New best_val_rmse: 0.4877

4 steps took 3.04 seconds
Epoch: 1 batch_num: 53 val_rmse: 0.4925
Still best_val_rmse: 0.4877 (from epoch 1)

8 steps took 6.08 seconds
Epoch: 1 batch_num: 61 val_rmse: 0.4851
New best_val_rmse: 0.4851

4 steps took 

Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 13.0 seconds
Epoch: 0 batch_num: 16 val_rmse: 1.009
New best_val_rmse: 1.009

16 steps took 12.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7212
New best_val_rmse: 0.7212

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6089
New best_val_rmse: 0.6089

16 steps took 12.2 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5598
New best_val_rmse: 0.5598

16 steps took 12.3 seconds
Epoch: 1 batch_num: 1 val_rmse: 0.5617
Still best_val_rmse: 0.5598 (from epoch 0)

16 steps took 12.2 seconds
Epoch: 1 batch_num: 17 val_rmse: 0.5406
New best_val_rmse: 0.5406

16 steps took 12.2 seconds
Epoch: 1 batch_num: 33 val_rmse: 0.5035
New best_val_rmse: 0.5035

16 steps took 12.3 seconds
Epoch: 1 batch_num: 49 val_rmse: 0.5239
Still best_val_rmse: 0.5035 (from epoch 1)

16 steps took 12.2 seconds
Epoch: 1 batch_num: 65 val_rmse: 0.5372
Still best_val_rmse: 0.5035 (from epoch 1)

16 steps took 12.4 seconds
Epoch: 2 batch_num: 2 val_rmse: 0.4965
New best_val_rmse: 0.4965

8 steps took

Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 13.0 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8956
New best_val_rmse: 0.8956

16 steps took 12.1 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6828
New best_val_rmse: 0.6828

16 steps took 12.2 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.5762
New best_val_rmse: 0.5762

16 steps took 12.1 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5601
New best_val_rmse: 0.5601

16 steps took 12.3 seconds
Epoch: 1 batch_num: 1 val_rmse: 0.54
New best_val_rmse: 0.54

16 steps took 12.2 seconds
Epoch: 1 batch_num: 17 val_rmse: 0.5483
Still best_val_rmse: 0.54 (from epoch 1)

16 steps took 12.1 seconds
Epoch: 1 batch_num: 33 val_rmse: 0.5229
New best_val_rmse: 0.5229

16 steps took 12.2 seconds
Epoch: 1 batch_num: 49 val_rmse: 0.5392
Still best_val_rmse: 0.5229 (from epoch 1)

16 steps took 12.2 seconds
Epoch: 1 batch_num: 65 val_rmse: 0.5214
New best_val_rmse: 0.5214

16 steps took 12.3 seconds
Epoch: 2 batch_num: 2 val_rmse: 0.5157
New best_val_rmse: 0.5157

16 steps took 12.2 seconds
Epoch:

Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 13.0 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9201
New best_val_rmse: 0.9201

16 steps took 12.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7405
New best_val_rmse: 0.7405

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6124
New best_val_rmse: 0.6124

16 steps took 12.2 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5926
New best_val_rmse: 0.5926

16 steps took 12.3 seconds
Epoch: 1 batch_num: 1 val_rmse: 0.6132
Still best_val_rmse: 0.5926 (from epoch 0)

16 steps took 12.1 seconds
Epoch: 1 batch_num: 17 val_rmse: 0.5168
New best_val_rmse: 0.5168

16 steps took 12.2 seconds
Epoch: 1 batch_num: 33 val_rmse: 0.4928
New best_val_rmse: 0.4928

8 steps took 6.11 seconds
Epoch: 1 batch_num: 41 val_rmse: 0.5293
Still best_val_rmse: 0.4928 (from epoch 1)

16 steps took 12.2 seconds
Epoch: 1 batch_num: 57 val_rmse: 0.5104
Still best_val_rmse: 0.4928 (from epoch 1)

16 steps took 12.1 seconds
Epoch: 1 batch_num: 73 val_rmse: 0.56
Still best_val_rmse: 0.4928 (from epoch

Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 13.0 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.8554
New best_val_rmse: 0.8554

16 steps took 12.3 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.6714
New best_val_rmse: 0.6714

16 steps took 12.1 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.5399
New best_val_rmse: 0.5399

16 steps took 12.2 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.5151
New best_val_rmse: 0.5151

16 steps took 12.3 seconds
Epoch: 1 batch_num: 1 val_rmse: 0.4973
New best_val_rmse: 0.4973

8 steps took 6.13 seconds
Epoch: 1 batch_num: 9 val_rmse: 0.5624
Still best_val_rmse: 0.4973 (from epoch 1)

16 steps took 12.2 seconds
Epoch: 1 batch_num: 25 val_rmse: 0.52
Still best_val_rmse: 0.4973 (from epoch 1)

16 steps took 12.1 seconds
Epoch: 1 batch_num: 41 val_rmse: 0.4766
New best_val_rmse: 0.4766

2 steps took 1.52 seconds
Epoch: 1 batch_num: 43 val_rmse: 0.4808
Still best_val_rmse: 0.4766 (from epoch 1)

4 steps took 3.1 seconds
Epoch: 1 batch_num: 47 val_rmse: 0.5185
Still best_val_rmse: 0.4766 (from epoch 1)


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



16 steps took 13.1 seconds
Epoch: 0 batch_num: 16 val_rmse: 0.9628
New best_val_rmse: 0.9628

16 steps took 12.2 seconds
Epoch: 0 batch_num: 32 val_rmse: 0.7367
New best_val_rmse: 0.7367

16 steps took 12.2 seconds
Epoch: 0 batch_num: 48 val_rmse: 0.6691
New best_val_rmse: 0.6691

16 steps took 12.2 seconds
Epoch: 0 batch_num: 64 val_rmse: 0.7469
Still best_val_rmse: 0.6691 (from epoch 0)

16 steps took 12.3 seconds
Epoch: 1 batch_num: 1 val_rmse: 0.5794
New best_val_rmse: 0.5794

16 steps took 12.1 seconds
Epoch: 1 batch_num: 17 val_rmse: 0.5706
New best_val_rmse: 0.5706

16 steps took 12.2 seconds
Epoch: 1 batch_num: 33 val_rmse: 0.5202
New best_val_rmse: 0.5202

16 steps took 12.1 seconds
Epoch: 1 batch_num: 49 val_rmse: 0.4967
New best_val_rmse: 0.4967

8 steps took 6.07 seconds
Epoch: 1 batch_num: 57 val_rmse: 0.5142
Still best_val_rmse: 0.4967 (from epoch 1)

16 steps took 12.1 seconds
Epoch: 1 batch_num: 73 val_rmse: 0.5189
Still best_val_rmse: 0.4967 (from epoch 1)

16 steps t

# Inference

In [13]:
# 测试数据
test_dataset = LitDataset(test_df, inference_only=True)

In [14]:
# 建立一个空数组用于保存预测数据 
all_predictions = np.zeros((len(list_val_rmse), len(test_df)))
# 处理测试数据
test_dataset = LitDataset(test_df, inference_only=True)
# 将测试数据转换成张量导入模型中预测
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                         drop_last=False, shuffle=False, num_workers=2)
# 
for index in range(len(list_val_rmse)):            
    model_path = f"model_{index + 1}.pth"
    print(f"\nUsing {model_path}")
    # 模型                    
    model = LitModel()
    # 读取模型参数
    model.load_state_dict(torch.load(model_path))
    # 将模型参数导入GPU中
    model.to(DEVICE)
    # 将预测列表到如数组
    all_predictions[index] = predict(model, test_loader)
    # 删除模型
    del model
    gc.collect()


Using model_1.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using model_2.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using model_3.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using model_4.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using model_5.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using model_6.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using model_7.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using model_8.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using model_9.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Using model_10.pth


Some weights of RobertaModel were not initialized from the model checkpoint at ../input/clrp-roberta-base/clrp_roberta_base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# 将预测结果导入到csv中。
predictions = all_predictions.mean(axis=0)
submission_df.target = predictions
print(submission_df)
submission_df.to_csv("submission.csv", index=False)

          id    target
0  c0f722661 -0.419028
1  f0953f0a5 -0.668754
2  0df072751 -0.375996
3  04caf4e0c -2.366732
4  0e63f8bea -1.768860
5  12537fe78 -1.326430
6  965e592c0  0.058625
