https://uku28motab.feishu.cn/docs/doccnUDbEhudHm2V440lcY87B1c

In [1]:
# 导入相关的库
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import logging
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from tensorflow.keras import backend as K
from transformers import RobertaTokenizer, TFRobertaModel
from kaggle_datasets import KaggleDatasets
tf.get_logger().setLevel(logging.ERROR)
from kaggle_datasets import KaggleDatasets

In [2]:
# 配置
EPOCHS = 70
# Batch size
BATCH_SIZE = 24
# 随机种子
SEED = 123
# Learning rate
LR = 0.000040
# Verbosity
VERBOSE = 2
# 交叉验证折数
FOLDS = 5

# 最大长度
MAX_LEN = 250

# 获取想使用的预训练模型
MODEL = 'roberta-base'

# 读取预训练模型的分词
tokenizer = RobertaTokenizer.from_pretrained(MODEL)

# tensorflow数据集
AUTO = tf.data.experimental.AUTOTUNE

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [3]:
# 随机种子
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)

# 这个函数根据转换器模型标记器对文本进行标记
def regular_encode(texts, tokenizer, maxlen = MAX_LEN):
    enc_di = tokenizer.batch_encode_plus(
        texts,
        padding = 'max_length',
        truncation = True,
        max_length = maxlen,
    )
    
    return np.array(enc_di['input_ids'])

# 这个函数对我们的训练句子进行编码
def encode_texts(x_train, x_val, MAX_LEN):
    x_train = regular_encode(x_train.tolist(), tokenizer, maxlen = MAX_LEN)
    x_val = regular_encode(x_val.tolist(), tokenizer, maxlen = MAX_LEN)
    return x_train, x_val

# 函数将数组转换为张量
def transform_to_tensors(x_train, x_val, y_train, y_val):
    
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train, y_train))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )
    
    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_val, y_val))
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )
    
    return train_dataset, valid_dataset

# 构建roberta_base模型
def build_roberta_base_model(max_len = MAX_LEN):
    transformer = TFRobertaModel.from_pretrained(MODEL)
    input_word_ids = tf.keras.layers.Input(shape = (max_len, ), dtype = tf.int32, name = 'input_word_ids')
    sequence_output = transformer(input_word_ids)[0]
    # We only need the cls_token, resulting in a 2d array
    cls_token = sequence_output[:, 0, :]
    output = tf.keras.layers.Dense(1, activation = 'linear', dtype = 'float32')(cls_token)
    model = tf.keras.models.Model(inputs = [input_word_ids], outputs = [output])
    model.compile(optimizer = tf.keras.optimizers.Adam(lr = LR),
                  loss = [tf.keras.losses.MeanSquaredError()],
                  metrics = [tf.keras.metrics.RootMeanSquaredError()])
    return model

# 训练和验证函数
def train_and_evaluate():
    
    # 读取训练数据
    df = pd.read_csv('../input/commonlitreadabilityprize/train.csv')
    # 随机种子
    seed_everything(SEED)
    
    # 使用shuffle和特定的seed启动kfold对象
    kfold = KFold(n_splits = FOLDS, shuffle = True, random_state = SEED)
    # 创建出折叠数组来存储预测
    oof_predictions = np.zeros(len(df))
    
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(df)):
        print('\n')
        print('-'*50)
        print(f'Training fold {fold + 1}')
        K.clear_session()
        # 获取文本特征和目标
        x_train, x_val = df['excerpt'].iloc[trn_ind], df['excerpt'].iloc[val_ind]
        y_train, y_val = df['target'].iloc[trn_ind].values, df['target'].iloc[val_ind].values
        # ERoberta 分词器编码文本
        x_train, x_val = encode_texts(x_train, x_val, MAX_LEN)
        # 函数将numpy数组转换为tf数据集
        train_dataset, valid_dataset = transform_to_tensors(x_train, x_val, y_train, y_val)
        # 构建模型
        model = build_roberta_base_model(max_len = MAX_LEN)
        # 模型保存
        checkpoint = tf.keras.callbacks.ModelCheckpoint(f'Roberta_Base_{SEED}_{fold + 1}.h5', 
                                                        monitor = 'val_root_mean_squared_error', 
                                                        verbose = VERBOSE, 
                                                        save_best_only = True,
                                                        save_weights_only = True, 
                                                        mode = 'min')
        steps = x_train.shape[0] // (BATCH_SIZE * 16)
        # 训练阶段
        history = model.fit(train_dataset,
                            batch_size = BATCH_SIZE,
                            epochs = EPOCHS,
                            verbose = VERBOSE,
                            callbacks = [checkpoint],
                            validation_data = valid_dataset,
                            steps_per_epoch = steps)
        
        
        # 读取最佳周期权重
        model.load_weights(f'Roberta_Base_{SEED}_{fold + 1}.h5')
        # Predict validation set to save them in the out of folds array
        val_pred = model.predict(valid_dataset)
        oof_predictions[val_ind] = val_pred.reshape(-1)
        
    print('\n')
    print('-'*50)
    # 预测验证集，将它们保存在数组中
    oof_rmse = np.sqrt(mean_squared_error(df['target'], oof_predictions))
    print(f'Our out of folds RMSE is {oof_rmse}')
    
# 执行训练和验证函数
train_and_evaluate()



--------------------------------------------------
Training fold 1


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/657M [00:00<?, ?B/s]

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/70
5/5 - 27s - loss: 1.0869 - root_mean_squared_error: 1.0425 - val_loss: 1.0498 - val_root_mean_squared_error: 1.0246

Epoch 00001: val_root_mean_squared_error improved from inf to 1.02460, saving model to Roberta_Base_123_1.h5
Epoch 2/70
5/5 - 9s - loss: 1.1106 - root_mean_squared_error: 1.0538 - val_loss: 1.0096 - val_root_mean_squared_error: 1.0048

Epoch 00002: val_root_mean_squared_error improved from 1.02460 to 1.00477, saving model to Roberta_Base_123_1.h5
Epoch 3/70
5/5 - 9s - loss: 1.1947 - root_mean_squared_error: 1.0930 - val_loss: 0.6229 - val_root_mean_squared_error: 0.7892

Epoch 00003: val_root_mean_squared_error improved from 1.00477 to 0.78922, saving model to Roberta_Base_123_1.h5
Epoch 4/70
5/5 - 9s - loss: 0.6912 - root_mean_squared_error: 0.8314 - val_loss: 0.4915 - val_root_mean_squared_error: 0.7011

Epoch 00004: val_root_mean_squared_error improved from 0.78922 to 0.70105, saving model to Roberta_Base_123_1.h5
Epoch 5/70
5/5 - 9s - loss: 0.9747 - root_m

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/70
5/5 - 26s - loss: 1.3435 - root_mean_squared_error: 1.1591 - val_loss: 1.1107 - val_root_mean_squared_error: 1.0539

Epoch 00001: val_root_mean_squared_error improved from inf to 1.05388, saving model to Roberta_Base_123_2.h5
Epoch 2/70
5/5 - 9s - loss: 0.9537 - root_mean_squared_error: 0.9766 - val_loss: 0.8807 - val_root_mean_squared_error: 0.9385

Epoch 00002: val_root_mean_squared_error improved from 1.05388 to 0.93848, saving model to Roberta_Base_123_2.h5
Epoch 3/70
5/5 - 9s - loss: 0.8168 - root_mean_squared_error: 0.9038 - val_loss: 1.0929 - val_root_mean_squared_error: 1.0454

Epoch 00003: val_root_mean_squared_error did not improve from 0.93848
Epoch 4/70
5/5 - 9s - loss: 0.7448 - root_mean_squared_error: 0.8630 - val_loss: 0.6829 - val_root_mean_squared_error: 0.8264

Epoch 00004: val_root_mean_squared_error improved from 0.93848 to 0.82637, saving model to Roberta_Base_123_2.h5
Epoch 5/70
5/5 - 9s - loss: 0.9309 - root_mean_squared_error: 0.9648 - val_loss: 0.979

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/70
5/5 - 27s - loss: 1.3198 - root_mean_squared_error: 1.1488 - val_loss: 1.0267 - val_root_mean_squared_error: 1.0133

Epoch 00001: val_root_mean_squared_error improved from inf to 1.01327, saving model to Roberta_Base_123_3.h5
Epoch 2/70
5/5 - 9s - loss: 1.1690 - root_mean_squared_error: 1.0812 - val_loss: 0.9240 - val_root_mean_squared_error: 0.9613

Epoch 00002: val_root_mean_squared_error improved from 1.01327 to 0.96127, saving model to Roberta_Base_123_3.h5
Epoch 3/70
5/5 - 9s - loss: 1.0497 - root_mean_squared_error: 1.0246 - val_loss: 0.8003 - val_root_mean_squared_error: 0.8946

Epoch 00003: val_root_mean_squared_error improved from 0.96127 to 0.89458, saving model to Roberta_Base_123_3.h5
Epoch 4/70
5/5 - 9s - loss: 0.6972 - root_mean_squared_error: 0.8350 - val_loss: 0.5675 - val_root_mean_squared_error: 0.7533

Epoch 00004: val_root_mean_squared_error improved from 0.89458 to 0.75332, saving model to Roberta_Base_123_3.h5
Epoch 5/70
5/5 - 9s - loss: 0.6441 - root_m

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/70
5/5 - 27s - loss: 1.3500 - root_mean_squared_error: 1.1619 - val_loss: 1.5019 - val_root_mean_squared_error: 1.2255

Epoch 00001: val_root_mean_squared_error improved from inf to 1.22550, saving model to Roberta_Base_123_4.h5
Epoch 2/70
5/5 - 9s - loss: 1.4675 - root_mean_squared_error: 1.2114 - val_loss: 0.9332 - val_root_mean_squared_error: 0.9660

Epoch 00002: val_root_mean_squared_error improved from 1.22550 to 0.96601, saving model to Roberta_Base_123_4.h5
Epoch 3/70
5/5 - 9s - loss: 1.2319 - root_mean_squared_error: 1.1099 - val_loss: 0.9981 - val_root_mean_squared_error: 0.9990

Epoch 00003: val_root_mean_squared_error did not improve from 0.96601
Epoch 4/70
5/5 - 9s - loss: 1.0294 - root_mean_squared_error: 1.0146 - val_loss: 1.0311 - val_root_mean_squared_error: 1.0154

Epoch 00004: val_root_mean_squared_error did not improve from 0.96601
Epoch 5/70
5/5 - 9s - loss: 0.9241 - root_mean_squared_error: 0.9613 - val_loss: 0.6042 - val_root_mean_squared_error: 0.7773

Ep

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Epoch 1/70
5/5 - 27s - loss: 2.0511 - root_mean_squared_error: 1.4322 - val_loss: 0.9599 - val_root_mean_squared_error: 0.9798

Epoch 00001: val_root_mean_squared_error improved from inf to 0.97975, saving model to Roberta_Base_123_5.h5
Epoch 2/70
5/5 - 9s - loss: 0.8735 - root_mean_squared_error: 0.9346 - val_loss: 0.9170 - val_root_mean_squared_error: 0.9576

Epoch 00002: val_root_mean_squared_error improved from 0.97975 to 0.95760, saving model to Roberta_Base_123_5.h5
Epoch 3/70
5/5 - 9s - loss: 1.0988 - root_mean_squared_error: 1.0482 - val_loss: 0.9492 - val_root_mean_squared_error: 0.9743

Epoch 00003: val_root_mean_squared_error did not improve from 0.95760
Epoch 4/70
5/5 - 9s - loss: 1.1854 - root_mean_squared_error: 1.0887 - val_loss: 0.8523 - val_root_mean_squared_error: 0.9232

Epoch 00004: val_root_mean_squared_error improved from 0.95760 to 0.92320, saving model to Roberta_Base_123_5.h5
Epoch 5/70
5/5 - 9s - loss: 0.8886 - root_mean_squared_error: 0.9426 - val_loss: 0.965