<a href="https://colab.research.google.com/github/yonseimath/datascience-biginner-2022-kaggle-competitions/blob/feature%2Fyenakim/yenakim/AI4Code_TF_TPU_CodeBert_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
import os
from typing import List

import numpy as np
import tensorflow as tf
import transformers
from kaggle_datasets import KaggleDatasets
from sklearn.model_selection import KFold

In [None]:
RANDOM_STATE = 42
N_SPLITS = 5
TOTAL_MAX_LEN = 512
BASE_MODEL = "microsoft/codebert-base"
GCS_PATH = KaggleDatasets().get_gcs_path("ai4code-codebert-tokens")
EPOCHS = 5
LR = 3e-5
WARMUP_RATE = 0.05
VERBOSE = 1 if os.environ["KAGGLE_KERNEL_RUN_TYPE"] == "Interactive" else 2 # 1은 자세한 출력, 2는 함축적인 출력

# TPU 사용
try:
    TPU = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(TPU)
    tf.tpu.experimental.initialize_tpu_system(TPU)
    STRATEGY = tf.distribute.experimental.TPUStrategy(TPU)
    BATCH_SIZE = 64 * STRATEGY.num_replicas_in_sync
except Exception:
    TPU = None
    STRATEGY = tf.distribute.get_strategy()
    BATCH_SIZE = 4

print("TensorFlow", tf.__version__)

if TPU is not None:
    print("Using TPU v3-8")
else:
    print("Using GPU/CPU")

print("Batch size:", BATCH_SIZE)

In [None]:
# 샘플 수 세기
def count_samples(filenames: List[str]) -> int:
    return sum(int(os.path.basename(x).split(".")[0].split("-")[-1]) for x in filenames)

# Serialize된 파일을 입력받아 모델에서 사용할 수 있도록 형태를 변환
def read_tfrecord(example):
    features = {
        "input_ids": tf.io.FixedLenFeature( # int64 배열의 문자열을 받는다는 의미
            [
                TOTAL_MAX_LEN,
            ],
            tf.int64,
        ),
        "attention_mask": tf.io.FixedLenFeature(
            [
                TOTAL_MAX_LEN,
            ],
            tf.int64,
        ),
        "feature": tf.io.FixedLenFeature([], tf.float32),
        "label": tf.io.FixedLenFeature([], tf.float32),
    }
    example = tf.io.parse_single_example(example, features) # 입력받은 파일을 하나씩 풀어줌
    return (
        {
            "input_ids": tf.cast(example["input_ids"], tf.int32), # cast : 텐서를 새로운 형태로 캐스팅
            "attention_mask": tf.cast(example["attention_mask"], tf.int32),
            "feature": example["feature"],
        },
        example["label"],
    )


def get_dataset(
    filenames: List[str],
    ordered: bool = False,
    repeated: bool = True,
    cached: bool = False,
) -> tf.data.Dataset:
    auto = tf.data.experimental.AUTOTUNE # 사용 가능한 CPU에 따라 병렬 호출 수가 동적으로 결정
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=auto)
    if not ordered:
        ignore_order = tf.data.Options()
        ignore_order.experimental_deterministic = False # transformation is allowed to yield elements out of order to trade determinism for performance
        dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(read_tfrecord, num_parallel_calls=auto)
    if not ordered:
        dataset = dataset.shuffle(2048, seed=RANDOM_STATE)
    if repeated:
        dataset = dataset.repeat() # 데이터 셋 반복
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True) # 데이터 배치의 크기 설정
    if cached:
        dataset = dataset.cache()
    dataset = dataset.prefetch(auto) # 데이터가 소비되는 시간과 데이터가 생성되는 시간 간의 의존성을 줄이는 변환
    return STRATEGY.experimental_distribute_dataset(dataset) # 복제본 당 값 생성

# 모델 만들기
def get_model() -> tf.keras.Model:
    backbone = transformers.TFAutoModel.from_pretrained(BASE_MODEL) # 사전학습된 code bert 불러옴
    input_ids = tf.keras.layers.Input( # 층 생성
        shape=(TOTAL_MAX_LEN,),
        dtype=tf.int32,
        name="input_ids",
    )
    attention_mask = tf.keras.layers.Input(
        shape=(TOTAL_MAX_LEN,),
        dtype=tf.int32,
        name="attention_mask",
    )
    feature = tf.keras.layers.Input(
        shape=(1,),
        dtype=tf.float32,
        name="feature",
    )
    x = backbone({"input_ids": input_ids, "attention_mask": attention_mask})[0]
    x = tf.concat([x[:, 0, :], feature], axis=1)
    outputs = tf.keras.layers.Dense(1, activation="linear", dtype="float32")(x)
    return tf.keras.Model(
        inputs=[input_ids, attention_mask, feature],
        outputs=outputs,
    )

# learning rate 조절?
class WarmupLinearDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(
        self,
        base_learning_rate: float,
        warmup_steps: int,
        total_steps: int,
    ) -> None:
        self._base_learning_rate = base_learning_rate
        self._warmup_steps = warmup_steps
        self._total_steps = total_steps

    def __call__(self, step: int) -> float:
        return self._base_learning_rate * tf.cond(
            tf.math.less_equal(step, warmup_steps),
            lambda: step / self._warmup_steps,
            lambda: (step - total_steps) / (self._warmup_steps - self._total_steps),
        )

# Training

In [None]:
for i, (train_index, val_index) in enumerate(KFold(n_splits=N_SPLITS).split(range(N_SPLITS))):
    if TPU is not None:
        tf.tpu.experimental.initialize_tpu_system(TPU)

    train_filenames = np.ravel( # 1차원 배열 반환
        [
            tf.io.gfile.glob(os.path.join(GCS_PATH, "tfrec", str(x), "*.tfrec"))
            for x in train_index
        ]
    )
    steps_per_epoch = count_samples(train_filenames) // BATCH_SIZE # 한 에포크 당 스텝
    train_dataset = get_dataset(train_filenames)

    val_filenames = np.ravel(
        [
            tf.io.gfile.glob(os.path.join(GCS_PATH, "tfrec", str(x), "*.tfrec"))
            for x in val_index
        ]
    )
    validation_steps = count_samples(val_filenames) // BATCH_SIZE
    val_dataset = get_dataset(val_filenames, ordered=True, repeated=False, cached=True)

    with STRATEGY.scope(): # model, optimizer, and checkpoint must be created under `strategy.scope`
        model = get_model()

        total_steps = steps_per_epoch * EPOCHS
        warmup_steps = int(WARMUP_RATE * total_steps)

        optimizer = transformers.AdamWeightDecay( # 옵티마이저 : AdamW
            learning_rate=WarmupLinearDecay(
                base_learning_rate=LR,
                warmup_steps=warmup_steps,
                total_steps=total_steps,
            ),
            weight_decay_rate=0.01,
            exclude_from_weight_decay=[
                "bias",
                "LayerNorm.bias",
                "LayerNorm.weight",
            ],
        )
        model.compile(loss="mae", optimizer=optimizer)

    model.fit(
        train_dataset,
        steps_per_epoch=steps_per_epoch,
        validation_data=val_dataset,
        validation_steps=validation_steps,
        epochs=EPOCHS,
        verbose=VERBOSE,
    )

    model.save_weights(f"model_{i}.h5")
    break