# Pretraining

Pretrain geneformer transformer encoder on masking task

In [1]:
# /ihome/kyin/niandrew/.conda/envs/tf_gpu
# /ix1/kyin/niandrew/custom_miniconda
# source /ix1/kyin/niandrew/custom_miniconda/bin/activate tf_gpu
!source /ix1/kyin/niandrew/custom_miniconda/bin/activate tf_gpu
!export XLA_FLAGS=--xla_gpu_cuda_data_dir=/ihome/kyin/niandrew/.conda/envs/tf_gpu/lib/
!export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/ihome/kyin/niandrew/.conda/envs/tf_gpu/lib/

# !export CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)"))
# !export LD_LIBRARY_PATH=${CUDNN_PATH}/lib

## Load Packages

In [2]:
from datasets import load_dataset
from tqdm import tqdm
import pickle
import matplotlib.pyplot as plt
import datetime

import os
os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"

import keras_nlp
import tensorflow as tf
import keras

2024-05-05 22:57:15.878989: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-05-05 22:57:15.924666: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


## Parameters

In [4]:
# Preprocessing params.
PRETRAINING_BATCH_SIZE = 12
SEQ_LENGTH = 512
MASK_RATE = 0.125
PREDICTIONS_PER_SEQ = int(SEQ_LENGTH*MASK_RATE)

# Model params.
NUM_LAYERS = 6
MODEL_DIM = 256
INTERMEDIATE_DIM = 512
NUM_HEADS = 4
DROPOUT = 0.02
NORM_EPSILON = 1e-12

# Training params.
PRETRAINING_LEARNING_RATE = 1e-3
PRETRAINING_EPOCHS = 3
PRETRAINING_WEIGHT_DECAY = 0.001

# Model name
MODEL_NAME = 'geneformer'

## Load and Preprocess Data

In [5]:
# Load Vocab File
with open('token_dictionary.pkl', 'rb') as file:
    vocab_dict = pickle.load(file)
vocab_list = list(vocab_dict.keys())
vocab_list.append('<unk>')

VOCAB_SIZE = len(vocab_list)
VOCAB_SIZE

25427

In [6]:
# Pretraining

# Tokenizer
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab_list,
    sequence_length=SEQ_LENGTH,
    lowercase=False,
    strip_accents=True,
    oov_token='<unk>'
)

# Padder
padder = keras_nlp.layers.StartEndPacker(
    sequence_length=SEQ_LENGTH,
    start_value=None,
    end_value=None,
    pad_value=0,
    return_padding_mask=False
)

# Masker
masker = keras_nlp.layers.MaskedLMMaskGenerator(
    vocabulary_size=VOCAB_SIZE,
    mask_selection_rate=MASK_RATE,
    mask_token_id=1,
    mask_selection_length=PREDICTIONS_PER_SEQ,
    unselectable_token_ids=[0],
    mask_token_rate=0.8,
    random_token_rate=0.1
)

# Preprocess
def preprocess(inputs):
    outputs = padder(inputs)
    outputs = masker(outputs)

    features = {
      "token_ids": outputs["token_ids"],
      "mask_positions": outputs["mask_positions"],
    }

    labels = outputs["mask_ids"]
    weights = outputs["mask_weights"]

    return features, labels, weights

2024-05-05 22:57:19.436602: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2024-05-05 22:57:19.436794: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8912 MB memory:  -> device: 0, name: NVIDIA L40S, pci bus id: 0000:4a:00.0, compute capability: 8.9


In [7]:
# Load the dataset in streaming mode
dataset = load_dataset('ctheodoris/Genecorpus-30M', data_files='genecorpus_30M_2048.dataset/dataset.arrow', streaming=True, split='train')

# Define a generator function to yield batches
def data_generator():
    for sample in dataset:
        yield tf.constant(sample['input_ids'][:SEQ_LENGTH])

# Create a tf.data.Dataset from the generator
tf_dataset = tf.data.Dataset.from_generator(data_generator, tf.int32, output_shapes=[None])

# Batch and pad the dataset
tf_dataset = tf_dataset.padded_batch(
    batch_size=PRETRAINING_BATCH_SIZE,
    padded_shapes=SEQ_LENGTH
)

In [8]:
# Pre-compute preprocessed batches on the fly on the CPU.
pretrain_ds = tf_dataset.map(
    preprocess, num_parallel_calls=tf.data.AUTOTUNE
).prefetch(tf.data.AUTOTUNE)

print(pretrain_ds.take(1).get_single_element())

({'token_ids': <tf.Tensor: shape=(12, 512), dtype=int32, numpy=
array([[11143, 17261,  5368, ..., 13399,  7733,  6285],
       [ 1007,  8776,  7992, ..., 17712, 16565,  1364],
       [ 8776,  1007,  7055, ...,  7027,     1,  5224],
       ...,
       [16725,  5368, 11143, ..., 15332,  8178,  3351],
       [    1,  7055,  4686, ...,  2423, 14556, 12410],
       [ 4703,  7055,  2061, ...,  2221, 13615,   220]], dtype=int32)>, 'mask_positions': <tf.Tensor: shape=(12, 64), dtype=int64, numpy=
array([[  8,   9,  21,  23,  26,  29,  58,  59,  64,  75,  79,  82,  86,
         88,  95,  96, 122, 128, 131, 164, 180, 181, 183, 184, 194, 202,
        217, 229, 237, 241, 268, 288, 305, 312, 315, 316, 319, 339, 385,
        387, 395, 398, 400, 403, 412, 413, 420, 423, 432, 433, 448, 450,
        451, 454, 455, 461, 463, 464, 467, 469, 482, 484, 495, 499],
       [  6,  14,  18,  23,  26,  51,  61,  70,  84,  86, 103, 107, 109,
        118, 124, 125, 128, 137, 171, 176, 180, 190, 208, 219, 227, 231,

## Pretraining

In [9]:
inputs = keras.Input(shape=(SEQ_LENGTH,), dtype="int32")

# Embed our tokens with a positional embedding.
embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=SEQ_LENGTH,
    embedding_dim=MODEL_DIM,
    mask_zero=True
)
outputs = embedding_layer(inputs)

# Apply layer normalization and dropout to the embedding.
outputs = keras.layers.LayerNormalization(epsilon=NORM_EPSILON)(outputs)
outputs = keras.layers.Dropout(rate=DROPOUT)(outputs)

# Add encoder blocks
for i in range(NUM_LAYERS):
    outputs = keras_nlp.layers.TransformerEncoder(
        intermediate_dim=INTERMEDIATE_DIM,
        num_heads=NUM_HEADS,
        dropout=DROPOUT,
        layer_norm_epsilon=NORM_EPSILON,
    )(outputs)

encoder_model = keras.Model(inputs, outputs)
encoder_model.summary()

In [10]:
#%load_ext tensorboard
# tensorboard --host 0.0.0.0 --logdir logs/fit --bind_all

In [11]:
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [12]:
# Create the pretraining model by attaching a masked language model head.
inputs = {
    "token_ids": keras.Input(shape=(SEQ_LENGTH,), dtype="int32", name="token_ids"),
    "mask_positions": keras.Input(
        shape=(PREDICTIONS_PER_SEQ,), dtype="int32", name="mask_positions"
    ),
}

# Encode the tokens.
encoded_tokens = encoder_model(inputs["token_ids"])

# Predict an output word for each masked input token.
# We use the input token embedding to project from our encoded vectors to
# vocabulary logits, which has been shown to improve training efficiency.
outputs = keras_nlp.layers.MaskedLMHead(
    token_embedding=embedding_layer.token_embedding,
    activation="softmax",
)(encoded_tokens, mask_positions=inputs["mask_positions"])

# Define and compile our pretraining model.
pretraining_model = keras.Model(inputs, outputs)
pretraining_model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=tf.keras.optimizers.AdamW(learning_rate=PRETRAINING_LEARNING_RATE,
                                        weight_decay=PRETRAINING_WEIGHT_DECAY),
    weighted_metrics=["sparse_categorical_accuracy"],
    jit_compile=True
)

pretraining_model.summary(expand_nested=True, show_trainable=True)

In [13]:
# Pretrain the model on our wiki text dataset.
history = pretraining_model.fit(
    pretrain_ds,
    epochs=PRETRAINING_EPOCHS,
    shuffle=True, 
    callbacks=[tensorboard_callback]
)

Epoch 1/3
  12849/Unknown [1m279s[0m 21ms/step - loss: 5.1817 - sparse_categorical_accuracy: 0.0025


KeyboardInterrupt



In [None]:

# Save this base model for further finetuning
encoder_model.save(("model/" + MODEL_NAME + ".keras"))

# Save training history
with open("model/" + MODEL_NAME + "_history.pkl", 'wb') as file:
    pickle.dump(history, file)