
# DS6050 - Group 6
* Andrej Erkelens <wsw3fa@virginia.edu>
* Robert Knuuti <uqq5zz@virginia.edu>
* Khoi Tran <kt2np@virginia.edu>

## Abstract
English is a verbose language with over 69% redundancy in its construction, and as a result, individuals only need to identify important details to comprehend an intended message.
While there are strong efforts to quantify the various elements of language, the average individual can still comprehend a written message that has errors, either in spelling or in grammar.
The emulation of the effortless, yet obscure task of reading, writing, and understanding language is the perfect challenge for the biologically-inspired methods of deep learning.
Most language and text related problems rely upon finding high-quality latent representations to understand the task at hand. Unfortunately, efforts to overcome such problems are limited to the data and computation power available to individuals; data availability often presents the largest problem, with small, specific domain tasks often proving to be limiting.
Currently, these tasks are often aided or overcome by pre-trained large language models (LLMs), designed by large corporations and laboratories.
Fine-tuning language models on domain-specific vocabulary with small data sizes still presents a challenge to the language community, but the growing availability of LLMs to augment such models alleviates the challenge.
This paper explores different techniques to be applied on existing language models (LMs), built highly complex Deep Learning models, and investigates how to fine-tune these models, such that a pre-trained model is used to enrich a more domain-specific model that may be limited in textual data.

## Project Objective

We are aiming on using several small domain specific language tasks, particularly classification tasks.
We aim to take at least two models, probably BERT and distill-GPT2 as they seem readily available on HuggingFace and TensorFlow's model hub.
We will iterate through different variants of layers we fine tune and compare these results with fully trained models, and ideally find benchmarks already in academic papers on all of the datasets.

We aim to optimize compute efficiency and also effectiveness of the model on the given dataset. Our goal is to find a high performing and generalizable method for our fine tuning process and share this in our paper.


In [None]:
%autosave 0
import sys
import os
from pathlib import Path

In [None]:
if 'google.colab' in sys.modules:
    %tensorflow_version 2.0
    %pip install -q tensorflow-text tokenizers transformers
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content/drive/MyDrive/ds6050/
    pass # needed for py:percent script

In [None]:
import tensorflow as tf
import tensorflow_text as tf_text

In [None]:
strategy = tf.distribute.MirroredStrategy()

In [None]:
#@title Hyperparameters

SEED=42
TRAIN_TEST_SPLIT=0.8
BATCH_SIZE=4
EPOCHS=10
LABEL='topic'
FEATURES='content'
PRETRAINED_WEIGHTS='bert-base-uncased'

In [None]:
features = FEATURES # feature for the future - add all the datasets ['categories', 'summary', 'content']
label = LABEL

In [None]:
import numpy as np
import pandas as pd

import tokenizers
import transformers

from tensorflow import keras


np.random.seed(SEED)
tf.random.set_seed(SEED)

df = pd.read_feather("data/dataset.feather")
df[label] = df[label].str.split('.').str[0]

response_count = len(df[label].unique())

df_train = df.sample(frac = TRAIN_TEST_SPLIT)
df_test = df.drop(df_train.index)

In [None]:
# strategy = tf.distribute.MirroredStrategy()

In [None]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

y_ = ohe.fit_transform(df[label].values.reshape(-1,1)).toarray()

In [None]:
max_len = 512
hf_bert_tokenizer = transformers.BertTokenizerFast.from_pretrained(PRETRAINED_WEIGHTS)
with strategy.scope():
    hf_bert_model = transformers.TFBertModel.from_pretrained(PRETRAINED_WEIGHTS)
# hf_bert_model = transformers.TFBertForSequenceClassification.from_pretrained("bert-base-uncased")

In [None]:
encodings = hf_bert_tokenizer.batch_encode_plus(list(df.summary.values), 
                                                return_tensors='tf', 
                                                padding='max_length',
                                                max_length=None,
                                                truncation=True)

In [None]:
def model_top(pretr_model):
  with strategy.scope():
      input_ids = tf.keras.Input(shape=(512,), dtype='int32')
      attention_masks = tf.keras.Input(shape=(512,), dtype='int32')

      output = pretr_model([input_ids, attention_masks])
      #pooler_output = output[1]
      pooler_output = tf.keras.layers.AveragePooling1D(pool_size=512)(output[0])
      flattened_output = tf.keras.layers.Flatten()(pooler_output)

      output = tf.keras.layers.Dense(32, activation='tanh')(flattened_output)
      output = tf.keras.layers.Dropout(0.2)(output)

      output = tf.keras.layers.Dense(response_count, activation='softmax')(output)
      model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=output)
      model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [None]:
with strategy.scope():
    model = model_top(hf_bert_model)

In [None]:
model.summary()

In [None]:
model.layers

In [None]:
model.layers[2].trainable = False

In [None]:
model.summary()

In [None]:
!nvidia-smi

In [None]:
with strategy.scope():
    checkpoint_filepath = './tmp/checkpoint'

    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_accuracy',
        mode='max',
        save_best_only=True)

    early_stopping_callback = tf.keras.callbacks.EarlyStopping(
        monitor="val_loss",
        patience=5,
        mode="auto",
    )
    history = model.fit([encodings['input_ids'], 
                         encodings['attention_mask']], 
                        y_, 
                        validation_split=1-TRAIN_TEST_SPLIT,
                        epochs=EPOCHS,
                        batch_size=BATCH_SIZE,
                        callbacks=[model_checkpoint_callback, early_stopping_callback])

In [None]:
train_labels = df_train[LABEL]
test_labels = df_test[LABEL]

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings),
                                                         train_labels))

test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings),
                                                        test_labels))

In [None]:
training_args.strategy.scope()

In [None]:
train_encodings['input_ids']

In [None]:
hf_bert_model.compile(optimizer='adam',
                      loss='categorical_crossentropy',
                      metrics=['acc'])

hf_bert_model.fit(train_encodings['input_ids'])

In [None]:
hf_bert_model.compile(optimizer='adam',
                      loss='categorical_crossentropy',
                      metrics=['acc'])

hf_bert_model.fit(train_dataset, epochs=EPOCHS, validation_data=test_dataset)

In [None]:
with training_args.strategy.scope():
  model = hf_bert_model

trainer = TFTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

### Data Preview

In [None]:
for text, label in ds_train.take(5):
  print('Text')
  print(text)
  print('Label')
  print(label)

In [None]:
## This is currently broken - Still tryign to get the TFBertModel to accept the token string in.
max_len = 384
hf_bert_tokenizer_bootstrapper = transformers.BertTokenizer.from_pretrained("bert-base-uncased")
hf_bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")

save_path = Path("data") / "models"
if not os.path.exists(save_path):
    os.makedirs(save_path, exist_ok=True)
hf_bert_tokenizer_bootstrapper.save_pretrained(save_path)
hf_bert_model.save_pretrained(save_path)

# Load the fast tokenizer from saved file
bert_tokenizer = tokenizers.BertWordPieceTokenizer(str(save_path/"vocab.txt"), lowercase=True)

def tf_hf_bertencode(features, label):
    x = bert_tokenizer.encode(tf.compat.as_str(features), add_special_tokens=True)
    y = bert_tokenizer.encode(tf.compat.as_str(label), add_special_tokens=True)
    return x, y

def tf_hf_bertencodeds(features, label):
    encode = tf.py_function(func=tf_hf_bertencode, inp=[features, label], Tout=[tf.int64, tf.int64])
    return encode

encoded_input = ds_train.batch(256).map(tf_hf_bertencodeds)
output = transformers.TFBertModel(config=transformers.PretrainedConfig.from_json_file(str(save_path/"config.json")))
hf_bert = output(encoded_input)

In [None]:

files = [] # Need to explode train_ds to sep files

tokenizer = tokenizers.BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=True,
)

tokenizer.train(
    files,
    vocab_size=10000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    limit_alphabet=1000,
    wordpieces_prefix="##",
)

# Save the files
tokenizer.save_model(args.out, args.name)

In [None]:

files = [] # Need to explode train_ds to sep files

tokenizer = tokenizers.BertWordPieceTokenizer(
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=True,
    lowercase=True,
)

tokenizer.train(
    files,
    vocab_size=10000,
    min_frequency=2,
    show_progress=True,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
    limit_alphabet=1000,
    wordpieces_prefix="##",
)

# Save the files
tokenizer.save_model(args.out, args.name)