
# DS6050 - Group 6
* Andrej Erkelens <wsw3fa@virginia.edu>
* Robert Knuuti <uqq5zz@virginia.edu>
* Khoi Tran <kt2np@virginia.edu>

## Abstract
English is a verbose language with over 69% redundancy in its construction, and as a result, individuals only need to identify important details to comprehend an intended message.
While there are strong efforts to quantify the various elements of language, the average individual can still comprehend a written message that has errors, either in spelling or in grammar.
The emulation of the effortless, yet obscure task of reading, writing, and understanding language is the perfect challenge for the biologically-inspired methods of deep learning.
Most language and text related problems rely upon finding high-quality latent representations to understand the task at hand. Unfortunately, efforts to overcome such problems are limited to the data and computation power available to individuals; data availability often presents the largest problem, with small, specific domain tasks often proving to be limiting.
Currently, these tasks are often aided or overcome by pre-trained large language models (LLMs), designed by large corporations and laboratories.
Fine-tuning language models on domain-specific vocabulary with small data sizes still presents a challenge to the language community, but the growing availability of LLMs to augment such models alleviates the challenge.
This paper explores different techniques to be applied on existing language models (LMs), built highly complex Deep Learning models, and investigates how to fine-tune these models, such that a pre-trained model is used to enrich a more domain-specific model that may be limited in textual data.

## Project Objective

We are aiming on using several small domain specific language tasks, particularly classification tasks.
We aim to take at least two models, probably BERT and distill-GPT2 as they seem readily available on HuggingFace and TensorFlow's model hub.
We will iterate through different variants of layers we fine tune and compare these results with fully trained models, and ideally find benchmarks already in academic papers on all of the datasets.

We aim to optimize compute efficiency and also effectiveness of the model on the given dataset. Our goal is to find a high performing and generalizable method for our fine tuning process and share this in our paper.


In [1]:
%autosave 0
import sys
import os
from pathlib import Path

Autosave disabled


In [2]:
if 'google.colab' in sys.modules:
    %pip install -q tensorflow-text tokenizers transformers
    from google.colab import drive
    drive.mount('/content/drive')
    %cd /content/drive/MyDrive/ds6050/
    pass # needed for py:percent script

In [3]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns
import string
import tokenizers
import tensorflow as tf
import tensorflow_addons as tfa
import transformers

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix


from tensorflow import keras
from tokenizers import decoders, models, normalizers, \
                       pre_tokenizers, processors, trainers

2022-08-09 15:59:15.561003: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [4]:
#@title Hyperparameters

SEED=42
TRAIN_TEST_SPLIT=0.8
BATCH_SIZE=4
EPOCHS=10
LABEL='topic'
FEATURES='content'
PRETRAINED_WEIGHTS='bert-base-uncased'

In [5]:
import tensorflow as tf

options = tf.data.Options()
options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.DATA

np.random.seed(42)
tf.random.set_seed(42)

In [6]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

import tokenizers
import transformers

from tensorflow import keras


np.random.seed(42)
tf.random.set_seed(42)

In [7]:
# strategy = tf.distribute.MirroredStrategy()

In [8]:
features = FEATURES # feature for the future - add all the datasets ['categories', 'summary', 'content']
label = LABEL

In [9]:
import numpy as np
import pandas as pd

import tokenizers
import transformers

from tensorflow import keras


np.random.seed(SEED)
tf.random.set_seed(SEED)

df = pd.read_feather("data/dataset.feather")
df[label] = df[label].str.split('.').str[0]

response_count = len(df[label].unique())

df_train = df.sample(frac = TRAIN_TEST_SPLIT)
df_test = df.drop(df_train.index)

In [10]:
# strategy = tf.distribute.MirroredStrategy()

In [11]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

y_ = ohe.fit_transform(df[label].values.reshape(-1,1)).toarray()
y_train = ohe.fit_transform(df_train[label].values.reshape(-1,1)).toarray()
y_test = ohe.fit_transform(df_test[label].values.reshape(-1,1)).toarray()

In [12]:
max_len = 512
hf_bert_tokenizer = transformers.BertTokenizerFast.from_pretrained("bert-base-uncased")
hf_bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")
# hf_bert_model = transformers.TFBertForSequenceClassification.from_pretrained("bert-base-uncased")

2022-08-09 15:59:23.668698: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-08-09 15:59:25.767779: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30902 MB memory:  -> device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:18:00.0, compute capability: 7.0
2022-08-09 15:59:25.769472: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 30965 MB memory:  -> device: 1, name: Tesla V100-SXM2-32GB, pci bus id: 0000:3b:00.0, compute capability: 7.0
2022-08-09 15:59:25.770987: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/rep

In [13]:
encodings_train = hf_bert_tokenizer.batch_encode_plus(list(df_train.summary.values), 
                                                return_tensors='tf', 
                                                padding='max_length',
                                                max_length=None,
                                                truncation=True)

encodings_test = hf_bert_tokenizer.batch_encode_plus(list(df_test.summary.values), 
                                                return_tensors='tf', 
                                                padding='max_length',
                                                max_length=None,
                                                truncation=True)

In [14]:
def model_top(pretr_model):
    input_ids = tf.keras.Input(shape=(512,), dtype='int32')
    attention_masks = tf.keras.Input(shape=(512,), dtype='int32')

    output = pretr_model([input_ids, attention_masks])
    #pooler_output = output[1]
    pooler_output = tf.keras.layers.AveragePooling1D(pool_size=512)(output[0])
    flattened_output = tf.keras.layers.Flatten()(pooler_output)

    output = tf.keras.layers.Dense(32, activation='tanh')(flattened_output)
    output = tf.keras.layers.Dropout(0.2)(output)

    output = tf.keras.layers.Dense(7, activation='softmax')(output)
    model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=output)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model

In [15]:
model = model_top(hf_bert_model)

In [16]:
model.summary(line_length=120, show_trainable=True)

Model: "model"
___________________________________________________________________________________________________________________________________
 Layer (type)                          Output Shape               Param #       Connected to                            Trainable  
 input_1 (InputLayer)                  [(None, 512)]              0             []                                      Y          
                                                                                                                                   
 input_2 (InputLayer)                  [(None, 512)]              0             []                                      Y          
                                                                                                                                   
 tf_bert_model (TFBertModel)           TFBaseModelOutputWithPool  109482240     ['input_1[0][0]',                       Y          
                                       ingAndCrossAttentions(

In [17]:
model.layers

[<keras.engine.input_layer.InputLayer at 0x7fee5463efd0>,
 <keras.engine.input_layer.InputLayer at 0x7fee54521880>,
 <transformers.models.bert.modeling_tf_bert.TFBertModel at 0x7fee9402f580>,
 <keras.layers.pooling.average_pooling1d.AveragePooling1D at 0x7fee545112b0>,
 <keras.layers.reshaping.flatten.Flatten at 0x7fee54511100>,
 <keras.layers.core.dense.Dense at 0x7fef61946430>,
 <keras.layers.regularization.dropout.Dropout at 0x7fed8c33b2e0>,
 <keras.layers.core.dense.Dense at 0x7fee5466da60>]

In [18]:
model.layers[2].trainable = False

In [19]:
model.summary(line_length=120, show_trainable=True)

Model: "model"
___________________________________________________________________________________________________________________________________
 Layer (type)                          Output Shape               Param #       Connected to                            Trainable  
 input_1 (InputLayer)                  [(None, 512)]              0             []                                      Y          
                                                                                                                                   
 input_2 (InputLayer)                  [(None, 512)]              0             []                                      Y          
                                                                                                                                   
 tf_bert_model (TFBertModel)           TFBaseModelOutputWithPool  109482240     ['input_1[0][0]',                       N          
                                       ingAndCrossAttentions(

In [20]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Tue Aug  9 15:59:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:18:00.0 Off |                    0 |
| N/A   39C    P0    55W / 300W |  31627MiB / 32510MiB |      0%      Default |
|                               |            

In [21]:
checkpoint_filepath = './tmp/checkpoint'

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

early_stopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5,
    mode="auto",
)

In [None]:
history = model.fit([encodings_train['input_ids'], 
                     encodings_train['attention_mask']], 
                    y_train,
                    validation_split=.2,
                    epochs=10,
                    batch_size=4,
                    callbacks=[model_checkpoint_callback, early_stopping_callback])

Epoch 1/10
   1/7205 [..............................] - ETA: 24:53:05 - loss: 2.0362 - accuracy: 0.0000e+00

2022-08-09 15:59:51.221544: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8204


 572/7205 [=>............................] - ETA: 19:38 - loss: 2.1508 - accuracy: 0.1482

In [None]:
features_train = [encodings_train['input_ids'], encodings_train['attention_mask']]
features_test = [encodings_test['input_ids'], encodings_test['attention_mask']]

In [None]:
predict_train_data = model.predict(features_train)
pred_train_data = np.argmax(predict_train_data, axis = 1)
train_cm = confusion_matrix(np.argmax(y_train, axis = 1), pred_train_data)

In [None]:
predict_test_data = model.predict(features_test)
pred_test_data = np.argmax(predict_test_data, axis = 1)
test_cm = confusion_matrix(np.argmax(y_test, axis = 1), pred_test_data)

In [None]:
# plotting training history
history_df = pd.DataFrame(np.array([history.history['accuracy'], history.history['loss']]).T, columns = ['accuracy', 'loss'])
history_df = history_df.reset_index().rename(columns = {'index': 'epoch'})
history_df['epoch'] = history_df['epoch'] + 1
history_df = pd.melt(history_df, id_vars = 'epoch', value_vars = ['accuracy', 'loss'])

fig, ax = plt.subplots(1, 1, figsize = (14,8))
sns.lineplot(x = 'epoch', y = 'value', hue = 'variable', data = history_df);
# labels, title and ticks
ax.set_xlabel('Epoch', fontsize = 12);
ax.set_ylabel(''); 
ax.set_title('Accuracy and Loss with Training, BERT', loc = 'left', fontsize = 20); 
#ax.xaxis.set_ticklabels(['','1','','','','2','','','','3']); 
plt.tight_layout()
plt.show()

In [None]:
## creating confusion matrices
predict_train_data = model.predict(features_train, batch_size=4)
pred_train_data = np.argmax(predict_train_data, axis = 1)
train_cm = confusion_matrix(np.argmax(ds_y_train, axis = 1), pred_train_data)

predict_test_data = model.predict(features_test)
pred_test_data = np.argmax(predict_test_data, axis = 1)
test_cm = confusion_matrix(np.argmax(y_test, axis = 1), pred_test_data)

# Construct untrained model performance
bat_size=32
model_untr = model_top(hf_bert_model)
untr_pred_train = model_untr.predict(features_train, 
                                     batch_size=bat_size)
untr_train_cm = confusion_matrix(np.argmax(y_train, axis = 1), 
                                 np.argmax(untr_pred_train, axis = 1))

untr_pred_test = model_untr.predict(ds_test, 
                                    batch_size=bat_size)
untr_test_cm = confusion_matrix(np.argmax(y_test, axis = 1), 
                                np.argmax(untr_pred_test, axis = 1))

labels = list(df['topic'].unique())
labels.sort()
x_labs = labels
labels.sort(reverse = True)
y_labs = labels

## function for visualizing confusion matrices
def plot_cm(cm, title = 'Confusion Matrix'):
  fig = plt.figure(figsize = (14,8))
  ax = sns.heatmap(cm/np.sum(cm), annot=True, fmt='.2%', cmap='Blues');
  # labels, title and ticks
  ax.set_xlabel('Predicted category', fontsize = 12);
  ax.set_ylabel('Actual category', fontsize = 12); 
  ax.set_title(title, fontsize = 20); 
  ax.xaxis.set_ticklabels(x_labs, fontsize = 8); 
  ax.yaxis.set_ticklabels(y_labs, fontsize = 8);

  ax.set_facecolor('w')
  fig.set_facecolor('w')
  
  plt.tight_layout()
  plt.show()

In [None]:
plot_cm(train_cm, 'BERT Confusion Matrix, Training Data')

In [None]:
plot_cm(test_cm, 'BERT Confusion Matrix, Testing Data')

In [None]:
plot_cm(untr_train_cm, 'BERT Confusion Matrix, Training Data (not fine-tuned)')

In [None]:
plot_cm(untr_test_cm, 'BERT Confusion Matrix, Testing Data (not fine-tuned)')

In [None]:
# see f1 scores
# threshold is just median/mean rounded up to the nearest 0.15
f1_metric = tfa.metrics.F1Score(num_classes = 7, threshold = 0.15)
f1_metric.update_state(y_train, predict_train_data)
train_f1 = f1_metric.result()
f1_metric.update_state(y_test, predict_test_data)
test_f1 = f1_metric.result()

# turn to dataframe
train_f1 = pd.Series(train_f1.numpy()).reset_index().rename(columns = {'index': 'category', 0: 'f1'})
train_f1['type'] = 'train'
test_f1  = pd.Series(test_f1.numpy()).reset_index().rename(columns  = {'index': 'category', 0: 'f1'})
test_f1['type']  = 'test'

gpt2_f1 = pd.concat([train_f1, test_f1]).reset_index(drop = True)\
            .replace({'category': {t: idx for idx, t in zip(sorted(df['topic'].unique()), range(7))}})\
            .sort_values(by = ['category', 'type'], ascending = False)

# plotting
plt.figure(figsize = (14,8))
# can't get it to sort alphabetically for some reason
ax = sns.barplot(x = 'category', y = 'f1', hue = 'type', data = gpt2_f1, order = list(set(gpt2_f1.category)));
# labels, title and ticks
ax.set_xlabel('Category', fontsize = 12);
ax.set_ylabel('F1 Score'); 
ax.set_title('F1 Score in Training and Testing Data, BERT', fontsize = 20); 
ax.xaxis.set_ticklabels(labels); 
ax.set_ylim([0, 1]);

ax.set_facecolor('w')
fig.set_facecolor('w')

plt.tight_layout()
plt.show()

In [None]:
# see f1 scores for non-fine tuned model
# threshold is just median/mean rounded up to the nearest 0.15
f1_metric_untr = tfa.metrics.F1Score(num_classes = 7, threshold = 0.15)
f1_metric_untr.update_state(y_train, untr_pred_train)
untr_train_f1 = f1_metric_untr.result()
f1_metric_untr.update_state(y_test,  untr_pred_test)
untr_test_f1 = f1_metric_untr.result()

# turn to dataframe
untr_train_f1 = pd.Series(untr_train_f1.numpy()).reset_index()\
                  .rename(columns = {'index': 'category', 0: 'f1'})
untr_train_f1['type'] = 'train'
untr_test_f1  = pd.Series(untr_test_f1.numpy()).reset_index()\
                  .rename(columns  = {'index': 'category', 0: 'f1'})
untr_test_f1['type']  = 'test'

untr_gpt2_f1 = pd.concat([untr_train_f1, untr_test_f1]).reset_index(drop = True)\
                 .replace({'category': {t: idx for idx, t in zip(sorted(df['topic'].unique()), range(7))}})\
                 .sort_values(by = ['category', 'type'], ascending = False)

# plotting
plt.figure(figsize = (14,8))
# can't get it to sort alphabetically for some reason
ax = sns.barplot(x = 'category', y = 'f1', hue = 'type', data = untr_gpt2_f1, order = list(set(untr_gpt2_f1.category)));
# labels, title and ticks
ax.set_xlabel('Category', fontsize = 12);
ax.set_ylabel('F1 Score'); 
ax.set_title('F1 Score in Training and Testing Data, BERT (not fine-tuned)', fontsize = 20); 
ax.xaxis.set_ticklabels(labels); 
ax.set_ylim([0, 1]);

ax.set_facecolor('w')
fig.set_facecolor('w')

plt.tight_layout()
plt.show()

In [None]:
## visualizing model architecture
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='model_bert_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
plot_model(model_untr, to_file='model_untr_bert_plot.png', show_shapes=True, show_layer_names=True)