# Lab 3 - Pretrained language models and text classification

## Classification with language models

### Setup

In [1]:
import os

import numpy as np
import tensorflow_datasets as tfds
import tensorflow as tf
from tensorflow.keras import Model, Input 
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.metrics import roc_auc_score

tfds.disable_progress_bar()

2022-10-20 18:12:01.570678: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
%matplotlib inline

### Download the data
Find a proper dataset [here](https://www.tensorflow.org/datasets/catalog/).
Download the dataset.

In [3]:
# TODO: Download the dataset with tensorflow dataset and obtain train and test datasets.

dataset, info = tfds.load("yelp_polarity_reviews", with_info=True, as_supervised=True)
train_dataset, test_dataset = dataset["train"], dataset["test"]

print(train_dataset.element_spec)

(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))


2022-10-20 18:12:05.064364: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Prepare the model

In [4]:
# TODO: Choose your transformer model and specify your model_path

MODEL = "distilbert-base-uncased"

model_path = f"/Users/timowang/Daten/models/lab3/{MODEL}"

from transformers import AutoTokenizer, TFAutoModel, logging
folder = os.fspath(model_path)

In [5]:
# TODO: Load and save pretrained transformer model and tokenizer.

transformer = TFAutoModel.from_pretrained(MODEL)
tokenizer = AutoTokenizer.from_pretrained(MODEL)
transformer.save_pretrained(folder)
tokenizer.save_pretrained(folder)

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_layer_norm', 'vocab_projector', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


('/Users/timowang/Daten/models/lab3/distilbert-base-uncased/tokenizer_config.json',
 '/Users/timowang/Daten/models/lab3/distilbert-base-uncased/special_tokens_map.json',
 '/Users/timowang/Daten/models/lab3/distilbert-base-uncased/vocab.txt',
 '/Users/timowang/Daten/models/lab3/distilbert-base-uncased/added_tokens.json',
 '/Users/timowang/Daten/models/lab3/distilbert-base-uncased/tokenizer.json')

### Process the data

In [6]:
def encode_data(data, max_len):
    enc_data = tokenizer(data,max_length=max_len, return_token_type_ids=False,
                         padding=True, truncation=True)
    return [np.array(enc_data[k]) for k in ['input_ids','attention_mask']]

In [7]:
# TODO: Encode train and test texts with the function `encode_data`

train_texts = []
train_labels = []
for text, label in train_dataset.take(10000):
    train_texts.append(text.numpy().decode("utf-8"))
    train_labels.append(label.numpy())
    
test_texts = []
test_labels = []
for text, label in test_dataset.take(1000):
    test_texts.append(text.numpy().decode("utf-8"))
    test_labels.append(label.numpy())

In [8]:
MAX_LEN = 64

train_data = encode_data(train_texts, MAX_LEN)
test_data = encode_data(test_texts, MAX_LEN)

In [9]:
print('Shape of the encoded data: ', train_data[0].shape)

Shape of the encoded data:  (10000, 64)


In [10]:
# TODO: Try encode a few pieces of texts and examine the output shape

sample_train_data = encode_data(train_texts[:2], MAX_LEN)
output = transformer.predict(sample_train_data)
print(type(output),len(output),output[0].shape)

<class 'transformers.modeling_tf_outputs.TFBaseModelOutput'> 1 (2, 64, 768)


In [11]:
X_train = train_data
X_test = test_data
y_train = train_labels
y_test = test_labels

### Build the model

In [12]:
# TODO: Define your model here. Depend on your dataset, 
#   you may need to use a different activation function for the last layer. 
#   You may also need to use a different loss function other than binary_crossentropy.

def get_model(transformer):
    input_word_ids = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_word_ids")
    input_att_mask = Input(shape=(MAX_LEN,), dtype=tf.int8, name="input_att_mask")
    transformer.trainable = False
    x = transformer([input_word_ids,input_att_mask])[0][:, 0, :]  
    out = Dense(1, activation='sigmoid',name = 'custom_dense')(x)
    model = Model(inputs=[input_word_ids,input_att_mask], outputs=out)
    model.compile(loss="binary_crossentropy", 
                  optimizer=Adam(learning_rate=1e-3), metrics=["acc"])
    return model

### Train the model

In [13]:
best_model_path = 'best_model_head.h5'
BATCH_SIZE = 128

# save the best model to a file
chp = ModelCheckpoint(best_model_path,save_best_only=True,save_weights_only=True)
model = get_model(transformer)
print('Model: classifier head traning only: \n',model.summary())

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 64)]         0           []                               
                                                                                                  
 input_att_mask (InputLayer)    [(None, 64)]         0           []                               
                                                                                                  
 tf_distil_bert_model (TFDistil  TFBaseModelOutput(l  66362880   ['input_word_ids[0][0]',         
 BertModel)                     ast_hidden_state=(N               'input_att_mask[0][0]']         
                                one, 64, 768),                                                    
                                 hidden_states=None                                           

In [14]:
best_model_path = 'best_model_whole.h5'
chp = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)
    

hist = model.fit(X_train, np.array(y_train),
                 validation_split=0.2,
                 epochs=3, shuffle=True, callbacks=[chp],
                 verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [15]:
model.load_weights(best_model_path)
test_pred = model.predict(X_test, batch_size=BATCH_SIZE, verbose=0)

In [16]:
print('AUC after head finetuning',roc_auc_score(y_test, test_pred))

AUC after head finetuning 0.8964935023365982
