## Import libraries

In [109]:
import os
import pandas as pd
import tensorflow as tf
from official import nlp
from official.nlp import bert
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs

print("TensorFlow: {} installed".format(tf.__version__))

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except:
        print("Couldn't set memory_growth")
        pass
    
    
def fix_random_seed(seed):
    """ Setting the random seed of various libraries """
    try:
        np.random.seed(seed)
    except NameError:
        print("Warning: Numpy is not imported. Setting the seed for Numpy failed.")
    try:
        tf.random.set_seed(seed)
    except NameError:
        print("Warning: TensorFlow is not imported. Setting the seed for TensorFlow failed.")
    try:
        random.seed(seed)
    except NameError:
        print("Warning: random module is not imported. Setting the seed for random failed.")

# Fixing the random seed
random_seed=4321
fix_random_seed(random_seed)

2.4.1


 The versions of TensorFlow you are currently using is 2.4.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


## Setting Environment variables for TFHUB downloads

In [82]:
%mkdir ${HOMEPATH}\Documents\code\manning_tf2_in_action\tf_hub
%env TFHUB_CACHE_DIR=${HOMEPATH}\Documents\code\manning_tf2_in_action\tf_hub

# Run %env to see if the variable has been fixed

env: TFHUB_CACHE_DIR=${HOMEPATH}\Documents\code\manning_tf2_in_action\tf_hub


## Download and read the data

In [48]:
# Inputs and labels will be stored in this
inputs = []
labels = []
# Total number of instances for spam and ham
n_ham, n_spam = 0,0
with open(os.path.join('data', 'SMSSpamCollection'), 'r') as f:
    for r in f:        
        # Ham input
        if r.startswith('ham'):
            label = 0
            txt = r[4:]
            n_ham += 1
        # Spam input
        elif r.startswith('spam'):
            label = 1
            txt = r[5:]
            n_spam += 1
        inputs.append(txt)
        labels.append(label)
        
print("Found {} ham and {} spam".format(n_ham, n_spam))
print(inputs[:5])

# Convert them to arrays
inputs = np.array(inputs).reshape(-1,1)
labels = np.array(labels)

Found 4827 ham and 747 spam
['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...\n', 'Ok lar... Joking wif u oni...\n', "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\n", 'U dun say so early hor... U c already then say...\n', "Nah I don't think he goes to usf, he lives around here though\n"]


## Splitting train/valid/test

Here we will split the data to three sets using `imbalanced-learn` library. Specifically we,

* Create a balanced test set with 100 spam and 100 ham (Random)
* Create a balanced validation set with 100 spam and 100 ham (Random)
* Create a balanced train set from the left over instances (undersampled using Near miss algorithm)

In [67]:
from imblearn.under_sampling import OneSidedSelection, NearMiss, RandomUnderSampler
import numpy as np

n=100 # Number of instances for each class for train/validation sets
rus = RandomUnderSampler(sampling_strategy={0:n, 1:n}, random_state=random_seed)
rus.fit_resample(inputs, labels)

# Get test indices
test_inds = rus.sample_indices_
test_x, test_y = inputs[test_inds], np.array(labels)[test_inds]
print("Test statistics")
print(pd.Series(test_y).value_counts())

# Get rest (train + valid)
rest_inds = [i for i in range(inputs.shape[0]) if i not in test_inds]
rest_x, rest_y = inputs[rest_inds], labels[rest_inds]

# Get valid indices
rus.fit_resample(rest_x, rest_y)
valid_inds = rus.sample_indices_
valid_x, valid_y = rest_x[valid_inds], rest_y[valid_inds]
print("Valid statistics")
print(pd.Series(valid_y).value_counts())

# Rest goes in training
train_inds = [i for i in range(rest_x.shape[0]) if i not in valid_inds]
train_x, train_y = rest_x[train_inds], rest_y[train_inds]
print("Train statistics")
print(pd.Series(train_y).value_counts())

Test statistics
1    100
0    100
dtype: int64
Valid statistics
1    100
0    100
dtype: int64
Train statistics
0    4627
1     547
dtype: int64


In [68]:
from sklearn.feature_extraction.text import CountVectorizer

# To use near miss algorithm, we need a numerical representation of the messages
# We will use the bag of words representation
countvec = CountVectorizer()
train_bow = countvec.fit_transform(train_x.reshape(-1).tolist())

# NearMiss is a common undersampling technique
oss = NearMiss()
X_res, y_res = oss.fit_resample(train_bow, train_y)
train_inds = oss.sample_indices_

train_x, train_y = train_x[train_inds], train_y[train_inds]

print(pd.Series(train_y).value_counts())

(5174, 8314)
1    547
0    547
dtype: int64


## Analyse some of the removed samples

## Downloading the BERT model

In [3]:
import tensorflow_hub as hub

model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2")

## Defining the inputs for the BERT model

BERT model needs three inputs,

* Input word IDs - These are the input tokens generated from text and padded to `max_seq_length` with zeros
* Input mask - A matrix of the shape of `input_word_ids` that represents whether an element is a token of a padded value (0s and 1s)
* Segment IDs - A matrix of the shape of `input_word_ids` that represents which sentence/sequence each token belongs to (0s and 1s)

In [5]:
max_seq_length = 128  # Your choice here.
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2",
                            trainable=True)

pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
print(pooled_output.shape)
print(sequence_output.shape)

(None, 768)
(None, 128, 768)


## Analysing the vocabulary of BERT

In [136]:
import official.nlp.bert.tokenization as tokenization

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [137]:
print(vocab_file)

b'C:\\Users\\thush\\AppData\\Local\\Temp\\tfhub_modules\\ce53fe6769d2ac3a260e92555120c54e1aecbea6\\assets\\vocab.txt'


## Understanding tokenization in BERT

In [14]:
tokens = tokenizer.tokenize("She sells seashells by the seashore")
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

['she', 'sells', 'seas', '##hell', '##s', 'by', 'the', 'seas', '##hore']
[2016, 15187, 11915, 18223, 2015, 2011, 1996, 11915, 16892]


## Special tokens used by BERT

In [15]:
ids = tokenizer.convert_tokens_to_ids(['[CLS]', '[SEP]', '[MASK]', '[PAD]'])
print(ids)

[101, 102, 103, 0]


In [129]:
def encode_sentence(s):
   tokens = list(tokenizer.tokenize(s))
   tokens.append('[SEP]')
   return tokenizer.convert_tokens_to_ids(tokens)


def get_bert_inputs(docs,max_seq_len=None):
    """ Generate inputs for BERT using a set of documents """
    
    tokens = tf.ragged.constant([encode_sentence(str(s)) for s in docs])
    
    print("Shape of the ragged input: {}".format(tokens.shape))
    tokens_pad = tokens.to_tensor()
    
    if max_seq_len and max_seq_len - tokens_pad.shape[1]>0:
        more_pad = tf.zeros(shape=[tokens_pad.shape[0], max_seq_len - tokens_pad.shape[1]], dtype='int32')
        tokens_pad = tf.concat([tokens_pad, more_pad], axis=1)
    elif max_seq_len and max_seq_len - tokens_pad.shape[1]<0:
        tokens_pad = tokens_pad[:, :max_seq_len]
        
    # Which are actual words
    tokens_mask = tf.cast((tokens_pad != 0), 'float32')
    # Which sentence each token belongs to
    tokens_type = tf.zeros_like(tokens_pad)
    print("Shape of the transformed input: {}".format(tokens_pad.shape))
    
    return {
        'input_word_ids': tokens_pad,
        'input_mask': tokens_mask,
        'input_type_ids': tokens_type
    }

train_inputs = get_bert_inputs(train_x, max_seq_len=80)
valid_inputs = get_bert_inputs(valid_x, max_seq_len=80)
test_inputs = get_bert_inputs(test_x, max_seq_len=80)
    

(1094, None)
(1094, 80)
(200, None)
(200, 80)
(200, None)
(200, 80)


## Creating a downstream classifier from BERT

In [133]:
#bert_config = bert.configs.BertConfig.from_dict(config_dict)
import yaml

hub_path_bert = 'C:\\Users\\thush\\Documents\\code\\manning_tf2_in_action\\tf_hub\\ce53fe6769d2ac3a260e92555120c54e1aecbea6'

# https://github.com/tensorflow/models/blob/master/official/nlp/configs/models/bert_en_uncased_base.yaml
with open(os.path.join("data", "bert_en_uncased_base.yaml"), 'r') as stream:
    config_dict = yaml.safe_load(stream)['task']['model']['encoder']['bert']

bert_config = bert.configs.BertConfig.from_dict(config_dict)
print("BERT Config")
print(bert_config)

# Generating a classifier and the encoder
hub_classifier, hub_encoder = bert.bert_models.classifier_model(
    # Caution: Most of `bert_config` is ignored if you pass a hub url.
    bert_config=bert_config, hub_module_url=hub_path_bert, num_labels=2
)


{'attention_dropout_rate': 0.1, 'dropout_rate': 0.1, 'hidden_activation': 'gelu', 'hidden_size': 768, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 512, 'num_attention_heads': 12, 'num_layers': 12, 'type_vocab_size': 2, 'vocab_size': 30522}
<official.nlp.bert.configs.BertConfig object at 0x0000020D91F67160>


In [134]:
# Set up epochs and steps
epochs = 3
batch_size = 64
eval_batch_size = 32

train_data_size = train_seq_pad.shape[0]
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(
    2e-6, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)


In [135]:
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

hub_classifier.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics)


print()
# np.concatenate([(train_y==0).reshape(-1,1), (train_y==1).reshape(-1,1)], axis=1).astype('int32'),
hub_classifier.fit(
      train_inputs, 
      train_y,
      validation_data=(valid_inputs, valid_y),
    validation_batch_size=eval_batch_size,
      batch_size=batch_size,
      epochs=epochs)



Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x20df18e29b0>