## Import libraries

In [43]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
%matplotlib inline
import collections
import math
import numpy as np
import pandas as pd
import os
import random
import tensorflow as tf
import zipfile
from matplotlib import pylab
from six.moves import range
from six.moves.urllib.request import urlretrieve
import tensorflow as tf

seed = 54321

%env TF_FORCE_GPU_ALLOW_GROWTH=true

env: TF_FORCE_GPU_ALLOW_GROWTH=true


## Downloading the data

In [44]:
url = 'https://github.com/ZihanWangKi/CrossWeigh/raw/master/data/'
dir_name = 'data'
#https://github.com/ZihanWangKi/CrossWeigh/raw/master/data/conllpp_train.txt
def download_data(url, filename, download_dir, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
      
    # Create directories if doesn't exist
    os.makedirs(download_dir, exist_ok=True)
    
    # If file doesn't exist download
    if not os.path.exists(os.path.join(download_dir,filename)):
        filepath, _ = urlretrieve(url + filename, os.path.join(download_dir,filename))
    else:
        filepath = os.path.join(download_dir, filename)
    
    # Check the file size
    statinfo = os.stat(filepath)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filepath)
    else:
        print(statinfo.st_size)
        raise Exception(
          'Failed to verify ' + filepath + '. Can you get to it with a browser?')
        
    return filepath

# Filepaths to train/valid/test data
train_filepath = download_data(url, 'conllpp_train.txt', dir_name, 3283420)
dev_filepath = download_data(url, 'conllpp_dev.txt', dir_name, 827443)
test_filepath = download_data(url, 'conllpp_test.txt', dir_name, 748737)

Found and verified data\conllpp_train.txt
Found and verified data\conllpp_dev.txt
Found and verified data\conllpp_test.txt


## Reading the data

In [45]:
def read_data(filename):
    '''
    Read data from a file with given filename
    Returns a list of sentences (each sentence a string), 
    and list of ner labels for each string
    '''

    print("Reading data ...")
    # master lists - Holds sentences (list of tokens), ner_labels (for each token an NER label)
    sentences, ner_labels = [], [] 
    
    # Open the file
    with open(filename,'r',encoding='latin-1') as f:        
        # Read each line
        is_sos = True # We record at each line if we are seeing the beginning of a sentence
        
        # Tokens and labels of a single sentence, flushed when encountered a new one
        sentence_tokens = []
        sentence_labels = []
        i = 0
        for row in f:
            # If we are seeing an empty line or -DOCSTART- that's a new line
            if len(row.strip()) == 0 or row.split(' ')[0] == '-DOCSTART-':
                is_sos = False
            # Otherwise keep capturing tokens and labels
            else:
                is_sos = True
                token, _, _, ner_label = row.split(' ')
                sentence_tokens.append(token)
                sentence_labels.append(ner_label.strip())
            
            # When we reach the end / or reach the beginning of next
            # add the data to the master lists, flush the temporary one
            if not is_sos and len(sentence_tokens)>0:
                sentences.append(' '.join(sentence_tokens))
                ner_labels.append(sentence_labels)
                sentence_tokens, sentence_labels = [], []
    
    print('\tDone')
    return sentences, ner_labels

# Train data
train_sentences, train_labels = read_data(train_filepath) 
# Validation data
valid_sentences, valid_labels = read_data(dev_filepath) 
# Test data
test_sentences, test_labels = read_data(test_filepath) 

# Print some stats
print('Train size: {}'.format(len(train_labels)))
print('Valid size: {}'.format(len(valid_labels)))
print('Test size: {}'.format(len(test_labels)))

# Print some data
print('\nSample data\n')
for v_sent, v_labels in zip(valid_sentences[:5], valid_labels[:5]):
    print("Sentence: {}".format(v_sent))
    print("Labels: {}".format(v_labels))
    print('\n')

Reading data ...
	Done
Reading data ...
	Done
Reading data ...
	Done
Train size: 14041
Valid size: 3250
Test size: 3452

Sample data

Sentence: CRICKET - LEICESTERSHIRE TAKE OVER AT TOP AFTER INNINGS VICTORY .
Labels: ['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


Sentence: LONDON 1996-08-30
Labels: ['B-LOC', 'O']


Sentence: West Indian all-rounder Phil Simmons took four for 38 on Friday as Leicestershire beat Somerset by an innings and 39 runs in two days to take over at the head of the county championship .
Labels: ['B-MISC', 'I-MISC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


Sentence: Their stay on top , though , may be short-lived as title rivals Essex , Derbyshire and Surrey all closed in on victory while Kent made up for lost time in their rain-affected match against Nottinghamshire .
Labels: ['O', 'O', 'O', 'O', 'O', 'O', '

## Checking the balance of labels

In [46]:
from itertools import chain

# Print the value count for each label
print("Training data label counts")
print(pd.Series(chain(*train_labels)).value_counts())

print("\nValidation data label counts")
print(pd.Series(chain(*valid_labels)).value_counts())

print("\nTest data label counts")
print(pd.Series(chain(*test_labels)).value_counts())

Training data label counts
O         169578
B-LOC       7140
B-PER       6600
B-ORG       6321
I-PER       4528
I-ORG       3704
B-MISC      3438
I-LOC       1157
I-MISC      1155
dtype: int64

Validation data label counts
O         42759
B-PER      1842
B-LOC      1837
B-ORG      1341
I-PER      1307
B-MISC      922
I-ORG       751
I-MISC      346
I-LOC       257
dtype: int64

Test data label counts
O         38143
B-ORG      1714
B-LOC      1645
B-PER      1617
I-PER      1161
I-ORG       881
B-MISC      722
I-LOC       259
I-MISC      252
dtype: int64


## Analysing the sequence length

In [47]:
pd.Series(train_sentences).str.split().str.len().describe(percentiles=[0.05, 0.95])

count    14041.000000
mean        14.501887
std         11.602756
min          1.000000
5%           2.000000
50%         10.000000
95%         37.000000
max        113.000000
dtype: float64

## Padding/Truncating sentences to create arrays

In [49]:
from functools import partial

max_seq_length = 40

def get_label_id_map(train_labels):
    # Get the unique list of labels
    unique_train_labels = pd.Series(chain(*train_labels)).unique()
    # Create a class label -> class ID mapping
    labels_map = dict(zip(unique_train_labels, np.arange(unique_train_labels.shape[0])))
    print("labels_map: {}".format(labels_map))
    return labels_map


def get_class_weights(train_labels):
    
    label_count_ser = pd.Series(chain(*train_labels)).value_counts()
    label_count_ser = label_count_ser.sum()/label_count_ser
    label_count_ser /= label_count_ser.max()
    
    label_id_map = get_label_id_map(train_labels)
    label_count_ser.index = label_count_ser.index.map(label_id_map)
    return label_count_ser.to_dict()


def get_padded_int_labels(labels, labels_map, max_seq_length, return_mask=True):
    
    # Create a partial function with many of arguments fixed
    # Pad to/Truncate at max_seq_length
    ner_pad_sequence_func = partial(
        tf.keras.preprocessing.sequence.pad_sequences, maxlen=max_seq_length,
        padding='post', truncating='post', 
    )

    # Convert string labels to integers 
    int_labels = [[labels_map[x] for x in one_seq] for one_seq in labels]
    
    
    # Pad sequences
    if return_mask:
        # If we return mask, we first pad with a special value (-1) and 
        # use that to create the mask and later replace -1 with 'O'
        padded_labels = np.array(ner_pad_sequence_func(int_labels, value=-1))
        
        # mask filter
        mask_filter = (padded_labels != -1)
        # replace -1 with 'O' s ID
        padded_labels[~mask_filter] = labels_map['O']        
        return padded_labels, mask_filter.astype('int')
    
    else:
        padded_labels = np.array(ner_pad_sequence_func(int_labels, value=labels_map['O']))
        return padded_labels
    
train_class_weights = get_class_weights(train_labels)
print(train_class_weights)

labels_map = get_label_id_map(train_labels)

# Convert string labels to integers for all train/validation/test data
# Pad train/validation/test data
padded_train_labels, train_mask = get_padded_int_labels(
    train_labels, labels_map, max_seq_length, return_mask=True
)
padded_valid_labels, valid_mask = get_padded_int_labels(
    valid_labels, labels_map, max_seq_length, return_mask=True
)
padded_test_labels, test_mask  = get_padded_int_labels(
    test_labels, labels_map, max_seq_length, return_mask=True
)


# Print some labels IDs
print(padded_train_labels[:2])
print(train_mask[:2])

print(padded_train_labels.max())

labels_map: {'B-ORG': 0, 'O': 1, 'B-MISC': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}
{1: 0.006811025015037328, 5: 0.16176470588235295, 3: 0.17500000000000002, 0: 0.18272425249169436, 4: 0.25507950530035334, 6: 0.31182505399568033, 2: 0.33595113438045376, 8: 0.9982713915298186, 7: 1.0}
labels_map: {'B-ORG': 0, 'O': 1, 'B-MISC': 2, 'B-PER': 3, 'I-PER': 4, 'B-LOC': 5, 'I-ORG': 6, 'I-MISC': 7, 'I-LOC': 8}
[[0 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1]
 [3 4 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1]]
[[1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0]]
8


## Implement a recurrent neural network

In [50]:
import tensorflow.keras.layers as layers
import tensorflow.keras.backend as K
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

K.clear_session()

def get_fitted_token_vectorization_layer(corpus, vocabulary_size=None):
    """ Fit a TextVectorization layer on given data """
    
    # Define a text vectorization layer
    vectorization_layer = TextVectorization(
        max_tokens=vocabulary_size, standardize=None,        
        output_sequence_length=max_seq_length, 
    )
    # Fit it on a corpus of data
    vectorization_layer.adapt(corpus)
    
    # Get the vocabulary size
    n_vocab = len(vectorization_layer.get_vocabulary())

    return vectorization_layer, n_vocab


# Input layer
word_input = tf.keras.layers.Input(shape=(1,), dtype=tf.string)

# Text vectorize layer
vectorize_layer, n_vocab = get_fitted_token_vectorization_layer(train_sentences)

# Vectorized output (each word mapped to an int ID)
vectorized_out = vectorize_layer(word_input)

# Look up embeddings for the returned IDs
embedding_layer = layers.Embedding(input_dim=n_vocab, output_dim=64, mask_zero=True)(vectorized_out)

# Define a simple RNN layer, it returns an output at each position
rnn_layer = layers.SimpleRNN(
    units=64, activation='tanh', use_bias=True, return_sequences=True
)

rnn_out = rnn_layer(embedding_layer)

dense_layer = layers.Dense(9, activation='softmax')
dense_out = dense_layer(rnn_out)

model = tf.keras.Model(inputs=word_input, outputs=dense_out)



## Defining a custom metric and compiling the model

In [51]:
def macro_accuracy(y_true, y_pred):
    
    # [batch size * time]
    y_true = tf.cast(tf.reshape(y_true, [-1]), 'int32')
    y_pred = tf.cast(tf.reshape(tf.argmax(y_pred, axis=-1), [-1]), 'int32')
    
    sorted_y_true = tf.sort(y_true)
    sorted_inds = tf.argsort(y_true)
    
    sorted_y_pred = tf.gather(y_pred, sorted_inds)
    
    sorted_correct = tf.cast(tf.math.equal(sorted_y_true, sorted_y_pred), 'int32')
    
    # We are adding one to make sure ther eare no division by zero
    correct_for_each_label = tf.cast(tf.math.segment_sum(sorted_correct, sorted_y_true), 'float32') + 1
    all_for_each_label = tf.cast(tf.math.segment_sum(tf.ones_like(sorted_y_true), sorted_y_true), 'float32') + 1
    
    mean_accuracy = tf.reduce_mean(correct_for_each_label/all_for_each_label)
    
    return mean_accuracy
        
mean_accuracy_metric = tf.keras.metrics.MeanMetricWrapper(fn=macro_accuracy, name='macro_accuracy')

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[mean_accuracy_metric])

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
text_vectorization (TextVect (None, 40)                0         
_________________________________________________________________
embedding (Embedding)        (None, 40, 64)            1512000   
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 40, 64)            8256      
_________________________________________________________________
dense (Dense)                (None, 40, 9)             585       
Total params: 1,520,841
Trainable params: 1,520,841
Non-trainable params: 0
_________________________________________________________________


## Training the model

When training the model we will use `sample_weight` to counteract class-imbalance. We will not use `class_weight`. When using `class_weight` as follows,

```
model.fit(
        train_sentences[i:i+1], padded_train_labels[i:i+1], 
        class_weight=train_class_weights,
        batch_size=64,
        epochs=3, 
        validation_data=(np.array(valid_sentences), padded_valid_labels)
)
```

it leads to the error below,

```
InvalidArgumentError: 2 root error(s) found.
  (0) Invalid argument:  indices[0] = 11 is not in [0, 9)
	 [[{{node GatherV2}}]]
	 [[IteratorGetNext]]
  (1) Invalid argument:  indices[0] = 11 is not in [0, 9)
	 [[{{node GatherV2}}]]
	 [[IteratorGetNext]]
	 [[model/text_vectorization/cond/then/_0/model/text_vectorization/cond/Pad/_56]]
0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_7453]

Function call stack:
train_function -> train_function
```

In [52]:

def get_sample_weights_from_class_weights(labels, class_weights):
    """ From the class weights generate sample weights """
    return np.vectorize(class_weights.get)(labels)


# Make train_sequences an array
train_sentences = np.array(train_sentences)
# Get sample weights (we cannot use class_weight with TextVectorization layer)
train_sample_weights = get_sample_weights_from_class_weights(padded_train_labels, train_class_weights)

# Training the model
model.fit(
        train_sentences, padded_train_labels, 
        sample_weight=train_sample_weights,
        batch_size=64,
        epochs=3, 
        validation_data=(np.array(valid_sentences), padded_valid_labels)
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x24cbdfebcc0>

In [53]:
model.evaluate(np.array(test_sentences), padded_test_labels)



[0.11428217589855194, 0.7732274532318115]

## Defining an advance RNN model

* Token embeddings + Char embeddings
* Bidirectional RNN

### Statistics about token lenghts (for char embeddings)

In [55]:
vocab_ser = pd.Series(pd.Series(train_sentences).str.split().explode().unique())
vocab_ser.str.len().describe(percentiles=[0.05, 0.95])

count    23623.000000
mean         6.832705
std          2.749288
min          1.000000
5%           3.000000
50%          7.000000
95%         12.000000
max         61.000000
dtype: float64

## Testing `TextVectorization` for char level

In [56]:
def split_char(token):
    """ Instead of splitting word by word, split each char"""
    return tf.strings.bytes_split(token)


# Define a vectorization layer that splits chars
vectorization_layer = TextVectorization(
        standardize=None,      
        split=split_char,
)


def prepare_corpus_for_char_embeddings(tokenized_sentences, max_seq_length):
    """ Pads each sequence to a maximum length """
    proc_sentences = []
    for tokens in tokenized_sentences:
        if len(tokens) >= max_seq_length:
            proc_sentences.append([[t] for t in tokens[:max_seq_length]])
        else:
            proc_sentences.append([[t] for t in tokens+['']*(max_seq_length-len(tokens))])
            
    return proc_sentences

# Define sample data
data = ['aaaa bb c', 'd eee']
# Pad sequences
tokenized_sentences = prepare_corpus_for_char_embeddings([d.split() for d in data], 3)
print("Padded sequence: {}".format(tokenized_sentences))

# Fit it on a corpus of data
vectorization_layer.adapt(tokenized_sentences)

# Print data
print("Vectorized output: {}".format(vectorization_layer(tokenized_sentences)))
print("Vocabulary: {}".format(vectorization_layer.get_vocabulary()))

Padded sequence: [[['aaaa'], ['bb'], ['c']], [['d'], ['eee'], ['']]]
Vectorized output: [[[2 2 2 2]
  [4 4 0 0]
  [6 0 0 0]]

 [[5 0 0 0]
  [3 3 3 0]
  [0 0 0 0]]]
Vocabulary: ['', '[UNK]', 'a', 'e', 'b', 'd', 'c']


## Defining the model

In [57]:
import tensorflow.keras.layers as layers
import tensorflow.keras.backend as K
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

K.clear_session()
max_seq_length = 40
max_token_length = 12

def get_fitted_token_vectorization_layer(corpus, max_seq_length, vocabulary_size=None):
    """ Fit a TextVectorization layer on given data """
    
    # Define a text vectorization layer
    vectorization_layer = TextVectorization(
        max_tokens=vocabulary_size, standardize=None,        
        output_sequence_length=max_seq_length, 
    )
    # Fit it on a corpus of data
    vectorization_layer.adapt(corpus)
    
    # Get the vocabulary size
    n_vocab = len(vectorization_layer.get_vocabulary())

    return vectorization_layer, n_vocab


def get_fitted_char_vectorization_layer(corpus, max_seq_length, max_token_length, vocabulary_size=None):
    """ Fit a TextVectorization layer on given data """
    
    def split_char(token):
        return tf.strings.bytes_split(token)

    # Define a text vectorization layer
    vectorization_layer = TextVectorization(
        standardize=None,      
        split=split_char,
        output_sequence_length=max_token_length, 
    )

    tokenized_sentences = [sent.split() for sent in corpus]
    padded_tokenized_sentences = prepare_corpus_for_char_embeddings(tokenized_sentences, max_seq_length)
    
    # Fit it on a corpus of data
    vectorization_layer.adapt(padded_tokenized_sentences)
    
    # Get the vocabulary size
    n_vocab = len(vectorization_layer.get_vocabulary())

    return vectorization_layer, n_vocab


# Input layer (tokens)
word_input = tf.keras.layers.Input(shape=(1,), dtype=tf.string)

# Text vectorize layer (token)
token_vectorize_layer, n_token_vocab = get_fitted_token_vectorization_layer(train_sentences, max_seq_length)
# Text vectorize layer (char)
char_vectorize_layer, n_char_vocab = get_fitted_char_vectorization_layer(train_sentences, max_seq_length, max_token_length)

# Vectorized output (each word mapped to an int ID)
token_vectorized_out = token_vectorize_layer(word_input)


# Vectorized output of each token
# Need a [batch size, seq len, 1]
# strings.split() returns a RaggedTensor. It needs to be converted to a Tensor. Otherwise the following error will be raised
# InvalidArgumentError:  assertion failed: [the given axis (axis = 2) is not squeezable!]
#	 [[node model/text_vectorization_1/RaggedSqueeze/Assert/Assert (defined at <ipython-input-26-a2f55ee22434>:17) ]] [Op:__inference_train_function_72435]
tokenized_word_input = layers.Lambda(
    lambda x: tf.strings.split(x).to_tensor(default_value='', shape=[None, max_seq_length, 1])
)(word_input)
char_vectorized_out = char_vectorize_layer(tokenized_word_input)

# Look up embeddings for the returned IDs
token_embedding_out = layers.Embedding(input_dim=n_token_vocab, output_dim=64, mask_zero=True)(token_vectorized_out)

# Produces a [batch size, seq length, token_length, emb size]
char_embedding_layer = layers.Embedding(input_dim=n_char_vocab, output_dim=32, mask_zero=True)(char_vectorized_out)

# A 1D convolutional layer that will generate token embeddings by shifting a convolutional kernel over 
# the sequence of chars in each token (padded)
cnn_output = layers.Conv1D(filters=1, kernel_size=5, strides=1, padding='same', activation='relu')(char_embedding_layer)
# There is an additional dimension of size 1 (out channel dimension) that we need to remove
cnn_output = layers.Lambda(lambda x: x[:, :, :, 0])(cnn_output)

# Concatenate the token and char embeddings
concat_embedding_out = layers.Concatenate()([token_embedding_out, cnn_output])

# Define a simple bidirectional RNN layer, it returns an output at each position
rnn_layer_1 = layers.Bidirectional(layers.SimpleRNN(
    units=64, activation='tanh', use_bias=True, return_sequences=True
))

rnn_out_1 = rnn_layer_1(concat_embedding_out)

# Defines the final prediction layer
dense_layer = layers.Dense(9, activation='softmax')
dense_out = dense_layer(rnn_out_1)

# Defines the model
char_token_embedding_rnn = tf.keras.Model(inputs=word_input, outputs=dense_out)
 
# Define a macro accuracy measure
mean_accuracy_metric = tf.keras.metrics.MeanMetricWrapper(fn=macro_accuracy, name='macro_accuracy')

# Compile the model with a loss optimizer and metrics
char_token_embedding_rnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[mean_accuracy_metric])

# Summary of the model
char_token_embedding_rnn.summary()


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 40, 1)        0           input_1[0][0]                    
__________________________________________________________________________________________________
text_vectorization_1 (TextVecto (None, None, 12)     0           lambda[0][0]                     
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 12, 32) 2752        text_vectorization_1[0][0]       
______________________________________________________________________________________________

## Training the model

In [58]:
def get_sample_weights_from_class_weights(labels, class_weights):
    """ From the class weights generate sample weights """
    return np.vectorize(class_weights.get)(labels)


# Make train_sequences an array
train_sentences = np.array(train_sentences)
# Get sample weights (we cannot use class_weight with TextVectorization layer)
train_sample_weights = get_sample_weights_from_class_weights(padded_train_labels, train_class_weights)

# Training the model
char_token_embedding_rnn.fit(
    train_sentences, padded_train_labels,
    sample_weight=train_sample_weights,
    batch_size=64,
    epochs=3, 
    validation_data=(np.array(valid_sentences), padded_valid_labels)
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x24cc713ca90>

## Evaluate the model on test data

In [59]:
char_token_embedding_rnn.evaluate(np.array(test_sentences), padded_test_labels)



[0.1049954742193222, 0.8327016830444336]