# Stock Entity Recognition Unmasked

In [1]:
import pandas
import re
import json
import math
import numpy
import os
import tensorflow as tf

from itertools import chain
from multiprocessing import Pool
from functools import partial
from transformers import TFBertForTokenClassification, BertTokenizerFast

from sklearn.model_selection import train_test_split

## Config

In [2]:
THREADS = 48

DEVICE = '/cpu:0'

TOKENS_AND_LABELS_TRAIN_LOC = 'data/ner/tokens_and_labels_train.parquet'
TOKENS_AND_LABELS_TEST_LOC = 'data/ner/tokens_and_labels_test.parquet'
MODEL_LOC = 'data/ner/unmasked/best.ckpt'

## Training

In [3]:
tokens_and_labels_train = pandas.read_parquet(TOKENS_AND_LABELS_TRAIN_LOC)
tokens_and_labels_train

Unnamed: 0,tokens,labels
29383,pretty sure USO is the closest to 1:1 with raw...,0 0 0 0 0 0 0 0 0 0 1
109062,Everyone knows there’s 2 rules to follow: neve...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
117238,Ford has been mismanaged for years . They cost...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 ...
36584,I own TERP . They will likely be bought out by...,0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 ...
125747,Just did a MMM 162.5c 3/6 . More masks please!,0 0 0 1 0 0 0 0 0 0
...,...,...
147873,"Watch a deal happens , we moon , then JP gets ...",0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
161762,Look up [TEMPEST](https://youtu.be/APBSaJ5AA_c...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
72042,"I had a bunch of SNDL I got cheap so I , too ,...",0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
138661,Taking financial advice from a convicted felon...,0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0


In [4]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def encode_labels(max_length, chunk):
    # loss function doesn't compute loss for labels if -100 so default to that
    labels_encoded = numpy.ones((len(chunk), max_length), dtype=numpy.int8) * -100

    # start filling in labels using the offset_mapping
    for observation_i, (offset_mapping, labels_raw) in enumerate(chunk):
        labels = numpy.array(labels_raw.split(' ')).astype(int)
        label_i = 0
        
        for offset_i, offset in enumerate(offset_mapping):
            if offset[0] == 0 and offset[1] != 0:
                labels_encoded[observation_i][offset_i] = labels[label_i]
                label_i += 1
                
    return labels_encoded

def encode_df(df, max_length=256):
    # encode everything
    inputs_encoded = tokenizer(
        # split ourselves so we can align with labels
        list(map(lambda o: o.split(' '), df['tokens'])), 
        return_tensors="tf",
        is_split_into_words=True,
        # offset mappings to align labels to the first word piece
        return_offsets_mapping=True,
        # make sure the same length across all encodings
        max_length=max_length, 
        padding='max_length',
        truncation=True
    )
    
    offsets_with_labels = list(zip(inputs_encoded.offset_mapping.numpy(), df['labels']))
    chunk_size = len(offsets_with_labels) / THREADS
    offsets_with_labels_chunks = [offsets_with_labels[round(chunk_size  *i):round(chunk_size * (i + 1))] for i in range(0, THREADS)]
    
    with Pool(THREADS) as pool:
        encoded_labels = pool.map(partial(encode_labels, max_length), offsets_with_labels_chunks)
        
    return inputs_encoded, numpy.stack(list(chain(*encoded_labels)))

In [5]:
encoded = encode_df(tokens_and_labels_train)

tf_train_dataset = tf.data.Dataset.from_tensor_slices(({
    'input_ids': encoded[0]['input_ids'],
    'token_type_ids': encoded[0]['token_type_ids'],
    'attention_mask': encoded[0]['attention_mask']
}, encoded[1])).batch(32)

In [6]:
%%time
model = TFBertForTokenClassification.from_pretrained('bert-base-uncased', num_labels = 2)

optimizer = tf.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.summary()

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=MODEL_LOC,
    save_weights_only=True,
    verbose=1
)

model.fit(tf_train_dataset, epochs=5, callbacks=[cp_callback])

All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_token_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108891648 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 108,893,186
Trainable params: 108,893,186
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug,

  return py_builtins.overload_of(f)(*args)



Epoch 00001: saving model to data/ner/unmasked/best.ckpt
Epoch 2/5

Epoch 00002: saving model to data/ner/unmasked/best.ckpt
Epoch 3/5

Epoch 00003: saving model to data/ner/unmasked/best.ckpt
Epoch 4/5

Epoch 00004: saving model to data/ner/unmasked/best.ckpt
Epoch 5/5

Epoch 00005: saving model to data/ner/unmasked/best.ckpt
CPU times: user 1h 24min 27s, sys: 10min 15s, total: 1h 34min 43s
Wall time: 2h 6min 38s


<tensorflow.python.keras.callbacks.History at 0x7f2190887bb0>

## Inference

In [7]:
# load up the tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = TFBertForTokenClassification.from_pretrained('bert-base-uncased', num_labels = 2)
model.load_weights(MODEL_LOC)

All model checkpoint layers were used when initializing TFBertForTokenClassification.

Some layers of TFBertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f23fa078f40>

In [8]:
def predict(sentence, max_length=256):
    tokens = sentence.split(' ')

    test_encoding = tokenizer(
        tokens,
        return_tensors="tf",
        is_split_into_words=True,
        max_length=max_length, 
        padding='max_length',
        truncation=True,
        return_offsets_mapping=True
    )

    # grab the offset mappings
    offset_mapping = test_encoding.offset_mapping
    del test_encoding['offset_mapping']
    test_encoding

    prediction = tf.argsort(model(test_encoding).logits[0])

    token_predictions = []
    
    num_tokens = len(test_encoding.attention_mask[0][test_encoding.attention_mask[0] == 1])
    token_i = 0
    
    for i in range(num_tokens):
        offset = offset_mapping[0][i]
        token_prediction = prediction[i]

        if offset[0] == 0 and offset[1] != 0:
            token_predictions.append([tokens[token_i], bool(token_prediction[1] == 1)])
            token_i += 1
            
    return token_predictions

In [26]:
with tf.device(DEVICE):
    display(predict('my msft is crm'))

[['my', False], ['msft', False], ['is', False], ['crm', True]]