# Stock Entity Recognition Masked

In [1]:
import pandas
import re
import json
import math
import numpy
import os
import tensorflow as tf
import concurrent.futures
import random

from itertools import chain
from multiprocessing import Pool
from functools import partial
from transformers import TFBertForSequenceClassification, BertTokenizerFast

from sklearn.model_selection import train_test_split

## Config

In [2]:
THREADS = 48

DEVICE = '/gpu:0'

SUBMISSIONS_WITHOUT_SYMBOLS_LOC = 'data/ner/submissions_without_symbols.parquet'

TOKENS_AND_LABELS_TRAIN_LOC = 'data/ner/tokens_and_labels_train.parquet'
TOKENS_AND_LABELS_TEST_LOC = 'data/ner/tokens_and_labels_test.parquet'

MODEL_LOC = 'data/ner/masked/best.ckpt'

## Data

In [3]:
tokens_and_labels_train = pandas.read_parquet(TOKENS_AND_LABELS_TRAIN_LOC)
tokens_and_labels_train

Unnamed: 0,tokens,labels
29383,pretty sure USO is the closest to 1:1 with raw...,0 0 0 0 0 0 0 0 0 0 1
109062,Everyone knows there’s 2 rules to follow: neve...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
117238,Ford has been mismanaged for years . They cost...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 ...
36584,I own TERP . They will likely be bought out by...,0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 ...
125747,Just did a MMM 162.5c 3/6 . More masks please!,0 0 0 1 0 0 0 0 0 0
...,...,...
147873,"Watch a deal happens , we moon , then JP gets ...",0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
161762,Look up [TEMPEST](https://youtu.be/APBSaJ5AA_c...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
72042,"I had a bunch of SNDL I got cheap so I , too ,...",0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
138661,Taking financial advice from a convicted felon...,0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0


In [4]:
# load submissions without symbols to augment the masked observations and shuffle them
submissions_without_symbols = pandas.read_parquet(SUBMISSIONS_WITHOUT_SYMBOLS_LOC)
submissions_without_symbols

Unnamed: 0,content
3,Thank you.
4,why do companies buy back shares?
5,Share buybacks should not change the value of ...
11,Directional iron condors using carefully place...
13,No You would have had to have earned income la...
...,...
31250,Blowing versus sucking
31251,Not punny
31252,VWDRY is up 3.5% today
31254,Ticker: $FAG


In [5]:
def load_symbols():
    exchanges = ['amex', 'nasdaq', 'nyse']

    # create a dict of symbols to exchanges
    symbols = set()
    for exchange in exchanges:
        exchange_symbols = pandas.read_csv(f'data/{exchange}_symbols.tsv', sep='\t')
        for index, row in exchange_symbols.iterrows():
            symbols.add(row['Symbol'])
        
    return symbols

# load a set of all symbols, were going to use this just to double check masking
symbols = load_symbols()
len(symbols)

9738

In [6]:
def chunk_df(df, chunks):
    chunk_size = len(df.index) / chunks
    return [df[round(chunk_size * i):round(chunk_size * (i + 1))].copy(deep=True) for i in range(0, chunks)]

## Masking

We want to mask a token at a time and predict if the masked token is a stock symbol or not. We also want a even distribution of symbols and non-symbols so for every symbol in a sentence, create the same number of observations of masked non-symbols.

In [7]:
def mask_submission_with_symbols(row, observations=[]):
    tokens = row['tokens'].split(' ')
    labels = numpy.array(row['labels'].split(' ')).astype(int)
    
    # get all the token indexes that are symbols
    symbols_idxs = numpy.nonzero(labels)[0]
    
    # since we only labeled less frequently used symbols, there would be more popular symbols that were not labeled so
    # do one more pass to filter those out to avoid building incorrect examples with them
    for i, token in enumerate(tokens):
        if token.upper() in symbols:
            labels[i] = 1
    
    # get all the token indexes that are not symbols
    all_non_symbol_idxs = numpy.nonzero(labels == 0)[0]
    
    # pick same number of random non symbol tokens, up to half
    non_symbol_idxs = all_non_symbol_idxs[numpy.random.choice(
        len(all_non_symbol_idxs), min(len(symbols_idxs), round(len(all_non_symbol_idxs) / 2)), replace=False
    )]
    
    for idxs, label in [(symbols_idxs, 1), (non_symbol_idxs, 0)]:
        for idx in idxs:
            new_observation = tokens.copy()
            new_observation[idx] = '[MASK]'
            observations.append([' '.join(new_observation), label])
            
    return observations

def mask_submissions_with_symbols(df):
    observations = []
    for _, row in df.iterrows():
        mask_submission_with_symbols(row, observations)
        
    return observations

def mask_submission_without_symbols(row, observations=[]):
    if (pandas.isnull(row['content'])):
        return observations
    
    tokens = row['content'].split(' ')

    # mask a random token
    tokens[random.randrange(len(tokens))] = '[MASK]'
    
    observations.append([' '.join(tokens), 0])
            
    return observations

def mask_submissions_without_symbols(df):
    observations = []
    for _, row in df.iterrows():
        mask_submission_without_symbols(row, observations)
        
    return observations

In [8]:
training_observations = []

# process training set which are submissions with symbols
submissions_chunks = chunk_df(tokens_and_labels_train, THREADS)

with concurrent.futures.ProcessPoolExecutor(max_workers=THREADS) as executor:
    futures = [executor.submit(mask_submissions_with_symbols, chunk) for chunk in submissions_chunks]
    for future in concurrent.futures.as_completed(futures):
        training_observations.extend(future.result())

# add some submissions without symbols so not biased to language of a specific type
submissions_chunks = chunk_df(
    submissions_without_symbols.sample(frac=1)[0:len(tokens_and_labels_train)], THREADS
)

with concurrent.futures.ProcessPoolExecutor(max_workers=THREADS) as executor:
    futures = [executor.submit(mask_submissions_without_symbols, chunk) for chunk in submissions_chunks]
    for future in concurrent.futures.as_completed(futures):
        training_observations.extend(future.result())

del submissions_chunks
        
training_observations = pandas.DataFrame(training_observations, columns=['tokens', 'label'])
training_observations

Unnamed: 0,tokens,label
0,I bought [MASK] & PACB so yea,1
1,I [MASK] CRSP & PACB so yea,0
2,DD: my [MASK] pilot buddy is predicting that t...,1
3,DD: my AA pilot buddy is predicting that they ...,0
4,too late . In reality though the next service ...,1
...,...,...
660193,[MASK],0
660194,Poor Michael [MASK],0
660195,[MASK],0
660196,[MASK],0


In [10]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def encode_df(df, max_length=256):
    # encode everything
    inputs_encoded = tokenizer(
        df['tokens'].tolist(), 
        return_tensors="tf",
        # make sure the same length across all encodings
        max_length=max_length, 
        padding='max_length',
        truncation=True
    )
        
    labels_encoded = numpy.array(df['label']).reshape((df.shape[0], 1))

    return tf.data.Dataset.from_tensor_slices(({
        'input_ids': inputs_encoded['input_ids'],
        'token_type_ids': inputs_encoded['token_type_ids'],
        'attention_mask': inputs_encoded['attention_mask']
    }, labels_encoded)).batch(32)

In [11]:
# shuffle and encode
encoded_dataset = encode_df(training_observations.sample(frac=1))

In [None]:
%%time
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

optimizer = tf.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.summary()

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=MODEL_LOC,
    save_weights_only=True,
    verbose=1
)

model.fit(encoded_dataset, epochs=5, callbacks=[cp_callback])

## Inference

In [13]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.load_weights(MODEL_LOC)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa27d7d8b80>

In [39]:
def predict(sentence, max_length=256):
    encoding = tokenizer(
        sentence, 
        return_tensors="tf",
        # make sure the same length across all encodings
        max_length=max_length, 
        padding='max_length',
        truncation=True
    )

    prediction = tf.argmax(model(encoding).logits[0])

    return prediction

In [35]:
predict('[MASK]')

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([ 1.3705293, -1.370667 ], dtype=float32)>

In [55]:
tokens_and_labels_test = pandas.read_parquet(TOKENS_AND_LABELS_TEST_LOC)
tokens_and_labels_test

Unnamed: 0,tokens,labels
56037,"MSFT , V , BRK.B , AAPL , JNJ",0 0 0 0 0 0 0 0 1
38806,Yes . Look at oil company Whiting Petroleum . ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
115771,INO and ten cent plays?,1 0 0 0 0
30274,SE,1
52170,one is the MSI index.,0 0 0 1 0
...,...,...
45078,EOG is at a pretty good price right now IMO . ...,1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
166128,HPQ To the mooooooooon,1 0 0 0
52986,"Ah , didn't realize Yahoo only updated the NAV...",0 0 0 0 0 0 0 0 1 0 0 0
162531,"Sears BK , Mattress Firm BK What is the best w...",0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 ...


In [50]:
predict(submissions_without_symbols.iloc[]['content'])

<tf.Tensor: shape=(), dtype=int64, numpy=0>