## Multiclass Classifier for Announcements issued by Listed Companies on the Hong Kong Stock Exchange

In [18]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Activation, Dense, Dropout, Input, Embedding, BatchNormalization, MultiHeadAttention, LayerNormalization, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly

In [42]:
# Get data from scraped CSVs
CCT = '/content/drive/My Drive/Data/HKEX/ch14A_cct.csv'
NT = '/content/drive/My Drive/Data/HKEX/notifiable_transactions.csv'
TK = '/content/drive/My Drive/Data/HKEX/takeovers_code_3_7.csv'
AGM = '/content/drive/My Drive/Data/HKEX/notice_of_agm.csv'
TH = r'/content/drive/My Drive/Data/HKEX/trading halt.csv'
AR = '/content/drive/My Drive/Data/HKEX/annual_results_annt.csv'

ct = pd.read_csv(CCT)
nt = pd.read_csv(NT)
tk = pd.read_csv(TK)
agm = pd.read_csv(AGM)
th = pd.read_csv(TH)
ar = pd.read_csv(AR)

In [43]:
# Trim data sources such that the amount of training examples from different categories are not vastly different
agm = agm.sample(frac=1)
agm = agm[:3000]
ar = ar.sample(frac=1)
ar = ar[:3000]

In [44]:
# Concat all data sources
df = ct.append(nt).append(tk).append(agm).append(th).append(ar)
df

Unnamed: 0,URL,Text,Label
0,https://www1.hkexnews.hk/listedco/listconews/s...,Hong Kong Exchanges and Clearing Limited and T...,Chapter 14A Connected Transaction Announcement
1,https://www1.hkexnews.hk/listedco/listconews/s...,Hong Kong Exchanges and Clearing Limited and T...,Chapter 14A Connected Transaction Announcement
2,https://www1.hkexnews.hk/listedco/listconews/s...,Hong Kong Exchanges and Clearing Limited and T...,Chapter 14A Connected Transaction Announcement
3,https://www1.hkexnews.hk/listedco/listconews/s...,Hong Kong Exchanges and Clearing Limited ...,Chapter 14A Connected Transaction Announcement
4,https://www1.hkexnews.hk/listedco/listconews/s...,Hong Kong Exchanges and Clearing Limited and T...,Chapter 14A Connected Transaction Announcement
...,...,...,...
814,https://www1.hkexnews.hk/listedco/listconews/s...,Hong Kong Exchanges and Clearing Limited and T...,Annual Results Announcement
2774,https://www1.hkexnews.hk/listedco/listconews/s...,Hong Kong Exchanges and Clearing Limited and T...,Annual Results Announcement
1921,https://www1.hkexnews.hk/listedco/listconews/s...,Hong Kong Exchanges and Clearing Limited and T...,Annual Results Announcement
81,https://www1.hkexnews.hk/listedco/listconews/g...,Hong Kong Exchanges and Clearing Limited and T...,Annual Results Announcement


In [45]:
# Shuffle dataframe
df = df.sample(frac = 1)

In [46]:
# Create matrix of labels, precursor to one-hot encodings
df['Notifiable Transaction'] = df.Label.apply(lambda x: 1 if x == "Notifiable Transactions" else 0)
df['Takeover Offer'] = df.Label.apply(lambda x: 1 if x == "Takeovers Code 3.7 Announcement" else 0)
df['Connected Transaction'] = df.Label.apply(lambda x:1 if x == "Chapter 14A Connected Transaction Announcement" else 0)
df['Notice of AGM'] = df.Label.apply(lambda x:1 if x == "Notice of AGM" else 0)
df['Trading Halt'] = df.Label.apply(lambda x:1 if x == 'Trading Halt' else 0)
df['Annual Results'] = df.Label.apply(lambda x:1 if x == 'Annual Results Announcement' else 0)
df.head()

Unnamed: 0,URL,Text,Label,Notifiable Transaction,Takeover Offer,Connected Transaction,Notice of AGM,Trading Halt,Annual Results
7441,https://www1.hkexnews.hk/listedco/listconews/s...,Hong Kong Exchanges and Clearing Limited and T...,Notice of AGM,0,0,0,1,0,0
2064,https://www1.hkexnews.hk/listedco/listconews/s...,Hong Kong Exchanges and Clearing Limited and T...,Annual Results Announcement,0,0,0,0,0,1
1791,https://www1.hkexnews.hk/listedco/listconews/s...,Hong Kong Exchanges and Clearing Limited and T...,Annual Results Announcement,0,0,0,0,0,1
202,https://www1.hkexnews.hk/listedco/listconews/s...,Hong Kong Exchanges and Clearing Limited ...,Takeovers Code 3.7 Announcement,0,1,0,0,0,0
2350,https://www1.hkexnews.hk/listedco/listconews/s...,Hong Kong Exchanges and Clearing Limited and T...,Notifiable Transactions,1,0,0,0,0,0


In [47]:
# Extract matrix of labels
labels = df[['Notifiable Transaction', 'Takeover Offer', 'Connected Transaction', 'Notice of AGM', 'Trading Halt', 'Annual Results']]
labels.head()

Unnamed: 0,Notifiable Transaction,Takeover Offer,Connected Transaction,Notice of AGM,Trading Halt,Annual Results
7441,0,0,0,1,0,0
2064,0,0,0,0,0,1
1791,0,0,0,0,0,1
202,0,1,0,0,0,0
2350,1,0,0,0,0,0


## Text Preprocessing

Text preprocessing is an important step of any NLP application. My preliminary testing on the model with fewer labels and fewer training examples indicated that removing stop words leads to a drastically decreased accuracy on the classifier. This may be because a bidirectional LSTM would be more able to capture the semantic meanings of the corpus with the stop words intact. 

In [48]:
# Text Preprocessing, removing stop words vastly decreased training accuracy
def preprocess(corpus):
    corpus = corpus.lower()
    cleaner = re.compile(r'[^a-z 0-9]')
    corpus = re.sub('\n', ' ', corpus)
    corpus = re.sub(cleaner, ' ', corpus)
    corpus = re.sub(r'\s+', ' ', corpus)
  #  corpus = ' '.join([word for word in corpus.split() if word not in stopwords.words('english')])
    return corpus

In [49]:
# Extract X Y values
corpus_list = list(df.Text)
X = [preprocess(corpus) for corpus in corpus_list]

Y = labels.values

In [50]:
# Create train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

# Tokenization and Padding

Each text corpus in the training examples X_train are converted into an array of tokens, where each token will be word-vectors representating the token. 

Each of the inputs will be normalized in terms of length, with a max length on each input chosen to be 1500. This number came about after extensive testing, in addition to some intuitive feature engineering, such as knowing that the most important content of any announcement is located towards the beginning of the announcement. 

For announcements with less than 1500 words, null tokens will be used to pad the announcement until the 1500 word count is reached.


In [51]:
# Process X data into tokenized and padded input
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

vocab_size = len(tokenizer.word_index) + 1
max_len = 3000
X_train = pad_sequences(X_train, padding = 'post', maxlen = max_len)
X_test = pad_sequences(X_test, padding = 'post', maxlen = max_len)

print(f'vocab_size = {vocab_size}')

vocab_size = 74794


In [52]:
import keras
(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)



Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray



In [21]:
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=max_len)
x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=max_len)

In [22]:
X_train.shape, x_train.shape

((9546, 2000), (25000, 2000))

In [31]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential([Dense(ff_dim, activation='relu'), Dense(embed_dim),])
        self.layernorm1 = LayerNormalization() 
        self.layernorm2 = LayerNormalization() 
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [61]:
class TokenPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)
    
    def call(self,x):
        maxlen=tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions 


In [62]:
def Transformers():
    inputs = Input(shape = (max_len,))
    embedding_layer = TokenPositionEmbedding(max_len, vocab_size, embed_dim)
    layer = embedding_layer(inputs)
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    layer = transformer_block(layer)
    layer = GlobalAveragePooling1D()(layer)
    layer = Dropout(0.2)(layer)
    layer = Dense(128, activation='relu')(layer)
    layer = Dropout(0.2)(layer)
    outputs = Dense(6, activation='softmax')(layer)
    model = Model(inputs=inputs, outputs=outputs)
    return model

In [66]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

model = Transformers()
opt = tf.keras.optimizers.Adam(lr=1e-3)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [67]:
model.summary()
#tf.keras.utils.plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=False)

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        [(None, 3000)]            0         
_________________________________________________________________
token_position_embedding_8 ( (None, 3000, 32)          2489408   
_________________________________________________________________
transformer_block_8 (Transfo (None, 3000, 32)          10656     
_________________________________________________________________
global_average_pooling1d_8 ( (None, 32)                0         
_________________________________________________________________
dropout_34 (Dropout)         (None, 32)                0         
_________________________________________________________________
dense_33 (Dense)             (None, 128)               4224      
_________________________________________________________________
dropout_35 (Dropout)         (None, 128)               0   

In [68]:
history = model.fit(X_train, Y_train,
                    epochs=10,
                    validation_data=(X_test, Y_test),
                    batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [69]:
acc = model.evaluate(X_test, Y_test)
print(f'Validation set results \nLoss: {acc[0]:.4f} \nAccuracy: {acc[1]:.4f}')

Validation set results 
Loss: 0.1745 
Accuracy: 0.9560


In [71]:
# Plot losses and accuracy with plotly
h = history.history
epochs = [i+1 for i in range(len(h['loss']))]
cols = plotly.colors.DEFAULT_PLOTLY_COLORS

fig = make_subplots(rows=1, cols=2, subplot_titles=('Training and Validation Accuracy', 'Training and Validation Loss'))

fig.add_trace(go.Scatter(x=epochs, y=h['accuracy'], 
                         line=dict(width=1.5, color=cols[0]),
                         name='Training Accuracy'), 1,1)
fig.add_trace(go.Scatter(x=epochs, y=h['val_accuracy'], 
                         line=dict(width=1.5, color=cols[3]),
                         name='Validation Accuracy'), 1,1)

fig.add_trace(go.Scatter(x=epochs, y=h['loss'], 
                         line=dict(width=1.5, color=cols[0]),
                         name='Training Loss'), 1,2)
fig.add_trace(go.Scatter(x=epochs, y=h['val_loss'], 
                         line=dict(width=1.5, color=cols[3]),
                         name='Validation Loss'), 1,2)

fig.update_layout(height = 400, width=1000)
fig.update_xaxes(title_text='Epochs')
fig.update_yaxes(title_text='Accuracy', row=1,col=1)
fig.update_yaxes(title_text='Loss', row=1,col=2)

fig.show()

In [None]:
model.save('/content/drive/My Drive/Data/HKEX/transformers_model.h5')

## Testing on New Announcements 

Let's see how our classifier does on new data that it has not seen! 

I picked some recent announcements for testing. 

For the testing, it is important that the testing announcements are not in the training set or validation set. The classify_pdf function written below takes this into account.


In [72]:
import requests
!pip install PyPDF2 
import PyPDF2
import io
import warnings

Collecting PyPDF2
[?25l  Downloading https://files.pythonhosted.org/packages/b4/01/68fcc0d43daf4c6bdbc6b33cc3f77bda531c86b174cac56ef0ffdb96faab/PyPDF2-1.26.0.tar.gz (77kB)
[K     |████▎                           | 10kB 27.6MB/s eta 0:00:01[K     |████████▌                       | 20kB 16.0MB/s eta 0:00:01[K     |████████████▊                   | 30kB 13.9MB/s eta 0:00:01[K     |█████████████████               | 40kB 13.3MB/s eta 0:00:01[K     |█████████████████████▏          | 51kB 9.3MB/s eta 0:00:01[K     |█████████████████████████▍      | 61kB 8.6MB/s eta 0:00:01[K     |█████████████████████████████▋  | 71kB 9.7MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 6.5MB/s 
[?25hBuilding wheels for collected packages: PyPDF2
  Building wheel for PyPDF2 (setup.py) ... [?25l[?25hdone
  Created wheel for PyPDF2: filename=PyPDF2-1.26.0-cp36-none-any.whl size=61087 sha256=86f9dd230bcc5d389c66d499d736e14717619ad65caffa6dc57efbc983f456f7
  Stored in directory: /

In [73]:
def classify_pdf(url):
    # if url for testing is in the existing data set, do not go ahead with testing
    if len(df.loc[df['URL'] == url]) != 0:
        return f'{url} in Training or Validation, try another url'
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        response = requests.get(url)
        raw_data = response.content

        pdf_content = io.BytesIO(raw_data)
        pdf_reader = PyPDF2.PdfFileReader(pdf_content)

        text_string = ''

        if pdf_reader.isEncrypted:
            pdf_reader.decrypt("")

        for i in range(pdf_reader.numPages):
            text_string += pdf_reader.getPage(i).extractText()

        text = [text_string]
        seq = tokenizer.texts_to_sequences(text)
        padded = pad_sequences(seq, maxlen=max_len)
        pred = model.predict(padded)
        labels_pred = ['Notifiable Transaction', 'Takeover Offer', 'Connected Transaction', 'Notice of AGM', 'Trading Halt', 'Annual Results']
        normalised = pred/np.sum(pred)*100
        confidence = float(normalised.astype(float).reshape(len(labels_pred),-1)[np.argmax(pred)])
        print(f'Prediction: {labels_pred[np.argmax(pred)]} with {confidence:.2f}% confidence')

In [74]:
classify_pdf('https://www1.hkexnews.hk/listedco/listconews/sehk/2020/0925/2020092501332.pdf')


Prediction: Notifiable Transaction with 99.97% confidence


Great! The classifier was able to correctly identify the announcement as a Notifiable Transaction, which is also known as a Discloseable Transaction. Let's try a few more announcements.

In [75]:
classify_pdf('https://www1.hkexnews.hk/listedco/listconews/sehk/2020/0925/2020092501350.pdf')

Prediction: Annual Results with 99.99% confidence


The classifier is very confident about this result, which turned out to be correct.

In [76]:
classify_pdf('https://www1.hkexnews.hk/listedco/listconews/sehk/2020/0925/2020092501035.pdf')

Prediction: Trading Halt with 100.00% confidence


As for this sample, the classifier is not too confident about it, but it turned out to be correct.