# Topic modelling risk narratives in mutual fund prospectuses
---

In [None]:
import sys
sys.path.append('..')

import multiprocessing
import numpy as np
import pandas as pd
import spacy
import tempfile
import warnings

from getdera import dera
from gensim import corpora
from gensim import models
from tqdm import tqdm

from bokeh.io import output_notebook
from bokeh.layouts import column
from bokeh.palettes import all_palettes
from bokeh.plotting import figure
from bokeh.plotting import show
from bokeh.models import HoverTool
from bokeh.models import CustomJS
from bokeh.models import ColumnDataSource
from bokeh.models import Slider

from getdera.scrapper import client
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

warnings.filterwarnings('ignore')

# Load bokeh into Jupyter
output_notebook()

In [None]:
# # Download and install spacy pretrained model
# !python -m spacy download en_core_web_sm

## 1. Data prep
---

In [None]:
# GLOBAL VARIABLES

DATASET = "risk"
DIR = tempfile.gettempdir()
START_DATE = "01/01/2019"  # From start of 2019
END_DATE = "30/12/2019"  # To end of 2019
SELECTED_SUB_FIELDS = [
    'name',  # Name of registrant
    'cityba',  # City of registrant's business address
    'pdate',  # Prospectus date
]  # Selected fields in the SUB table
SELECTED_TXT_FIELDS = [
    'adsh',  # Accession number
    'tag',  # Standard taxonomy tag
    'value',  # Text
]
PARTS_OF_SPEECH = [
    'NOUN',
    'VERB',
    'ADJ',
    'ADV'
]

DATA = {}  # Data dictionary

nlp = spacy.load("en_core_web_sm")  # Pretrained NLP model
PIPE_PARAMS = {'n_process': multiprocessing.cpu_count(), 'batch_size': 50}  # Spacy pipeline parameters

In [None]:
# Extract data from sec.gov

with tempfile.TemporaryDirectory(dir=DIR) as tmpdir:
    # Download data and save in tempdir
    client.get_DERA(DATASET, tmpdir, START_DATE, END_DATE)
    # Process SUB data in tempdir
    sub_table = dera.process(tmpdir, DATASET, 'sub', START_DATE, END_DATE, dtype={'pdate': str})
    sub_table = sub_table[SELECTED_SUB_FIELDS]
    DATA['sub'] = sub_table
    # Process TXT data in tempdir
    txt_table = dera.process(tmpdir, DATASET, 'txt', START_DATE, END_DATE, dtype={'document': str, 'txtlen': int})
    txt_table = txt_table[SELECTED_TXT_FIELDS]
    DATA['txt'] = txt_table

In [None]:
# FILTER tags RiskNarrativeTextBlock

DATA['risk'] = DATA['txt'].query('tag == "RiskNarrativeTextBlock"').set_index('adsh')

# LEFT OUTER JOIN sub data with risk_data by index (adsh)

data = DATA['sub'].merge(DATA['risk'], on='adsh', how='left')

# Convert pdate to datetime

data['pdate'] = pd.to_datetime(data['pdate'])

## 2. Data processing

In [None]:
# Language Processing Pipeline

texts = data['value'].fillna('N/A').tolist()
docs = []
for doc in tqdm(nlp.pipe(texts, disable=["ner", "parser"], **PIPE_PARAMS), total=len(data)):
    doc = [token.lemma_.lower() for token in doc if token.pos_ in PARTS_OF_SPEECH]
    docs.append(doc)

## 2. Topic Modelling and Visualisation
---

In [None]:
# GLOBAL VARIABLES

# Assume there are 6 types of risk profiles 
# https://www.citibank.com.hk/english/investment/pdf/IRPQ_ICPQ_Eng.pdf

N_TOPICS = 6

### 2.1 LDA

In [None]:
# Set random seed
np.random.seed(42)


# Get dictionary and corpus

def get_dictionary_corpus(docs, no_below=5, no_above=0.5):
    dictionary = corpora.Dictionary(docs)  # Dictionary
    dictionary.filter_extremes(no_below, no_above)  # Filter extremes in dictionary
    corpus = [dictionary.doc2bow(doc) for doc in docs]  # Corpus
    return dictionary, corpus


# Get LDA model
# Note: Must set minimum_probability to 0 in order to perform
# dimensionality reduction downstream

def get_lda_model(corpus, dictionary, num_topics=N_TOPICS):
    lda = models.LdaMulticore(corpus,
                              id2word=dictionary,
                              num_topics=num_topics,
                              minimum_probability=0)
    return lda


# Get LDA topics df

def get_lda_topics_df(lda):
    topics_dict = {}
    for i, topic in lda.print_topics(-1):
        topics_dict['topic {}'.format(i)] = topic.split('+')
    return pd.DataFrame(topics_dict)

In [None]:
# Check number of unique tokens

print(dictionary)

In [None]:
# Dictionary and corpus

dictionary, corpus = get_dictionary_corpus(docs)

# LDA model

lda = get_lda_model(corpus, dictionary)

# LDA topics df

get_lda_topics_df(lda)

In [None]:
# Refactor results from LDA into 
# numpy matrix (number of prospectuses by number of topics)

results = np.array([[vec for (j, vec) in lda[corpus[i]]] for i in range(len(corpus))])

### 2.2 PCA Dimensionality Reduction

In [None]:
# Fit PCA model

pca = PCA(n_components=2)

# Get embeddings

embeddings_pca = pca.fit_transform(results)
embeddings_pca = pd.DataFrame(embeddings_pca, columns=['x', 'y'])
embeddings_pca['hue'] = results.argmax(axis=1)

In [None]:
# Bokeh plot

# Bokeh data source

source = ColumnDataSource(data={
    'x': embeddings_pca.x,
    'y': embeddings_pca.y,
    'colors': [all_palettes['Spectral'][11][i] for i in embeddings_pca.hue],
    'name': data['name'],
    'city': data['cityba'],
    'pdate': data['pdate'],
    'alpha': [0.5]*embeddings_pca.shape[0],
    'size': [15]*embeddings_pca.shape[0]})

# Plot

title = 'Topic modelling risk narratives (LDA with PCA dimensionality reduction)'
pca_plot = figure(plot_width=800,
                  plot_height=800,
                  tools=['hover', 'pan', 'wheel_zoom', 'reset'],
                  title=title)
pca_plot.circle('x', 'y',
                size='size',  # Size according to 'colors' attribute in source
                fill_color='colors',  # Color according to 'colors' attribute in source
                alpha='alpha',  # Alpha according to 'colors' attribute in source
                line_alpha=0,
                line_width=0.01,
                source=source)

# Hover tool

hover = pca_plot.select(dict(type=HoverTool))
hover.tooltips = [('index', '$index'),
                  ('name', '@name'),
                  ('city', '@city'),
                  ('prospectus date', '@pdate{%F}')]
hover.formatters = {'@pdate': 'datetime'}

show(pca_plot)

### 2.3 t-SNE Dimensionality Reduction

In [None]:
# Fit t-SNE model

tsne = TSNE(random_state=0, init='pca', perplexity=30)

# Get embeddings

embeddings_tsne = tsne.fit_transform(results)
embeddings_tsne = pd.DataFrame(embeddings_tsne, columns=['x','y'])
embeddings_tsne['hue'] = results.argmax(axis=1)

In [None]:
# Bokeh plot

# Bokeh data source

source = ColumnDataSource(data={
    'x': embeddings_tsne.x,
    'y': embeddings_tsne.y,
    'colors': [all_palettes['Spectral'][11][i] for i in embeddings_tsne.hue],
    'name': data['name'],
    'city': data['cityba'],
    'pdate': data['pdate'],
    'alpha': [0.5]*embeddings_tsne.shape[0],
    'size': [15]*embeddings_tsne.shape[0]})

# Plot

title = 'Topic modelling risk narratives (LDA with t-SNE dimensionality reduction)'
tsne_plot = figure(plot_width=800,
                   plot_height=800,
                   tools=[hover_tool, 'pan', 'wheel_zoom', 'reset'],
                   title=title)
tsne_plot.circle('x', 'y',
                 size='size',  # Size according to 'colors' attribute in source
                 fill_color='colors',  # Color according to 'colors' attribute in source
                 alpha='alpha',  # Alpha according to 'colors' attribute in source
                 line_alpha=0,
                 line_width=0,
                 source=source)

# Hover tool

hover = pca_plot.select(dict(type=HoverTool))
hover.tooltips = [('index', '$index'),
                  ('name', '@name'),
                  ('city', '@city'),
                  ('prospectus date', '@pdate{%F}')]
hover.formatters = {'@pdate': 'datetime'}

show(tsne_plot)