# Analysis of risk narratives in mutual fund prospectuses
---

In [1]:
import sys
sys.path.append('..')

import multiprocessing
import pandas as pd
import spacy
import tempfile
import warnings

from getdera import dera

from getdera.scrapper import client
from tqdm import tqdm

warnings.filterwarnings('ignore')

In [2]:
# GLOBAL VARIABLES

DATASET = "risk"
DIR = tempfile.gettempdir()
START_DATE = "01/01/2018"
END_DATE = "30/12/2019"

DATA = {} # Data dictionary

nlp = spacy.load("en_core_web_md") # Pretrained NLP model
PIPE_PARAMS = {'n_process': multiprocessing.cpu_count(), 'batch_size': 50} # Spacy pipeline parameters

In [3]:
# Extract data from sec.gov

with tempfile.TemporaryDirectory(dir=DIR) as tmpdir:
    # Download data and save in tempdir
    client.get_DERA(DATASET, tmpdir, START_DATE, END_DATE)
    # Process SUB data in tempdir
    DATA['sub'] = dera.process(tmpdir, 'sub', START_DATE, END_DATE)
    # Process TXT data in tempdir
    DATA['txt'] = dera.process(tmpdir, 'txt', START_DATE, END_DATE, dtype = {'document': str, 'txtlen': int})

100%|██████████| 8/8 [00:00<00:00, 116.60it/s]
100%|██████████| 8/8 [00:07<00:00,  1.06it/s]


In [4]:
# FILTER tags RiskNarrativeTextBlock
DATA['risk'] = DATA['txt'].query('tag == "RiskNarrativeTextBlock"').set_index('adsh')

# LEFT OUTER JOIN sub data with risk_data by index (adsh)
data = DATA['sub'].merge(DATA['risk'], on='adsh', how='left')

# Clear memory
del DATA

In [5]:
# Language Processing Pipeline

texts = data['value'].fillna('N/A').tolist()
docs = []
for doc in tqdm(nlp.pipe(texts, disable=["ner", "parser"], **PIPE_PARAMS), total=len(data)):
    docs.append(doc)

 35%|███▌      | 14600/41593 [19:45<36:32, 12.31it/s]  


KeyboardInterrupt: 