# 10-K Business Description

_UNDER CONSTRUCTION_

- Spacy
- Syntactic analysis, POS tags, named entity recognition
- Logistic regression, Perceptron, stochastic gradient descent
- Growth and Value stocks


In [None]:
import re
import json
import gzip
import requests
import time
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from wordcloud import WordCloud
import spacy
from tqdm import tqdm
from finds.database.sql import SQL
from finds.database.redisdb import Redis
from finds.database.mongodb import MongoDB
from finds.structured.crsp import CRSP
from finds.structured.pstat import PSTAT
from finds.structured.benchmarks import Benchmarks
from finds.structured.signals import Signals
from finds.backtesting.backtest import BackTest
from finds.busday import BusDay
from finds.unstructured import Unstructured
from finds.unstructured.store import Store
from finds.readers.sectoring import Sectoring
from finds.readers.edgar import Edgar
from finds.misc.show import Show
from finds.plots import plot_date
from secret import credentials, paths
# %matplotlib qt
VERBOSE = 0
show = Show(ndigits=4, latex=None)

In [None]:
sql = SQL(**credentials['sql'], verbose=VERBOSE)
user = SQL(**credentials['user'], verbose=VERBOSE)
bd = BusDay(sql)
rdb = Redis(**credentials['redis'])
crsp = CRSP(sql, bd, rdb, verbose=VERBOSE)
pstat = PSTAT(sql, bd, verbose=VERBOSE)
bench = Benchmarks(sql, bd, verbose=VERBOSE)
ed = Edgar(paths['10X'], zipped=True, verbose=VERBOSE)
imgdir = paths['images'] / 'edgar'
store = Store(paths['scratch'], ext='pkl')
item, form = 'bus10K', '10-K'

# 10-K Business Descriptions

In [None]:
# Retrieve universe of stocks
# 5-year growth and book-to-price, by 2022, 1997, 1972
# NYSE top-half market cap
univ = crsp.get_universe(20181231)

In [None]:
lookup = crsp.build_lookup('permno', 'comnam', fillna="")  # company name
comnam = lookup(univ.index)
univ['comnam'] = comnam

In [None]:
lookup_sic = pstat.build_lookup('lpermno', 'sic', fillna=0)     # sic from PSTAT
sic_ = Series(lookup_sic(univ.index, date=20181231), univ.index)
univ['siccd'] = univ['siccd'].where(sic_.isin([0, 9999]), sic_)

In [None]:
lookup_naics = pstat.build_lookup('lpermno', 'naics', fillna=0) # naics from PSTAT
naics_ = Series(lookup_naics(univ.index, date=20181231), univ.index)
univ['naics'] = univ['naics'].where(sic_.isin([0, 9999]), naics_)

In [None]:
# Retrieve business descriptions text; extract nouns from POS tags
nlp = spacy.load("en_core_web_lg")   # Load a spaCy language pipeline
if 'bus' not in store:   # store processed text if necessary
    rows = DataFrame(ed.open(form=form, item=item))  # open bus10K archive
    bus = {}
    restart = 0
    for i, permno in tqdm(enumerate(univ.index)):
        found = rows[rows['permno'].eq(permno) &
                     rows['date'].between(20190101, 20190331)]
        if len(found) and i >= restart:
            doc = nlp(ed[found.iloc[0]['pathname']][:nlp.max_length].lower())
            bus[permno] = " ".join([re.sub("[^a-zA-Z]+", "", token.lemma_)
                                    for token in doc if token.pos_ in ['NOUN']
                                    and len(token.lemma_) > 2])
    store.dump(bus, 'bus')   # serialize
bus = store.load('bus')
keys = list(bus.keys())
corpus = list(bus.values())

# Bag-of-words Tf-Idf

In [None]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=10)
tfidf = vectorizer.fit_transform(corpus)
X = tfidf

__Retrieve Fama-French sector scheme__

In [None]:
# populate codes49 industry, company name, and legacy sector
codes = Sectoring(sql, scheme='codes49', fillna="")     # codes49 industry
sic = Sectoring(sql, scheme='sic', fillna=0)  
codes49 = Series(codes[univ['siccd']])
replace = univ['siccd'].isin([0, 9999]).values
codes49[replace] = codes[sic[univ.loc[replace, 'naics']]]
univ['industry'] = codes49.values

In [None]:
codes12 = Sectoring(sql, scheme='codes12', fillna="")  # [5,10,12,17,30,38,48,49]
sic = Sectoring(sql, scheme='sic', fillna=0)    # cross-walk naics to sic
legacy = Series(codes12[univ['siccd']])         # convert sic to legacy sector
replace = (legacy.eq("").values | univ['siccd'].isin([0, 9999]).values)
legacy[replace] = codes12[sic[univ.loc[replace, 'naics']]] # convert naics
univ['legacy'] = legacy.tolist()
y = univ['legacy'].reindex(keys)
print(y.groupby(y).count().to_string())

## Logistic Regression

The logistic regression update is:
- $P(y=1 | x) \leftarrow 1/(1 + e^{-w x})$

- $w \leftarrow w + \alpha ~ x ~(1 - P(y=1|x))$ if $y = 1$

- $w \leftarrow w - \alpha ~ x ~(1 - P(y=0|x))$ if $y = 0$

$\Rightarrow w \leftarrow w + \alpha ~ x ~(y - P(y=1|x))$ where $y \in \{0,~1\}$



## Perceptron

The perceptron update is:

- $\hat{y} \leftarrow \mathrm{sign}(w x)$

- $w \leftarrow w + \alpha x$ if $y = +1$ and $y \ne \hat{y}$
- $w \leftarrow w - \alpha x$ if $y = -1$  and $y \ne \hat{y}$

$\Rightarrow w \leftarrow w + \alpha~x~(y - \hat{y})/2$  where $y \in \{-1,~+1\}$

## Accuracy

- confusion matrix, precision, recall
- auc, roc

In [None]:
## Confusion Matrix
print(confusion_matrix(y, res.clf.predict(X)))

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ConfusionMatrixDisplay.from_predictions(y, res.clf.predict(X), ax=ax)
fig.tight_layout()
plt.savefig(imgdir / 'logistic_cf.jpg')

In [None]:
fig, ax = plt.subplots(figsize=(5, 3))  #  accuracy vs alpha
ax.semilogx(res.Cs, res.valid_accuracy) #, drawstyle="steps-post")
ax.semilogx(res.Cs, res.train_accuracy) #, drawstyle="steps-post")
argmax = np.argmax(res.valid_accuracy)
ax.annotate(f"{res.valid_accuracy[argmax]:.4f}",
            xy=(res.Cs[argmax], res.valid_accuracy[argmax]))
ax.plot(res.Cs[argmax], res.valid_accuracy[argmax], "o")
ax.set_xlabel("Regularization parameter (C)")
ax.set_ylabel("accuracy")
ax.set_title(f"Softmax Regression: Accuracy vs Complexity")
ax.legend(['Cross-Validation Accuracy', 'Training Accuracy'])
plt.tight_layout()
plt.savefig(imgdir / 'logistic.jpg')

## Feature importances


In [None]:
top_n = 20 
words = {}
feature_names = vectorizer.get_feature_names_out()
for topic, lab in enumerate(res.clf.classes_):
    importance = res.clf.coef_[topic, :]
    words[lab] = [feature_names[i]
                  for i in importance.argsort()[:-top_n-1:-1]]
    freqs = {feature_names[i]: importance[i]
             for i in importance.argsort()[:-top_n-1:-1]}
    fig, ax = plt.subplots(figsize=(3.5, 3), clear=True)
    wc = WordCloud(height=500, width=500, colormap='cool')
    ax.imshow(wc.generate_from_frequencies(freqs))
    ax.axis("off")
    ax.set_title(lab)
    plt.tight_layout()
    plt.savefig(imgdir / f"logistic_wc{topic}.jpg")
out = DataFrame.from_dict(words, orient='columns')
show(out, index=False, **SHOW)