In [21]:
import pandas as pd

df = pd.read_csv('with_hours.csv', compression = 'gzip')

df.columns

Index(['upper_quantile_price_change', 'lower_quantile_price_change',
       'percise_date', 'id', 'symbol', 'marketcap', 'industry',
       'yfinance_symbol_id', 'item_types', 'clean_content', 'split_sents',
       'split_sent_length', 'industry_code', 'pct_change_close',
       'pct_change_open', 'category'],
      dtype='object')

first we'll load the csv that contains 13,000 filings since 2016, here's a quick explination of the columns:

upper_quantile_price_change: highest 75% of overnight price movement for the equity  
lower_quantile_price_change: lowest 25% of overnight price movement for the equity  
percise_date: the datetime the filing was published  
id: filing text id in my db @  
symbol: the trading symbol for the equity @  
marketcap: the market cap *  
industry: the industry * @  
yfinance_symbol_id: id of the symbol in my db   
item_types: the item types attached to each filing @  
clean_content: the 'other events' section text only   
split_sents: clean_content split into sentences with the shape [( sentence, filing_text_id ), ...]  
split_sent_length: the number of tuples in each split_sents list   
industry_code: the categorical numerical representation of the industry  
pct_change_close: the percent change since last close the day of or after the filing publish @  
pct_change_open: the percent change difference between last close and the next morning @  
category: categorical representation of pct_change_open  
            ```conditions = [
                (df['pct_change_open'] <= df['lower_quantile_price_change']),
                (df['pct_change_open'] >=  df['upper_quantile_price_change'])
            ]
            values = [1, 2]
            df['category'] = np.select(conditions, values, default=0)```

* = provided by yahoo finance  
@ = isn't used in the model input

In [32]:
from spacy.lang.en import English

nlp = English()
sentencizer = self.nlp.create_pipe("sentencizer")
nlp.add_pipe(self.sentencizer)

def split_text(row):
    return_content = []
    doc = nlp(row['clean_content'])
    for s in doc.sents:
        if len(s.text.split()) > 5:
            return_content.append(( correct_spaces(s.text), row['id'] ))
    return return_conten

ModuleNotFoundError: No module named 'spacy'

after processing we want to use https://github.com/UKPLab/sentence-transformers to create bert embeddings for each sentence in our split_sents tuples. Bert embeddings are vector representation of sentences. Interestingly enough cosine distance between the embeddings also indicate semantic similarity. We can use this property to make clusters of sentences with similar meaning without supervision. The embedding process can take a while, I recommend selecting a subset of the dataframe to start on.

In [22]:
# This is optional
df = df[df['industry'] == 'Biotechnology']
df = df[df['category'] != 0] # this is cheating, we're technically looking ahead

In [26]:
df = df.loc[14:120]

In [27]:
df.shape

(4, 16)

In [28]:
from sentence_transformers import SentenceTransformer

full_sentence_vectors = []
for i, row in df.iterrows():
    full_sentence_vectors.extend(row['split_sents'])
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') # automatically download this model

embedded = model.encode([x[0] for x in full_sentence_vectors])



In [30]:
len(embedded)

1424

Now that we have our embeddings we're going to want to reduce the dimensionality with umap learn and cluster with hdbscan, we'll then create a column for each cluster that is a boolean representation of if the filing text contains a sentence belonging to that cluster. These params are the ones that have worked best for me but feel free to try new ones out.

In [None]:
umap_model = umap.UMAP(n_neighbors=15,
                    n_components=100,
                    min_dist=0.1,
                    low_memory=True,
                    angular_rp_forest=False,
                    metric='cosine')
umap_model.fit(embedded)

cluster = hdbscan.HDBSCAN(min_cluster_size=50,
                        metric='euclidean',
                        prediction_data=True,
                        cluster_selection_method='eom')
cluster.fit(umap_model.embedding_)

filing_text_ids_cluster_labels_zip = zip([x[1] for x in full_sentence_vectors], cluster.labels_)

before we go further let's take a quick detour and visualize with umap plot, we'll need to reduce our embeddings to 2 dimensions

In [None]:
umap_model = umap.UMAP(n_neighbors=15,
                    n_components=2,
                    min_dist=0.1,
                    low_memory=True,
                    angular_rp_forest=False,
                    metric='cosine')
umap_model.fit(embedded)

The `filing_text_ids_cluster_labels_zip` contains tuples with the values ( dataframe id, cluster id ) for each sentence throughout the filing texts. Now we'll iterate through it and add the cluster columns for our decision tree to make predictions on.

In [None]:

for filing_text_id, cluster_label in filing_text_ids_cluster_labels_zip:
    if f'cluster_id_{cluster_label}' not in df:
        df[f'cluster_id_{cluster_label}'] = 0
    idx = df.index[df['id'] == filing_text_id][0]
    df.loc[idx, f'cluster_id_{cluster_label}'] = 1
    
y = df['category'].values # create our target

x = df.drop(columns=['id', 'pct_change_open', 'clean_content', 'category', 'symbol', 'industry', 'split_sents', 'item_types', 'cluster_id_-1', 'pct_change_close'])



Now we're ready to train our decision tree. First we'll split into training and testing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.02, random_state=42)

model = RandomForestClassifier(n_estimators=1250,
                                min_samples_split=2,
                                min_samples_leaf=2,
                                max_features= 'auto',
                                max_depth=None,
                                bootstrap=True)
model.fit(x_train, y_train)