In [2]:
import numpy
import pandas as pd
import datetime
import pickle
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim import matutils, models, utils, corpora
from gensim.models.coherencemodel import CoherenceModel
from collections import Counter
import scipy.sparse
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize, pos_tag

In [249]:
# Build Function to Measure LDA & Word2Vec Model Runtimes
import time

_start_time = time.time()

def tic():
    global _start_time 
    _start_time = time.time()

def tac():
    t_sec = round(time.time() - _start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))

In [261]:
# Tokenizer Function
def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [275]:
# Main Preprocessor for LDA model input features
def LDA_Prep(Pickle):    

    # Read in the cleaned data, before the CountVectorizer step
    data_clean = pd.read_pickle(Pickle)
    data_clean

    # Apply the nouns function to the transcripts to filter only on nouns
    data_nouns = pd.DataFrame(data_clean.lemmatized.apply(nouns))
    data_nouns

    # Re-add the additional stop words since we are recreating the document-term matrix (CURRENTLY NOTHING)
    add_stop_words = []
    add_stop_words = [word for word, count in Counter(words).most_common() if count > int(len(data_nouns) * 0.5)]
    stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

    # Recreate a document-term matrix with only nouns
    cv = CountVectorizer(stop_words=stop_words) #also can change to 'english' for automated stop words
    data_cv = cv.fit_transform(data_nouns.lemmatized)
    data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
    data_dtm.index = data_nouns.index
    data_dtm

    # Create the gensim corpus
    corpus = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtm.transpose()))

    # Create the vocabulary dictionary
    id2word = dict((v, k) for k, v in cv.vocabulary_.items())
    
    return corpus, id2word, data_dtm, data_clean

In [276]:
tic()
corpus1, id2word1, data_dtm1 , df1 = LDA_Prep('Minutes_Clean.pkl')
tac()

Time passed: 0hour:2min:50sec




In [279]:
# We run this to calculate the Perplexity & UMass Topic Coherence score
def scorer(model, corpus):
    perplexity = model.log_perplexity(corpus)
    cmvec = CoherenceModel(model=model, corpus=corpus, coherence='u_mass')
    coherence = cmvec.get_coherence()
    print(f'Perplexity : {round(perplexity,4)}\nUMass Topic Coherence : {round(coherence,4)}')

In [299]:
# We run this to assign each document their respective, highest probability topic model
def topic_mapper(model, corpus, dtm, df):    
    corpus_transformed = model[corpus]
    x = zip(corpus_transformed, dtm.index)
    Y = list(x)
    topic_list = []
    for i in range(len(Y)):
        topic_list.append(max(Y[i][0], key=lambda x:x[1]))

    topic_models = list(zip([a for (a,b) in topic_list],[b for (a,b) in topic_list], dtm.index))
    
    df['topic_model'] = 0
    df['topic_probability'] = 0.1

    for i in range(len(df)):
        df['topic_model'][i] = topic_models[i][0]
        df['topic_probability'][i] = topic_models[i][1]
    return df

In [305]:
# We run this to generate binary signals and pass over the dataframe to backtester ipynb. 
# SIGNAL IS HIGHLY DEPENDENT ON TOPIC MODELS SO PLEASE UPDATE BELOW FUNCTION ACCORDING TO YOUR LDA MODEL RESULTS

def binary_signal(df):
    df['binary'] = np.nan

    df['binary'] = np.where((df['topic_model'] == 0),1,np.nan)
    df['binary'] = np.where((df['topic_model'] == 1),1,df['binary'])
    df['binary'] = np.where((df['topic_model'] == 2),2,df['binary'])
    df['binary'] = np.where((df['topic_model'] == 3),np.nan,df['binary'])

    df['binary'] = np.where((df['topic_probability'] > 0.5),df['binary'],np.nan)
    df['binary'] = df['binary'].ffill()
    df = df[~df.index.duplicated(keep='last')]
    df.to_pickle('signal.pkl')
    
    return df

In [44]:
# Print the top 15 words mentioned in each transcripts
for timestamp, top_words in top_dict.items():
    print(timestamp)
    print(', '.join([word for word, count in top_words[0:14]]))
    print('---')

0
bank, risk, market, supervisor, management, banking, financial, failure, supervisory, institution, role, important, wa, leverage
---
1
payment, new, electronic, banking, product, federal, reserve, service, money, policy, value, consumer, monetary, card
---
2
inflation, policy, rate, run, monetary, year, long, price, objective, growth, strategy, employment, stability, trend
---
3
bank, banking, payment, government, note, electronic, currency, period, market, private, new, state, money, product
---
4
business, small, sector, bank, market, credit, loan, economy, ha, job, percent, year, service, cost
---
5
risk, bank, market, capital, technology, new, regulator, banking, financial, regulatory, management, ha, increasingly, regulation
---
6
debt, hamilton, treasury, security, government, wa, state, market, time, public, ha, new, secretary, united
---
7
wa, percent, mortgage, housing, ha, home, cycle, income, economic, urban, inflation, development, year, low
---
8
growth, year, policy, ec

# Single LDA Model Execution

In [219]:
tic()
ldan = models.LdaModel(corpus=corpus1, num_topics=4, id2word=id2word1, passes=50)
tac()
ldan.print_topics()

2022-12-13 09:17:03,671 : INFO : using symmetric alpha at 0.25
2022-12-13 09:17:03,672 : INFO : using symmetric eta at 0.25
2022-12-13 09:17:03,674 : INFO : using serial LDA version on this node
2022-12-13 09:17:03,680 : INFO : running online (multi-pass) LDA training, 4 topics, 50 passes over the supplied corpus of 1601 documents, updating model once every 1601 documents, evaluating perplexity every 1601 documents, iterating 50x with a convergence threshold of 0.001000
2022-12-13 09:17:06,139 : INFO : -10.554 per-word bound, 1503.5 perplexity estimate based on a held-out corpus of 1601 documents with 1398834 words
2022-12-13 09:17:06,139 : INFO : PROGRESS: pass 0, at document #1601/1601
2022-12-13 09:17:06,884 : INFO : topic #0 (0.250): 0.019*"bank" + 0.014*"rate" + 0.014*"policy" + 0.012*"price" + 0.011*"market" + 0.010*"reserve" + 0.009*"inflation" + 0.009*"economy" + 0.008*"year" + 0.008*"growth"
2022-12-13 09:17:06,885 : INFO : topic #1 (0.250): 0.022*"policy" + 0.021*"rate" + 0.0

2022-12-13 09:17:25,361 : INFO : topic #3 (0.250): 0.020*"community" + 0.014*"mortgage" + 0.013*"loan" + 0.013*"bank" + 0.012*"credit" + 0.012*"business" + 0.012*"market" + 0.011*"reserve" + 0.010*"year" + 0.009*"income"
2022-12-13 09:17:25,361 : INFO : topic diff=0.090283, rho=0.353553
2022-12-13 09:17:27,748 : INFO : -6.718 per-word bound, 105.3 perplexity estimate based on a held-out corpus of 1601 documents with 1398834 words
2022-12-13 09:17:27,748 : INFO : PROGRESS: pass 7, at document #1601/1601
2022-12-13 09:17:28,414 : INFO : topic #0 (0.250): 0.015*"growth" + 0.014*"economy" + 0.012*"market" + 0.012*"rate" + 0.011*"year" + 0.011*"ha" + 0.011*"price" + 0.011*"productivity" + 0.010*"country" + 0.010*"investment"
2022-12-13 09:17:28,415 : INFO : topic #1 (0.250): 0.036*"policy" + 0.034*"inflation" + 0.032*"rate" + 0.018*"price" + 0.017*"market" + 0.014*"economy" + 0.013*"year" + 0.012*"reserve" + 0.012*"term" + 0.011*"bank"
2022-12-13 09:17:28,415 : INFO : topic #2 (0.250): 0.03

2022-12-13 09:17:46,759 : INFO : topic #3 (0.250): 0.024*"community" + 0.015*"mortgage" + 0.015*"loan" + 0.014*"business" + 0.013*"credit" + 0.013*"bank" + 0.012*"reserve" + 0.010*"market" + 0.010*"income" + 0.010*"consumer"
2022-12-13 09:17:46,759 : INFO : topic diff=0.021063, rho=0.258199
2022-12-13 09:17:49,171 : INFO : -6.709 per-word bound, 104.6 perplexity estimate based on a held-out corpus of 1601 documents with 1398834 words
2022-12-13 09:17:49,172 : INFO : PROGRESS: pass 14, at document #1601/1601
2022-12-13 09:17:49,843 : INFO : topic #0 (0.250): 0.015*"growth" + 0.015*"economy" + 0.012*"market" + 0.012*"ha" + 0.012*"year" + 0.011*"rate" + 0.011*"productivity" + 0.011*"country" + 0.011*"price" + 0.010*"investment"
2022-12-13 09:17:49,844 : INFO : topic #1 (0.250): 0.037*"policy" + 0.035*"inflation" + 0.034*"rate" + 0.019*"price" + 0.017*"market" + 0.015*"economy" + 0.013*"year" + 0.012*"term" + 0.012*"reserve" + 0.010*"ha"
2022-12-13 09:17:49,845 : INFO : topic #2 (0.250): 0

2022-12-13 09:18:07,881 : INFO : topic #3 (0.250): 0.026*"community" + 0.016*"mortgage" + 0.016*"loan" + 0.015*"business" + 0.014*"credit" + 0.013*"bank" + 0.012*"reserve" + 0.010*"consumer" + 0.010*"income" + 0.009*"market"
2022-12-13 09:18:07,881 : INFO : topic diff=0.009605, rho=0.213201
2022-12-13 09:18:10,230 : INFO : -6.707 per-word bound, 104.5 perplexity estimate based on a held-out corpus of 1601 documents with 1398834 words
2022-12-13 09:18:10,230 : INFO : PROGRESS: pass 21, at document #1601/1601
2022-12-13 09:18:10,901 : INFO : topic #0 (0.250): 0.015*"economy" + 0.015*"growth" + 0.013*"year" + 0.013*"market" + 0.013*"ha" + 0.011*"country" + 0.011*"productivity" + 0.011*"rate" + 0.010*"price" + 0.010*"investment"
2022-12-13 09:18:10,902 : INFO : topic #1 (0.250): 0.038*"policy" + 0.035*"inflation" + 0.034*"rate" + 0.020*"price" + 0.017*"market" + 0.015*"economy" + 0.013*"year" + 0.012*"term" + 0.012*"reserve" + 0.010*"ha"
2022-12-13 09:18:10,903 : INFO : topic #2 (0.250): 0

2022-12-13 09:18:28,776 : INFO : topic #3 (0.250): 0.027*"community" + 0.016*"mortgage" + 0.016*"loan" + 0.015*"business" + 0.014*"credit" + 0.014*"bank" + 0.013*"reserve" + 0.010*"consumer" + 0.010*"income" + 0.009*"market"
2022-12-13 09:18:28,776 : INFO : topic diff=0.005754, rho=0.185695
2022-12-13 09:18:31,130 : INFO : -6.706 per-word bound, 104.4 perplexity estimate based on a held-out corpus of 1601 documents with 1398834 words
2022-12-13 09:18:31,131 : INFO : PROGRESS: pass 28, at document #1601/1601
2022-12-13 09:18:31,772 : INFO : topic #0 (0.250): 0.016*"economy" + 0.015*"growth" + 0.013*"year" + 0.013*"ha" + 0.013*"market" + 0.011*"country" + 0.011*"productivity" + 0.011*"rate" + 0.011*"investment" + 0.010*"price"
2022-12-13 09:18:31,773 : INFO : topic #1 (0.250): 0.038*"policy" + 0.036*"inflation" + 0.035*"rate" + 0.020*"price" + 0.017*"market" + 0.015*"economy" + 0.013*"year" + 0.012*"term" + 0.012*"reserve" + 0.010*"ha"
2022-12-13 09:18:31,774 : INFO : topic #2 (0.250): 0

2022-12-13 09:18:49,878 : INFO : topic #3 (0.250): 0.027*"community" + 0.016*"mortgage" + 0.016*"loan" + 0.015*"business" + 0.014*"credit" + 0.014*"bank" + 0.013*"reserve" + 0.011*"consumer" + 0.010*"income" + 0.009*"housing"
2022-12-13 09:18:49,878 : INFO : topic diff=0.003893, rho=0.166667
2022-12-13 09:18:52,257 : INFO : -6.706 per-word bound, 104.4 perplexity estimate based on a held-out corpus of 1601 documents with 1398834 words
2022-12-13 09:18:52,258 : INFO : PROGRESS: pass 35, at document #1601/1601
2022-12-13 09:18:52,902 : INFO : topic #0 (0.250): 0.016*"economy" + 0.015*"growth" + 0.013*"year" + 0.013*"ha" + 0.013*"market" + 0.011*"country" + 0.011*"productivity" + 0.011*"rate" + 0.011*"investment" + 0.010*"price"
2022-12-13 09:18:52,902 : INFO : topic #1 (0.250): 0.038*"policy" + 0.036*"inflation" + 0.035*"rate" + 0.020*"price" + 0.017*"market" + 0.015*"economy" + 0.013*"year" + 0.012*"term" + 0.012*"reserve" + 0.010*"ha"
2022-12-13 09:18:52,903 : INFO : topic #2 (0.250): 

2022-12-13 09:19:10,841 : INFO : topic #3 (0.250): 0.027*"community" + 0.016*"mortgage" + 0.016*"loan" + 0.015*"business" + 0.014*"credit" + 0.014*"bank" + 0.013*"reserve" + 0.011*"consumer" + 0.011*"income" + 0.009*"housing"
2022-12-13 09:19:10,841 : INFO : topic diff=0.002793, rho=0.152499
2022-12-13 09:19:13,145 : INFO : -6.705 per-word bound, 104.4 perplexity estimate based on a held-out corpus of 1601 documents with 1398834 words
2022-12-13 09:19:13,145 : INFO : PROGRESS: pass 42, at document #1601/1601
2022-12-13 09:19:13,786 : INFO : topic #0 (0.250): 0.016*"economy" + 0.015*"growth" + 0.013*"year" + 0.013*"ha" + 0.013*"market" + 0.012*"country" + 0.011*"productivity" + 0.011*"rate" + 0.011*"investment" + 0.010*"capital"
2022-12-13 09:19:13,787 : INFO : topic #1 (0.250): 0.038*"policy" + 0.036*"inflation" + 0.035*"rate" + 0.020*"price" + 0.017*"market" + 0.015*"economy" + 0.013*"year" + 0.013*"term" + 0.012*"reserve" + 0.010*"growth"
2022-12-13 09:19:13,788 : INFO : topic #2 (0.

2022-12-13 09:19:31,733 : INFO : topic #3 (0.250): 0.028*"community" + 0.017*"mortgage" + 0.016*"loan" + 0.015*"business" + 0.014*"credit" + 0.014*"bank" + 0.013*"reserve" + 0.011*"consumer" + 0.011*"income" + 0.009*"housing"
2022-12-13 09:19:31,734 : INFO : topic diff=0.002099, rho=0.141421
2022-12-13 09:19:34,051 : INFO : -6.705 per-word bound, 104.3 perplexity estimate based on a held-out corpus of 1601 documents with 1398834 words
2022-12-13 09:19:34,052 : INFO : PROGRESS: pass 49, at document #1601/1601
2022-12-13 09:19:34,697 : INFO : topic #0 (0.250): 0.016*"economy" + 0.015*"growth" + 0.013*"year" + 0.013*"ha" + 0.013*"market" + 0.012*"country" + 0.011*"productivity" + 0.011*"rate" + 0.011*"investment" + 0.010*"capital"
2022-12-13 09:19:34,698 : INFO : topic #1 (0.250): 0.038*"policy" + 0.036*"inflation" + 0.035*"rate" + 0.020*"price" + 0.017*"market" + 0.015*"economy" + 0.013*"year" + 0.013*"term" + 0.012*"reserve" + 0.010*"growth"
2022-12-13 09:19:34,699 : INFO : topic #2 (0.

[(0,
  '0.016*"economy" + 0.015*"growth" + 0.013*"year" + 0.013*"ha" + 0.013*"market" + 0.012*"country" + 0.011*"productivity" + 0.011*"rate" + 0.011*"investment" + 0.010*"capital"'),
 (1,
  '0.038*"policy" + 0.036*"inflation" + 0.035*"rate" + 0.020*"price" + 0.017*"market" + 0.015*"economy" + 0.013*"year" + 0.013*"term" + 0.012*"reserve" + 0.010*"growth"'),
 (2,
  '0.042*"bank" + 0.033*"risk" + 0.023*"market" + 0.016*"capital" + 0.013*"banking" + 0.013*"institution" + 0.011*"firm" + 0.010*"reserve" + 0.010*"credit" + 0.009*"management"'),
 (3,
  '0.028*"community" + 0.017*"mortgage" + 0.016*"loan" + 0.015*"business" + 0.014*"credit" + 0.014*"bank" + 0.013*"reserve" + 0.011*"consumer" + 0.011*"income" + 0.009*"housing"')]

In [220]:
ldan.save("SLDA_Minutes.model")

2022-12-13 09:19:34,732 : INFO : LdaState lifecycle event {'fname_or_handle': 'SLDA_lemmatized_full.model.state', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-12-13T09:19:34.732638', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'saving'}
2022-12-13 09:19:34,734 : INFO : saved SLDA_lemmatized_full.model.state
2022-12-13 09:19:34,738 : INFO : LdaModel lifecycle event {'fname_or_handle': 'SLDA_lemmatized_full.model', 'separately': "['expElogbeta', 'sstats']", 'sep_limit': 10485760, 'ignore': ['state', 'dispatcher', 'id2word'], 'datetime': '2022-12-13T09:19:34.738639', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'saving'}
2022-12-13 09:19:34,739 : INFO : storing np array 'expElogbeta' to SLDA_lemmatized_full.model.expElogbeta.npy
2022-12-13 09:19:3

# Ensemble LDA Model Execution

In [310]:
elda = models.EnsembleLda(corpus=corpus1, id2word=id2word1, num_topics=3, num_models=8, passes=10)
elda.print_topics()

2022-12-15 04:48:36,706 : INFO : generating 8 topic models using 1 workers
2022-12-15 04:48:36,706 : INFO : using symmetric alpha at 0.3333333333333333
2022-12-15 04:48:36,707 : INFO : using symmetric eta at 0.3333333333333333
2022-12-15 04:48:36,707 : INFO : using serial LDA version on this node
2022-12-15 04:48:36,709 : INFO : running online LDA training, 3 topics, 10 passes over the supplied corpus of 206 documents, updating every 46000 documents, evaluating every ~206 documents, iterating 50x with a convergence threshold of 0.001000
2022-12-15 04:48:36,709 : INFO : training LDA model using 23 processes
2022-12-15 04:48:54,980 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #206/206, outstanding queue size 1
2022-12-15 04:48:55,407 : INFO : topic #0 (0.333): 0.026*"member" + 0.022*"quarter" + 0.017*"meeting" + 0.015*"period" + 0.014*"condition" + 0.013*"spending" + 0.011*"outlook" + 0.011*"range" + 0.011*"expansion" + 0.010*"month"
2022-12-15 04:48:55,407 : INFO : t

2022-12-15 04:49:01,304 : INFO : topic #2 (0.333): 0.029*"member" + 0.014*"meeting" + 0.014*"spending" + 0.013*"quarter" + 0.013*"condition" + 0.012*"period" + 0.011*"month" + 0.011*"sale" + 0.011*"currency" + 0.010*"action"
2022-12-15 04:49:01,304 : INFO : topic diff=0.106498, rho=0.351299
2022-12-15 04:49:01,718 : INFO : -6.114 per-word bound, 69.3 perplexity estimate based on a held-out corpus of 206 documents with 1517233 words
2022-12-15 04:49:01,729 : INFO : PROGRESS: pass 8, dispatched chunk #0 = documents up to #206/206, outstanding queue size 1
2022-12-15 04:49:02,145 : INFO : topic #0 (0.333): 0.031*"member" + 0.020*"quarter" + 0.018*"meeting" + 0.017*"period" + 0.016*"condition" + 0.016*"spending" + 0.015*"expansion" + 0.012*"month" + 0.012*"outlook" + 0.012*"pressure"
2022-12-15 04:49:02,145 : INFO : topic #1 (0.333): 0.023*"range" + 0.023*"member" + 0.021*"quarter" + 0.017*"period" + 0.014*"meeting" + 0.014*"month" + 0.013*"condition" + 0.013*"currency" + 0.010*"expansion"

2022-12-15 04:49:25,110 : INFO : topic diff=0.194463, rho=0.442677
2022-12-15 04:49:25,507 : INFO : -6.094 per-word bound, 68.3 perplexity estimate based on a held-out corpus of 206 documents with 1517233 words
2022-12-15 04:49:25,518 : INFO : PROGRESS: pass 5, dispatched chunk #0 = documents up to #206/206, outstanding queue size 1
2022-12-15 04:49:25,907 : INFO : topic #0 (0.333): 0.032*"member" + 0.022*"quarter" + 0.019*"period" + 0.017*"meeting" + 0.016*"expansion" + 0.016*"condition" + 0.016*"month" + 0.011*"spending" + 0.011*"pressure" + 0.011*"range"
2022-12-15 04:49:25,908 : INFO : topic #1 (0.333): 0.021*"member" + 0.015*"spending" + 0.015*"quarter" + 0.014*"condition" + 0.014*"period" + 0.013*"participant" + 0.013*"meeting" + 0.011*"outlook" + 0.010*"month" + 0.010*"governor"
2022-12-15 04:49:25,908 : INFO : topic #2 (0.333): 0.027*"member" + 0.019*"currency" + 0.019*"quarter" + 0.018*"range" + 0.016*"meeting" + 0.014*"period" + 0.011*"operation" + 0.011*"condition" + 0.011*"

2022-12-15 04:49:49,426 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #206/206, outstanding queue size 1
2022-12-15 04:49:49,815 : INFO : topic #0 (0.333): 0.018*"member" + 0.016*"period" + 0.016*"quarter" + 0.015*"condition" + 0.014*"meeting" + 0.014*"participant" + 0.014*"month" + 0.011*"range" + 0.010*"pace" + 0.009*"governor"
2022-12-15 04:49:49,816 : INFO : topic #1 (0.333): 0.029*"member" + 0.021*"quarter" + 0.016*"meeting" + 0.016*"period" + 0.014*"range" + 0.014*"expansion" + 0.014*"currency" + 0.013*"condition" + 0.012*"spending" + 0.011*"month"
2022-12-15 04:49:49,816 : INFO : topic #2 (0.333): 0.031*"member" + 0.019*"quarter" + 0.016*"meeting" + 0.016*"condition" + 0.015*"period" + 0.014*"spending" + 0.013*"month" + 0.012*"expansion" + 0.011*"outlook" + 0.010*"sale"
2022-12-15 04:49:49,816 : INFO : topic diff=0.451253, rho=0.567687
2022-12-15 04:49:50,216 : INFO : -6.116 per-word bound, 69.4 perplexity estimate based on a held-out corpus of 206 documents w

2022-12-15 04:49:55,981 : INFO : using symmetric alpha at 0.3333333333333333
2022-12-15 04:49:55,982 : INFO : using symmetric eta at 0.3333333333333333
2022-12-15 04:49:55,983 : INFO : using serial LDA version on this node
2022-12-15 04:49:55,984 : INFO : running online LDA training, 3 topics, 10 passes over the supplied corpus of 206 documents, updating every 46000 documents, evaluating every ~206 documents, iterating 50x with a convergence threshold of 0.001000
2022-12-15 04:49:55,985 : INFO : training LDA model using 23 processes
2022-12-15 04:50:14,047 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #206/206, outstanding queue size 1
2022-12-15 04:50:14,447 : INFO : topic #0 (0.333): 0.025*"member" + 0.023*"quarter" + 0.015*"period" + 0.015*"month" + 0.014*"meeting" + 0.012*"condition" + 0.012*"spending" + 0.011*"sale" + 0.011*"expansion" + 0.009*"range"
2022-12-15 04:50:14,448 : INFO : topic #1 (0.333): 0.029*"member" + 0.019*"quarter" + 0.016*"meeting" + 0.015*"p

2022-12-15 04:50:20,111 : INFO : topic #2 (0.333): 0.021*"participant" + 0.016*"quarter" + 0.015*"condition" + 0.014*"period" + 0.013*"meeting" + 0.012*"governor" + 0.012*"member" + 0.012*"month" + 0.010*"pace" + 0.010*"spending"
2022-12-15 04:50:20,112 : INFO : topic diff=0.113454, rho=0.351299
2022-12-15 04:50:20,514 : INFO : -6.071 per-word bound, 67.2 perplexity estimate based on a held-out corpus of 206 documents with 1517233 words
2022-12-15 04:50:20,525 : INFO : PROGRESS: pass 8, dispatched chunk #0 = documents up to #206/206, outstanding queue size 1
2022-12-15 04:50:20,919 : INFO : topic #0 (0.333): 0.034*"member" + 0.018*"spending" + 0.016*"quarter" + 0.015*"meeting" + 0.013*"period" + 0.013*"condition" + 0.013*"sale" + 0.012*"outlook" + 0.011*"inventory" + 0.011*"expansion"
2022-12-15 04:50:20,920 : INFO : topic #1 (0.333): 0.029*"member" + 0.022*"quarter" + 0.018*"period" + 0.018*"range" + 0.017*"meeting" + 0.015*"condition" + 0.014*"expansion" + 0.014*"month" + 0.011*"curr

2022-12-15 04:50:43,636 : INFO : topic diff=0.241838, rho=0.442677
2022-12-15 04:50:44,035 : INFO : -6.104 per-word bound, 68.8 perplexity estimate based on a held-out corpus of 206 documents with 1517233 words
2022-12-15 04:50:44,046 : INFO : PROGRESS: pass 5, dispatched chunk #0 = documents up to #206/206, outstanding queue size 1
2022-12-15 04:50:44,450 : INFO : topic #0 (0.333): 0.038*"member" + 0.019*"meeting" + 0.016*"quarter" + 0.016*"spending" + 0.015*"period" + 0.013*"condition" + 0.013*"expansion" + 0.011*"sale" + 0.011*"pressure" + 0.010*"month"
2022-12-15 04:50:44,450 : INFO : topic #1 (0.333): 0.018*"member" + 0.016*"quarter" + 0.015*"condition" + 0.013*"spending" + 0.013*"meeting" + 0.012*"period" + 0.012*"outlook" + 0.011*"participant" + 0.011*"month" + 0.009*"pace"
2022-12-15 04:50:44,450 : INFO : topic #2 (0.333): 0.027*"member" + 0.024*"quarter" + 0.020*"range" + 0.019*"period" + 0.016*"month" + 0.015*"meeting" + 0.015*"condition" + 0.014*"expansion" + 0.013*"currency

2022-12-15 04:51:07,773 : INFO : PROGRESS: pass 2, dispatched chunk #0 = documents up to #206/206, outstanding queue size 1
2022-12-15 04:51:08,155 : INFO : topic #0 (0.333): 0.034*"member" + 0.016*"quarter" + 0.016*"period" + 0.015*"meeting" + 0.014*"spending" + 0.013*"condition" + 0.012*"month" + 0.012*"expansion" + 0.010*"outlook" + 0.010*"sale"
2022-12-15 04:51:08,156 : INFO : topic #1 (0.333): 0.029*"member" + 0.021*"quarter" + 0.018*"meeting" + 0.017*"period" + 0.015*"condition" + 0.014*"expansion" + 0.014*"range" + 0.013*"month" + 0.012*"spending" + 0.011*"currency"
2022-12-15 04:51:08,156 : INFO : topic #2 (0.333): 0.018*"quarter" + 0.016*"participant" + 0.015*"member" + 0.014*"period" + 0.014*"condition" + 0.013*"meeting" + 0.012*"month" + 0.010*"outlook" + 0.010*"spending" + 0.010*"governor"
2022-12-15 04:51:08,156 : INFO : topic diff=0.397085, rho=0.567687
2022-12-15 04:51:08,549 : INFO : -6.116 per-word bound, 69.4 perplexity estimate based on a held-out corpus of 206 docum

2022-12-15 04:51:14,314 : INFO : using symmetric alpha at 0.3333333333333333
2022-12-15 04:51:14,314 : INFO : using symmetric eta at 0.3333333333333333
2022-12-15 04:51:14,315 : INFO : using serial LDA version on this node
2022-12-15 04:51:14,316 : INFO : running online LDA training, 3 topics, 10 passes over the supplied corpus of 206 documents, updating every 46000 documents, evaluating every ~206 documents, iterating 50x with a convergence threshold of 0.001000
2022-12-15 04:51:14,317 : INFO : training LDA model using 23 processes
2022-12-15 04:51:32,442 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #206/206, outstanding queue size 1
2022-12-15 04:51:32,836 : INFO : topic #0 (0.333): 0.033*"member" + 0.022*"quarter" + 0.019*"condition" + 0.015*"meeting" + 0.014*"month" + 0.013*"period" + 0.012*"expansion" + 0.011*"spending" + 0.009*"outlook" + 0.009*"sale"
2022-12-15 04:51:32,837 : INFO : topic #1 (0.333): 0.026*"member" + 0.017*"quarter" + 0.016*"meeting" + 0.015*

2022-12-15 04:51:38,513 : INFO : topic #2 (0.333): 0.019*"participant" + 0.018*"quarter" + 0.017*"period" + 0.014*"condition" + 0.013*"month" + 0.013*"meeting" + 0.012*"member" + 0.011*"pace" + 0.010*"governor" + 0.010*"outlook"
2022-12-15 04:51:38,513 : INFO : topic diff=0.138024, rho=0.351299
2022-12-15 04:51:38,912 : INFO : -6.072 per-word bound, 67.3 perplexity estimate based on a held-out corpus of 206 documents with 1517233 words
2022-12-15 04:51:38,923 : INFO : PROGRESS: pass 8, dispatched chunk #0 = documents up to #206/206, outstanding queue size 1
2022-12-15 04:51:39,316 : INFO : topic #0 (0.333): 0.034*"member" + 0.017*"quarter" + 0.017*"meeting" + 0.016*"condition" + 0.016*"spending" + 0.015*"period" + 0.013*"expansion" + 0.012*"month" + 0.012*"sale" + 0.011*"outlook"
2022-12-15 04:51:39,317 : INFO : topic #1 (0.333): 0.027*"member" + 0.024*"range" + 0.022*"quarter" + 0.018*"currency" + 0.016*"period" + 0.016*"meeting" + 0.013*"month" + 0.013*"expansion" + 0.012*"condition"

2022-12-15 04:52:02,131 : INFO : topic diff=0.190750, rho=0.442677
2022-12-15 04:52:02,536 : INFO : -6.067 per-word bound, 67.0 perplexity estimate based on a held-out corpus of 206 documents with 1517233 words
2022-12-15 04:52:02,546 : INFO : PROGRESS: pass 5, dispatched chunk #0 = documents up to #206/206, outstanding queue size 1
2022-12-15 04:52:02,947 : INFO : topic #0 (0.333): 0.033*"member" + 0.017*"spending" + 0.017*"meeting" + 0.017*"quarter" + 0.015*"condition" + 0.015*"period" + 0.013*"expansion" + 0.012*"outlook" + 0.012*"sale" + 0.011*"month"
2022-12-15 04:52:02,947 : INFO : topic #1 (0.333): 0.028*"member" + 0.023*"quarter" + 0.020*"range" + 0.017*"period" + 0.016*"meeting" + 0.014*"month" + 0.014*"currency" + 0.013*"expansion" + 0.013*"condition" + 0.009*"debt"
2022-12-15 04:52:02,948 : INFO : topic #2 (0.333): 0.024*"participant" + 0.016*"quarter" + 0.015*"condition" + 0.013*"period" + 0.013*"governor" + 0.012*"meeting" + 0.011*"month" + 0.010*"division" + 0.010*"outloo

[(0,
  '0.030*"member" + 0.019*"quarter" + 0.016*"meeting" + 0.016*"period" + 0.014*"condition" + 0.013*"expansion" + 0.013*"spending" + 0.012*"month" + 0.012*"range" + 0.010*"sale"'),
 (1,
  '0.021*"participant" + 0.016*"quarter" + 0.015*"condition" + 0.014*"period" + 0.012*"governor" + 0.012*"meeting" + 0.011*"month" + 0.011*"member" + 0.010*"outlook" + 0.010*"spending"')]

In [311]:
elda.save("ELDA_FEDminutes.model")

2022-12-15 04:52:25,412 : INFO : EnsembleLda lifecycle event {'fname_or_handle': 'ELDA_FEDminutes.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset({'topic_model_class'}), 'datetime': '2022-12-15T04:52:25.412655', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'saving'}
2022-12-15 04:52:25,412 : INFO : not storing attribute topic_model_class
2022-12-15 04:52:25,415 : INFO : saved ELDA_FEDminutes.model


In [312]:
# Test Load the saved models
elda_load = models.EnsembleLda.load("ELDA_FEDminutes.model")

2022-12-15 04:52:25,428 : INFO : loading EnsembleLda object from ELDA_FEDminutes.model
2022-12-15 04:52:25,434 : INFO : loading classic_model_representation recursively from ELDA_FEDminutes.model.classic_model_representation.* with mmap=None
2022-12-15 04:52:25,435 : INFO : loading state recursively from ELDA_FEDminutes.model.classic_model_representation.state.* with mmap=None
2022-12-15 04:52:25,435 : INFO : setting ignored attribute topic_model_class to None
2022-12-15 04:52:25,436 : INFO : EnsembleLda lifecycle event {'fname': 'ELDA_FEDminutes.model', 'datetime': '2022-12-15T04:52:25.436660', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'loaded'}


In [313]:
elda_load.print_topics()

2022-12-15 04:52:25,460 : INFO : topic #0 (0.500): 0.030*"member" + 0.019*"quarter" + 0.016*"meeting" + 0.016*"period" + 0.014*"condition" + 0.013*"expansion" + 0.013*"spending" + 0.012*"month" + 0.012*"range" + 0.010*"sale"
2022-12-15 04:52:25,461 : INFO : topic #1 (0.500): 0.021*"participant" + 0.016*"quarter" + 0.015*"condition" + 0.014*"period" + 0.012*"governor" + 0.012*"meeting" + 0.011*"month" + 0.011*"member" + 0.010*"outlook" + 0.010*"spending"


[(0,
  '0.030*"member" + 0.019*"quarter" + 0.016*"meeting" + 0.016*"period" + 0.014*"condition" + 0.013*"expansion" + 0.013*"spending" + 0.012*"month" + 0.012*"range" + 0.010*"sale"'),
 (1,
  '0.021*"participant" + 0.016*"quarter" + 0.015*"condition" + 0.014*"period" + 0.012*"governor" + 0.012*"meeting" + 0.011*"month" + 0.011*"member" + 0.010*"outlook" + 0.010*"spending"')]

## Using topic_mapper & binary_signal function to generate signals

In [314]:
topic_mapper(elda, corpus1, data_dtm1, df1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['topic_model'][i] = topic_models[i][0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['topic_probability'][i] = topic_models[i][1]


Unnamed: 0,text,source,lemmatized,topic_model,topic_probability
1995-02-01,starting on tuesday january at p m ...,Minutes,starting on tuesday january at p m and continu...,0,0.871737
1995-03-28,on tuesday march at a m present...,Minutes,on tuesday march at a m present mr greenspan c...,0,0.974870
1995-05-23,on tuesday may at a m present mr...,Minutes,on tuesday may at a m present mr greenspan cha...,0,0.969855
1995-07-06,on wednesday july at p m and ...,Minutes,on wednesday july at p m and continued on thur...,0,0.965195
1995-08-22,on tuesday august at a m present ...,Minutes,on tuesday august at a m present mr greenspan ...,0,0.995538
...,...,...,...,...,...
2022-05-04,may – on tuesday may at a m and...,Minutes,may – on tuesday may at a m and continued on w...,1,0.999532
2022-06-15,june – on tuesday june at a m a...,Minutes,june – on tuesday june at a m and continued on...,1,0.999545
2022-07-27,july on tuesday july at a m a...,Minutes,july on tuesday july at a m and continued on w...,1,0.999532
2022-09-21,september on tuesday september a...,Minutes,september on tuesday september at p m and cont...,1,0.999539


In [303]:
# We run this to generate binary signals and pass over the dataframe to backtester ipynb. 
# SIGNAL IS HIGHLY DEPENDENT ON TOPIC MODELS SO PLEASE UPDATE BELOW FUNCTION ACCORDING TO YOUR LDA MODEL RESULTS

def binary_signal(df):
    df['binary'] = np.nan

    df['binary'] = np.where((df['topic_model'] == 0),2,np.nan)
    df['binary'] = np.where((df['topic_model'] == 1),1,df['binary'])
#     df['binary'] = np.where((df['topic_model'] == 2),2,df['binary'])
#     df['binary'] = np.where((df['topic_model'] == 3),np.nan,df['binary'])

    df['binary'] = np.where((df['topic_probability'] > 0.5),df['binary'],np.nan)
    df['binary'] = df['binary'].ffill()
    df = df[~df.index.duplicated(keep='last')]
    df.to_pickle('signal.pkl')
    
    return df

In [304]:
binary_signal(data_nouns)

Unnamed: 0,lemmatized,topic_model,topic_probability,binary
1995-02-01,january m mr greenspan chairman mcdonough vice...,1,0.885254,1.0
1995-03-28,march mr greenspan chairman mcdonough vice cha...,1,0.942649,1.0
1995-05-23,tuesday mr greenspan chairman mcdonough vice c...,1,0.960876,1.0
1995-07-06,july m july mr greenspan chairman mcdonough vi...,1,0.957036,1.0
1995-08-22,august mr greenspan chairman mcdonough vice ch...,1,0.990376,1.0
...,...,...,...,...
2022-05-04,tuesday m wednesday m attendance jerome h powe...,0,0.991349,2.0
2022-06-15,june – june m wednesday june m attendance jero...,0,0.988901,2.0
2022-07-27,july july m july m attendance jerome h powell ...,0,0.977105,2.0
2022-09-21,september september m september m attendance j...,0,0.992050,2.0


# Using Scorer function to measure perplexity & UMass topic coherence

In [315]:
scorer(elda,corpus1)

2022-12-15 04:52:26,084 : INFO : -6.137 per-word bound, 70.4 perplexity estimate based on a held-out corpus of 206 documents with 1517233 words


Perplexity : -6.1366
UMass Topic Coherence : -0.0265
