# Introduction
This notebook will run a process to find out the best number of topics in terms of the highest coherence score. The coherence score is a measure of how similar the words that make up the topics are. A higher coherence score indicates that the topics are composed of words that are more similar to each other, hence the topics themselves are more interpretable and meaningful.

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('words')

import pandas as pd
# fine tuning to find the best number of topics
from gensim.models import CoherenceModel
import time
import string
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import words
from gensim import corpora
from gensim.models import CoherenceModel, LdaModel

stop = set(stopwords.words('english'))
stop.add('also')
stop.add('could')


english_words = set(words.words())

exclude = set(string.punctuation) 
lemma = WordNetLemmatizer()

df = pd.read_csv('all_filings_and_sections.csv')[['Section1', 'Section1A', 'Section7']]

def clean(doc: str) -> str:
    """
    This function cleans the input document by removing specific strings, 
    stop words, punctuation, non-English words, and lemmatizing the words.
    
    Parameters:
    doc (str): The input document to be cleaned.
    
    Returns:
    str: The cleaned document.
    """
    # Check if the input is a string
    if not isinstance(doc, str):
        return None
    
    # Remove specific strings from the document
    doc = doc.replace('Item 1.', '')
    doc = doc.replace('Item 1A.', '')
    doc = doc.replace('Item 7.', '')
    
    # Remove stop words from the document
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    # Remove punctuation from the document
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    
    # Keep only English words in the document
    english = " ".join(word for word in punc_free.split() if word in english_words)
    
    # Lemmatize the words in the document and remove words with length less than or equal to 3
    normalized = " ".join(lemma.lemmatize(word) for word in english.split() if len(word) > 3)
    
    return normalized

df['Section1_clean'] = df['Section1'].apply(clean)
df['Section1A_clean'] = df['Section1A'].apply(clean)
df['Section7_clean'] = df['Section7'].apply(clean)

df = df.dropna()

documents1, documents2, documents3 = df['Section1_clean'], df['Section1A_clean'], df['Section7_clean']

start_time = time.time()

doc_clean1 = [doc.split() for doc in documents1] 
doc_clean2 = [doc.split() for doc in documents2] 
doc_clean3 = [doc.split() for doc in documents3] 

execution_time = time.time() - start_time
print(f"The code executed in {execution_time} seconds")

doc_clean = doc_clean1 + doc_clean2 + doc_clean3

# Creating the term dictionary of our corpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

start_time = time.time()
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=doc_term_matrix, texts=doc_clean, start=2, limit=40, step=2)
execution_time = time.time() - start_time
print(f"The code executed in {execution_time} seconds")

# Print the coherence scores
for m, cv in zip(range(2, 40, 2), coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zhaoyiliang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/zhaoyiliang/nltk_data...
[nltk_data]   Package words is already up-to-date!


The code executed in 2.709465980529785 seconds
The code executed in 3632.189742088318 seconds
Num Topics = 2  has Coherence Value of 0.4241
Num Topics = 4  has Coherence Value of 0.4112
Num Topics = 6  has Coherence Value of 0.3922
Num Topics = 8  has Coherence Value of 0.3833
Num Topics = 10  has Coherence Value of 0.3805
Num Topics = 12  has Coherence Value of 0.4038
Num Topics = 14  has Coherence Value of 0.3943
Num Topics = 16  has Coherence Value of 0.3635
Num Topics = 18  has Coherence Value of 0.3828
Num Topics = 20  has Coherence Value of 0.3809
Num Topics = 22  has Coherence Value of 0.3862
Num Topics = 24  has Coherence Value of 0.3572
Num Topics = 26  has Coherence Value of 0.372
Num Topics = 28  has Coherence Value of 0.3866
Num Topics = 30  has Coherence Value of 0.3867
Num Topics = 32  has Coherence Value of 0.3897
Num Topics = 34  has Coherence Value of 0.3754
Num Topics = 36  has Coherence Value of 0.372
Num Topics = 38  has Coherence Value of 0.3855
