# Imports

In [None]:
import pandas as pd
import nltk
import re
import gensim
import matplotlib.pyplot as plt 
import seaborn as sns
import pyLDAvis.gensim 
import numpy as np

# Part 1: Inspect Data for Missing or Duplicated Data

In [None]:
# df = pd.read_csv("data/stocks_raw_454_posts.csv")

# # check for missing values
# print("missing values per column:\\n", df.isnull().sum())
# # drop rows where 'Text' is empty
# df = df[df["Text"].str.strip() != ""]
# # check for duplicates (based on title + text)
# df.drop_duplicates(subset=["Title", "Text"], inplace=True)

# # save cleaned data
# df.to_csv("data/stocks_cleaned.csv", index=False)
# print(f"cleaned dataset saved: {len(df)} posts")

# Combine The Datasets Into a File

In [None]:
csv_files = ["data/stocks_raw_879_posts.csv"]
df_list = [pd.read_csv(file) for file in csv_files]
df = pd.concat(df_list, ignore_index=True)

df.drop_duplicates(subset=["Title", "Text"], inplace=True)

df.to_csv("data/stocks_combined.csv", index=False)
print(f"combined dataset saved: {len(df)} posts")

# Part 2: Preprocessing

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

df = pd.read_csv("data/stocks_combined.csv")

# preprocessing function
def preprocess_text(text):
    if pd.isnull(text):  # handle missing values
        return ""

    text = text.lower()
    text = re.sub(r'http\\S+', '', text)
    text = re.sub(r'[^a-z\\s]', '', text)
    tokens = word_tokenize(text)  # tokenize
    tokens = [word for word in tokens if word not in stopwords.words('english')] 
    return " ".join(tokens)

# apply preprocessing to the "Text" column
df["Cleaned_Text"] = df["Text"].astype(str).apply(preprocess_text)

# save the cleaned dataset
df.to_csv("data/stocks_preprocessed.csv", index=False)
print(f"preprocessing complete! cleaned dataset saved: {len(df)} posts")

# Part 3: Model

In [None]:
from gensim import corpora
from gensim.models import LdaModel

# tokenize the cleaned text
df["Tokenized"] = df["Cleaned_Text"].apply(lambda x: x.split())

# create a dict and corpus
dictionary = corpora.Dictionary(df["Tokenized"])
corpus = [dictionary.doc2bow(text) for text in df["Tokenized"]]

# training‼️ (adjust num_topics for different results)
num_topics = 10  # number of topics
lda_model = LdaModel(corpus=corpus,
                     id2word=dictionary,
                     num_topics=num_topics,
                     alpha='auto',
                     eta='auto',
                     passes=10) # ✨the model✨

# print the topics
for idx, topic in lda_model.print_topics():
    print(f"💡 Topic {idx}: {topic}")

# Part 4: Visualisation

In [None]:
# # visualize w/ pyLDAvis
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
# pyLDAvis.display(vis)

# Get The Topic Distribution For Each Document

In [None]:
def get_topic_distribution(lda_model, corpus):
    topic_dist = []
    for doc in corpus:
        topic_probs = lda_model.get_document_topics(doc, minimum_probability=0)
        topic_dist.append([prob for _, prob in topic_probs])
    
    return pd.DataFrame(topic_dist, columns=[f"Topic {i}" for i in range(lda_model.num_topics)])

# dataframe
topic_df = get_topic_distribution(lda_model, corpus)

plt.figure(figsize=(10,6))
sns.heatmap(topic_df, cmap="coolwarm", annot=False, cbar=True)
plt.title("Topic Distribution Across Documents")
plt.xlabel("Topics")
plt.ylabel("Documents")
plt.show()

# Frequency Distribution of Word Counts in Documents

In [None]:
# import matplotlib.colors as mcolors

# cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]

# fig, axes = plt.subplots(2, 2, figsize=(16, 14), dpi=160, sharex=True, sharey=True)

# for i, ax in enumerate(axes.flatten()):
#     df_dominant_topic_sub = df_dominant_topic.loc[df_dominant_topic.Dominant_Topic == i, :]
    
#     # Check that 'Text' is a string and then calculate document lengths
#     doc_lens = [len(d.split()) if isinstance(d, str) else 0 for d in df_dominant_topic_sub.Text]
    
#     ax.hist(doc_lens, bins=1000, color=cols[i])
#     ax.tick_params(axis='y', labelcolor=cols[i], color=cols[i])
    
#     # Seaborn warning fix: update `shade=False` to `fill=False`
#     sns.kdeplot(doc_lens, color="black", fill=False, ax=ax.twinx())
    
#     ax.set(xlim=(0, 1000), xlabel='Document Word Count')
#     ax.set_ylabel('Number of Documents', color=cols[i])
#     ax.set_title('Topic: ' + str(i), fontdict=dict(size=16, color=cols[i]))

# fig.tight_layout()
# fig.subplots_adjust(top=0.90)
# plt.xticks(np.linspace(0, 1000, 9))
# fig.suptitle('Distribution of Document Word Counts by Dominant Topic', fontsize=22)
# plt.show()
