In [None]:
import os
import pandas as pd
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

# Ensure nltk stopwords are downloaded
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Define stop words
stop_words = set(stopwords.words('english'))

data_folder = 'data'

# Dictionary to store LDA results
lda_results_by_year = {}

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

# Loop through each CSV file in the data folder
for filename in os.listdir(data_folder):
    if filename.endswith(".csv"):
        # Extract the year from the filename
        year = filename.split('.')[0]

        # Load the CSV file
        file_path = os.path.join(data_folder, filename)
        df = pd.read_csv(file_path)

        # Ensure all values in the 'Content' column are strings
        df['Content'] = df['Content'].fillna('').astype(str)

        # Clean and preprocess the 'Content' column
        df['tokens'] = df['Content'].apply(preprocess)

        # Create a dictionary and corpus for LDA
        dictionary = corpora.Dictionary(df['tokens'])
        corpus = [dictionary.doc2bow(text) for text in df['tokens']]

        # Train LDA model (e.g., with 5 topics)
        lda_model = gensim.models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)

        # Store the LDA model and corpus for this year
        lda_results_by_year[year] = (lda_model, corpus, dictionary)

        # Print the topics for each year
        print(f"Topics for the year {year}:")
        for idx, topic in lda_model.print_topics(num_topics=5):
            print(f"Topic {idx}: {topic}")

# Visualize the LDA results for each year
os.makedirs('data', exist_ok=True)

# Prepare and save the LDA visualization for each year
for year, (lda_model, corpus, dictionary) in lda_results_by_year.items():
    # Prepare the LDA visualization
    vis_data = gensimvis.prepare(lda_model, corpus, dictionary)

    # Save the visualization to an HTML file in the 'data' folder
    pyLDAvis.save_html(vis_data, f'data/lda_visualization_{year}.html')

    pyLDAvis.display(vis_data)
