# Imports and Loading

In [14]:
# Loading the required libraries
%pip install nbimporter
import nbimporter
import feature_extraction
from feature_extraction import EmpathFeatureExtractor
from feature_extraction import NGramFeatureExtractor
from feature_extraction import EmpathFeatureAnalyzer
from feature_extraction import LDAFeatureExtractor
import os
from sklearn.manifold import TSNE
from wordcloud import WordCloud
from collections import Counter
from sklearn.manifold import TSNE
from wordcloud import WordCloud
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
%store -r selected_categories

Note: you may need to restart the kernel to use updated packages.


In [10]:
# Load documents and labels
folders = {
    "depression": {"path": "data/preprocessed_posts/depression", "label": 1},
    "breastcancer": {"path": "data/preprocessed_posts/breastcancer", "label": 0},
}
documents, labels = [], []
for category, data in folders.items():
    for file_name in os.listdir(data["path"]):
        file_path = os.path.join(data["path"], file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            documents.append(file.read())
            labels.append(data["label"])
print(f"Loaded {len(documents)} documents.")
print(f"Labels: {set(labels)}")

Loaded 1878 documents.
Labels: {0, 1}


In [15]:
# Creating an LDAFeatureAnalyzer object

class LDAFeatureAnalyzer:
    def __init__(self, lda_model, corpus, topic_matrix, dictionary, labels, num_topics):
        """
        Initialize the LDAFeatureAnalyzer class.

        Parameters:
        lda_model: Trained Gensim LDA model.
        corpus: The Bag-of-Words corpus used for LDA.
        topic_matrix: Document-topic matrix (output of topic_distribution_to_matrix).
        dictionary: Gensim Dictionary object used in LDA training.
        labels: Labels corresponding to the documents.
        num_topics: Number of topics in the LDA model.
        """
        self.lda_model = lda_model
        self.corpus = corpus
        self.topic_matrix = topic_matrix
        self.dictionary = dictionary
        self.labels = labels
        self.num_topics = num_topics

    def get_top_words_per_topic(self, top_n=10):
        """
        Get the top N words for each topic in the LDA model.

        Parameters:
        top_n (int): Number of top words to return for each topic.

        Returns:
        A dictionary where keys are topic IDs and values are lists of top N words.
        """
        top_words = {}
        for topic_id in range(self.num_topics):
            top_words[topic_id] = [
                word for word, _ in self.lda_model.show_topic(topic_id, topn=top_n)
            ]
        return top_words

    def generate_wordclouds(self, top_n=10):
        """
        Generate word clouds for each topic.

        Parameters:
        top_n (int): Number of top words to include in the word cloud.
        """
        top_words = self.get_top_words_per_topic(top_n=top_n)
        for topic_id, words in top_words.items():
            word_freqs = {word: weight for word, weight in self.lda_model.show_topic(topic_id, topn=top_n)}
            wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freqs)
            plt.figure(figsize=(8, 4))
            plt.imshow(wordcloud, interpolation="bilinear")
            plt.axis("off")
            plt.title(f"Topic {topic_id} Word Cloud")
            plt.show()

    def perform_tsne(self, perplexity=50, n_iter=500):
        """
        Perform t-SNE to reduce topic matrix to 2D for visualization.

        Parameters:
        perplexity (int): Perplexity parameter for t-SNE.
        n_iter (int): Number of iterations for t-SNE.

        Returns:
        tsne_results: A 2D array of t-SNE coordinates for visualization.
        """
        tsne = TSNE(n_components=2, perplexity=perplexity, n_iter=n_iter, random_state=42)
        tsne_results = tsne.fit_transform(self.topic_matrix)
        return tsne_results

    def visualize_tsne(self, tsne_results):
        """
        Visualize the t-SNE results as a scatter plot.

        Parameters:
        tsne_results: A 2D array of t-SNE coordinates for visualization.
        """
        plt.figure(figsize=(10, 7))
        scatter = plt.scatter(
            tsne_results[:, 0],
            tsne_results[:, 1],
            c=self.labels,
            cmap='viridis',
            s=10,
            alpha=0.7
        )
        plt.colorbar(scatter, label="Labels")
        plt.title("t-SNE Visualization of LDA Topic Distributions")
        plt.xlabel("t-SNE Dimension 1")
        plt.ylabel("t-SNE Dimension 2")
        plt.show()

    def generate_topics_summary(self, top_n=10):
        """
        Generate a summary table of topics, showing top N words per topic.

        Parameters:
        top_n (int): Number of top words to include in the summary.

        Returns:
        A pandas DataFrame summarizing topics and their top N words.
        """
        topic_summaries = []
        for topic_id in range(self.num_topics):
            top_words = ", ".join([word for word, _ in self.lda_model.show_topic(topic_id, topn=top_n)])
            topic_summaries.append({
                "Topic ID": topic_id,
                "Top Words": top_words
            })
        return pd.DataFrame(topic_summaries)

    def run_analysis(self):
        """
        Run the complete analysis pipeline: generate word clouds, perform t-SNE, and print topic summary.
        """
        print("Generating Word Clouds...")
        self.generate_wordclouds()

        print("Performing t-SNE...")
        tsne_results = self.perform_tsne()
        self.visualize_tsne(tsne_results)

        print("Generating Topic Summary Table...")
        summary_df = self.generate_topics_summary()


# Frequency and Predictive Power of N-gram Features

In [None]:
# Extracting n-gram features
ngram_extractor = NGramFeatureExtractor(documents, labels)
ngram_extractor.extract_features()
depression_unigram_freqs, non_depression_unigram_freqs = ngram_extractor.compute_frequencies(feature_type="unigram")
depression_bigram_freqs, non_depression_bigram_freqs = ngram_extractor.compute_frequencies(feature_type="bigram")
top_100_depression_unigrams = ngram_extractor.get_top_n_features(depression_unigram_freqs, top_n=100)
print(top_100_depression_unigrams)
top_100_non_depression_unigrams = ngram_extractor.get_top_n_features(non_depression_unigram_freqs, top_n=100)
print(top_100_non_depression_unigrams)
top_100_depression_bigrams = ngram_extractor.get_top_n_features(depression_bigram_freqs, top_n=100)
print(top_100_depression_bigrams)
top_100_non_depression_bigrams = ngram_extractor.get_top_n_features(non_depression_bigram_freqs, top_n=100)
print(top_100_non_depression_bigrams)
ngram_extractor.visualize_wordcloud(depression_unigram_freqs, "Depression Unigram Word Cloud")
ngram_extractor.visualize_wordcloud(non_depression_unigram_freqs, "Non-Depression Unigram Word Cloud")
ngram_extractor.visualize_wordcloud(depression_bigram_freqs, "Depression Bigram Word Cloud")
ngram_extractor.visualize_wordcloud(non_depression_bigram_freqs, "Non-Depression Bigram Word Cloud")

# Predictive Power of Empath Features

In [3]:
# Analyzing the Empath model
empath_analyzer = EmpathFeatureAnalyzer(documents, labels, selected_categories)

# Run the pipeline
empath_analyzer.extract_empath_features()
empath_analyzer.analyze_correlation()
empath_analyzer.group_correlations_by_subcategory()

# Generate and visualize the summary table
summary_table = empath_analyzer.generate_summary_table()
empath_analyzer.visualize_summary_table()


Extracted Empath features with shape: (293, 237)
Removed constant columns: ['articles', 'auxiliary_verbs', 'adverbs', 'conjunctions', 'personal_pronouns', 'impersonal_pronouns', 'negations', 'prepositions', 'verbs', 'nouns', 'adjectives', 'comparatives', 'superlatives', 'modifiers', 'function_words', 'filler_words', 'verb_tense', 'slang', 'jargon', 'formal_language', 'casual_language', 'exclamations', 'contractions', 'word_complexity', 'sentiment_words', 'anxiety', 'hope', 'excitement', 'relief', 'gratitude', 'guilt', 'boredom', 'embarrassment', 'longing', 'nostalgia', 'frustration', 'melancholy', 'illness', 'fitness', 'nutrition', 'ingestion', 'physical_state', 'medicine', 'aging', 'disease', 'hospital', 'recovery', 'dieting', 'mental_health', 'drug_use', 'headache', 'fatigue', 'hormones', 'appetite', 'relationships', 'group_behavior', 'teamwork', 'community', 'peer_pressure', 'leadership', 'parenting', 'mentorship', 'marriage', 'divorce', 'gender_roles', 'social_identity', 'cultural_

# TODO: Predictive Power of LDA Features

Objective: Use Latent Dirichlet Allocation (LDA) to discover hidden topics in the posts, identifying themes associated with depression.

Process:
Train LDA models on the text data.
Generate topic distributions for each post (probability of belonging to each topic).
Select the top 20 topics with the largest proportions in the data.
Use t-SNE for dimensionality reduction to visualize topic clusters in 2D space.

Findings:
Topics indicative of depression include:
Themes like "Depression," "Broke," "Tired," "Pain," reflecting suffering, self-preoccupation, and low self-esteem.
Words associated with disclosure, loneliness, hostility, and interpersonal issues.

Significance: LDA captures latent patterns that are not explicitly defined (unlike LIWC), enabling nuanced analysis of depression-related themes.


In [16]:
# Assume the following are already generated from LDAFeatureExtractor:
lda_model = lda_extractor.lda_model
corpus = lda_extractor.corpus
topic_matrix = lda_extractor.topic_distribution_to_matrix()
dictionary = lda_extractor.dictionary
labels = labels  # Labels from your dataset
num_topics = lda_extractor.num_topics

# Initialize the analyzer
lda_analyzer = LDAFeatureAnalyzer(lda_model, corpus, topic_matrix, dictionary, labels, num_topics)

# Run analysis
lda_summary = lda_analyzer.run_analysis()

# Save the topic summary table if needed
lda_summary.to_csv("lda_topics_summary.csv", index=False)


NameError: name 'lda_extractor' is not defined