# Imports and Loading

In [1]:
# Loading the required libraries
%pip install nbimporter
import nbimporter
import feature_extraction
from feature_extraction import EmpathFeatureExtractor
from feature_extraction import NGramFeatureExtractor
from feature_extraction import EmpathFeatureAnalyzer
import os
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
%store -r selected_categories

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Load documents and labels
folders = {
    "depression": {"path": "data/preprocessed/preprocessed_depression_posts", "label": 1},
    "breastcancer": {"path": "data/preprocessed/preprocessed_breastcancer_posts", "label": 0},
}
documents, labels = [], []
for category, data in folders.items():
    for file_name in os.listdir(data["path"]):
        file_path = os.path.join(data["path"], file_name)
        with open(file_path, 'r', encoding='utf-8') as file:
            documents.append(file.read())
            labels.append(data["label"])
print(f"Loaded {len(documents)} documents.")
print(f"Labels: {set(labels)}")

Loaded 293 documents.
Labels: {0, 1}


# Frequency and Predictive Power of N-gram Features

In [None]:
# Extracting n-gram features
ngram_extractor = NGramFeatureExtractor(documents, labels)
ngram_extractor.extract_features()
depression_unigram_freqs, non_depression_unigram_freqs = ngram_extractor.compute_frequencies(feature_type="unigram")
depression_bigram_freqs, non_depression_bigram_freqs = ngram_extractor.compute_frequencies(feature_type="bigram")
top_100_depression_unigrams = ngram_extractor.get_top_n_features(depression_unigram_freqs, top_n=100)
print(top_100_depression_unigrams)
top_100_non_depression_unigrams = ngram_extractor.get_top_n_features(non_depression_unigram_freqs, top_n=100)
print(top_100_non_depression_unigrams)
top_100_depression_bigrams = ngram_extractor.get_top_n_features(depression_bigram_freqs, top_n=100)
print(top_100_depression_bigrams)
top_100_non_depression_bigrams = ngram_extractor.get_top_n_features(non_depression_bigram_freqs, top_n=100)
print(top_100_non_depression_bigrams)
ngram_extractor.visualize_wordcloud(depression_unigram_freqs, "Depression Unigram Word Cloud")
ngram_extractor.visualize_wordcloud(non_depression_unigram_freqs, "Non-Depression Unigram Word Cloud")
ngram_extractor.visualize_wordcloud(depression_bigram_freqs, "Depression Bigram Word Cloud")
ngram_extractor.visualize_wordcloud(non_depression_bigram_freqs, "Non-Depression Bigram Word Cloud")

# Predictive Power of Empath Features

In [3]:
# Analyzing the Empath model
empath_analyzer = EmpathFeatureAnalyzer(documents, labels, selected_categories)

# Run the pipeline
empath_analyzer.extract_empath_features()
empath_analyzer.analyze_correlation()
empath_analyzer.group_correlations_by_subcategory()

# Generate and visualize the summary table
summary_table = empath_analyzer.generate_summary_table()
empath_analyzer.visualize_summary_table()


Extracted Empath features with shape: (293, 237)
Removed constant columns: ['articles', 'auxiliary_verbs', 'adverbs', 'conjunctions', 'personal_pronouns', 'impersonal_pronouns', 'negations', 'prepositions', 'verbs', 'nouns', 'adjectives', 'comparatives', 'superlatives', 'modifiers', 'function_words', 'filler_words', 'verb_tense', 'slang', 'jargon', 'formal_language', 'casual_language', 'exclamations', 'contractions', 'word_complexity', 'sentiment_words', 'anxiety', 'hope', 'excitement', 'relief', 'gratitude', 'guilt', 'boredom', 'embarrassment', 'longing', 'nostalgia', 'frustration', 'melancholy', 'illness', 'fitness', 'nutrition', 'ingestion', 'physical_state', 'medicine', 'aging', 'disease', 'hospital', 'recovery', 'dieting', 'mental_health', 'drug_use', 'headache', 'fatigue', 'hormones', 'appetite', 'relationships', 'group_behavior', 'teamwork', 'community', 'peer_pressure', 'leadership', 'parenting', 'mentorship', 'marriage', 'divorce', 'gender_roles', 'social_identity', 'cultural_

# TODO: Predictive Power of LDA Features

Objective: Use Latent Dirichlet Allocation (LDA) to discover hidden topics in the posts, identifying themes associated with depression.

Process:
Train LDA models on the text data.
Generate topic distributions for each post (probability of belonging to each topic).
Select the top 20 topics with the largest proportions in the data.
Use t-SNE for dimensionality reduction to visualize topic clusters in 2D space.

Findings:
Topics indicative of depression include:
Themes like "Depression," "Broke," "Tired," "Pain," reflecting suffering, self-preoccupation, and low self-esteem.
Words associated with disclosure, loneliness, hostility, and interpersonal issues.

Significance: LDA captures latent patterns that are not explicitly defined (unlike LIWC), enabling nuanced analysis of depression-related themes.
