In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import time

import warnings
warnings.filterwarnings('ignore')

original_corpus = pd.read_csv('data/mental_health.csv')
cleaned_corpus = pd.read_csv('data/cleaned_mhc.csv')

print("Shape of Original Corpus", original_corpus.shape)
print("Shape of Cleaned Corpus", cleaned_corpus.shape)

ModuleNotFoundError: No module named 'pandas'

In [None]:
from sklearn.decomposition import TruncatedSVD

highlight_terms = ['suicidal', 'depression', 'suicide', 'kill', 'myself', 'die',
                   'died', 'pain', 'sad', 'help', 'sorry', 'anxiety', 'therapy',
                   'suffering', 'killing', 'pill',
                   'movie', 'film', 'character', 'story', 'actor', 'performance', 'show',
                   'plot', 'acting']

n_topics = 3
n_top_words = 200

vectorizer = TfidfVectorizer(max_features=3500)
svd_model = TruncatedSVD(n_components=n_topics, random_state=42)

def display_topics(model, feature_names, num_top_words, highlight_terms):
    for idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
        top_words = [word.upper() if word in highlight_terms else word for word in top_words]
        print(f"Topic {idx + 1}: {', '.join(top_words)} \n")

X_original_tfidf = vectorizer.fit_transform(original_corpus['text'])
svd_model.fit(X_original_tfidf)
print("Top topics in Original Corpus:")
display_topics(svd_model, vectorizer.get_feature_names_out(), n_top_words, highlight_terms)

X_cleaned_tfidf = vectorizer.fit_transform(cleaned_corpus['text'])
svd_model.fit(X_cleaned_tfidf)
print("\nTop topics in Cleaned Corpus:")
display_topics(svd_model, vectorizer.get_feature_names_out(), n_top_words, highlight_terms)

In [None]:
from sklearn.decomposition import TruncatedSVD

n_topics = 10
n_top_words = 10

vectorizer = TfidfVectorizer(max_features=3500)
svd_model = TruncatedSVD(n_components=n_topics, random_state=42)

def display_topics(model, feature_names, num_top_words):
    for idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
        print(f"Topic {idx + 1}: {', '.join(top_words)}")

X_original_tfidf = vectorizer.fit_transform(original_corpus['text'])
svd_model.fit(X_original_tfidf)
print("Top topics in Original Corpus:")
display_topics(svd_model, vectorizer.get_feature_names_out(), n_top_words)

X_cleaned_tfidf = vectorizer.fit_transform(cleaned_corpus['text'])
svd_model.fit(X_cleaned_tfidf)
print("\nTop topics in Cleaned Corpus:")
display_topics(svd_model, vectorizer.get_feature_names_out(), n_top_words)

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from IPython.display import display, HTML

def style_df_with_highlights(df, highlight_terms):
    def highlight_words(val):
        if isinstance(val, str) and val.upper() in [term.upper() for term in highlight_terms]:
            return 'background-color: #800080'
        return ''
    
    return df.style.applymap(highlight_words)

def display_topics_tabular(model, feature_names, num_top_words, highlight_terms, words_per_row=15):
    topic_dfs = []
    
    for idx, topic in enumerate(model.components_):
        # Get top words for the topic
        top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
        # Apply highlighting (uppercase)
        top_words = [word.upper() if word in highlight_terms else word for word in top_words]
        
        # Calculate number of rows needed
        num_rows = (len(top_words) + words_per_row - 1) // words_per_row
        
        # Create a 2D array of words, padding with empty strings if necessary
        rows = []
        for row in range(num_rows):
            start_idx = row * words_per_row
            end_idx = min(start_idx + words_per_row, len(top_words))
            row_words = top_words[start_idx:end_idx]
            # Pad row with empty strings if needed
            row_words.extend([''] * (words_per_row - len(row_words)))
            rows.append(row_words)
        
        # Create DataFrame for this topic
        df = pd.DataFrame(
            rows,
            columns=[f'Token_{i+1}' for i in range(words_per_row)]
        )
        
        # Add topic number as index
        df.index = [f'Row_{i+1}' for i in range(len(df))]
        
        topic_dfs.append((idx + 1, df))

    # Display each topic's DataFrame with styling
    for topic_num, df in topic_dfs:
        print(f"\nTopic {topic_num}")
        
        # Apply styling and display
        styled_df = style_df_with_highlights(df, highlight_terms)
        
        # Display the styled DataFrame
        display(styled_df)
        print("\n")

# Example usage:
highlight_terms = ['suicidal', 'depression', 'suicide', 'kill', 'myself', 'die',
                   'died', 'pain', 'sad', 'help', 'sorry', 'anxiety', 'therapy',
                   'suffering', 'killing', 'pill', 'depressed', 'death', 'cry', 'redflag', 'tired',
                   'movie', 'film', 'character', 'story', 'actor', 'performance', 'show',
                   'plot', 'acting', 'scene', 'watch', 'seen', 'funny', 'dvd', 'drama', 
		   'dialouge', 'entertaining', 'hollywood', 'humor', 'star', 'horror', 'series',
		   'genre', 'production', 'cinema', 'effect', 'audience', 'picture', 'actress']

n_topics = 3
n_top_words = 195
vectorizer = TfidfVectorizer(max_features=3500)
svd_model = TruncatedSVD(n_components=n_topics, random_state=42)

# For cleaned corpus
X_cleaned_tfidf = vectorizer.fit_transform(cleaned_corpus['text'])
svd_model.fit(X_cleaned_tfidf)
print("\nTop topics in Cleaned Corpus:")
display_topics_tabular(svd_model, vectorizer.get_feature_names_out(), n_top_words, highlight_terms)