In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import time

import warnings
warnings.filterwarnings('ignore')

original_corpus = pd.read_csv('data/mental_health.csv')
cleaned_corpus = pd.read_csv('data/cleaned_mhc.csv')

print("Shape of Original Corpus", original_corpus.shape)
print("Shape of Cleaned Corpus", cleaned_corpus.shape)

Shape of Original Corpus (27977, 2)
Shape of Cleaned Corpus (23240, 2)


In [20]:
from sklearn.decomposition import TruncatedSVD

highlight_terms = ['suicidal', 'depression', 'suicide', 'kill', 'myself', 'die',
                   'died', 'pain', 'sad', 'help', 'sorry', 'anxiety', 'therapy',
                   'suffering', 'killing', 'pill',
                   'movie', 'film', 'character', 'story', 'actor', 'performance', 'show',
                   'plot', 'acting']

n_topics = 3
n_top_words = 200

vectorizer = TfidfVectorizer(max_features=3500)
svd_model = TruncatedSVD(n_components=n_topics, random_state=42)

def display_topics(model, feature_names, num_top_words, highlight_terms):
    for idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
        top_words = [word.upper() if word in highlight_terms else word for word in top_words]
        print(f"Topic {idx + 1}: {', '.join(top_words)} \n")

X_original_tfidf = vectorizer.fit_transform(original_corpus['text'])
svd_model.fit(X_original_tfidf)
print("Top topics in Original Corpus:")
display_topics(svd_model, vectorizer.get_feature_names_out(), n_top_words, highlight_terms)

X_cleaned_tfidf = vectorizer.fit_transform(cleaned_corpus['text'])
svd_model.fit(X_cleaned_tfidf)
print("\nTop topics in Cleaned Corpus:")
display_topics(svd_model, vectorizer.get_feature_names_out(), n_top_words, highlight_terms)

Top topics in Original Corpus:
Topic 1: im, like, want, feel, know, life, ive, me, cant, get, dont, people, even, really, one, would, time, it, think, going, never, go, friends, much, HELP, day, anymore, fucking, years, good, things, someone, anything, make, DIE, way, need, talk, better, end, could, everything, KILL, got, nothing, anyone, see, still, live, family, always, back, something, love, school, hate, every, MYSELF, ill, right, redflag, work, shit, ever, everyone, bad, say, year, this, last, now, tired, care, feeling, happy, friend, take, point, thing, since, person, parents, keep, tell, try, tried, job, thought, getting, told, also, alone, PAIN, long, around, fuck, thats, trying, find, wish, made, stop, do, away, well, done, DEPRESSION, enough, first, else, lot, hard, best, maybe, said, living, SUICIDAL, world, worse, many, give, thoughts, please, br, wanted, reason, that, sure, days, thinking, actually, depressed, ago, felt, mom, scared, hope, makes, months, up, started, them,

In [15]:
from sklearn.decomposition import TruncatedSVD

n_topics = 10
n_top_words = 10

vectorizer = TfidfVectorizer(max_features=3500)
svd_model = TruncatedSVD(n_components=n_topics, random_state=42)

def display_topics(model, feature_names, num_top_words):
    for idx, topic in enumerate(model.components_):
        top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
        print(f"Topic {idx + 1}: {', '.join(top_words)}")

X_original_tfidf = vectorizer.fit_transform(original_corpus['text'])
svd_model.fit(X_original_tfidf)
print("Top topics in Original Corpus:")
display_topics(svd_model, vectorizer.get_feature_names_out(), n_top_words)

X_cleaned_tfidf = vectorizer.fit_transform(cleaned_corpus['text'])
svd_model.fit(X_cleaned_tfidf)
print("\nTop topics in Cleaned Corpus:")
display_topics(svd_model, vectorizer.get_feature_names_out(), n_top_words)

Top topics in Original Corpus:
Topic 1: im, like, want, feel, know, life, ive, me, cant, get
Topic 2: br, movie, film, one, the, great, good, story, films, movies
Topic 3: filler, wanna, text, bored, anyone, im, dm, talk, need, someone
Topic 4: im, br, movie, film, gonna, bored, going, tired, the, ive
Topic 5: want, dont, br, die, film, movie, fucking, kill, anymore, cant
Topic 6: dont, talk, like, need, someone, wanna, help, please, anyone, know
Topic 7: like, feel, dont, im, feels, feeling, movie, makes, br, people
Topic 8: fucking, fuck, dont, shit, hate, people, wanna, guys, like, got
Topic 9: dont, know, got, didnt, think, said, ive, told, mom, day
Topic 10: want, day, wanna, school, got, die, guys, bored, im, girl

Top topics in Cleaned Corpus:
Topic 1: im, want, like, feel, know, life, ive, get, cant, dont
Topic 2: movie, film, one, character, great, story, good, scene, well, see
Topic 3: im, movie, film, gonna, ive, going, tired, cant, sorry, bored
Topic 4: want, movie, dont, f

In [34]:
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from IPython.display import display, HTML

def style_df_with_highlights(df, highlight_terms):
    def highlight_words(val):
        if isinstance(val, str) and val.upper() in [term.upper() for term in highlight_terms]:
            return 'background-color: #800080'
        return ''
    
    return df.style.applymap(highlight_words)

def display_topics_tabular(model, feature_names, num_top_words, highlight_terms, words_per_row=15):
    topic_dfs = []
    
    for idx, topic in enumerate(model.components_):
        # Get top words for the topic
        top_words = [feature_names[i] for i in topic.argsort()[:-num_top_words - 1:-1]]
        # Apply highlighting (uppercase)
        top_words = [word.upper() if word in highlight_terms else word for word in top_words]
        
        # Calculate number of rows needed
        num_rows = (len(top_words) + words_per_row - 1) // words_per_row
        
        # Create a 2D array of words, padding with empty strings if necessary
        rows = []
        for row in range(num_rows):
            start_idx = row * words_per_row
            end_idx = min(start_idx + words_per_row, len(top_words))
            row_words = top_words[start_idx:end_idx]
            # Pad row with empty strings if needed
            row_words.extend([''] * (words_per_row - len(row_words)))
            rows.append(row_words)
        
        # Create DataFrame for this topic
        df = pd.DataFrame(
            rows,
            columns=[f'Token_{i+1}' for i in range(words_per_row)]
        )
        
        # Add topic number as index
        df.index = [f'Row_{i+1}' for i in range(len(df))]
        
        topic_dfs.append((idx + 1, df))

    # Display each topic's DataFrame with styling
    for topic_num, df in topic_dfs:
        print(f"\nTopic {topic_num}")
        
        # Apply styling and display
        styled_df = style_df_with_highlights(df, highlight_terms)
        
        # Display the styled DataFrame
        display(styled_df)
        print("\n")

# Example usage:
highlight_terms = ['suicidal', 'depression', 'suicide', 'kill', 'myself', 'die',
                   'died', 'pain', 'sad', 'help', 'sorry', 'anxiety', 'therapy',
                   'suffering', 'killing', 'pill', 'depressed', 'death', 'cry', 'redflag', 'tired',
                   'movie', 'film', 'character', 'story', 'actor', 'performance', 'show',
                   'plot', 'acting', 'scene', 'watch', 'seen', 'funny', 'dvd', 'drama', 
		   'dialouge', 'entertaining', 'hollywood', 'humor', 'star', 'horror', 'series',
		   'genre', 'production', 'cinema', 'effect', 'audience', 'picture', 'actress']

n_topics = 3
n_top_words = 195
vectorizer = TfidfVectorizer(max_features=3500)
svd_model = TruncatedSVD(n_components=n_topics, random_state=42)

# For cleaned corpus
X_cleaned_tfidf = vectorizer.fit_transform(cleaned_corpus['text'])
svd_model.fit(X_cleaned_tfidf)
print("\nTop topics in Cleaned Corpus:")
display_topics_tabular(svd_model, vectorizer.get_feature_names_out(), n_top_words, highlight_terms)


Top topics in Cleaned Corpus:

Topic 1


Unnamed: 0,Token_1,Token_2,Token_3,Token_4,Token_5,Token_6,Token_7,Token_8,Token_9,Token_10,Token_11,Token_12,Token_13,Token_14,Token_15
Row_1,im,want,like,feel,know,life,ive,get,cant,dont,friend,time,people,even,one
Row_2,really,year,would,think,thing,day,going,go,HELP,never,make,much,anymore,fucking,thought
Row_3,way,good,someone,anything,DIE,need,end,talk,better,everything,hate,love,nothing,KILL,family
Row_4,live,see,could,always,anyone,feeling,school,care,still,say,something,every,got,back,work
Row_5,REDFLAG,right,ill,shit,keep,everyone,ever,last,take,bad,TIRED,tell,job,point,happy
Row_6,parent,try,person,since,tried,month,getting,alone,PAIN,reason,also,told,find,fuck,long
Row_7,wish,trying,give,lot,around,stop,DEPRESSION,away,done,girl,thats,SUICIDAL,hurt,week,hard
Row_8,made,living,enough,best,else,well,worse,guy,MOVIE,maybe,world,first,said,many,come
Row_9,please,hope,sure,ago,mom,DEPRESSED,thinking,wanted,scared,post,actually,look,today,felt,without
Row_10,lost,night,let,problem,CRY,probably,old,FILM,SAD,cannot,started,left,though,home,past





Topic 2


Unnamed: 0,Token_1,Token_2,Token_3,Token_4,Token_5,Token_6,Token_7,Token_8,Token_9,Token_10,Token_11,Token_12,Token_13,Token_14,Token_15
Row_1,MOVIE,FILM,one,CHARACTER,great,STORY,good,SCENE,well,see,also,first,time,SEEN,WATCH
Row_2,SHOW,best,would,ACTOR,PERFORMANCE,love,many,PLOT,FUNNY,ACTING,made,little,cast,role,saw
Row_3,man,two,make,comedy,watching,part,play,lot,music,director,new,woman,real,quite,action
Row_4,excellent,fun,girl,look,fan,could,come,interesting,young,really,guy,DVD,like,must,beautiful
Row_5,year,people,HORROR,wonderful,played,may,STAR,big,got,war,however,way,say,still,original
Row_6,although,perfect,recommend,old,find,american,liked,song,though,especially,SERIES,ever,bit,watched,classic
Row_7,true,moment,fact,said,line,favorite,enjoyed,another,thing,kid,work,different,found,AUDIENCE,pretty
Row_8,give,amazing,viewer,child,screen,back,shot,definitely,main,remember,CINEMA,EFFECT,john,book,course
Row_9,HOLLYWOOD,game,script,later,seeing,set,nice,highly,version,HUMOR,enjoy,men,video,something,along
Row_10,together,PICTURE,father,ENTERTAINING,DRAMA,kind,three,black,take,much,special,boy,simply,based,name





Topic 3


Unnamed: 0,Token_1,Token_2,Token_3,Token_4,Token_5,Token_6,Token_7,Token_8,Token_9,Token_10,Token_11,Token_12,Token_13,Token_14,Token_15
Row_1,im,MOVIE,FILM,gonna,ive,going,TIRED,cant,SORRY,bored,ill,CHARACTER,fucking,great,done
Row_2,STORY,guy,wanna,school,SCENE,well,scared,girl,good,thats,WATCH,pretty,play,sure,class
Row_3,SEEN,ACTOR,first,got,sick,tonight,probably,PERFORMANCE,FUNNY,fuck,game,kinda,PLOT,man,reddit
Row_4,also,looking,watching,post,cast,ready,ACTING,chat,job,SHOW,music,one,best,saw,new
Row_5,comedy,bit,lol,director,fan,role,old,hour,posting,boy,two,excellent,hey,DVD,whats
Row_6,favorite,goodbye,dad,fun,HORROR,test,yeah,yall,set,interesting,high,grade,STAR,song,gay
Row_7,played,trying,american,little,bye,next,minute,cause,soon,teacher,original,wait,luck,nervous,tomorrow
Row_8,young,war,playing,classic,recommend,guess,although,video,quite,must,screen,watched,ENTERTAINING,wonderful,see
Row_9,lazy,finally,version,book,cool,glad,loser,final,script,shit,ugly,john,highly,viewer,piece
Row_10,action,buy,failing,today,joke,comment,CINEMA,AUDIENCE,age,HOLLYWOOD,house,writing,line,failure,hilarious




