In [1]:
from collections import Counter, defaultdict
from itertools import chain

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import nltk
from nltk import pos_tag, ne_chunk, Tree
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer

from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.cluster import KMeans



  from .autonotebook import tqdm as notebook_tqdm


## Data Preparation

In [2]:
bipolar_df = pd.read_csv('../data/RedditBipolar.csv', encoding='latin9', delimiter=';', parse_dates=['date'])
bipolar_df = bipolar_df[['date', 'author', 'post']]

In [3]:
print(bipolar_df.shape)
print(bipolar_df['post'].nunique())
print(bipolar_df['author'].nunique())
print(bipolar_df.info())

(1692, 3)
1692
1618
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1692 entries, 0 to 1691
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    1692 non-null   object
 1   author  1692 non-null   object
 2   post    1692 non-null   object
dtypes: object(3)
memory usage: 39.8+ KB
None


## Text Preprocessing

### Tokenization

In [None]:
bipolar_df['post_tokens_uppercase'] = bipolar_df['post'].apply(lambda x: nltk.word_tokenize(x))

### Stop word removal

In [None]:
stop_words = stopwords.words('english')
stop_words = stop_words + ['make', 'feel', 'like', 'going', 'thing', 'something', 'get', 'http', 'else', 'anyone', 'bipolar', 'disorder']

bipolar_df['post_tokens_lowercase'] = bipolar_df['post_tokens_uppercase'].apply(
  lambda x: [word.lower() for word in x if word.lower().isalpha() and word.lower() not in stop_words]
)

### POS tagging

In [None]:
bipolar_df['post_tokens_uppercase_with_pos_tags'] = bipolar_df['post_tokens_uppercase'].apply(pos_tag)

### Named entity recognition

In [None]:
def extract_named_entities(tree):
  entities = []
  for node in tree:
    if isinstance(node, Tree): # instance of nameed entity
      entity_type = node.label()
      entity_words = " ".join(word for word, _ in node.leaves())
      entities.append((entity_words, entity_type))
  return entities

In [None]:
bipolar_df['named_entities'] = bipolar_df['post_tokens_uppercase_with_pos_tags'].apply(
    lambda tagged: extract_named_entities(ne_chunk(tagged))
)


### Lemmatization

In [None]:
def map_pos_tags(tag: str):
  if tag.startswith('J'):
      return 'a'
  elif tag.startswith('V'):
      return 'v'
  elif tag.startswith('R'):
      return 'r'
  return 'n'

In [None]:
wnl = WordNetLemmatizer()

bipolar_df['post_lemmas_lowercase'] = bipolar_df['post_tokens_lowercase'].apply(lambda x: [wnl.lemmatize(word) for word in x])

bipolar_df['post_lemmas_uppercase'] = bipolar_df['post_tokens_uppercase_with_pos_tags'].apply(
  lambda tagged: [wnl.lemmatize(word=word, pos=map_pos_tags(tag)) for word, tag in tagged]
)

## Exploratory Analysis

### Word frequency analysis

In [None]:
all_lemmas = list(chain.from_iterable(bipolar_df['post_lemmas_lowercase']))
word_count = Counter(all_lemmas)

In [None]:
top_words = word_count.most_common(20)

words, counts = zip(*top_words)

plt.figure(figsize=(10, 6))
plt.barh(words, counts)
plt.title('Top Word Counts')
plt.xlabel('Counts')
plt.ylabel('Words')
plt.tight_layout()
plt.show()

### Word clouds

In [None]:
wc = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_count)

plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Most Frequent Lemmas", fontsize=16)
plt.tight_layout()
plt.show()

### Top entities overall

In [None]:
all_entities = list(chain.from_iterable(bipolar_df['named_entities']))
all_entities = [word for word, _ in all_entities]
entity_counts = Counter(all_entities)

In [None]:
wc = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(entity_counts)

plt.figure(figsize=(12, 6))
plt.imshow(wc, interpolation='bilinear')
plt.title('Named Entity Word Cloud')
plt.axis('off')
plt.tight_layout()
plt.show()

### Sentiment analysis

In [None]:
# bipolar_df = bipolar_df.drop(['compound', 'pos', 'neg', 'neu'], axis=1)

In [None]:
sia = SentimentIntensityAnalyzer()

# generates a dictionary of normalized scores (neg, neu, pos, compound)
bipolar_df['sentiment'] = bipolar_df['post'].apply(sia.polarity_scores)

# creates a dataframe from json
sentiment_df = pd.json_normalize(bipolar_df['sentiment'])

# concatenates the bipolar_df and sentiment_df columns
bipolar_df = pd.concat([bipolar_df, sentiment_df], axis=1)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(bipolar_df['compound'], bins=50)
plt.title('Distribution of Compound Sentiment Scores')
plt.xlabel('Compound Sentiment Score')
plt.ylabel('Post Count')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
def label_sentiment(x: float):
  if x <= -0.05:
    return 'Negative'
  if x >= 0.05:
    return 'Positive'
  return 'Neutral'

bipolar_df['sentiment_label'] = bipolar_df['compound'].apply(label_sentiment)
bipolar_df['sentiment_label'].value_counts(normalize=True)

## Topic Modeling

### TF-IDF matrix

In [None]:
def lemmas_to_string(lemmas: list[str]):
  return ' '.join(lemma for lemma in lemmas)

bipolar_df['post_lemma_string_lowercase'] = bipolar_df['post_lemmas_lowercase'].apply(lemmas_to_string)

tfidf_vectorizer = TfidfVectorizer(
  ngram_range=(2, 3),
  min_df=5,
  max_df=0.95,
  stop_words=stop_words
)

X_tfidf = tfidf_vectorizer.fit_transform(bipolar_df['post_lemma_string_lowercase'])

### Non-Negative Matrix Factorization (NMF)

In [None]:
n_components = 10

model = NMF(
  n_components=n_components,
  init='random',
  random_state=42,
  max_iter=1000
)

W = model.fit_transform(X_tfidf) # document-topic matrix → each row shows how much each topic contributes to a post
H = model.components_ # topic-term matrix → each row is a topic, each column a term

In [None]:
feature_names = tfidf_vectorizer.get_feature_names_out() # Gets the words from the tf-idf vector

for topic_idx, topic in enumerate(H): # for loop to map the top columns in the H matrix to their corresponding words in the feature_names
  top_indices = topic.argsort()[::-1][:10] # reverse sort the top ten indices according to their topic weights
  top_words = [feature_names[i] for i in top_indices] # retrieve the feature names for the top ten indices
  print(f'Topic {topic_idx+1}: {", ".join(top_words)}') # print the n-grams corresponding to each topic

### LDA

In [None]:
count_vectorizer = CountVectorizer(
  ngram_range=(2, 3),
  min_df=5,
  max_df=0.95,
  stop_words=stop_words
)

X_count = count_vectorizer.fit_transform(bipolar_df['post_lemma_string_lowercase'])

In [None]:
n_components = 10

lda = LatentDirichletAllocation(
  n_components=n_components,
  random_state=42,
  max_iter=50
)

lda.fit(X_count)

In [None]:
feature_names = count_vectorizer.get_feature_names_out()

for topic_idx, topic in enumerate(lda.components_):
  top_indices = topic.argsort()[::-1][:10]
  top_words = [feature_names[i] for i in top_indices]
  print(f'Topic {topic_idx + 1}: {", ".join(top_words)}')

### Top Posts and Sentiment per Topic

In [None]:
def get_top_posts(topic_idx: int, n_posts: int):
  topic_scores = W[:, topic_idx]
  top_indices = topic_scores.argsort()[::-1][:n_posts]
  return (pd.concat([bipolar_df.iloc[top_indices]['post'], sentiment_df.iloc[top_indices]['compound']], axis=1))

for i in range(n_components):
  posts = get_top_posts(i, 5)
  print(f'Topic {i}:')
  print(f"{posts.apply(lambda x: 'Sentiment: ' + str(x['compound']) + '  Content: ' + str(x['post']), axis=1)}")
  print('\n')

## Topic Modeling with Embeddings

### Clustering Word Embeddings (Word2Vec)

In [None]:
w2v_model = Word2Vec(
  sentences=bipolar_df['post_lemmas_lowercase'], # expects list of tokens
  vector_size=100, # dimensionality of the word vectors
  window=5, # context window size
  min_count=5, # ignore words that appear fewer than 5 times
  workers=4, # parallelization
  seed=42
)

In [None]:
print(w2v_model.wv['manic']) # vector for the word 'bipoar'
print(w2v_model.wv.most_similar('manic')) # words most similar to 'manic'
print(w2v_model.wv.similarity('manic', 'depression')) # cosine similarity

In [None]:
words = list(w2v_model.wv.key_to_index) # get all words in the vocabulary
word_vectors = np.array([w2v_model.wv[word] for word in words])

In [None]:
n_clusters = 10

kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
kmeans.fit(word_vectors)

word_clusters = kmeans.labels_ # each word now has a cluster label

In [None]:
clusters = defaultdict(list)

for word, label in zip(words, word_clusters):
  clusters[label].append(word)

# print top words per cluster:
for cluster_id, cluster_words in clusters.items():
  print(f"Cluster {cluster_id}: {', '.join(cluster_words[:10])}\n")

### Clustering Document Embeddings

In [None]:
# Load a lightweight, general-purpose sentence embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Your column is already a clean string version of the posts
post_texts = bipolar_df['post_lemma_string_lowercase'].tolist()

sentence_embeddings = model.encode(post_texts, show_progress_bar=True)

In [None]:
n_clusters = 10

kmeans_doc = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
kmeans_doc.fit(sentence_embeddings)

bipolar_df['doc_topic'] = kmeans_doc.labels_

### BERTopic

In [None]:
model = BERTopic(language="english", verbose=True)
topics, probs = model.fit_transform(bipolar_df['post_lemma_string_lowercase'], sentence_embeddings)