In [17]:
import os

## check if on colab
try:
    import google.colab
    in_colab = True
    local_path = "/content/drive/MyDrive/DLSS/"
    google.colab.drive.mount('/content/drive')

except ImportError:
    in_colab = False
    ## get current directory
    current_wd = os.getcwd()
    ## move one up to go to main directory
    local_path = os.path.dirname(os.path.dirname(current_wd)) + "/"

print("CWD: ", local_path)

import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import spacy
from collections import Counter

# Load SpaCy's English tokenizer and stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

CWD:  c:\Users\wirth/


In [18]:
# Step 4: Collect words and find most common words in each cluster
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if token.text.isalnum() and token.text.lower() not in stop_words]
    return tokens

def get_words_from_cluster(df, cluster_number):
    texts = df[df['cluster'] == cluster_number]['title_and_text_lemmatized']
    words = []
    for text in texts:
        tokens = preprocess_text(text)
        words.extend(tokens)
    return words

def get_embeddings_dict(df):
    embeddings_dict = {}
    for _, row in df.iterrows():
        word = row['word']
        embedding = row.drop(['Unnamed: 0', 'word']).values.astype(float)
        embeddings_dict[word] = embedding
    return embeddings_dict

def get_text_embedding(text, embeddings_dict):
    words = text.split()  # Simple tokenization, adjust as needed
    word_embeddings = [embeddings_dict.get(word) for word in words if embeddings_dict.get(word) is not None]
    if word_embeddings:
        return np.mean(word_embeddings, axis=0)
    else:
        return np.zeros(len(next(iter(embeddings_dict.values()))))  # Default to zero vector
    
def get_top_words(words, num_common=10):
    word_counts = Counter(words)
    return [word for word, freq in word_counts.most_common(num_common)]

In [19]:
## cluster
num_clusters = 5  # Adjust based on your needs

In [20]:
## load data
df_text = pd.read_csv("data/preprocessed/total_posts.csv")[["id", "title_and_text_lemmatized"]]
df_embeddings  = pd.read_csv("data/embeddings_best_model/embeddings_CBOW_total_posts.csv")
embeddings_dict = get_embeddings_dict(df_embeddings)

In [21]:
## prepare
# Step 2: Aggregate embeddings for each text
df_text['embedding'] = df_text['title_and_text_lemmatized'].apply(lambda text: get_text_embedding(text, embeddings_dict))
# Convert the aggregated embeddings into an array for clustering
X = np.vstack(df_text['embedding'].values)

kmeans = KMeans(n_clusters=num_clusters, random_state=42)
clusters = kmeans.fit_predict(X)
df_text['cluster'] = clusters

top_words_per_cluster = {}
for cluster_num in range(num_clusters):
    words_in_cluster = get_words_from_cluster(df_text, cluster_num)
    top_words_per_cluster[cluster_num] = get_top_words(words_in_cluster)
    print(f"Top 10 words in cluster {cluster_num}:")
    print(top_words_per_cluster[cluster_num])

Top 10 words in cluster 0:
['climate', 'change', 'people', 'think', 'believe', 'like', 'know', 'real', 'cause', 'trump']
Top 10 words in cluster 1:
['climate', 'change', 'world', 'new', 'year', 'people', 'global', 'r', 'energy', 'time']
Top 10 words in cluster 2:
['climate', 'change', 'trump', 'new', 'world', 'news', 'report', 'fight', 'global', 'study']
Top 10 words in cluster 3:
['climate', 'change', 'global', 'world', 'real', 'fight', 'scientist', 'cause', 'new', 'trump']
Top 10 words in cluster 4:
['climate', 'change', 'fight', 'world', 'trump', 'new', 'need', 'help', 'combat', 'want']


In [22]:
list_finetuning_models = list(range(2010, 2023)) 
list_finetuning_models = ["askmen", "askwomen"]

for subgroup in list_finetuning_models:
    print(subgroup)

    data  = pd.read_csv(f"data/preprocessed/posts_{subgroup}.csv")
    df_embeddings  = pd.read_csv(f"data/embeddings_year_and_reddits/embeddings_CBOW_posts_{subgroup}.csv")
    embeddings_dict = get_embeddings_dict(df_embeddings)
    
    ## prepare
    # Step 2: Aggregate embeddings for each text
    df_text['embedding'] = df_text['title_and_text_lemmatized'].apply(lambda text: get_text_embedding(text, embeddings_dict))
    # Convert the aggregated embeddings into an array for clustering
    X = np.vstack(df_text['embedding'].values)

    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)
    df_text['cluster'] = clusters

    cluster_words = []
    for cluster_num in range(num_clusters):
        words_in_cluster = get_words_from_cluster(df_text, cluster_num)
        top_words = get_top_words(words_in_cluster)
        cluster_words.append({'cluster': cluster_num, 'top_10_words': top_words})
        
    os.makedirs(f"data/output/topics/", exist_ok=True)
    df_top_words = pd.DataFrame(cluster_words)
    df_top_words.to_csv(f"data/output/topics/topics_{subgroup}.csv")


askmen
askwomen
