# Topic Modelling 

In [7]:
import os

## check if on colab
try:
    import google.colab
    in_colab = True
    local_path = "/content/drive/MyDrive/DLSS/"
    google.colab.drive.mount('/content/drive')

except ImportError:
    in_colab = False
    ## get current directory
    current_wd = os.getcwd()
    ## move one up to go to main directory
    local_path = os.path.dirname(os.path.dirname(current_wd)) + "/"

print("CWD: ", local_path)

import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import spacy
from collections import Counter

# Load SpaCy's English tokenizer and stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

CWD:  d:\dlss-project24/


In [None]:
def preprocess_text(text):
    """
    Preprocesses the input text by tokenizing, converting to lowercase, and removing stop words and non-alphanumeric tokens.

    Args:
        text (str): The input text to preprocess.

    Returns:
        list of str: A list of processed tokens.
    """
    doc = nlp(text)  # Tokenize and process the text using a natural language processing model (e.g., SpaCy)
    
    # Convert tokens to lowercase, remove stop words and non-alphanumeric tokens
    tokens = [token.text.lower() for token in doc if token.text.isalnum() and token.text.lower() not in stop_words]
    
    return tokens  # Return the list of processed tokens

In [None]:
def get_words_from_cluster(df, cluster_number):
    """
    Retrieves all words from the texts in a specific cluster.

    Args:
        df (pd.DataFrame): The DataFrame containing text data and cluster assignments.
        cluster_number (int): The cluster number for which to extract words.

    Returns:
        list of str: A list of words from the specified cluster.
    """
    # Select texts belonging to the specified cluster
    texts = df[df['cluster'] == cluster_number]['title_and_text_lemmatized']
    
    words = []
    
    # Process each text and collect words
    for text in texts:
        tokens = preprocess_text(text)  # Preprocess the text to extract tokens
        words.extend(tokens)  # Add the tokens to the list of words
    
    return words  # Return the list of words for the cluster

In [None]:
def get_embeddings_dict(df):
    """
    Creates a dictionary mapping words to their corresponding embeddings.

    Args:
        df (pd.DataFrame): The DataFrame containing words and their embeddings.

    Returns:
        dict: A dictionary where the keys are words and the values are embedding vectors.
    """
    embeddings_dict = {}
    
    # Iterate over each row in the DataFrame to build the dictionary
    for _, row in df.iterrows():
        word = row['word']  # Extract the word
        # Extract the embedding vector, excluding unnecessary columns
        embedding = row.drop(['Unnamed: 0', 'word']).values.astype(float)
        embeddings_dict[word] = embedding  # Add the word and its embedding to the dictionary
    
    return embeddings_dict  # Return the dictionary of embeddings

In [None]:
def get_text_embedding(text, embeddings_dict):
    """
    Computes the average embedding for a given text based on the word embeddings.

    Args:
        text (str): The input text for which to compute the embedding.
        embeddings_dict (dict): A dictionary mapping words to their embeddings.

    Returns:
        np.ndarray: The average embedding vector for the input text.
    """
    # Simple tokenization, splitting text into words
    words = text.split()  
    
    # Get the embedding for each word, if it exists in the embeddings dictionary
    word_embeddings = [embeddings_dict.get(word) for word in words if embeddings_dict.get(word) is not None]
    
    if word_embeddings:
        # Compute the mean of the embeddings to get the text's embedding
        return np.mean(word_embeddings, axis=0)
    else:
        # Return a zero vector if no embeddings were found for any words
        return np.zeros(len(next(iter(embeddings_dict.values()))))

In [None]:
def get_top_words(words, num_common=10):
    """
    Identifies the most common words in a list of words.

    Args:
        words (list of str): The list of words to analyze.
        num_common (int): The number of top common words to return.

    Returns:
        list of str: A list of the most common words.
    """
    word_counts = Counter(words)  # Count the frequency of each word
    return [word for word, freq in word_counts.most_common(num_common)]  # Return the most common words

In [19]:
# Define number of clusters
num_clusters = 3

In [23]:
# Create a list containing a single year, 2010, as the range is exclusive of the upper bound
list_finetuning_models = list(range(2010, 2011))

# Iterate over the list of years (in this case, it will only be 2010)
for subgroup in list_finetuning_models:
    print(subgroup)  # Print the current year

    # Load the preprocessed data for the current year from a CSV file
    data = pd.read_csv(local_path + f"data/preprocessed/posts_{subgroup}.csv")
    
    # Load the precomputed embeddings for the posts in the current year
    df_embeddings = pd.read_csv(local_path + f"output/embeddings/yearly_embeddings/embeddings_CBOW_posts_{subgroup}.csv")
    
    # Convert the embeddings dataframe into a dictionary for easier lookup
    embeddings_dict = get_embeddings_dict(df_embeddings)
    
    ## Prepare the data for clustering
    # Step 2: Aggregate embeddings for each text by applying the `get_text_embedding` function
    data['embedding'] = data['title_and_text_lemmatized'].apply(lambda text: get_text_embedding(text, embeddings_dict))
    
    # Convert the list of embeddings into a 2D array, where each row corresponds to a text
    X = np.vstack(data['embedding'].values)

    # Perform KMeans clustering on the aggregated embeddings
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)
    
    # Assign the resulting cluster labels to the data
    data['cluster'] = clusters

    # Initialize a list to store the top words for each cluster
    cluster_words = []
    
    # Iterate over each cluster
    for cluster_num in range(num_clusters):
        # Get all words from texts that belong to the current cluster
        words_in_cluster = get_words_from_cluster(data, cluster_num)
        
        # Get the top words in the current cluster
        top_words = get_top_words(words_in_cluster)
        
        # Store the cluster number and its top words in the list
        cluster_words.append({'cluster': cluster_num, 'top_10_words': top_words})
        
    # Ensure the output directory for topics exists
    os.makedirs(f"data/output/topics/", exist_ok=True)
    
    # Convert the list of top words per cluster into a DataFrame
    df_top_words = pd.DataFrame(cluster_words)
    
    # Filter out the words 'climate' and 'change' from the top words list for each cluster
    df_top_words['top_10_words'] = df_top_words['top_10_words'].apply(lambda x: [word for word in x if word not in ['climate', 'change']])
    
    # Save the top words per cluster to a CSV file
    df_top_words.to_csv(local_path + f"output/topic_modelling/topics_{subgroup}.csv")
    
    # Print the DataFrame of top words
    print(df_top_words)

2010
   cluster                                       top_10_words
0        0  [global, new, scientist, warming, fight, repor...
1        1  [global, warming, science, world, scientist, e...
2        2  [science, world, global, scientist, new, peopl...
