# Topic Modelling with k-means clustering

In [None]:
import os

# Check if the code is running on Google Colab
try:
    import google.colab
    in_colab = True  # If this block runs, we are on Colab
    local_path = "/content/drive/MyDrive/DLSS/"
    # Mount Google Drive to access files
    google.colab.drive.mount('/content/drive')

except ImportError:
    in_colab = False  # If an ImportError occurs, we are not on Colab
    # Get the current working directory
    current_wd = os.getcwd()
    # Move one directory up to go to the main directory
    local_path = os.path.dirname(os.path.dirname(current_wd)) + "/"

# Print the determined local path
print("CWD: ", local_path)

# Import necessary libraries
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import spacy
from collections import Counter
import matplotlib.pyplot as plt


# Load SpaCy's English tokenizer and stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
def preprocess_text(text):
    """
    Preprocesses the input text by tokenizing it, converting to lowercase, 
    removing stop words, and retaining only alphanumeric tokens.

    Args:
    text (str): The text to be processed.

    Returns:
    list: A list of processed tokens.
    """
    # Tokenize the text using SpaCy
    doc = nlp(text)
    # Lowercase, filter out stop words and non-alphanumeric tokens
    tokens = [token.text.lower() for token in doc if token.text.isalnum() and token.text.lower() not in stop_words]
    return tokens

def get_words_from_cluster(df, cluster_number):
    """
    Extracts and preprocesses words from the texts in a specified cluster.

    Args:
    df (pd.DataFrame): DataFrame containing the text data with a 'cluster' column.
    cluster_number (int): The cluster number to filter the texts.

    Returns:
    list: A list of preprocessed words from the specified cluster.
    """
    # Filter the DataFrame to get texts belonging to the specified cluster
    texts = df[df['cluster'] == cluster_number]['title_and_text_lemmatized']
    words = []
    # Preprocess each text and accumulate the words
    for text in texts:
        tokens = preprocess_text(text)
        words.extend(tokens)
    return words

def get_embeddings_dict(df):
    """
    Creates a dictionary mapping words to their corresponding embeddings.

    Args:
    df (pd.DataFrame): DataFrame containing word embeddings with a 'word' column.

    Returns:
    dict: A dictionary where keys are words and values are their embeddings (numpy arrays).
    """
    embeddings_dict = {}
    # Iterate through each row to build the embeddings dictionary
    for _, row in df.iterrows():
        word = row['word']
        # Extract the embedding by dropping unnecessary columns and converting to float
        embedding = row.drop(['Unnamed: 0', 'word']).values.astype(float)
        embeddings_dict[word] = embedding
    return embeddings_dict

def get_text_embedding(text, embeddings_dict):
    """
    Computes the average embedding for a given text based on word embeddings.

    Args:
    text (str): The text to be embedded.
    embeddings_dict (dict): Dictionary of word embeddings.

    Returns:
    np.ndarray: The average embedding of the text, or a zero vector if no words are found.
    """
    # Tokenize the text (simple tokenization)
    words = text.split()  
    # Retrieve the embeddings for each word in the text
    word_embeddings = [embeddings_dict.get(word) for word in words if embeddings_dict.get(word) is not None]
    if word_embeddings:
        # Return the average of the word embeddings
        return np.mean(word_embeddings, axis=0)
    else:
        # Return a zero vector if no word embeddings are found
        return np.zeros(len(next(iter(embeddings_dict.values()))))  # Default to zero vector

def get_top_words(words, num_common=10):
    """
    Identifies the most common words from a list of words.

    Args:
    words (list): A list of words.
    num_common (int, optional): The number of top common words to return. Defaults to 10.

    Returns:
    list: A list of the most common words.
    """
    # Count the frequency of each word
    word_counts = Counter(words)
    # Return the top 'num_common' most common words
    return [word for word, freq in word_counts.most_common(num_common)]

In [None]:
# Adjust number of clusters  
num_clusters = 3

In [None]:
# List of models for fine-tuning, specified by years (in this case, just 2010)
list_finetuning_models = list(range(2010, 2011)) 

# Iterate over each model (year) in the list
for subgroup in list_finetuning_models:
    print(subgroup)

    # Load the preprocessed data for the specific year
    data  = pd.read_csv(local_path + f"data/preprocessed/posts_{subgroup}.csv")
    
    # Load the embeddings for the specific year
    df_embeddings  = pd.read_csv(local_path + f"output/embeddings/yearly_embeddings/embeddings_CBOW_posts_{subgroup}.csv")
    
    # Create a dictionary mapping words to their embeddings
    embeddings_dict = get_embeddings_dict(df_embeddings)
    
    ## Prepare the data for clustering
    # Step 2: Aggregate embeddings for each text by computing the average embedding
    data['embedding'] = data['title_and_text_lemmatized'].apply(lambda text: get_text_embedding(text, embeddings_dict))
    
    # Convert the list of embeddings into a 2D array suitable for clustering
    X = np.vstack(data['embedding'].values)

    # Perform K-means clustering on the embeddings
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)
    
    # Assign the cluster labels to the DataFrame
    data['cluster'] = clusters

    cluster_words = []
    # For each cluster, extract the most common words
    for cluster_num in range(num_clusters):
        # Get the words associated with the current cluster
        words_in_cluster = get_words_from_cluster(data, cluster_num)
        
        # Identify the top 10 most common words in the cluster
        top_words = get_top_words(words_in_cluster)
        
        # Append the cluster number and its top words to the list
        cluster_words.append({'cluster': cluster_num, 'top_10_words': top_words})
        
    # Ensure the output directory exists
    os.makedirs(f"data/output/topics/", exist_ok=True)
    
    # Create a DataFrame from the list of cluster words
    df_top_words = pd.DataFrame(cluster_words)
    
    # Filter out generic words such as 'climate' and 'change' from the top words
    df_top_words['top_10_words'] = df_top_words['top_10_words'].apply(lambda x: [word for word in x if word not in ['climate', 'change']])
    
    # Save the top words for each cluster to a CSV file
    df_top_words.to_csv(local_path + f"output/topic_modelling/topics_{subgroup}.csv")
    
    # Print the DataFrame with the top words for inspection
    print(df_top_words)

In [None]:
# Original data
data = {
    'Year': [2010, 2010, 2010, 2011, 2011, 2011, 2012, 2012, 2012, 2013, 2013, 2013, 2014, 2014, 2014, 2015, 2015, 2015, 
             2016, 2016, 2016, 2017, 2017, 2017, 2018, 2018, 2018, 2019, 2019, 2019, 2020, 2020, 2020, 2021, 2021, 2021, 
             2022, 2022, 2022],
    'Topic': ['Climate Change and Scientific Reporting', 'Global Energy and Environmental Science', 
              'Global Impact of Climate Change', 'Climate Change and Scientific Reporting', 
              'Causes and Scientific Study of Global Warming', 'Climate Change and Scientific Reporting', 
              'Global Impact and Scientific Analysis of Hurricanes', 'Scientific Studies on Global Causes and Weather Events', 
              'Global Warming and Scientific Understanding', 'Global Warming and Scientific Studies', 
              'Scientific Action Plans and Climate Change', 'Climate Action and Denial in Global Reports', 
              'Global Warming and Its Impact on People', 'Climate Change and Scientific Reporting', 
              'Climate Action and Denial in Global Reports', 'Climate Change and Scientific Reporting', 
              'Climate Action and Global Leadership', 'Climate Action and Global Leadership', 
              'Climate Change and Scientific Reporting', 'Climate Action and Global Leadership', 
              'Climate Action and Global Leadership', 'Climate Change and Scientific Reporting', 
              'Climate Action and U.S. Leadership', 'Climate Change and Scientific Reporting', 
              'Climate Action and U.S. Leadership', 'Climate Action and Global Leadership', 
              'Climate Action and U.S. Leadership', 'Climate Action and Public Awareness', 
              'Global Action and Leadership', 'Climate Action and Global Protest', 
              'Global Action and Leadership', 'Climate Action and U.S. Leadership', 'Covid', 
              'Climate Action', 'Global Leadership and Climate Action', 
              'Climate Action and U.S. Leadership', 'Climate Action', 
              'Global Leadership and Climate Action', 'Climate Action and U.S. Leadership']
}

df = pd.DataFrame(data)

# Mapping original topics to 5 overarching topics
topic_mapping = {
    'Climate Change and Scientific Reporting': 'Climate Change and Scientific Reporting',
    'Global Warming and Scientific Studies': 'Climate Change and Scientific Reporting',
    'Global Impact of Climate Change': 'Climate Change and Scientific Reporting',
    'Causes and Scientific Study of Global Warming': 'Climate Change and Scientific Reporting',
    'Global Warming and Scientific Understanding': 'Climate Change and Scientific Reporting',
    'Global Energy and Environmental Science': 'Climate Change and Scientific Reporting',
    
    'Climate Action and Global Leadership': 'Climate Action and Global Leadership',
    'Climate Action and U.S. Leadership': 'Climate Action and Global Leadership',
    'Global Leadership and Climate Action': 'Climate Action and Global Leadership',
    'Scientific Action Plans and Climate Change': 'Climate Action and Global Leadership',
    'Climate Action and Public Awareness': 'Climate Action and Global Leadership',
    'Global Action and Leadership': 'Climate Action and Global Leadership',
    'Climate Action and Global Protest': 'Climate Action and Global Leadership',
        
    'Covid': 'Catastrophes',
    'Global Impact and Scientific Analysis of Hurricanes': 'Catastrophes',
    'Scientific Studies on Global Causes and Weather Events': 'Catastrophes',
}

# Applying the mapping
df['Overarching Topic'] = df['Topic'].map(topic_mapping)

# Count occurrences of each overarching topic by year
topic_trend = df.groupby(['Year', 'Overarching Topic']).size().unstack().fillna(0)

# Plotting the trends over time
plt.figure(figsize=(14, 8))
topic_trend.plot(kind='line', marker='o', ax=plt.gca())
plt.xlabel('')  # Increase the font size of the x-axis label
plt.ylabel('Frequency', fontsize=25)  # Increase the font size of the y-axis label
plt.xticks(range(topic_trend.index.min(), topic_trend.index.max() + 1, 1), size = 20)  # X-axis steps of 1 year
plt.yticks(range(0, int(topic_trend.values.max()) + 2, 1), size = 25)  # Y-axis steps of 1
plt.legend(title='Topic', bbox_to_anchor=(0.5, 0.975), loc='upper center', ncol=3, fontsize=15, title_fontsize=20)
plt.grid(True)
plt.tight_layout()
plt.savefig(local_path + "plots/topics_over_time.jpg")
plt.show()