In [None]:
import os

## check if on colab
try:
    import google.colab
    in_colab = True
    local_path = "/content/drive/MyDrive/DLSS/"
    google.colab.drive.mount('/content/drive')

except ImportError:
    in_colab = False
    ## get current directory
    current_wd = os.getcwd()
    ## move one up to go to main directory
    local_path = os.path.dirname(os.path.dirname(current_wd)) + "/"

print("CWD: ", local_path)

import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import spacy
from collections import Counter
import matplotlib.pyplot as plt


# Load SpaCy's English tokenizer and stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.text.lower() for token in doc if token.text.isalnum() and token.text.lower() not in stop_words]
    return tokens

def get_words_from_cluster(df, cluster_number):
    texts = df[df['cluster'] == cluster_number]['title_and_text_lemmatized']
    words = []
    for text in texts:
        tokens = preprocess_text(text)
        words.extend(tokens)
    return words

def get_embeddings_dict(df):
    embeddings_dict = {}
    for _, row in df.iterrows():
        word = row['word']
        embedding = row.drop(['Unnamed: 0', 'word']).values.astype(float)
        embeddings_dict[word] = embedding
    return embeddings_dict

def get_text_embedding(text, embeddings_dict):
    words = text.split()  # Simple tokenization, adjust as needed
    word_embeddings = [embeddings_dict.get(word) for word in words if embeddings_dict.get(word) is not None]
    if word_embeddings:
        return np.mean(word_embeddings, axis=0)
    else:
        return np.zeros(len(next(iter(embeddings_dict.values()))))  # Default to zero vector
    
def get_top_words(words, num_common=10):
    word_counts = Counter(words)
    return [word for word, freq in word_counts.most_common(num_common)]

In [None]:
## cluster  
num_clusters = 3  # Adjust based on your needs

In [None]:
list_finetuning_models = list(range(2010, 2011)) 

for subgroup in list_finetuning_models:
    print(subgroup)

    data  = pd.read_csv(local_path + f"data/preprocessed/posts_{subgroup}.csv")
    df_embeddings  = pd.read_csv(local_path + f"output/embeddings/yearly_embeddings/embeddings_CBOW_posts_{subgroup}.csv")
    embeddings_dict = get_embeddings_dict(df_embeddings)
    
    ## prepare
    # Step 2: Aggregate embeddings for each text
    data['embedding'] = data['title_and_text_lemmatized'].apply(lambda text: get_text_embedding(text, embeddings_dict))
    # Convert the aggregated embeddings into an array for clustering
    X = np.vstack(data['embedding'].values)

    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    clusters = kmeans.fit_predict(X)
    data['cluster'] = clusters

    cluster_words = []
    for cluster_num in range(num_clusters):
        words_in_cluster = get_words_from_cluster(data, cluster_num)
        top_words = get_top_words(words_in_cluster)
        cluster_words.append({'cluster': cluster_num, 'top_10_words': top_words})
        
    os.makedirs(f"data/output/topics/", exist_ok=True)
    df_top_words = pd.DataFrame(cluster_words)
    df_top_words['top_10_words'] = df_top_words['top_10_words'].apply(lambda x: [word for word in x if word not in ['climate', 'change']])
    df_top_words.to_csv(local_path + f"output/topic_modelling/topics_{subgroup}.csv")
    print(df_top_words)


In [None]:

# Original data
data = {
    'Year': [2010, 2010, 2010, 2011, 2011, 2011, 2012, 2012, 2012, 2013, 2013, 2013, 2014, 2014, 2014, 2015, 2015, 2015, 
             2016, 2016, 2016, 2017, 2017, 2017, 2018, 2018, 2018, 2019, 2019, 2019, 2020, 2020, 2020, 2021, 2021, 2021, 
             2022, 2022, 2022],
    'Topic': ['Climate Change and Scientific Reporting', 'Global Energy and Environmental Science', 
              'Global Impact of Climate Change', 'Climate Change and Scientific Reporting', 
              'Causes and Scientific Study of Global Warming', 'Climate Change and Scientific Reporting', 
              'Global Impact and Scientific Analysis of Hurricanes', 'Scientific Studies on Global Causes and Weather Events', 
              'Global Warming and Scientific Understanding', 'Global Warming and Scientific Studies', 
              'Scientific Action Plans and Climate Change', 'Climate Action and Denial in Global Reports', 
              'Global Warming and Its Impact on People', 'Climate Change and Scientific Reporting', 
              'Climate Action and Denial in Global Reports', 'Climate Change and Scientific Reporting', 
              'Climate Action and Global Leadership', 'Climate Action and Global Leadership', 
              'Climate Change and Scientific Reporting', 'Climate Action and Global Leadership', 
              'Climate Action and Global Leadership', 'Climate Change and Scientific Reporting', 
              'Climate Action and U.S. Leadership', 'Climate Change and Scientific Reporting', 
              'Climate Action and U.S. Leadership', 'Climate Action and Global Leadership', 
              'Climate Action and U.S. Leadership', 'Climate Action and Public Awareness', 
              'Global Action and Leadership', 'Climate Action and Global Protest', 
              'Global Action and Leadership', 'Climate Action and U.S. Leadership', 'Covid', 
              'Climate Action', 'Global Leadership and Climate Action', 
              'Climate Action and U.S. Leadership', 'Climate Action', 
              'Global Leadership and Climate Action', 'Climate Action and U.S. Leadership']
}

df = pd.DataFrame(data)

# Mapping original topics to 5 overarching topics
topic_mapping = {
    'Climate Change and Scientific Reporting': 'Climate Change and Scientific Reporting',
    'Global Warming and Scientific Studies': 'Climate Change and Scientific Reporting',
    'Global Impact of Climate Change': 'Climate Change and Scientific Reporting',
    'Causes and Scientific Study of Global Warming': 'Climate Change and Scientific Reporting',
    'Global Warming and Scientific Understanding': 'Climate Change and Scientific Reporting',
    'Global Energy and Environmental Science': 'Climate Change and Scientific Reporting',
    
    'Climate Action and Global Leadership': 'Climate Action and Global Leadership',
    'Climate Action and U.S. Leadership': 'Climate Action and Global Leadership',
    'Global Leadership and Climate Action': 'Climate Action and Global Leadership',
    'Scientific Action Plans and Climate Change': 'Climate Action and Global Leadership',
    'Climate Action and Public Awareness': 'Climate Action and Global Leadership',
    'Global Action and Leadership': 'Climate Action and Global Leadership',
    'Climate Action and Global Protest': 'Climate Action and Global Leadership',
        
    'Covid': 'Catastrophes',
    'Global Impact and Scientific Analysis of Hurricanes': 'Catastrophes',
    'Scientific Studies on Global Causes and Weather Events': 'Catastrophes',
}

# Applying the mapping
df['Overarching Topic'] = df['Topic'].map(topic_mapping)

# Count occurrences of each overarching topic by year
topic_trend = df.groupby(['Year', 'Overarching Topic']).size().unstack().fillna(0)

# Plotting the trends over time
plt.figure(figsize=(14, 8))
topic_trend.plot(kind='line', marker='o', ax=plt.gca())
plt.xlabel('')  # Increase the font size of the x-axis label
plt.ylabel('Frequency', fontsize=25)  # Increase the font size of the y-axis label
plt.xticks(range(topic_trend.index.min(), topic_trend.index.max() + 1, 1), size = 20)  # X-axis steps of 1 year
plt.yticks(range(0, int(topic_trend.values.max()) + 2, 1), size = 25)  # Y-axis steps of 1
plt.legend(title='Topic', bbox_to_anchor=(0.5, 0.975), loc='upper center', ncol=3, fontsize=15, title_fontsize=20)
plt.grid(True)
plt.tight_layout()
plt.savefig(local_path + "plots/topics_over_time.jpg")
plt.show()