In [None]:
!pip install datasets



In [None]:
from datasets import load_dataset

dataset = load_dataset("climatebert/climate_sentiment", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
dataset[:3]  # Display first 3 samples

{'text': ['− Scope 3: Optional scope that includes indirect emissions associated with the goods and services supply chain produced outside the organization. Included are emissions from the transport of products from our logistics centres to stores (downstream) performed by external logistics operators (air, land and sea transport) as well as the emissions associated with electricity consumption in franchise stores.',
  'The Group is not aware of any noise pollution that could negatively impact the environment, nor is it aware of any impact on biodiversity. With regards to land use, the Group is only a commercial user, and the Group is not aware of any local constraints with regards to water supply. The Group does not believe that it is at risk with regards to climate change in the near-or mid-term.',
  'Global climate change could exacerbate certain of the threats facing our business, including the frequency and severity of weather-related events referred to in Performance of critical 

In [None]:
import re

In [None]:
def clean(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    return text

In [None]:
# Extracting only risk-labeled texts. Label 0
risk_texts = [clean(x['text']) for x in dataset if x['label'] == 0]

In [None]:
risk_texts[:3]

['the group is not aware of any noise pollution that could negatively impact the environment nor is it aware of any impact on biodiversity with regards to land use the group is only a commercial user and the group is not aware of any local constraints with regards to water supply the group does not believe that it is at risk with regards to climate change in the nearor midterm',
 'global climate change could exacerbate certain of the threats facing our business including the frequency and severity of weatherrelated events referred to in performance of critical infrastructure in this section  in addition increases in energy prices are partly influenced by government policies to address climate change which combined with a growing data demand that increases our energy requirements could increase our energy costs beyond our current expectations',
 'setting an investment horizon is part and parcel of our policy of focusing on the long term and helping clients to build capital both financia

In [None]:
# For vectorization
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# For topic modeling
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
def train_lda_model(docs, vectorizer_type='count', n_topics=5, max_df=0.95, min_df=2, random_state=42):
    if vectorizer_type == 'count':
        vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, stop_words='english')
    elif vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer(max_df=max_df, min_df=min_df, stop_words='english')
    else:
        raise ValueError("Choose either 'count' or 'tfidf' for vectorizer_type.")

    X = vectorizer.fit_transform(docs)
    lda_model = LatentDirichletAllocation(n_components=n_topics, max_iter=10, random_state=random_state)
    lda_model.fit(X)

    return lda_model, vectorizer, X

lda_model_count, vectorizer_count, X_count = train_lda_model(risk_texts, vectorizer_type='count')
lda_model_tfidf, vectorizer_tfidf, X_tfidf = train_lda_model(risk_texts, vectorizer_type='tfidf')

In [None]:
# Function to display topics, for comparison
def compare_topics(lda_model_1, vect_1, lda_model_2, vect_2, label="Risk", n_words=10):
    print(f"{label} Class Topics Comparison")
    for idx in range(lda_model_1.n_components):
        topic1_words = [vect_1.get_feature_names_out()[i] for i in lda_model_1.components_[idx].argsort()[:-n_words-1:-1]]
        topic2_words = [vect_2.get_feature_names_out()[i] for i in lda_model_2.components_[idx].argsort()[:-n_words-1:-1]]

        print(f"Topic {idx+1}:")
        print(f"  CountVectorizer : {', '.join(topic1_words)}")
        print(f"  TFIDFVectorizer : {', '.join(topic2_words)}\n")

In [None]:
compare_topics(
    lda_model_1=lda_model_count, vect_1=vectorizer_count,
    lda_model_2=lda_model_tfidf, vect_2=vectorizer_tfidf,
    label="Risk"
)

Risk Class Topics Comparison
Topic 1:
  CountVectorizer : risk, risks, management, climate, climaterelated, impact, including, portfolio, group, scenario
  TFIDFVectorizer : risk, risks, climate, change, impact, including, management, energy, increase, impacts

Topic 2:
  CountVectorizer : climate, change, risk, risks, environmental, projects, business, increased, coal, impact
  TFIDFVectorizer : climate, risk, change, risks, carbon, coal, new, emissions, impact, assets

Topic 3:
  CountVectorizer : climate, change, impact, weather, events, risks, physical, extreme, changes, increase
  TFIDFVectorizer : change, risk, climate, impact, risks, physical, increased, production, weather, costs

Topic 4:
  CountVectorizer : climate, change, carbon, risks, energy, risk, emissions, transition, gas, physical
  TFIDFVectorizer : climate, risks, change, risk, carbon, physical, transition, potential, impacts, energy

Topic 5:
  CountVectorizer : risks, risk, clients, transition, physical, climate, 

In [None]:
pip install numpy==1.24.3




In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary

def get_topics_words(lda_model, vectorizer, n_words=10):
    terms = vectorizer.get_feature_names_out()
    return [[terms[i] for i in topic.argsort()[:-n_words-1:-1]] for topic in lda_model.components_]

# Extract top words from both models
lda_topics_count = get_topics_words(lda_model_count, vectorizer_count)
lda_topics_tfidf = get_topics_words(lda_model_tfidf, vectorizer_tfidf)

In [None]:
def compute_coherence(topics, texts):
    tokenized_texts = [t.split() for t in texts]
    dictionary = Dictionary(tokenized_texts)
    cm = CoherenceModel(topics=topics, texts=tokenized_texts, dictionary=dictionary, coherence='c_v')
    return cm.get_coherence()

# Compute Coherence Scores
coherence_count = compute_coherence(lda_topics_count, risk_texts)
coherence_tfidf = compute_coherence(lda_topics_tfidf, risk_texts)

print(f"Coherence Score (CountVectorizer): {coherence_count:.4f}")
print(f"Coherence Score (TFIDFVectorizer): {coherence_tfidf:.4f}")

Coherence Score (CountVectorizer): 0.4533
Coherence Score (TFIDFVectorizer): 0.4083
