In [None]:
%%capture
!pip install faiss-cpu sentence-transformers scikit-learn
# !pip install safetensors
!pip install datasets
# !pip install datashader
# !pip install adjustText
!pip install bertopic


# 2025 COMP90042 Project

Group 24 Faiss Simces preprocessing pipeline



# 1.DataSet Processing
(You can add as many code blocks and text blocks as you need. However, YOU SHOULD NOT MODIFY the section title)

### FAISS Clustering for Climate Science Evidence

This notebook performs two rounds of FAISS clustering to retrieve approximately 32,000 evidence texts related to climate science. This step is still part of the preprocessing pipeline.

### Final output

- evidence_32809.json

### FAISS Clustering Experiment Results

To select relevant evidence for climate science, we applied similarity thresholding on FAISS clustering results. The goal was to reduce the total number of evidence texts while retaining as much gold evidence from the development set as possible.

The table below summarizes the results. The first column shows the similarity threshold used, the second column indicates the number of remaining evidence texts after filtering, and the third column represents the proportion of dev gold evidence retained at each threshold.

| Similarity Threshold | Remaining Evidence Count | Dev Gold Evidence Retained |
|----------------------|--------------------------|-----------------------------|
| 0.7                  | 5,834                    | 0.50                        |
| 0.6                  | 14,492                   | 0.68                        |
| 0.5                  | 32,809                   | 0.80                        |
| 0.4                  | 69,678                   | 0.85                        |
| 0.3                  | 132,507                  | 0.88                        |
| 0.2                  | 144,633                  | 0.89                        |
| 0.1                  | 147,074                  | 0.89                        |

Based on this experiment, we selected the threshold of 0.5, resulting in 32,809 evidence texts, which retains 80% of the dev gold evidence. This trade-off was considered optimal for further preprocessing and model training.

### Topic Validation with BERTopic

After completing two rounds of FAISS clustering and selecting 32,809 evidence texts, we applied BERTopic to analyze the topic distribution within the filtered evidence set.

The goal was to verify whether the filtered evidence predominantly reflects climate science content, thereby validating the effectiveness of the FAISS-based filtering.

BERTopic generated interpretable topics, and a significant number of them were clearly related to climate science themes such as:

- Climate change impacts
- Global warming and temperature trends
- CO₂ emissions and greenhouse gases
- Renewable energy and climate policy
- Scientific consensus on climate change

This confirms that the FAISS clustering and similarity thresholding process successfully filtered out unrelated content and preserved a high concentration of climate science–relevant evidence.

### Upload Data Files

Upload the following files into the `data` folder:

- `train-claims.json`
- `dev-claims.json`
- `test-claims-unlabelled.json`
- `evidence.json`
- `evidence_perplexity.json` (generated by running `DataSet Processing_PPL.ipynb`)

In [None]:
import json
import csv
import random
import os

# Create 'log' and 'data' directories if they don't exist
os.makedirs('log', exist_ok=True)
os.makedirs('data', exist_ok=True)

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
import faiss
import numpy as np
import json
from sentence_transformers.util import cos_sim
from bertopic import BERTopic
from datasets import load_dataset
import numpy as np
import torch
import ast
import pandas as pd


import json
import re
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import nltk
from sentence_transformers import SentenceTransformer
import collections
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer

### Output Files

- `evidence_subset_train.json`: Contains gold evidence from train-claims in the format `{evidence_id: text}`
- `claims.json`: Contains claims in the format `{claim_id: text}`

In [None]:
train_json_path = "data/train-claims.json"      # claim & evidence
dev_json_path = "data/dev-claims.json"      # claim & evidence
test_json_path = "data/test-claims-unlabelled.json"  # claim
evidence_json_path = "data/evidence.json"  # evidence
output_evidence_set_path = "data/evidence_subset_train.json"
output_claim_set_path = "data/claims.json"

# output_dev_emb_path = "local_data/dev-embed-1.json"

with open(train_json_path, "r", encoding="utf-8") as f:
    train_data = json.load(f)
with open(dev_json_path, "r", encoding="utf-8") as f:
    dev_data = json.load(f)
with open(test_json_path, "r", encoding="utf-8") as f:
    test_data = json.load(f)
with open(evidence_json_path, "r", encoding="utf-8") as f:
    evidence_data = json.load(f)

#combine train and dev data
# merged_data = {**train_data, **dev_data}
merged_data = train_data

evicence_set = {}
for claim_id, claim_info in merged_data.items():

    claim_text = claim_info["claim_text"]
    positive_ids = claim_info["evidences"]

    for pos_id in positive_ids:
        if pos_id not in evidence_data:
            print(f"Warning: Evidence ID {pos_id} not found in evidence data.")
            continue
        evicence_set[pos_id] = evidence_data[pos_id]

print(len(evicence_set))
# Save the evidence set to a JSON file
with open(output_evidence_set_path, "w", encoding="utf-8") as f:
    json.dump(evicence_set, f, ensure_ascii=False, indent=4)


claim_set = {}

def get_claims(data):

    claim_set = {}
    for claim_id, claim_info in data.items():
        claim_text = claim_info["claim_text"]
        claim_set[claim_id] = {
            "claim_text": claim_text,
        }
    return claim_set

train_claim_set = get_claims(train_data)
dev_claim_set = get_claims(dev_data)
test_claim_set = get_claims(test_data)
claim_set = {**train_claim_set, **dev_claim_set, **test_claim_set}
print(len(claim_set))
# Save the claim set to a JSON file
with open(output_claim_set_path, "w", encoding="utf-8") as f:
    json.dump(claim_set, f, ensure_ascii=False, indent=4)

3121
1535


In [None]:
with open("data/claims.json", "r", encoding="utf-8") as f:
    claims = json.load(f)  # dict: id -> text
print(len(claims)) #1535
claim_text = [data["claim_text"] for data in claims.values()]
claim_text[0]

1535


'Not only is there no scientific evidence that CO2 is a pollutant, higher CO2 concentrations actually help ecosystems support more plant and animal life.'

In [None]:
with open("data/dev-claims.json", "r", encoding="utf-8") as f:
    dev_claims = json.load(f)  # dict: id -> text
print(len(dev_claims))

dev_id = []
for data in dev_claims.values():
    dev_id.extend(data["evidences"])
dev_id = list(set(dev_id))
print(len(dev_id))
print(dev_id[0:5])

154
463
['evidence-462483', 'evidence-127999', 'evidence-1052408', 'evidence-142343', 'evidence-131135']


In [None]:
with open("data/evidence_subset_train.json", "r", encoding="utf-8") as f:
    evidence_subset_train = json.load(f)  # dict: id -> text
print(len(evidence_subset_train)) #3121
gold_texts = list(evidence_subset_train.values())
gold_id = [id for id in evidence_subset_train.keys()]
gold_id[0:5]

3121


['evidence-442946',
 'evidence-1194317',
 'evidence-12171',
 'evidence-338219',
 'evidence-1127398']

In [None]:
# csv_path = "gold_bertopic.csv"
# df = pd.read_csv(csv_path)

# keywords = []

# for val in df["Representation"]:
#     try:
#         keyword_list = ast.literal_eval(val)
#         keywords.extend([kw.strip(" '\"") for kw in keyword_list])
#     except Exception as e:
#         print("skip:", val)
#         continue

# print(keywords)
# print(f"Keyword: {len(keywords)}")

['climate', 'global', 'carbon', 'change', 'warming', 'dioxide', 'temperature', 'emissions', 'temperatures', 'greenhouse', 'human', 'scientific', 'climate', 'warming', 'greenhouse', 'change', 'global', 'gases', 'carbon', 'activities', 'period', 'temperature', 'years', 'age', 'century', 'ago', 'warming', 'methane', 'global', 'ice', 'ice', 'sea', 'arctic', 'sheet', 'greenland', 'melting', 'antarctic', 'mass', 'glaciers', 'antarctica', 'extreme', 'droughts', 'weather', 'heat', 'hurricane', 'drought', 'waves', 'events', 'flooding', 'floods', 'atmosphere', 'radiation', 'infrared', 'surface', 'water', 'air', 'temperature', 'greenhouse', 'earth', 'effect', 'warmest', 'record', 'month', 'january', 'recorded', 'average', 'temperature', 'winter', 'february', 'hottest', 'solar', 'orbit', 'sun', 'earth', 'cosmic', 'orbital', 'years', 'changes', 'variation', 'period', 'ocean', 'acidification', 'oceans', 'carbon', 'dioxide', 'marine', 'organisms', 'decrease', 'corals', 'atmosphere', 'energy', 'power'

In [None]:
import re
import nltk

from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')

def keep_common_tokens(text):
    tokens = word_tokenize(text)
    pattern = re.compile(r'^[a-zA-Z0-9#\-_]+$')
    filtered = [word.lower() for word in tokens if pattern.match(word)]
    return ' '.join(filtered)

def normalize_co2(text):
    text = re.sub(r'\b[cC][oO0][\s\-]?2\b', "carbon dioxide", text)
    return text

CLIMATE_TERMS = {
    "ghg": "greenhouse gases",
    "ghgs": "greenhouse gases"
}

def normalize_climate_terms(text):
    text = normalize_co2(text)
    for key, val in CLIMATE_TERMS.items():
        text = re.sub(rf'\b{re.escape(key)}\b', val, text)
    return text

text = 'CO2 Co2 For a ghgs light-weight co2 installation without transformers, ghg UMAP and/or HDBSCAN (for training with Model2Vec or perhaps for inference), see this tutorial.'

print(normalize_climate_terms(keep_common_tokens(text)))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


carbon dioxide carbon dioxide for a greenhouse gases light-weight carbon dioxide installation without transformers greenhouse gases umap hdbscan for training with model2vec or perhaps for inference see this tutorial


In [None]:
with open("data/evidence_perplexity.json", "r", encoding="utf-8") as f:
    evidence_120 = json.load(f)

original_docs = []
filtered_docs = []
filtered_ids = []
filtered_ppl = []
for eid, data in evidence_120.items():
    text = data.get("text", "").strip()
    ppl_val = data.get("ppl", None)
    labels = data.get("claim_labels", [])
    if eid in gold_id:
        continue
    if text and ppl_val is not None:
        clean_text = normalize_climate_terms(keep_common_tokens(text))
        if clean_text and ppl_val < 85:
            original_docs.append(text)
            filtered_docs.append(clean_text)
            filtered_ids.append(eid)
            filtered_ppl.append(ppl_val)
            # filtered_labels.append(labels)

print(f"Original docs: {len(evidence_120)}")
print(f"Filtered docs: {len(filtered_docs)}, {len(filtered_docs) / len(evidence_120)}")

Original docs: 1208827
Filtered docs: 955534, 0.7904638132669108


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2' , device=device)

cuda


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
query_texts = gold_texts + claim_text
query_embeddings_all = embedding_model.encode(query_texts, batch_size=512, show_progress_bar=True)
query_embeddings_all = normalize(query_embeddings_all)

# gold_embeddings = query_embeddings_all[:len(gold_texts)]
# keyword_embeddings = query_embeddings_all[len(gold_texts):]

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
evidence_texts = filtered_docs
evidence_ids = filtered_ids

embeddings = embedding_model.encode(filtered_docs, show_progress_bar=True, device=device , batch_size=512)

with open('embeddings.npy', 'wb') as f:
    np.save(f, embeddings)

Batches:   0%|          | 0/1867 [00:00<?, ?it/s]

In [None]:
embeddings = np.load('embeddings.npy')

In [None]:
all_embeddings = embeddings

all_embeddings = np.vstack(all_embeddings).astype("float32")
all_embeddings = normalize(all_embeddings)


d = all_embeddings.shape[1]

quantizer = faiss.IndexFlatIP(d)
nlist = 1024

cpu_index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
cpu_index.train(all_embeddings)
cpu_index.add(all_embeddings)


In [None]:
D, I = cpu_index.search(query_embeddings_all, k=30000)

In [None]:
threshold = 0.3 #0.2 25000
retrieved_ids = []
for scores, ids in zip(D, I):
    for i, s in zip(ids, scores):
        if s > threshold:
            retrieved_ids.append(filtered_ids[i])

retrieved_ids.extend(gold_id)
retrieved_ids = list(set(retrieved_ids))
print(f"Documents above threshold {threshold}: {len(retrieved_ids)}")

overlap_dev = set(dev_id).intersection(retrieved_ids)
print(f"Documents in dev set: {len(overlap_dev)}, {len(overlap_dev) / len(dev_id)}")

Documents above threshold 0.3: 155934
Documents in dev set: 418, 0.9028077753779697


In [None]:
with open("data/evidence_perplexity.json", "r", encoding="utf-8") as f:
    evidence_120 = json.load(f)

results = {}
for eid in retrieved_ids:
    results[eid] = evidence_120[eid]["text"]

print(len(results))

with open("data/evidence_164100.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

155934


In [None]:
del all_embeddings
del embeddings

In [None]:
import gc
gc.collect()

41

### Second time faiss

In [None]:
with open("data/evidence_164100.json", "r", encoding="utf-8") as f:
    evidence_120 = json.load(f)

original_docs = []
filtered_docs = []
filtered_ids = []
for eid, data in evidence_120.items():
    text = data.strip()
    if text:
        clean_text = normalize_climate_terms(keep_common_tokens(text))
        if clean_text:
            original_docs.append(text)
            filtered_docs.append(clean_text)
            filtered_ids.append(eid)


print(f"Original docs: {len(evidence_120)}")
print(f"Filtered docs: {len(filtered_docs)}, {len(filtered_docs) / len(evidence_120)}")

Original docs: 155934
Filtered docs: 155934, 1.0


In [None]:
query_texts = gold_texts + claim_text
query_embeddings_all = embedding_model.encode(query_texts, batch_size=512, show_progress_bar=True)
query_embeddings_all = normalize(query_embeddings_all)

Batches:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
evidence_texts = filtered_docs
evidence_ids = filtered_ids

embeddings = embedding_model.encode(filtered_docs, show_progress_bar=True, device=device , batch_size=512)

with open('embeddings.npy', 'wb') as f:
    np.save(f, embeddings)

Batches:   0%|          | 0/305 [00:00<?, ?it/s]

In [None]:
embeddings = np.load('embeddings.npy')

In [None]:
all_embeddings = embeddings

all_embeddings = np.vstack(all_embeddings).astype("float32")
all_embeddings = normalize(all_embeddings)


d = all_embeddings.shape[1]

quantizer = faiss.IndexFlatIP(d)
nlist = 400

cpu_index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
cpu_index.train(all_embeddings)
cpu_index.add(all_embeddings)


In [None]:
D, I = cpu_index.search(query_embeddings_all, k=10000)

In [None]:
threshold = 0.5 #0.5 0.80
retrieved_ids = []
for scores, ids in zip(D, I):
    for i, s in zip(ids, scores):
        if s > threshold:
            retrieved_ids.append(filtered_ids[i])

retrieved_ids.extend(gold_id)
retrieved_ids = list(set(retrieved_ids))
print(f"Documents above threshold {threshold}: {len(retrieved_ids)}")

overlap_dev = set(dev_id).intersection(retrieved_ids)
print(f"Documents in dev set: {len(overlap_dev)}, {len(overlap_dev) / len(dev_id)}")

# 0.7: 5834 0.50
# 0.6: 14492 0.68
# 0.5: 32809 0.8
# 0.4: 69678 0.85
# 0.3: 132507 0.88
# 0.2: 144633 0.89
# 0.1: 147074 0.89

Documents above threshold 0.5: 31119
Documents in dev set: 365, 0.7883369330453563


In [None]:
with open("evidence.json", "r", encoding="utf-8") as f:
    evidence_120 = json.load(f)

results = {}
for eid in retrieved_ids:
    results[eid] = evidence_120[eid]

print(len(results))

with open("evidence_32809.json", "w", encoding="utf-8") as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

In [None]:
result_text = [data for data in results.values()]
result_text[0:3]

['More recent studies of seafloor microbes cast considerable doubt on that; one study in 2012 reduced the calculated microbial biomass on the seafloor from the original 303 billions of tonnes of C to just 4.1 billions of tonnes of C, reducing the global biomass of prokaryotes to 50 to 250 billions of tonnes of C. Further, if the average per-cell biomass of prokaryotes is reduced from 86 to 14 femtograms C, then the global biomass of prokaryotes was reduced to 13 to 44.5 billions of tonnes of C, equal to between 2.4% and 8.1% of the carbon in plants.',
 'They include scientist Gregor Mendel, who made epochal pea plant experiments, composer Leoš Janáček, and writer Milan Kundera.',
 'Much of Texas was also placed under a burn ban.']

### Topic Validation with BERTopic

After completing two rounds of FAISS clustering and selecting 32,809 evidence texts, we applied BERTopic to analyze the topic distribution within the filtered evidence set.

The goal was to verify whether the filtered evidence predominantly reflects climate science content, thereby validating the effectiveness of the FAISS-based filtering.

BERTopic generated interpretable topics, and a significant number of them were clearly related to climate science themes such as:

- Climate change impacts
- Global warming and temperature trends
- CO₂ emissions and greenhouse gases
- Renewable energy and climate policy
- Scientific consensus on climate change

This confirms that the FAISS clustering and similarity thresholding process successfully filtered out unrelated content and preserved a high concentration of climate science–relevant evidence.

In [None]:
import collections
from sklearn.feature_extraction.text import CountVectorizer

# Extract vocab to be used in BERTopic
vocab_counter = collections.Counter()
tokenizer = CountVectorizer().build_tokenizer()
for doc in tqdm(result_text):
  vocab_counter.update(tokenizer(doc))
vocab = [word for word, frequency in vocab_counter.items() if frequency >= 30]
low_freq_words = {word: freq for word, freq in vocab_counter.items() if freq < 30}

print(f"📌 Total vocab size: {len(vocab_counter)}")
print(f"🧊 Low frequency words (<20): {len(low_freq_words)}")
print(f"📊 Ratio of low-freq words: {len(low_freq_words) / len(vocab_counter):.2%}")

print("\n🧾 Examples of low-frequency words (freq < 20):")
for word, freq in list(low_freq_words.items())[:100]:
    print(f"{word}: {freq}")

100%|██████████| 155934/155934 [00:01<00:00, 78320.43it/s]


📌 Total vocab size: 126155
🧊 Low frequency words (<20): 117026
📊 Ratio of low-freq words: 92.76%

🧾 Examples of low-frequency words (freq < 20):
seafloor: 25
303: 9
prokaryotes: 4
femtograms: 1
Gregor: 6
Mendel: 4
epochal: 1
pea: 8
Leoš: 2
Janáček: 2
Milan: 12
Kundera: 1
mints: 2
coin: 25
canceled: 26
Joaquin: 19
Kalbar: 1
Boonah: 1
tabloid: 22
Aftonbladet: 2
Expressen: 1
disturb: 10
Bamiyan: 2
litany: 1
illiteracy: 9
Assuming: 12
stringent: 26
finale: 6
annulus: 7
CI: 8
Warsaw: 26
Rzeszów: 1
Katowice: 3
Bydgoszcz: 1
Poznań: 6
Bucharest: 13
Waterville: 7
Augusta: 15
329: 5
deficient: 26
classics: 15
Pike: 23
SEPTA: 10
Izard: 1
Daraa: 4
FSA: 8
Governorate: 8
illusion: 11
cues: 13
1285: 4
Edw: 2
submontane: 1
Shading: 1
Concepts: 4
manufactures: 13
sells: 20
solarium: 1
draperies: 1
Elections: 23
exploratory: 10
systematically: 26
Bray: 4
Downside: 1
Somerset: 27
unobstructed: 3
Cost: 15
inappropriate: 19
monetize: 2
637: 4
Tocantis: 1
conglomerates: 2
oilseed: 3
curcas: 2
JC: 5
communis

In [None]:
import torch
from cuml.manifold import UMAP
from cuml.cluster import HDBSCAN
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

results_doc = result_text
print(f"Total docs: {len(results_doc)}, {len(results)}")
# umap_model = UMAP(n_components=10, n_neighbors=20, random_state=42, metric="cosine", verbose=True)
# hdbscan_model = HDBSCAN(min_samples=15, gen_min_span_tree=True, prediction_data=False, min_cluster_size=20, verbose=True)
umap_model = UMAP(
    n_components=6,
    n_neighbors=40,
    metric="cosine",
    verbose=True
)
hdbscan_model = HDBSCAN(
    min_samples=5,
    min_cluster_size=50,
    prediction_data=True,
    gen_min_span_tree=True,
    verbose=True
)
vectorizer_model = CountVectorizer(vocabulary=vocab, stop_words="english")

# Fit BERTopic without actually performing any clustering
topic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        verbose=True
).fit(results_doc)  #

2025-05-19 05:55:08,595 - BERTopic - Embedding - Transforming documents to embeddings.


Total docs: 155934, 155934


Batches:   0%|          | 0/4873 [00:00<?, ?it/s]

2025-05-19 05:56:25,492 - BERTopic - Embedding - Completed ✓
2025-05-19 05:56:25,494 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


[2025-05-19 05:56:25.686] [CUML] [info] Building knn graph using nn descent
[2025-05-19 05:56:26.242] [CUML] [debug] n_neighbors=40
[2025-05-19 05:56:26.245] [CUML] [debug] Calling knn graph run
[2025-05-19 05:56:36.092] [CUML] [debug] Done. Calling fuzzy simplicial set
[2025-05-19 05:56:36.200] [CUML] [debug] Done. Calling remove zeros
[2025-05-19 05:56:39.017] [CUML] [info] Transform can only be run with brute force. Using brute force.
[2025-05-19 05:56:39.017] [CUML] [debug] Running transform
[2025-05-19 05:56:39.017] [CUML] [debug] Building KNN Graph
[2025-05-19 05:56:45.276] [CUML] [debug] Smoothing KNN distances
[2025-05-19 05:56:45.291] [CUML] [debug] Executing fuzzy simplicial set
[2025-05-19 05:56:45.297] [CUML] [debug] Performing L1 normalization
[2025-05-19 05:56:45.350] [CUML] [debug] n_epochs=30
[2025-05-19 05:56:45.375] [CUML] [debug] Computing # of epochs for training each sample
[2025-05-19 05:56:45.376] [CUML] [debug] Performing optimization


2025-05-19 05:56:45,480 - BERTopic - Dimensionality - Completed ✓
2025-05-19 05:56:45,487 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-05-19 05:56:53,742 - BERTopic - Cluster - Completed ✓
2025-05-19 05:56:53,778 - BERTopic - Representation - Fine-tuning topics using representation models.
  idf = np.log((avg_nr_samples / df) + 1)
2025-05-19 05:56:57,303 - BERTopic - Representation - Completed ✓


In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,98313,-1_climate_water_change_energy,"[climate, water, change, energy, global, unive...","[More specifically, emissions from farms, such..."
1,0,2118,0_species_extinction_threatened_endangered,"[species, extinction, threatened, endangered, ...",[It is a critically endangered species threate...
2,1,1902,1_energy_solar_wind_renewable,"[energy, solar, wind, renewable, power, electr...",[Wind power harnesses the power of the wind to...
3,2,1492,2_forest_forests_trees_subtropical,"[forest, forests, trees, subtropical, habitats...",[Its natural habitats are subtropical or tropi...
4,3,1290,3_hockey_league_season_team,"[hockey, league, season, team, ice, championsh...",[The 1999 -- 00 Belgian Hockey League season w...
...,...,...,...,...,...
223,222,52,222_mitigation_engineering_climate_options,"[mitigation, engineering, climate, options, sc...",[Chapter 28 of the National Academy of Science...
224,223,51,223_preserves_dating_fossils_period,"[preserves, dating, fossils, period, molecule,...",[It preserves fossils dating back to the Paleo...
225,224,51,224_limit_warming_global_limiting,"[limit, warming, global, limiting, emissions, ...",[One of the targets that has been suggested is...
226,225,50,225_antibiotic_antibiotics_drug_treatment,"[antibiotic, antibiotics, drug, treatment, inf...","[Phenoxymethylpenicillin, also known as penici..."
