## BERTopic model analysis

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

em= 'TurkuNLP/sbert-cased-finnish-paraphrase'
import numpy as np
from bertopic import BERTopic
import glob
import random
import re

## Data

### Read in training data

In [None]:
# read in train data
trainpath= "../data/s24_forest_train.csv"
df = pd.read_csv(trainpath, sep="\t")
documents=list(df["text"].drop_duplicates())
print("Training on", len(documents), "documents. \n")

# and test data
testpath= "../data/s24_forest_test.csv"
df2 = pd.read_csv(testpath, sep="\t")
documents2=list(df2["text"].drop_duplicates())


In [None]:

# a function to clean url addresses and lowercase words that were totally in uppercase
# punctiation was corrected so that sentences were correctly separated by a space after a dot

def clean_doc(doc):
    # Pattern to match any letters separated by a dot
    pattern = r"([a-zöåäA-ZÅÄÖ]+)\.([a-zåäöA-ZÅÄÖ]+)"
    pattern2 = r"([a-zåäöA-ZÅÄÖ]+)\,([a-zöäåA-ZÅÄÖ]+)"

    words = doc.split()
    #print(words)
    doc_list= [word for word in words if "www" not in word]
    #words = doc.split()
    doc_list= [word for word in doc_list if "http" not in word]
    doc = " ".join(doc_list)
    # Replace with a space after the dot
    if "http" and "www" not in doc:
        doc = re.sub(pattern, r"\1. \2", doc)
        doc = re.sub(pattern2, r"\1, \2", doc)
    doc_clean = re.sub(r"\s+", " ", doc).strip()
    doc_clean = re.sub('[€]', 'euro', doc_clean)
    doc_clean = re.sub('[*"]', '', doc_clean)
    words = doc_clean.split()
    # Convert words with all uppercase letters to lowercase
    processed_doc = [word.lower() if word.isupper() else word for word in words]
    return " ".join(processed_doc)
    
documents2 = [clean_doc(d) for d in documents2]
documents = [clean_doc(d) for d in documents]


## nr of topics for different models


- turku-nlp umap kmeans

- turku-nlp pca kmeans

- xlm-r umap kmeans

- xlm-r pca kmeans


In [None]:
k_out ="data/slurm-24538287.out"

with open(k_out, "r") as fp:
    k_res=fp.readlines()
    
k_res = [k for k in k_res if k !="\n"]
models = ["turku-umap","turku-pca", "xlm-umap","xlm-pca"]
n=0
scores=[]
for k in k_res:

    if k.startswith("Loading"):
        n+=1
        print(k)
        wcss=[]
        silhouette=[]
    elif "Done!" in k:
        scores.append([wcss,silhouette])
    elif k[0].isdigit():
        #print(k.split())
        wcss.append(float(k.split()[0]))
        silhouette.append(float(k.split()[1]))
        #print(k)
#scores       

In [None]:
# plot elbow and silhouette scores

import matplotlib.pyplot as plt
import numpy as np

# Create a 2x2 grid of subplots
fig, axes = plt.subplots(2, 2, figsize=(10, 8))  # 2 rows, 2 columns

k_values = [50,75,100,125,150,175,200,225,250,275,300]
# Plot data on each subplot
axes[0, 0].plot(k_values,scores[0][0], 'bx-', label="elbow")
axes[0, 0].set_title(models[0])
# Create a secondary y-axis
ax2 = axes[0, 0].twinx()
ax2.scatter(k_values,scores[0][1], color="red", s=10, marker="x")
ax2.set_ylabel("silhouette", color="red")
ax2.tick_params(axis='y', labelcolor="red")
axes[0, 0].legend()

axes[0, 1].plot(k_values,scores[1][0], 'bx-', label="elbow")
axes[0, 1].set_title(models[1])
# Create a secondary y-axis
ax2 = axes[0, 1].twinx()
ax2.scatter(k_values,scores[1][1], color="red", s=10, marker="x")
ax2.set_ylabel("silhouette", color="red")
ax2.tick_params(axis='y', labelcolor="red")
axes[0, 1].legend()

axes[1, 0].plot(k_values,scores[2][0], 'bx-', label="elbow")
axes[1, 0].set_title(models[2])
# Create a secondary y-axis
ax2 = axes[1, 0].twinx()
ax2.scatter(k_values,scores[2][1], color="red", s=10, marker="x")
ax2.set_ylabel("silhouette", color="red")
ax2.tick_params(axis='y', labelcolor="red")
axes[1, 0].legend()

axes[1, 1].plot(k_values,scores[3][0], 'bx-', label="elbow")
axes[1, 1].set_title(models[3])
# Create a secondary y-axis
ax2 = axes[1, 1].twinx()
ax2.scatter(k_values,scores[3][1], color="red", s=10, marker="x")
ax2.set_ylabel("silhouette", color="red")
ax2.tick_params(axis='y', labelcolor="red")
axes[1, 1].legend()

# Adjust layout
plt.tight_layout()

# Show the figure
plt.show()


Used nr of topics:
- turku-umap and xlm-umap = 175
- turku-pca 200
- xlm-pca 150

# Explore a trained model

Code in this section is based on tutorials from https://maartengr.github.io/BERTopic/getting_started/quickstart/quickstart.html


In [None]:
## load in a model
path='/scratch/project_2008526/telmap/suomi24/bertopicmodel/model_sbert-cased-finnish-paraphrase_umap_kmeans'
topic_model = BERTopic.load(path+"/picklemodel")

# get keywords per topic
topic_dict= topic_model.get_topics()
t=[]
kw=[]
for k,v in topic_dict.items():
    t.append(k)
    wordlist= [i[0] for i in v]
    kw.append(" ".join(wordlist))
    
kwdf = pd.DataFrame({"Topic":t, "Keywords":kw})
kwdf.head() # show topic keywords

In [None]:
# see basic document info

document_info = topic_model.get_document_info(documents)
df=topic_model.get_topic_info()
topics = list(df["Topic"])

# print mean topic size and the size of the smallest topic
print(df[df["Topic"]!=-1][["Count"]].mean().values, df[df["Topic"]!=-1][["Count"]].min().values)

# -1 topic size (only applicable for hdbscan models)
if -1 in df["Topic"]:
    print("Trash")
    print(len(df[df["Topic"]==-1]))
df.sort_values(by="Count").head()

In [None]:
# filter out small topics
df_filtered= df[df["Count"]>29]

topics_filtered=list(df_filtered["Topic"])

docsdf = document_info[document_info["Topic"].isin(topics_filtered)]
docs=list(docsdf["Document"])
len(docs)

The hierarchical topic tree was used to get an impression of the semantic coherence of the created topics. 
(i.e. Does the branch structure make sense for a human evaluator?)

In [None]:
# hierarchical topics
hierarchical_topics = topic_model.hierarchical_topics(documents)
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

# go through topics

In [None]:
document_info.head()

In [None]:
# print representative documents
representative = document_info[(document_info["Representative_document"]==True)&(document_info["Topic"]==173)]
print(list(representative.sample(1)["Document"]))
representative

In [None]:
# estimate probabilities for a kmeans model

topic_distr, _ = topic_model.approximate_distribution(documents, min_similarity=0)

In [None]:
# get the max probability for each (i.e. the probability of the assigned topic)
topic_max = [np.max(t) for t in topic_distr]
topic_max

In [None]:
# histogram of max topic probabilities
print(np.median(topic_max), np.mean(topic_max), np.max(topic_max), np.min(topic_max),np.std(topic_max))

#counts, bins = np.histogram(topic_max)
import matplotlib.pyplot as plt
plt.hist(topic_max, bins=40)
plt.show()

In [None]:
# add a column for the probabilities
document_info["Topic_probability"] = topic_max

In [None]:
# filter out documents with a low probability ==> get more representative documents
filtered_docs= document_info[document_info["Topic_probability"] >= 0.02]

# how many documents do we have left?
print("After filtering:", len(filtered_docs) , "docs.")
filtered_docs.head()

### Exploring topics
Based on topic keywords, relevant topics for further analysis were identified. Topics that seemed to share a common theme were investigated together. E.g. topics 24, 73, 88, and 151 seemed to be about clearcuttings.

In [None]:
# read documents from a topic
# 24, 73, 88, 151 (clearcuttings)
a_topic=random.choice(topics)
a_topic=151

# 24 : avohakkuut tuhoavat monimuotoisuutta VS viherhippien täytyy antaa ammattilaisten tehdä työtään
# 73 => puukauppa
# 88 avohakkuu mielipiteitä, lyhyitä
# 151 avohakkuu ja suojellut alueet
print("Topic:",a_topic)
print("Keywords:" ,filtered_docs[filtered_docs["Topic"]==a_topic]["Name"].head(1))
temp = filtered_docs[filtered_docs["Topic"]==a_topic]
print("nr of docs:", len(temp),"\n")

# print a sample
example_texts = temp.sample(7)["Document"]
for e in example_texts:
    print(e,"\n")

### Visualization

In [None]:
nature_conservation =[2,10,43,72,173]
hakkuutopics=[24,73,88,151]
subset_docs=document_info[document_info["Topic"].isin(hakkuutopics)]

# print out topic probability information
print(subset_docs["Topic_probability"].mean(),subset_docs["Topic_probability"].std() )
print(subset_docs["Topic_probability"].min(),subset_docs["Topic_probability"].max() )


In [None]:
# set a threshold to filter out irrelevant documents
threshold=subset_docs["Topic_probability"].mean()-1*subset_docs["Topic_probability"].std()
filtered_subset_docs=subset_docs[subset_docs["Topic_probability"]>threshold]
print(len(filtered_subset_docs), len(subset_docs))

In [None]:
# make reduced embeddings for plotting

embedding_model ='TurkuNLP/sbert-cased-finnish-paraphrase'

epath="/scratch/project_2008526/telmap/suomi24/turkunlp_train_embeddings.h5"

from umap import UMAP

print(f"Loading embeddings '{epath}'. \n")
with h5py.File(epath, "r") as f:
    embeddings = f["embeddings"][:]
    
reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)

In [None]:
# visualize the wanted topics
topic_model.visualize_documents(documents, topics=hakkuutopics, custom_labels=labels,reduced_embeddings=reduced_embeddings, width=1000, height=600)

#### Modified function to remove axes (no needed)

In [None]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from umap import UMAP
from typing import List, Union


def visualize_documents(
    topic_model,
    docs: List[str],
    topics: List[int] = None,
    embeddings: np.ndarray = None,
    reduced_embeddings: np.ndarray = None,
    sample: float = None,
    hide_annotations: bool = False,
    hide_document_hover: bool = False,
    custom_labels: Union[bool, str] = False,
    title: str = "<b>Documents and Topics</b>",
    width: int = 1200,
    height: int = 750,
):
    """Visualize documents and their topics in 2D.

    Arguments:
        topic_model: A fitted BERTopic instance.
        docs: The documents you used when calling either `fit` or `fit_transform`
        topics: A selection of topics to visualize.
                Not to be confused with the topics that you get from `.fit_transform`.
                For example, if you want to visualize only topics 1 through 5:
                `topics = [1, 2, 3, 4, 5]`.
        embeddings: The embeddings of all documents in `docs`.
        reduced_embeddings: The 2D reduced embeddings of all documents in `docs`.
        sample: The percentage of documents in each topic that you would like to keep.
                Value can be between 0 and 1. Setting this value to, for example,
                0.1 (10% of documents in each topic) makes it easier to visualize
                millions of documents as a subset is chosen.
        hide_annotations: Hide the names of the traces on top of each cluster.
        hide_document_hover: Hide the content of the documents when hovering over
                             specific points. Helps to speed up generation of visualization.
        custom_labels: If bool, whether to use custom topic labels that were defined using
                       `topic_model.set_topic_labels`.
                       If `str`, it uses labels from other aspects, e.g., "Aspect1".
        title: Title of the plot.
        width: The width of the figure.
        height: The height of the figure.

    """
    topic_per_doc = topic_model.topics_

    # Sample the data to optimize for visualization and dimensionality reduction
    if sample is None or sample > 1:
        sample = 1

    indices = []
    for topic in set(topic_per_doc):
        s = np.where(np.array(topic_per_doc) == topic)[0]
        size = len(s) if len(s) < 100 else int(len(s) * sample)
        indices.extend(np.random.choice(s, size=size, replace=False))
    indices = np.array(indices)

    df = pd.DataFrame({"topic": np.array(topic_per_doc)[indices]})
    df["doc"] = [docs[index] for index in indices]
    df["topic"] = [topic_per_doc[index] for index in indices]

    # Extract embeddings if not already done
    if sample is None:
        if embeddings is None and reduced_embeddings is None:
            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")
        else:
            embeddings_to_reduce = embeddings
    else:
        if embeddings is not None:
            embeddings_to_reduce = embeddings[indices]
        elif embeddings is None and reduced_embeddings is None:
            embeddings_to_reduce = topic_model._extract_embeddings(df.doc.to_list(), method="document")

    # Reduce input embeddings
    if reduced_embeddings is None:
        umap_model = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric="cosine").fit(embeddings_to_reduce)
        embeddings_2d = umap_model.embedding_
    elif sample is not None and reduced_embeddings is not None:
        embeddings_2d = reduced_embeddings[indices]
    elif sample is None and reduced_embeddings is not None:
        embeddings_2d = reduced_embeddings

    unique_topics = set(topic_per_doc)
    if topics is None:
        topics = unique_topics

    # Combine data
    df["x"] = embeddings_2d[:, 0]
    df["y"] = embeddings_2d[:, 1]

    # Prepare text and names
    if isinstance(custom_labels, str):
        names = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in unique_topics]
        names = ["_".join([label[0] for label in labels[:4]]) for labels in names]
        names = [label if len(label) < 30 else label[:27] + "..." for label in names]
    elif topic_model.custom_labels_ is not None and custom_labels:
        names = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in unique_topics]
    else:
        names = [
            f"{topic}"
            for topic in unique_topics
        ]

    # Visualize
    fig = go.Figure()

    # Outliers and non-selected topics
    non_selected_topics = set(unique_topics).difference(topics)
    if len(non_selected_topics) == 0:
        non_selected_topics = [-1]

    selection = df.loc[df.topic.isin(non_selected_topics), :]
    selection["text"] = ""
    selection.loc[len(selection), :] = [
        None,
        None,
        selection.x.mean(),
        selection.y.mean(),
        "Other documents",
    ]

    fig.add_trace(
        go.Scattergl(
            x=selection.x,
            y=selection.y,
            hovertext=selection.doc if not hide_document_hover else None,
            hoverinfo="text",
            mode="markers+text",
            name="other",
            showlegend=False,
            marker=dict(color="#CFD8DC", size=5, opacity=0.5),
        )
    )

    # Selected topics
    for name, topic in zip(names, unique_topics):
        if topic in topics and topic != -1:
            selection = df.loc[df.topic == topic, :]
            selection["text"] = ""

            if not hide_annotations:
                selection.loc[len(selection), :] = [
                    None,
                    None,
                    selection.x.mean(),
                    selection.y.mean(),
                    name,
                ]

            fig.add_trace(
                go.Scattergl(
                    x=selection.x,
                    y=selection.y,
                    hovertext=selection.doc if not hide_document_hover else None,
                    hoverinfo="text",
                    text=selection.text,
                    mode="markers+text",
                    name=name,
                    textfont=dict(
                        size=12,
                    ),
                    marker=dict(size=5, opacity=0.5),
                )
            )

    # Add grid in a 'plus' shape
    x_range = (
        df.x.min() - abs((df.x.min()) * 0.15),
        df.x.max() + abs((df.x.max()) * 0.15),
    )
    y_range = (
        df.y.min() - abs((df.y.min()) * 0.15),
        df.y.max() + abs((df.y.max()) * 0.15),
    )
    
    fig.add_annotation(x=x_range[0], y=sum(y_range) / 2, text="D1", showarrow=False, yshift=10)
    fig.add_annotation(y=y_range[1], x=sum(x_range) / 2, text="D2", showarrow=False, xshift=10)

    # Stylize layout
    fig.update_layout(
        template="simple_white",
        title={
            "text": "",
            "x": 0.5,
            "xanchor": "center",
            "yanchor": "top",
            #"font": dict(size=22, color="Black"),
        },
        width=width,
        height=height,
        
    )

    fig.update_xaxes(visible=False, showline=False, showgrid=False, zeroline=False)
    fig.update_yaxes(visible=False, showline=False, showgrid=False, zeroline=False)
    return fig

In [None]:
myfig=visualize_documents(topic_model, documents, topics=hakkuutopics, custom_labels=labels,reduced_embeddings=reduced_embeddings, width=600,height=500)

In [None]:
myfig.update_layout(
    xaxis=dict(
        title="",  # Set x-axis title
        #range=[-5, 5],                # Set x-axis range
        showgrid=True,               # Show gridlines
        zeroline=False,
        showline=False # Hide zero line
    ),
    yaxis=dict(
        
        title="",  # Set y-axis title
        #range=[3.5, 13.5],                 # Set y-axis range
        showline=False,                # Show axis line
        mirror=False,# Mirror axis line on top/bottom or left/right
        zeroline=False,
        showgrid=False
    )
)

## Thematic groups

BERTopic only assigns one topic per document, so to work around that, I looked at topic probabilities and selected documents for which the probability of a certain document was high. This way, you could retrieve also documents beyond the topic label assignments. (See https://maartengr.github.io/BERTopic/getting_started/distribution/distribution.html#example )

In [None]:
mhoito=[27 ,99 ,112 ,136 ,30]
ilmasto_nielut=[34,64,79,127]
mkauppa=[42,106]
hakkuut=[24,151,88]

In [None]:
print(np.median(topic_distr))
print(np.mean(topic_distr))
print(np.std(topic_distr))
print(np.mean(topic_distr) - np.std(topic_distr))

In [None]:
# hakkuut

hakkuu_true = [x[24] > 0.02 or x[151] > 0.02 or  x[88] > 0.02 for x in topic_distr]
ilmasto_true = [x[64] > 0.02 or x[127] > 0.02 for x in topic_distr]
nielut_true = [x[34] > 0.02 or x[79] > 0.02 for x in topic_distr]



In [None]:
document_info["hakkuu"] = hakkuu_true
document_info["ilmasto"] = ilmasto_true
document_info["nielut"] = nielut_true

In [None]:
# see some climate-related posts
print(len(document_info[document_info["ilmasto"]==True]["Document"].values))
print(document_info[document_info["ilmasto"]==True]["Document"].values)

## CLimate viz

In [None]:
#hiilinielut_df=document_info[document_info["nielut"]==True]
#ilmasto_df=document_info[document_info["ilmasto"]==True]
#hakkuu_df=document_info[document_info["hakkuu"]==True]
document_info["timestamps"] =train["date"]
document_info['year'] = pd.DatetimeIndex(document_info['timestamps']).year
document_info['month'] = pd.DatetimeIndex(document_info['timestamps']).month

hakkuut=[24, 88, 151]
clim=document_info[document_info["Topic"].isin(hakkuut)][["Document","Topic","Top_n_words", "Topic_probability","timestamps","year","month"]]
print(clim["Topic_probability"].mean())
print(clim["Topic_probability"].std())

# filter documents with a threshold of mean - 2*SD
print(clim["Topic_probability"].mean()-2*clim["Topic_probability"].std())
mean_std=clim["Topic_probability"].mean()-2*clim["Topic_probability"].std()
clim=clim[clim["Topic_probability"]>mean_std]
print(len(clim))
print(clim["Document"].values)


In [None]:
# read through the lowest-probability documents to see if the threshold makes sense
clim.sort_values(by="Topic_probability")["Document"].head().values

In [None]:
## plot the development of climate post counts over time

In [None]:

# Generate a date range from 2014-01-01 to 2020-12-31 with monthly frequency
date_range = pd.date_range(start="2014-01-01", end="2020-12-31", freq="M")

# Create a DataFrame with year, month, and year_month columns
timedf = pd.DataFrame({
    "year": date_range.year,
    "month": date_range.month,
    "year_month": date_range.strftime("%Y_%m")
})

timedf['month'] = timedf['month'].astype(str).str.zfill(2)
print(len(timedf))

In [None]:
import matplotlib.pyplot as plt

# Aggregating and calculating the standard deviation
per_month = clim.groupby(by=["year", "month"]).size().reset_index(name="count")
per_month['month'] = per_month['month'].astype(str).str.zfill(2)
per_month["year_month"] = per_month["year"].astype(str) + "_" + per_month["month"].astype(str)
per_month= per_month.merge(timedf, on=["year","month","year_month"], how="outer")#.head(20)


In [None]:

# Calculate standard deviation (example: simulate with sample data)
# If you have raw data instead of counts, replace `.size()` with an appropriate aggregation function

rolling_window = 8  # You can adjust the window size
per_month["rolling_std"] = per_month["count"].fillna(0).rolling(window=rolling_window, min_periods=1).std()
std_dev = per_month["count"].fillna(0).std()#.reset_index()  # Replace 'value_column' with your column name
#per_month = per_month.merge(std_dev, on=["year", "month"])

# Define x, y, and error
x = per_month["year_month"]
y = per_month["count"].fillna(0)
error = per_month["rolling_std"]

# Plotting the data
#ax=plt.figure(figsize=(9, 5))
fig, ax = plt.subplots(figsize=(9, 5))
plt.plot(x, y, '.-', label='Count')  # Black line for the main plot
plt.fill_between(x, y - error, y + error, color='lightblue', alpha=0.6, label='Standard Deviation')  # Fill for std deviation

# Beautify the plot
plt.xticks(rotation=90, fontsize=8)  # Rotate x-axis labels for clarity
custom_labels = [f"{label[-2:]}-{label[:4]}" for i, label in enumerate(x)]  # Example custom label format

# You can specify positions for the ticks (e.g., every 6th tick)
plt.xticks(ticks=np.arange(0, len(x), step=2), labels=custom_labels[::2], rotation=90, fontsize=8)


plt.xlabel("Time, months")
plt.ylabel("Nr of posts")
#plt.title("Monthly Count with Standard Deviation")
#plt.legend()
plt.ylim((0,55))
plt.tight_layout()

# Show the plot
#plt.show()

fig.savefig('figures/climate.png')

In [None]:
per_month=clim.groupby(by=["year","month"]).size().reset_index(name="count")
per_month['month'] = per_month['month'].astype(str).str.zfill(2)
per_month["year_month"]= per_month["year"].astype(str)+"_"+per_month["month"].astype(str)
per_month.plot.scatter(x="year_month", y="count")

In [None]:
hiilinielut_df.groupby(["year","month"]).size().reset_index().tail(36)

In [None]:
print(hiilinielut_df[(hiilinielut_df["year"]==2018) &(hiilinielut_df["month"]==10)]["Document"].values)

In [None]:
print(hakkuu_df["Document"].values)

In [None]:
hiilinielut_df.groupby(["year","month"]).size().plot()

In [None]:
ilmasto_df.groupby(["year","month"]).size().plot()

## Topics over time

In [None]:
mhoito=[27 ,99 ,112 ,136 ,30]
ilmasto_nielut=[34,64,79,127]
mkauppa=[42,106]
hakkuut=[24,151]

In [None]:
timestamps=list(train["date"])
topics_over_time = topic_model.topics_over_time(documents, timestamps, nr_bins=36)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics=hakkuut)

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, topics=ilmasto_nielut)

In [None]:
##
topic_model.visualize_topics_over_time(topics_over_time, topics=[10,72])

## topic reduction

Not done for the current paper, but BERTopic offers a function that merges topics together. As we had many overlapping topics, it might make sense to do this..

In [None]:
# Further reduce topics
topic_model.reduce_topics(documents, nr_topics=130)

# Access updated topics
new_topics = topic_model.topics_

In [None]:
# hierarchical topics
hierarchical_topics = topic_model.hierarchical_topics(documents)
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

In [None]:
T = topic_model.get_document_info(documents)
docs_per_topics = T.groupby(["Topic"]).apply(lambda x: x.index,include_groups=False).to_dict()

In [None]:
docs_per_topics

In [None]:
# representative docs

topic_model.get_representative_docs(42)

## Coherence score


In [None]:
# coherence score
# c_v from 

from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
# Tokenize your documents
tokenized_docs = [doc.split() for doc in documents]  # or use a tokenizer for more complex preprocessing

# Create a dictionary from the tokenized documents
dictionary = Dictionary(tokenized_docs)


# compute coherence score
tops = topic_model.get_topics()
top_words_per_topic = [[word for word, _ in topic_model.get_topic(topic)] for topic in tops.keys()]

coherence_model = CoherenceModel(
    topics=top_words_per_topic,
    texts=[doc.split() for doc in documents],  # Tokenized documents
    dictionary=dictionary, 
    coherence='c_v'   # 'c_v' is popular for topic models, but you can experiment with others
)

# Calculate the coherence score
coherence_score = coherence_model.get_coherence()
print("Coherence Score:", coherence_score)
print("\n")


In [None]:
topics = document_info["Topic"].sort_values().unique()
quality = {"relevant":{"topics":[],"quality":[]}, "irrelevant":{"topics":[],"quality":[]}}
for t in topics[:5]:
    temp = document_info[document_info["Topic"]==t]
    example_texts = temp.sample(20)["Document"]
    for e in example_texts:
        print(e)
    print("is the topic relevant? (1/0)")
    relevance = input()
    print("is the quality good (1), ok (2), or bad (3)?")
    q= input()
    if relevance =="1":
        print("Relevant","\n")
        quality["relevant"]["topics"].append(t)
        
        quality["relevant"]["quality"].append(q)

    else: 
        quality["irrelevant"]["topics"].append(t)
        
        quality["irrelevant"]["quality"].append(q)
    