## 1) Imports


In [None]:
import os
import pickle
from pathlib import Path

import pandas as pd
import polars as pl
import ufal.morphodita
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import LiteLLM, MaximalMarginalRelevance
from umap import UMAP


from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

## 2) Load cleaned data


In [None]:
data_dir = Path("data")
questions_csv_path = data_dir / "questions_cleaned_filtered.csv"
resource_set_path = data_dir / "umimeprogramovatcz-system_resource_set.csv"

questions_df = pl.read_csv(questions_csv_path, separator=",")
resource_set_df = pl.read_csv(resource_set_path, separator=";")

with open("rs_filtered.pickle", "rb") as handle:
    rc_dict = pickle.load(handle)

stopwords = []

with open("stopwords-cs.txt", "r") as f:
    for stopword in f:
        stopwords.append(stopword.replace("\n", ""))

In [None]:
rs_ids = [int(rs_id) for rs_id in rc_dict.keys()]
questions_df = questions_df.filter(pl.col("rs").is_in(rs_ids))


## 3) setup pipeline building blocks

### 1. embedding model


In [None]:
embedding_model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")

### 2. Tokenizer


In [None]:
class LemmaTokenizer:
    def __init__(self):
        self.tagger = ufal.morphodita.Tagger.load(
            "czech-morfflex2.0-pdtc1.0-220710/czech-morfflex2.0-pdtc1.0-220710.tagger"
        )
        self.converter = ufal.morphodita.TagsetConverter.newStripLemmaIdConverter(
            self.tagger.getMorpho()
        )
        self.tokenizer = self.tagger.newTokenizer()
        self.forms = ufal.morphodita.Forms()

    def __call__(self, text):
        self.tokenizer.setText(text)
        self.tokenizer.nextSentence(self.forms, None)

        lemmas = ufal.morphodita.TaggedLemmas()
        self.tagger.tag(self.forms, lemmas)

        self.converter.convertAnalyzed(lemmas)

        raw_lemmas = list([lemma.lemma for lemma in lemmas if lemma.lemma.isalpha()])
        return raw_lemmas


### 3. Representation models


In [None]:
use_llm = False

PROMPT = """
Mám téma (topic), které se vztahuje k následujícím kvízovým otázkám:
[DOCUMENTS]
Téma lze popsat následujícími klíčovými slovy: [KEYWORDS]
Na základě informací výše, extrahuj krátký popisek tématu, použij češtinu a nepřidávej žádné další informace, délka popisku nechť je mezi 3 a 7 slovy. Popisek uveď v následujícím formátu:
topic: <topic label>
"""

representation_models = [KeyBERTInspired(), MaximalMarginalRelevance(0.5)]
if use_llm:
    representation_model_LLM = LiteLLM(
        model="perplexity/sonar-pro", prompt=PROMPT, nr_docs=4
    )
    representation_models.append(representation_model_LLM)


### 4. Count Vectorizer


In [None]:
vectorizer_model = CountVectorizer(
    stop_words=stopwords + ["img", "pravda", "nepravda"],
    tokenizer=LemmaTokenizer(),
)

### 5. UMAP


In [None]:
umap_model = UMAP(
    n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42
)

### 6. Class TFIDF


In [None]:
ctfidf_model = ClassTfidfTransformer(bm25_weighting=False, reduce_frequent_words=False)

## 3) Assemble and run the pipeline for each of the Resource Sets


In [None]:
selected_ids = [rc_id for rc_id in rc_dict.keys()]
selected_rs_names = [rc_dict[id] for id in selected_ids]

dfs = []

for id in selected_ids:
    filtered_questions_df = questions_df.filter(pl.col("rs") == id)
    docs_np = filtered_questions_df["question_correct"].to_numpy().flatten()
    docs = docs_np.tolist()

    topic_model = BERTopic(
        embedding_model=embedding_model,
        ctfidf_model=ctfidf_model,
        representation_model=representation_models,
        vectorizer_model=vectorizer_model,
        verbose=True,
        # zeroshot_topic_list=zeroshot_topic_list,
        # zeroshot_min_similarity=0.40,
        min_topic_size=5,
        umap_model=umap_model,
        top_n_words=5,
        language="multilingual",
    )

    topics, probs = topic_model.fit_transform(docs)

    if -1 in topics:
        new_topics = topic_model.reduce_outliers(
            docs, topics, probabilities=probs, strategy="probabilities"
        )

        topic_model.update_topics(
            docs,
            topics=new_topics,
            vectorizer_model=vectorizer_model,
            representation_model=representation_models,
        )
        documents = pd.DataFrame({"Document": docs, "Topic": new_topics})
        topic_model._update_topic_size(documents)

    df_docs = pl.from_pandas(topic_model.get_document_info(docs))
    df_docs = df_docs.join(df_docs.group_by("Topic").len(), on="Topic")
    df_docs = df_docs.join(
        filtered_questions_df, left_on="Document", right_on="question_correct"
    )
    df_docs = df_docs.with_columns(
        rs_name=pl.Series(
            "rc_name", values=[rc_dict[rs_id] for rs_id in df_docs["rs"].to_list()]
        )
    )
    dfs.append(df_docs)

## 4) Write the results to file


In [None]:
df_docs = pl.concat(dfs)
df_docs

In [None]:
import json

with open("docs_topics_data.json", "w", encoding="utf-8") as f:
    json.dump(df_docs.to_dicts(), f, ensure_ascii=False, indent=4)

In [None]:
df_docs_topics = (
    df_docs.select(["Document", "Topic", "Name", "len", "rs_name", "successRate"])
    .group_by("Topic", "Name", "len", "rs_name")
    .agg(pl.col("Document"), pl.col("successRate"))
)
df_docs_topics = df_docs_topics.rename({"len": "Count"})

df_docs_topics = df_docs_topics.join(
    df_docs_topics.group_by("rs_name").len().rename({"len": "topic_count"}),
    on="rs_name",
)


In [None]:
df_docs_topics

## Create hierarchical json for treemap visualization


In [None]:
data = []

for _, rs_row in df_docs_topics.group_by("rs_name"):
    topic_children = []
    for i, row in rs_row.group_by(["Name", "rs_name"]):
        children = []
        for docs in row["Document"].to_list():
            for doc_i, doc in enumerate(docs):
                succ_rate = row["successRate"].to_list()[0][doc_i]
                children.append(
                    {
                        "name": doc,
                        # "value": 1 / row["Count"].item(),
                        "value": succ_rate
                        if succ_rate > 0
                        else 30,  # for questions that have not been answered yet, I give default values of 30 so that they are shown in treemap
                        "originalValue": succ_rate,
                        "label": {"fontSize": 12},
                    }
                )

        topic_children.append(
            {
                "name": row["Name"].item(),
                "value": row["Count"].item(),
                "label": {"fontSize": 18, "fontWeight": "bold", "color": "#ffffff"},
                "children": children,
            }
        )
    data.append(
        {
            "name": rs_row["rs_name"][0],
            "value": rs_row["topic_count"][0],
            "label": {"fontSize": 18, "fontWeight": "bold", "color": "#ffffff"},
            "children": topic_children,
        }
    )

In [None]:
data

In [None]:
import json

# json_data_string = json.dumps(data, indent=2)

with open("topics_data.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)