# Clustering rozhodovacek

## 1) Imports


In [19]:
import json
from pathlib import Path
import polars as pl
import re

## 2) Load the dataframe


In [None]:
data_dir = Path("data")
questions_csv_path = data_dir / "umimeprogramovatcz-system_binary_choice_data.csv"

# load questions
questions_df = pl.read_csv(questions_csv_path, separator=";")
# load resource sets
resource_set_df = pl.read_csv(
    data_dir / "umimeprogramovatcz-system_resource_set.csv",
    has_header=True,
    separator=";",
)

questions_df.head()

id,rs,difficultyRank,cluster,question,correct,distractor1,translation,explanation,grade,successRate,responseTime,recalculated,answers,created,lastUpdate,masterUpdated,checkTranslate,visible
i64,i64,i64,i64,str,str,str,str,str,i64,f64,f64,str,i64,str,str,str,i64,i64
1,64,1,2,"""[[""text"",""Co vypíše program?""]…","""[[""code64"",""MTg="",{""language"":…","""[[""code64"",""NQ=="",{""language"":…",,,0,89.071,5910.0,"""2024-07-26 03:19:37""",3047,"""2017-11-22 17:56:31""","""2017-11-22 17:56:31""","""0000-00-00 00:00:00""",0,1
2,64,1,2,"""[[""text"",""Co vypíše program?""]…","""[[""code64"",""Nw=="",{""language"":…","""[[""code64"",""eA=="",{""language"":…",,,0,86.251,4744.0,"""2024-07-26 03:19:37""",3033,"""2017-11-22 17:56:31""","""2017-11-22 17:56:31""","""0000-00-00 00:00:00""",0,1
3,64,1,2,"""[[""text"",""Co vypíše program?""]…","""[[""code64"",""MyAy"",{""language"":…","""[[""code64"",""MiAz"",{""language"":…",,,0,88.404,5483.0,"""2024-07-26 03:19:37""",3027,"""2017-11-22 17:56:31""","""2017-11-22 17:56:31""","""0000-00-00 00:00:00""",0,1
4,64,1,2,"""[[""text"",""Co vypíše program?""]…","""[[""code64"",""NQ=="",{""language"":…","""[[""code64"",""Mw=="",{""language"":…",,,0,84.608,6761.5,"""2024-07-26 03:19:37""",3086,"""2017-11-22 17:56:31""","""2017-11-22 17:56:31""","""0000-00-00 00:00:00""",0,1
5,64,2,2,"""[[""text"",""Co vypíše program?""]…","""[[""code64"",""Nw=="",{""language"":…","""[[""code64"",""NQ=="",{""language"":…",,"""[[""text"",""Po prvním provedení …",0,69.731,10449.5,"""2024-07-26 03:19:37""",2154,"""2017-11-22 17:56:31""","""2019-07-16 15:26:11""","""0000-00-00 00:00:00""",0,1


## 3) Strip json formatting of certain columns


In [21]:
def strip_json_formatting(json_str):
    try:
        # Parse the JSON string to a Python object
        parsed = json.loads(json_str)
        # print(parsed)
        # Extract the second element of the inner list
        if parsed[0][0] in ["para", "text", "textlatex", "latex"]:
            return str(parsed[0][1])
        elif parsed[0][0] == "code64":
            return "code"
        elif parsed[0][0] == "img":
            return "img"
        elif parsed[0][0] == "emoji":
            return "emoji"
        return json_str
    except:
        # Return original if not valid JSON
        return json_str


In [None]:
questions_df = questions_df.rename({"distractor1": "distractor"})


resource_set_df = resource_set_df.with_columns(
    pl.col("name").map_elements(strip_json_formatting, return_dtype=str).cast(pl.String)
)


questions_df_clean = questions_df.with_columns(
    pl.col("question")
    .map_elements(strip_json_formatting, return_dtype=str)
    .cast(pl.String),
    pl.col("correct")
    .map_elements(strip_json_formatting, return_dtype=str)
    .cast(pl.String),
    pl.col("distractor")
    .map_elements(strip_json_formatting, return_dtype=str)
    .cast(pl.String),
).select(["rs", "question", "correct", "distractor", "successRate"])


In [23]:
questions_df_clean = questions_df_clean.drop_nans().drop_nulls()

questions_df_clean.head()

rs,question,correct,distractor,successRate
i64,str,str,str,f64
64,"""Co vypíše program?""","""code""","""code""",89.071
64,"""Co vypíše program?""","""code""","""code""",86.251
64,"""Co vypíše program?""","""code""","""code""",88.404
64,"""Co vypíše program?""","""code""","""code""",84.608
64,"""Co vypíše program?""","""code""","""code""",69.731


## 4) Obtain some simple (syntactic) features


In [None]:
def is_fill_in_blank(field: str) -> int:
    return 1 if "_" in field else 0


def is_explicit_question(question: str | None) -> int:
    if not question:
        return 0
    return 1 if question[-1] == "?" else 0


def is_true_false(correct: str, distractor: str) -> int:
    return ((correct == "pravda") | (distractor == "pravda")).cast(pl.Int8)


def contains_img(question: str, correct: str, distractor: str) -> int:
    text = question + " " + correct + " " + distractor
    return 1 if "img" in text else 0


def contains_code(question: str, correct: str, distractor: str) -> int:
    text = question + " " + correct + " " + distractor
    return 1 if "code64" in text else 0


def ends_with_colon(question: str | None) -> int:
    if not question:
        return 0
    return 1 if question[-1] == ":" else 0


def what_was_first(question: str) -> int:
    return 1 if question.lower() == "co bylo dříve?" else 0


def contains_number(question: str, correct: str, distractor: str) -> int:
    text = question + " " + correct + " " + distractor
    numeric_pattern = r"\b(\d+)\b"
    return 1 if re.search(numeric_pattern, text.lower()) else 0


questions_df_clean = questions_df_clean.with_columns(
    question_correct=pl.col("question") + " " + pl.col("correct"),
    fill_in_blank=pl.col("question").map_elements(
        is_fill_in_blank, return_dtype=pl.Int8
    ),
    explicit_question=pl.col("question").map_elements(
        is_explicit_question, return_dtype=pl.Int8
    ),
    colon_end=pl.col("question").map_elements(ends_with_colon, return_dtype=pl.Int8),
    what_was_first=pl.col("question").map_elements(
        what_was_first, return_dtype=pl.Int8
    ),
    true_false=pl.struct(["correct", "distractor"]).map_batches(
        lambda x: is_true_false(
            x.struct.field("correct"), x.struct.field("distractor")
        ),
        return_dtype=pl.Int8,
    ),
)

has_number = (
    questions_df_clean.map_rows(
        lambda t: (contains_number(t[1], t[2], t[3])),
        return_dtype=pl.Int8,
    )
    .to_series()
    .alias("has_number")
)
has_img = (
    questions_df_clean.map_rows(
        lambda t: (contains_img(t[1], t[2], t[3])), return_dtype=pl.Int8
    )
    .to_series()
    .alias("has_img")
)
has_code = (
    questions_df_clean.map_rows(
        lambda t: (contains_code(t[1], t[2], t[3])), return_dtype=pl.Int8
    )
    .to_series()
    .alias("has_code")
)

questions_df_clean.insert_column(len(questions_df_clean.columns), has_img)
questions_df_clean.insert_column(len(questions_df_clean.columns), has_code)
questions_df_clean.insert_column(len(questions_df_clean.columns), has_number)

questions_df_clean = questions_df_clean.filter(
    ~pl.col("question_correct").str.contains_any(["code", "img", "emoji"])
)


questions_df_clean.head()

rs,question,correct,distractor,successRate,question_correct,fill_in_blank,explicit_question,colon_end,what_was_first,true_false,has_img,has_code,has_number
i64,str,str,str,f64,str,i8,i8,i8,i8,i8,i64,i64,i64
64,"""Co znamená v Pythonu operace '…","""zbytek po dělení""","""celočíselné dělení""",76.968,"""Co znamená v Pythonu operace '…",0,1,0,0,0,0,0,0
64,"""Co znamená v Pythonu operace '…","""dělení""","""zbytek po dělení""",76.394,"""Co znamená v Pythonu operace '…",0,1,0,0,0,0,0,0
64,"""Co znamená v Pythonu operace '…","""celočíselné dělení""","""zbytek po dělení""",74.399,"""Co znamená v Pythonu operace '…",0,1,0,0,0,0,0,0
64,"""Co znamená v Pythonu operace '…","""umocňování""","""násobení""",81.184,"""Co znamená v Pythonu operace '…",0,1,0,0,0,0,0,0
68,"""Co udělá program?""","""Zacyklí se a nikdy neskončí""","""Vypíše součet čísel od 1 do 5""",67.595,"""Co udělá program? Zacyklí se a…",0,1,0,0,0,0,0,1


## 5) Filter those resource set having enough questions

I chose a cutoff of 60 so that the Bertopic pipeline has enough documents to work with


In [27]:
rs_count_enough = (
    questions_df_clean.group_by("rs")
    .len()
    .filter(pl.col("len") >= 60)
    .rename({"rs": "id"})
)
resource_set_df_filtered = resource_set_df.join(rs_count_enough, on="id").sort(
    "len", descending=True
)

questions_df_clean_filtered = questions_df_clean.join(
    resource_set_df_filtered, left_on="rs", right_on="id"
)

questions_df_clean_filtered = questions_df_clean_filtered.select(pl.nth(range(13)))

questions_df_clean_filtered.head()

rs,question,correct,distractor,successRate,question_correct,fill_in_blank,explicit_question,colon_end,what_was_first,true_false,has_img,has_code
i64,str,str,str,f64,str,i8,i8,i8,i8,i8,i64,i64
65,"""Jak v Pythonu zapisujeme logic…","""and""","""&&""",83.878,"""Jak v Pythonu zapisujeme logic…",0,1,0,0,0,0,0
65,"""Jak v Pythonu zapisujeme logic…","""and""","""or""",81.737,"""Jak v Pythonu zapisujeme logic…",0,1,0,0,0,0,0
65,"""Jak v Pythonu zapisujeme logic…","""or""","""||""",84.746,"""Jak v Pythonu zapisujeme logic…",0,1,0,0,0,0,0
65,"""Jak v Pythonu zapisujeme logic…","""not""","""!""",73.499,"""Jak v Pythonu zapisujeme logic…",0,1,0,0,0,0,0
65,"""Jak v Pythonu zapisujeme test …","""x != y""","""x =!= y""",74.795,"""Jak v Pythonu zapisujeme test …",0,1,0,0,0,0,0


## 6) Save cleaned data frame


In [None]:
questions_df_clean_filtered.write_csv(data_dir / "questions_cleaned_filtered.csv")

In [None]:
rs_dicts = resource_set_df_filtered.select("id", "shortcut").to_dicts()
rs_dict = {
    rs_dict["id"]: " ".join(rs_dict["shortcut"].split("-")[1:]) for rs_dict in rs_dicts
}

In [None]:
import pickle

with open("rs_filtered.pickle", "wb") as handle:
    pickle.dump(rs_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
rs_dict

{144: 'tabulky porozumeni zaklady',
 291: 'neuronove site',
 63: 'negace tvrzeni',
 184: 'grafy nejkratsi cesty',
 57: 'spolecna vlastnost slova',
 137: 'vyhledavani na internetu',
 183: 'grafy abstrakce',
 146: 'sloupcove grafy',
 311: 'internet web',
 132: 'historie pocitacu',
 103: 'prezentace',
 105: 'vstupni vystupni zarizeni',
 107: 'hardware pamet',
 114: 'viry spam socialni inzenyrstvi',
 102: 'e mail',
 152: 'interpunkce specialni znaky',
 147: 'kolacove grafy',
 323: 'ovoce zelenina urcovani pravdivosti',
 87: 'tabulky rozsahy',
 289: 'omezujici podminky',
 108: 'tabulky bunky radky sloupce',
 109: 'tabulky pocetni operace',
 82: 'tabulky funkce',
 286: 'prohledavani stavoveho prostoru',
 212: 'razeni dat beztabulek',
 213: 'razeni dat tabulky',
 284: 'strojove uceni rozhodovaci stromy',
 115: 'hesla autentizace',
 246: 'ziskavani dat',
 119: 'site obecne',
 283: 'linearni regrese',
 305: 'algoritmicke mysleni pojmenovani promennych',
 306: 'histogramy',
 326: 'ovoce zelenina