In [1]:
#import sys
#!{sys.executable} -m pip install -U pip
#!{sys.executable} -m pip install pandas numpy

In [2]:
# Notebook käivitamine (VS Code / JupyterLab / browser):
# - Run All (või jooksuta järjest)
# - Viimased 4 celli peavad ekraanile kuvama A–D väljundid

In [3]:
# Cell 1: paths
from pathlib import Path
import pandas as pd
import json

# kohanda kui vaja
IN_CSV = "toorandmed_aasta.csv"          # või "data/toorandmed_aasta.csv"
CONFIG = "cleaner_config.json"
OUTDIR = "out"

Path(OUTDIR).mkdir(exist_ok=True)

In [4]:
# Cell 2: run cleaning script (py eraldi failina)
import sys

cmd = f'{sys.executable} cleaner_configurable.py --in "{IN_CSV}" --config "{CONFIG}" --lang both --outdir "{OUTDIR}"'
print(cmd)
!{cmd}

/home/reimoz/.venvs/jupyter/bin/python cleaner_configurable.py --in "toorandmed_aasta.csv" --config "cleaner_config.json" --lang both --outdir "out"
Wrote:
 - out/courses_cleaned_full.csv
 - out/courses_metadata.csv
 - out/courses_documents.csv
 - out/clean_report.json


In [5]:
# Cell 3: load outputs
df_full = pd.read_csv(f"{OUTDIR}/courses_cleaned_full.csv")
df_meta = pd.read_csv(f"{OUTDIR}/courses_metadata.csv")
df_docs = pd.read_csv(f"{OUTDIR}/courses_documents.csv")

with open(f"{OUTDIR}/clean_report.json", "r", encoding="utf-8") as f:
    rep = json.load(f)

(df_full.shape, df_meta.shape, df_docs.shape, rep.keys())

((3135, 232),
 (3135, 21),
 (3135, 11),
 dict_keys(['dropped', 'kept_rows', 'input_rows', 'lang', 'json_flattened_cols', 'missing_by_column_top', 'categoricals', 'document_text_length_stats']))

In [6]:
#RAPORT

In [7]:
#Puuduvate väärtuste hulk igas veerus

In [8]:
# LAST CELLS - A) Missing values per column
missing = df_full.isna().sum().sort_values(ascending=False)
missing_pct = (missing / len(df_full) * 100).round(2)
missing_table = pd.DataFrame({"missing": missing, "missing_%": missing_pct})
missing_table

Unnamed: 0,missing,missing_%
title__es,3135,100.0
version__title__es,3135,100.0
version__overview__description__es,3135,100.0
overview__description__es,3135,100.0
version__additional_info__study_levels__names,3135,100.0
...,...,...
version___actions__is_editable,0,0.0
version__parent_code,0,0.0
version__grading__assessment_scale__code,0,0.0
version__grading__assessment_scale__en,0,0.0


In [9]:
#Kategooriliste veergude enim levinud väärtused (head(5))

In [10]:
# LAST CELLS - B) Categoricals top values (head 5)
cat_cols = [
    "version__target__language__code",
    "version__target__semester__code",
    "version__target__faculty__city",
    "additional_info__assessment_scale__code",
    "study_levels__codes",
    "version__additional_info__study_levels__codes"
]

for c in cat_cols:
    print("\n==", c, "==")
    if c in df_full.columns:
        display(df_full[c].fillna("").astype(str).str.strip().value_counts().head(5))
    else:
        print("MISSING COLUMN")


== version__target__language__code ==


version__target__language__code
et    2326
en     809
Name: count, dtype: int64


== version__target__semester__code ==


version__target__semester__code
spring    1570
autumn    1565
Name: count, dtype: int64


== version__target__faculty__city ==


version__target__faculty__city
Tartu linn    3134
Tartu            1
Name: count, dtype: int64


== additional_info__assessment_scale__code ==


additional_info__assessment_scale__code
grade      1858
pass       1276
defence       1
Name: count, dtype: int64


== study_levels__codes ==
MISSING COLUMN

== version__additional_info__study_levels__codes ==


version__additional_info__study_levels__codes
bachelor                    970
master                      576
bachelor;master             320
bachelor_master             260
bachelor;doctoral;master    217
Name: count, dtype: int64

In [11]:
#“Sissejuhatus datasetisse”: ühe aine kirje loetaval kujul (mitte lihtsalt raw print)

In [12]:
# LAST CELLS - C) One example course, readable
import textwrap
import numpy as np

# vali rida, millel document_text on pikk (et näidis oleks sisukas)
ix = df_docs["document_text"].fillna("").astype(str).str.len().sort_values(ascending=False).index[0]

row_full = df_full.loc[ix].to_dict()
row_meta = df_meta.loc[ix].to_dict() if ix in df_meta.index else {}
row_doc  = df_docs.loc[ix].to_dict()

def pretty_kv(d, keys=None, width=110):
    if keys is None:
        keys = list(d.keys())
    lines = []
    for k in keys:
        v = d.get(k, "")
        if isinstance(v, float) and np.isnan(v):
            v = ""
        s = f"{k}: {v}"
        lines.append(textwrap.fill(s, width=width, subsequent_indent="  "))
    return "\n".join(lines)

# vali näitamiseks “mõistlik” alamhulk (muuda kui tahad)
meta_keys = [k for k in [
    "code", "version__code", "course_uuid", "version__uuid",
    "version__target__semester__code", "version__target__language__code",
    "version__target__faculty__city", "additional_info__assessment_scale__code",
    "study_levels__codes", "version__additional_info__study_levels__codes",
] if k in row_meta]

print("### Näidis aine: METADATA ###\n")
print(pretty_kv(row_meta, keys=meta_keys))

print("\n\n### Näidis aine: DOCUMENT_TEXT ###\n")
print(textwrap.fill(str(row_doc.get("document_text", "")), width=110))

### Näidis aine: METADATA ###

code: SVUH.00.111
version__code: sv-2025-spring-fulltime
course_uuid: 32a3a779-112d-20a3-13ef-92e39df5062f
version__uuid: 4da518b8-7606-cb46-0119-f6fe09c73150
version__target__semester__code: spring
version__target__language__code: et
version__target__faculty__city: Tartu linn
additional_info__assessment_scale__code: grade
study_levels__codes: bachelor
version__additional_info__study_levels__codes: bachelor


### Näidis aine: DOCUMENT_TEXT ###

Pealkiri: Praktiline ajakirjandus II Title: Practical Journalism II Kirjeldus: Kursus toimub neljas osas, aga
kahes jaotuses (praktiline ajakirjandus I ja II): esimene osa keskendub uudisele kirjutavas meediumis, teine
osa uudisele audiomeediumis, kolmas uudisele audiovisuaalses meediumis ja neljas digimeedias. Mõtteliselt on
iga osa 6 EAP vääriline. 1) uudis kirjutavas meediumis: tudengid töötavad nädalas ühe täispäeva kui reaalne
toimetus, otsides kajastamisväärseid sündmusi, tehes reporteritööd ning kirjutades s

In [13]:
#document_text tähemärkide arvu statistika (describe + percentiilid)

In [14]:
# LAST CELLS - D) document_text length stats
lens = df_docs["document_text"].fillna("").astype(str).str.len()
lens.describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

count     3135.000000
mean      3229.053270
std       1737.625635
min        189.000000
25%       2045.000000
50%       2834.000000
75%       3980.000000
90%       5458.600000
95%       6570.700000
99%       9005.820000
max      15891.000000
Name: document_text, dtype: float64