# *From Corpus to Classroom: Interactive Access to Children’s Song Repertoires on the MUSCO Platform*
## Authors: Vanessa Nina Borsan, Jure Juvan, Matija Marolt, Matevž Pesek, Leon Stefanija
### Presented as a poster @ ICCCM ’25, 8-10 October 2025, Aalborg University, Aalborg, DENMARK

------


## The following notebook includes usage examples of scripts for:
1. Data Preprocessing
2. Data Plotting

-----

# LOAD DATA

In [None]:
import sys
from pathlib import Path

sys.path.insert(0, str(Path.cwd() / "src"))

p = Path.cwd()
while p != p.parent and not (p / "src").exists():
    p = p.parent
sys.path.insert(0, str(p / "src"))

from educationalfilters import prepare_df
from educationalfilters import save_load
from educationalfilters.pipeline import apply_all_filters
import pandas as pd
import importlib
from educationalfilters import filter_label_utils as flu, filter_df
importlib.reload(flu)
importlib.reload(filter_df)


In [None]:
REPO_ROOT = Path(p)                          
RAW_DIR    = REPO_ROOT / "data" / "raw"
PROC_DIR   = REPO_ROOT / "data" / "processed"

CIC_JSON_DIR = str((RAW_DIR / "ciciban_jsons").resolve())
SLP_CSV      = str((RAW_DIR / "slp_df.csv").resolve())
SLP_SCORES   = str((REPO_ROOT / "scores/slp").resolve())    # folder with .mxl files
RHY_MAP_FILE = "rhythm_mapping.pickle"                      # lives in data/processed/ via save_load

In [None]:
# Build Ciciban DF from JSONs (notes-only rhythm, pause_count from JSON rhythm vs melody length)
c_df = prepare_df.convert_jsons_to_df(CIC_JSON_DIR)
# Upgrade to ABC using canonical map (creates/extends data/processed/rhythm_mapping.pickle)
c_df, rhythm_mapping = prepare_df.df_upgrade(c_df, save_file_path=RHY_MAP_FILE)


# PREPARE DFs

In [None]:
# Load SLP CSV + parse .mxl rhythms/time sigs; apply canonical ABC mapping
slp_df = prepare_df.prepare_slp(SLP_CSV, SLP_SCORES, RHY_MAP_FILE)

# [Optional] harmonize a couple of fields if needed by downstream code
if "min_pitch" in slp_df.columns and "max_pitch" in slp_df.columns:
    slp_df["ambitus_min"] = slp_df["ambitus_min"].fillna(slp_df["min_pitch"])
    slp_df["ambitus_max"] = slp_df["ambitus_max"].fillna(slp_df["max_pitch"])
slp_df["time_signature_raw"] = slp_df["time_signature"].astype("string")

print(f"{len(slp_df)} SLP rows")
display(slp_df.head(3))

## RHYTHM MAPPING REFERENCE

In [None]:
import pprint
from educationalfilters import save_load

rhythm_mapping = save_load.load_pickle("rhythm_mapping.pickle")

print("🎼 Current Rhythm Mapping (duration → letter) [FIXED: a (0.5), d (1.0), e (2.0); AUTOMATICALLY GENERATED: the rest]:")
pprint.pprint(dict(sorted(rhythm_mapping.items())))

## VRF & IF 

In [None]:
pre_min_range='C4'
pre_max_range='A4'
pre_plus_min_range='A3'
pre_plus_max_range='C5'

In [None]:
pre_min_range      = 'C4'
pre_max_range      = 'A4'
pre_plus_min_range = 'A3'
pre_plus_max_range = 'C5'

# Apply to Ciciban
ciciban_df = filter_df.preschool_filter(
    c_df,
    pre_plus_min_pitch=pre_plus_min_range,
    pre_plus_max_pitch=pre_plus_max_range,
    pre_min_pitch=pre_min_range,
    pre_max_pitch=pre_max_range,
    rhythm_mapping=rhythm_mapping,  # accepted but not required by RF
)

# Apply to SLP
slp_df_f = filter_df.preschool_filter(
    slp_df,
    pre_plus_min_pitch=pre_plus_min_range,
    pre_plus_max_pitch=pre_plus_max_range,
    pre_min_pitch=pre_min_range,
    pre_max_pitch=pre_max_range,
    rhythm_mapping=rhythm_mapping,
)

## CONVERT RHYTHMS AND MELODIC INTERVALS

In [None]:
dfs=[slp_df_f, ciciban_df]
labels=['SLP', 'Ciciban']

In [None]:
from educationalfilters import dataset_conversion as dc, rfilters

# Adapter so the new rfilters works with the old function signature
def _rf_adapter(df, rhythm_mapping, label):
    df2 = df.copy()
    df2['corpus'] = label
    return rfilters.compute_rhythm_labels(df2)  # -> (df_out, counts)

KEEP_COLS = [
    "metadata_filename", "metadata_title", "corpus",
    "melodic_string", "melodic_string_absolute", "melodic_string_abc",
    "melodic_string_relative", "melodic_intervals",
    "rhythm_string", "rhythm_string_abc", "time_signature",
    "has_pauses", "pause_count",
    "ambitus_min", "ambitus_max", "ambitus_semitones", "ambitus_interval",
    "VRF_label", "IF_label", "RF_label",
    "VRF_BOTH", "IF_BOTH", "RF_BOTH",  # include only if you compute these earlier
]

merged_df, all_counts = dc.process_and_merge_dfs(
    dfs, labels, rhythm_mapping, filter_function=_rf_adapter, keep_cols=KEEP_COLS
)
merged_df = dc.prepare_melodic_intervals(merged_df)

In [None]:
# These columns are not being used in this case study, so they were not considered and dropped from the df.
drop_cols = ["metadata_title", "melodic_string", "melodic_string_absolute"]
drop_cols = [c for c in drop_cols if c in merged_df.columns]  

merged_df = merged_df.drop(columns=drop_cols)

## DATA CLEAN UP

In [None]:
import sys
from pathlib import Path
sys.path.insert(0, str(Path.cwd() / "src"))

from educationalfilters import filter_df

In [None]:
summary_clean = filter_df.prepare_all_filters_clean(merged_df)  # fixed row order for plotting

## PLOT

In [None]:
import sys
import os

# add the parent directory (project/) to sys.path
sys.path.append(os.path.abspath(".."))

from scripts import plot

COLORS  = ['#04795E', '#E9E9E9']  # Ciciban, SLP
HATCHES = ['//', '\\\\']

group1 = ["VRF1", "IF1", "VRF2", "IF2", "VRF1 + IF1", "VRF2+IF2", "ANY (VRF+IF)"]
group2 = ["RF1", "RF2", "RF3", "RF4", "VRF2+IF2+RF3", "VRF2+IF2+RF4", "ANY (VRF+IF+RF)"]

plot.plot_filters(summary_clean, COLORS, HATCHES, group1, group2, save_path="exports/filters_summary.png")

os.makedirs("exports", exist_ok=True)
summary_clean.to_csv("exports/filters_summary_clean.csv")