In [1]:
import os
import pandas as pd
from collections import defaultdict
from xml.etree import ElementTree as ET

In [None]:
# set path to where SymbTr/MusicXML is stored
dataset_path = "/path/to/SymbTr/MusicXML"

### Extract makam - usul counts

In [3]:
def extract_music_info(file_path):
    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Find direction-type elements containing words
    words_elements = root.findall(".//direction-type/words")

    # Look for the element with makam, form, and usul information
    for element in words_elements:
        text = element.text
        if "Makam:" in text:
            return text.strip()

    return None

In [4]:
makam_usul_counts = defaultdict(lambda: defaultdict(int))
unique_makams = set()
unique_usuls = set()

for filename in os.listdir(dataset_path):
    if filename.endswith(".xml"):
        song_info = extract_music_info(os.path.join(dataset_path, filename))
        makam = song_info.split(",")[0].split(":")[1].strip()
        usul = song_info.split(",")[2].split(":")[1].strip()
        unique_makams.add(makam)
        unique_usuls.add(usul)
        makam_usul_counts[makam][usul] += 1

In [5]:
unique_makams = sorted(unique_makams)
unique_usuls = sorted(unique_usuls)
makam_usul_df = pd.DataFrame(
    0, index=list(unique_makams), columns=list(unique_usuls)
)
for makam, usuls in makam_usul_counts.items():
    for usul, count in usuls.items():
        makam_usul_df.loc[makam, usul] = count

In [6]:
print(len(unique_makams), len(unique_usuls))

155 107


In [7]:
# Calculate which usuls are most commonly used with each makam
makam_preferred_usuls = {}
for makam, usuls in makam_usul_counts.items():
    # Get the most common usul(s)
    if not usuls:
        continue
    max_count = max(usuls.values())
    preferred = [u for u, c in usuls.items() if c == max_count]
    makam_preferred_usuls[makam] = (preferred, max_count)

# Display the results
print("Most common usul for each makam:")
for makam, (usuls, count) in sorted(
    makam_preferred_usuls.items(), key=lambda x: x[1][1], reverse=True
)[:15]:
    print(
        f"Makam: {makam}, Preferred usul(s): {', '.join(usuls)} (used {count} times)"
    )

Most common usul for each makam:
Makam: Hicaz, Preferred usul(s): Aksak (used 26 times)
Makam: Nihâvent, Preferred usul(s): Düyek (used 24 times)
Makam: Hüzzam, Preferred usul(s): Aksak (used 21 times)
Makam: Uşşak, Preferred usul(s): Aksak, Sofyan (used 19 times)
Makam: Mâhur, Preferred usul(s): Aksak (used 18 times)
Makam: Rast, Preferred usul(s): Düyek (used 18 times)
Makam: Segâh, Preferred usul(s): Aksak (used 18 times)
Makam: Hüseynî, Preferred usul(s): Sofyan (used 17 times)
Makam: Hicazkâr, Preferred usul(s): Düyek (used 15 times)
Makam: Kürdîlihicazkâr, Preferred usul(s): Aksak (used 13 times)
Makam: Muhayyer, Preferred usul(s): Sofyan (used 13 times)
Makam: Beyâtî, Preferred usul(s): Düyek (used 10 times)
Makam: Bûselik, Preferred usul(s): Düyek (used 10 times)
Makam: Sabâ, Preferred usul(s): Düyek, Aksak (used 10 times)
Makam: Acemaşîrân, Preferred usul(s): Aksak (used 9 times)


In [8]:
makam_usul_df

Unnamed: 0,14/4,2+2+3,3+3+3+4,Aksak,Aksak (9/4),Aksaksemâî,Aksaksemâî Evferi,Aydın,Ayîn Devr-i Revânı,Ağır Aksaksemâî,...,Âzerî Yürüksemâî,Çenber,Çenber (12/2),Çeng-i Harbî,Çiftedüyek,Çiftesofyan,Çiftesofyan (9/16),İki-Bir,İkiz Aksak,Şarkı Devr-i Revânı
Acem,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acemaşîrân,0,0,0,9,0,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acembûselik,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Acemkürdî,0,0,0,6,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Acemtarab,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Şevk-ı-Cedîd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Şevk-ı-Dil,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Şevkutarab,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Şevkâver,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
# Save the DataFrame to a CSV file
os.makedirs("../data", exist_ok=True)
makam_usul_df.to_csv("../data/makam_usul_counts.csv", index=True)

### Extract makam - cycle length counts

In [10]:
# Function to classify time signatures into categories based on cycle length
def classify_cycle_length(beats):
    beats = int(beats)
    if beats <= 4:
        return 'Short'
    elif beats <= 9:
        return 'Medium'
    elif beats <= 16:
        return 'Long'
    else:
        return 'Extended'

In [11]:
def extract_makam_and_cycle_length(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    makam = None
    cycle_length = set()

    # Get makam from <direction-type><words>
    for words_element in root.findall(".//direction-type/words"):
        text = words_element.text
        if text and "Makam:" in text:
            makam = text.split("Makam:")[1].split(",")[0].strip()

    # Get all time signatures and classify them by cycle length
    for time in root.findall(".//time"):
        beats = time.findtext("beats")
        beat_type = time.findtext("beat-type")
        if beats and beat_type:
            cycle_length_category = classify_cycle_length(beats)
            cycle_length.add(cycle_length_category)

    return makam, cycle_length

In [12]:
# Data structure: makam → { cycle_length_category → count }
makam_time_counts = defaultdict(lambda: defaultdict(int))
all_makams = set()
all_cycle_lengths = {'Short', 'Medium', 'Long', 'Extended'}

# Walk through the dataset and process each MusicXML file
for filename in os.listdir(dataset_path):
    if filename.endswith(".xml"):
        file_path = os.path.join(dataset_path, filename)
        makam, time_sigs = extract_makam_and_cycle_length(file_path)
        if not makam or not time_sigs:
            continue
        for cycle_length in time_sigs:
            makam_time_counts[makam][cycle_length] += 1
            all_makams.add(makam)

# Build DataFrame
makam_cycle_length_df = pd.DataFrame(0, index=sorted(all_makams), columns=sorted(all_cycle_lengths))

for makam, cycle_dict in makam_time_counts.items():
    for cycle_length, count in cycle_dict.items():
        makam_cycle_length_df.loc[makam, cycle_length] = count

makam_cycle_length_df

Unnamed: 0,Extended,Long,Medium,Short
Acem,1,2,4,2
Acemaşîrân,5,9,42,12
Acembûselik,0,0,2,0
Acemkürdî,1,8,21,9
Acemtarab,1,0,0,0
...,...,...,...,...
Şevk-ı-Cedîd,0,0,1,0
Şevk-ı-Dil,1,1,0,0
Şevkutarab,0,1,0,2
Şevkâver,1,1,0,0


In [13]:
makam_cycle_length_df.sum(axis=0)

Extended     132
Long         400
Medium      1273
Short        526
dtype: int64

In [14]:
# Save to CSV
os.makedirs("../data", exist_ok=True)
makam_cycle_length_df.to_csv("../data/makam_vs_cycle_length_counts.csv")