In [3]:
from pathlib import Path
import xml.etree.ElementTree as ET
from collections import defaultdict, Counter
from itertools import chain

visit1 = Path("/scratch/besp/shared_data/mros/polysomnography/annotations-events-nsrr/visit1")
visit2 = Path("/scratch/besp/shared_data/mros/polysomnography/annotations-events-nsrr/visit2")


etype2concepts = defaultdict(Counter)


all_xml_files = chain(
    visit1.glob("mros-visit1-*-nsrr.xml"),
    visit2.glob("mros-visit2-*-nsrr.xml"),
)

for idx, xml_path in enumerate(all_xml_files, 1):
    if idx % 100 == 0 and idx > 0:       
        print(f"parsed {idx:,} files")
        

    tree = ET.parse(xml_path)

    for se in tree.iterfind(".//ScoredEvent"):
        etype   = (se.findtext("EventType")    or "").strip()
        concept = (se.findtext("EventConcept") or "").strip()

        if etype and concept:
            etype2concepts[etype][concept] += 1


print("\n=== SUMMARY ===\n")
for etype in sorted(etype2concepts):
    total = sum(etype2concepts[etype].values())
    print(f"{etype}  (total {total:,})")


    for concept, n in etype2concepts[etype].most_common():
        print(f"    └─ {concept:<40} {n:,}")
    print()

import pandas as pd
rows = []
for etype, counter in etype2concepts.items():
    for concept, n in counter.items():
        rows.append((etype, concept, n))


df = (pd.DataFrame(rows, columns=["Eventtype", "Eventconcept", "number-of-sample"])
        .sort_values(["Eventtype", "number-of-sample"], ascending=[True, False]))


csv_path = "mros_visit1_event_summary.csv"
df.to_csv(csv_path, index=False)

parsed 100 files
parsed 200 files
parsed 300 files
parsed 400 files
parsed 500 files
parsed 600 files
parsed 700 files
parsed 800 files
parsed 900 files
parsed 1,000 files
parsed 1,100 files
parsed 1,200 files
parsed 1,300 files
parsed 1,400 files
parsed 1,500 files
parsed 1,600 files
parsed 1,700 files
parsed 1,800 files
parsed 1,900 files
parsed 2,000 files
parsed 2,100 files
parsed 2,200 files
parsed 2,300 files
parsed 2,400 files
parsed 2,500 files
parsed 2,600 files
parsed 2,700 files
parsed 2,800 files
parsed 2,900 files
parsed 3,000 files
parsed 3,100 files
parsed 3,200 files
parsed 3,300 files
parsed 3,400 files
parsed 3,500 files
parsed 3,600 files
parsed 3,700 files
parsed 3,800 files
parsed 3,900 files

=== SUMMARY ===

Arousals|Arousals  (total 628,637)
    └─ ASDA arousal|Arousal (ASDA)              623,627
    └─ Arousal|Arousal ()                       3,754
    └─ Arousal|Arousal (ARO Limb)               1,095
    └─ Arousal|Arousal (Asda)                   133
    └─ A

In [4]:
from pathlib import Path
import xml.etree.ElementTree as ET
from collections import defaultdict, Counter
from itertools import chain

visit1 = Path("/scratch/besp/shared_data/mesa/polysomnography/annotations-events-nsrr")



etype2concepts = defaultdict(Counter)


all_xml_files = chain(
    visit1.glob("mesa-sleep-*-nsrr.xml"),

)

for idx, xml_path in enumerate(all_xml_files, 1):
    if idx % 100 == 0 and idx > 0:       
        print(f"parsed {idx:,} files")
        

    tree = ET.parse(xml_path)

    for se in tree.iterfind(".//ScoredEvent"):
        etype   = (se.findtext("EventType")    or "").strip()
        concept = (se.findtext("EventConcept") or "").strip()

        if etype and concept:
            etype2concepts[etype][concept] += 1


print("\n=== SUMMARY ===\n")
for etype in sorted(etype2concepts):
    total = sum(etype2concepts[etype].values())
    print(f"{etype}  (total {total:,})")


    for concept, n in etype2concepts[etype].most_common():
        print(f"    └─ {concept:<40} {n:,}")
    print()

import pandas as pd
rows = []
for etype, counter in etype2concepts.items():
    for concept, n in counter.items():
        rows.append((etype, concept, n))


df = (pd.DataFrame(rows, columns=["Eventtype", "Eventconcept", "number-of-sample"])
        .sort_values(["Eventtype", "number-of-sample"], ascending=[True, False]))


csv_path = "mesa_visit1_event_summary.csv"
df.to_csv(csv_path, index=False)

parsed 100 files
parsed 200 files
parsed 300 files
parsed 400 files
parsed 500 files
parsed 600 files
parsed 700 files
parsed 800 files
parsed 900 files
parsed 1,000 files
parsed 1,100 files
parsed 1,200 files
parsed 1,300 files
parsed 1,400 files
parsed 1,500 files
parsed 1,600 files
parsed 1,700 files
parsed 1,800 files
parsed 1,900 files
parsed 2,000 files

=== SUMMARY ===

Arousals|Arousals  (total 323,439)
    └─ Arousal|Arousal ()                       257,992
    └─ ASDA arousal|Arousal (ASDA)              61,976
    └─ Spontaneous arousal|Arousal (ARO SPONT)  3,326
    └─ Arousal resulting from respiratory effort|Arousal (ARO RES) 145

Electrocardiogram|Electrocardiogram  (total 1)
    └─ Narrow complex tachycardia|Narrow Complex Tachycardia 1

Limb Movement|Limb Movement  (total 230,482)
    └─ Periodic leg movement - left|PLM (Left)  170,761
    └─ Limb movement - left|Limb Movement (Left) 59,721

Respiratory|Respiratory  (total 1,155,505)
    └─ SpO2 desaturation|SpO2 desatu

In [4]:
import re
from pathlib import Path
from datetime import datetime, timedelta
import pandas as pd
EPOCH_LEN   = 30
_TIME_FMTS  = ("%H:%M:%S.%f", "%H:%M:%S")

def _parse_time(tstr: str) -> datetime.time:
    
    from datetime import datetime as _dt
    for fmt in _TIME_FMTS:
        try:
            return _dt.strptime(tstr, fmt).time()
        except ValueError:
            pass
    raise ValueError(f"Bad time string: {tstr!r}")

def _safe_float(s: str, default: float = 0.0) -> float:
    try:
        return float(s)
    except ValueError:
        return default
def parse_all_score_wsc(fp) -> pd.DataFrame:

    fp = Path(fp)
    start_dt: datetime | None = None
    day_offset = 0

    events = []   # (event, concept, start_sec, end_sec, dur, epoch)


    rx = re.compile(
        r"""^(?P<event>.+?)\s*-\s*DUR:\s*(?P<dur>\d+(?:\.\d+)?)\s*SEC\.?\s*-\s*
            (?P<concept>[^-]+?)\s*(?:-|$)""",
        re.I | re.VERBOSE,
    )

    with fp.open() as fh:
        for ln in fh:
            ln = ln.rstrip("\n")
            if not ln or ln.startswith(("*", "#")):
                continue

            try:
                clock_str, note = ln.split("\t", 1)
            except ValueError:
                continue
            clock_str, note = clock_str.strip(), note.strip()

            
            if "START RECORDING" in note.upper():
                if start_dt is not None:
                    raise ValueError(f"Multiple START RECORDING lines in {fp}")
                start_dt = datetime.combine(
                    datetime(2000, 1, 1), _parse_time(clock_str)
                )
                continue

            if start_dt is None:
                continue

            m = rx.search(note)
            if not m:
                continue

            event   = m.group("event").strip()
            dur     = float(m.group("dur"))
            concept = m.group("concept").strip()

            evt_time = _parse_time(clock_str)
            evt_dt   = datetime.combine(start_dt.date() + timedelta(days=day_offset),
                                        evt_time)
            if evt_dt < start_dt:          
                day_offset += 1
                evt_dt += timedelta(days=1)

            start_sec = (evt_dt - start_dt).total_seconds()
            end_sec   = start_sec + dur
            epoch     = int(start_sec // EPOCH_LEN)

            events.append((event, concept, start_sec, end_sec, dur, epoch))

    if start_dt is None:
        raise ValueError(f"No START RECORDING line found in {fp}")
    if not events:
        raise ValueError(f"No valid DUR events found in {fp}")

    df = pd.DataFrame(
        events,
        columns=[
            "EVENT_CLASS", "EVENT",
            "START_SEC", "END_SEC", "DURATION_SEC", "EPOCH",
        ],
    )
    df.loc[df["EVENT_CLASS"] == "DESATURATION", "EVENT"] = "DESATURATION"
    df.loc[df["EVENT_CLASS"] == "LM", "EVENT"] = "LM-" + df.loc[df["EVENT_CLASS"] == "LM", "EVENT"]
    df.loc[df["EVENT_CLASS"] == "AROUSAL", "EVENT"] = "AROUSAL-" + df.loc[df["EVENT_CLASS"] == "AROUSAL", "EVENT"]
    return df



'''
input:
df_evt = parse_all_score_wsc("/u/ztshuai/ondemand/wsc-visit3-36097-nsrr.allscore.txt")
print(df_evt.head())


print(df_evt["EVENT"].value_counts())
print("###################################")
print(df_evt["EVENT_CLASS"].value_counts())
output:
         EVENT_CLASS         EVENT  START_SEC  END_SEC  DURATION_SEC  EPOCH
0  RESPIRATORY EVENT      HYPOPNEA    1347.78  1362.18          14.4     44
1       DESATURATION  DESATURATION    1361.24  1383.74          22.5     45
2  RESPIRATORY EVENT      HYPOPNEA    1412.43  1435.73          23.3     47
3       DESATURATION  DESATURATION    1419.38  1454.28          34.9     47
4  RESPIRATORY EVENT      HYPOPNEA    1441.07  1463.97          22.9     48
DESATURATION         240
HYPOPNEA             221
LM-ISOLATED          129
AROUSAL-LM            47
OBSTRUCTIVE APNEA     19
Name: EVENT, dtype: int64
###################################
RESPIRATORY EVENT    240
DESATURATION         240
LM                   129
AROUSAL               47
Name: EVENT_CLASS, dtype: int64
'''
from pathlib import Path
from collections import Counter, defaultdict
import pandas as pd
from tqdm import tqdm  

root = Path("/scratch/besp/shared_data/wsc/polysomnography")           
pattern = "**/wsc-visit*-*-nsrr.allscore.txt"   
all_files = list(root.glob(pattern))
print(f"found {len(all_files):,} allscore file")


evt_counter        = Counter()       
evtclass_counter   = Counter()       
evtclass2evt       = defaultdict(Counter)  

for fp in tqdm(all_files, desc="Parsing"):
    try:
        df_evt = parse_all_score_wsc(fp)
    except Exception as e:
        
        continue

    evt_counter.update(df_evt["EVENT"])
    evtclass_counter.update(df_evt["EVENT_CLASS"])
    for cls, evt in zip(df_evt["EVENT_CLASS"], df_evt["EVENT"]):
        evtclass2evt[cls][evt] += 1


print("\n=== SUMMARY ===\n")
for cls in sorted(evtclass2evt):
    total = evtclass_counter[cls]
    print(f"{cls}  (total {total:,})")
    for evt, n in evtclass2evt[cls].most_common():
        print(f"    └─ {evt:<35} {n:,}")
    print()


rows = []
for evt, n in evt_counter.items():
    rows.append(("EVENT", evt, n))
for cls, n in evtclass_counter.items():
    rows.append(("EVENT_CLASS", cls, n))

df_out = pd.DataFrame(rows, columns=["Level", "Name", "Count"]) \
          .sort_values(["Level", "Count"], ascending=[True, False])

csv_path = root / "wsc_event_summary.csv"
df_out.to_csv(csv_path, index=False)
'''

=== SUMMARY ===

AROUSAL  (total 63,549)
    └─ AROUSAL-LM                          36,262
    └─ AROUSAL-RESPIRATORY EVENT           13,537
    └─ AROUSAL-SPONTANEOUS                 13,124
    └─ AROUSAL-PLM                         541
    └─ AROUSAL-NOISE                       82
    └─ AROUSAL-SNORE                       3

DESATURATION  (total 81,786)
    └─ DESATURATION                        81,786

EKG EVENTS  (total 10)
    └─ SINUS TACHYCARDIA                   10

LM  (total 152,983)
    └─ LM-ISOLATED                         152,983

RESPIRATORY EVENT  (total 81,787)
    └─ HYPOPNEA                            73,809
    └─ OBSTRUCTIVE APNEA                   6,923
    └─ CENTRAL APNEA                       963
    └─ MIXED APNEA                         92

SNORE  (total 1)
    └─ PERIODIC                            1
'''


found 743 allscore file


Parsing: 100%|██████████| 743/743 [02:03<00:00,  6.00it/s]


=== SUMMARY ===

AROUSAL  (total 63,549)
    └─ AROUSAL-LM                          36,262
    └─ AROUSAL-RESPIRATORY EVENT           13,537
    └─ AROUSAL-SPONTANEOUS                 13,124
    └─ AROUSAL-PLM                         541
    └─ AROUSAL-NOISE                       82
    └─ AROUSAL-SNORE                       3

DESATURATION  (total 81,786)
    └─ DESATURATION                        81,786

EKG EVENTS  (total 10)
    └─ SINUS TACHYCARDIA                   10

LM  (total 152,983)
    └─ LM-ISOLATED                         152,983

RESPIRATORY EVENT  (total 81,787)
    └─ HYPOPNEA                            73,809
    └─ OBSTRUCTIVE APNEA                   6,923
    └─ CENTRAL APNEA                       963
    └─ MIXED APNEA                         92

SNORE  (total 1)
    └─ PERIODIC                            1






In [1]:
import re
from pathlib import Path
from datetime import datetime, timedelta
import pandas as pd
EPOCH_LEN   = 30
_TIME_FMTS  = ("%H:%M:%S.%f", "%H:%M:%S")

def _parse_time(tstr: str) -> datetime.time:
    
    from datetime import datetime as _dt
    for fmt in _TIME_FMTS:
        try:
            return _dt.strptime(tstr, fmt).time()
        except ValueError:
            pass
    raise ValueError(f"Bad time string: {tstr!r}")

def _safe_float(s: str, default: float = 0.0) -> float:
    try:
        return float(s)
    except ValueError:
        return default
def parse_sco_wsc(fp) -> pd.DataFrame:
    
    events = []
    fp = Path(fp)
    with fp.open() as fh:
        for ln in fh:
            ln = ln.rstrip()
            
            if not ln or ln.startswith(("Epoch", "*", "#")):
                continue

            
            cols = re.split(r"\t", ln) # should use "\t" instead of "\t+" pr "\s"
            if len(cols) < 6:          
                continue
            
            if cols[0] == '':
                continue
            epoch        = int(cols[0])
            marker_code  = cols[3]     # e.g. 405
            marker_text  = cols[4].strip()   # e.g. LMA
            clock_str    = cols[6]     # e.g. 21:29:42 or 21:29:42.123
            
            duration_sec = 0.0
            
            
            duration_sec = float(cols[-1])
            if duration_sec > 0:
                
                events.append((epoch, marker_text, clock_str, duration_sec))

    if not events:
        raise ValueError(f"No valid marker rows found in {fp}")
    
    
    first_epoch, _, first_clock, _ = events[0]
    first_abs_dt = datetime.combine(
        datetime(2000, 1, 1), _parse_time(first_clock)
    )
    study_start_abs = first_abs_dt - timedelta(seconds=first_epoch * EPOCH_LEN)
    
    
    rows = []
    for epoch, ev, t_str, dur in events:
        abs_dt = datetime.combine(study_start_abs.date(), _parse_time(t_str))
        start_sec = (abs_dt - study_start_abs).total_seconds()
        end_sec   = start_sec + dur
        if start_sec<0:
            start_sec += 86400
        if start_sec>=86400:
            start_sec -= 86400
        if end_sec<0:
            end_sec += 86400
        if end_sec>=86400:
            end_sec -= 86400 
        rows.append((ev, start_sec, end_sec, dur, epoch))

    df = pd.DataFrame(
        rows,
        columns=["EVENT", "START_SEC", "END_SEC", "duration_sec", "epoch"], # can be removed, the last two columns
    )
    df["EVENT_CLASS"] = df["EVENT"]
    return df


from pathlib import Path
from collections import Counter, defaultdict
import pandas as pd
from tqdm import tqdm  

root = Path("/scratch/besp/shared_data/wsc/polysomnography")           
pattern = "**/wsc-visit*-*-nsrr.sco.txt"   
all_files = list(root.glob(pattern))
print(f"found {len(all_files):,} allscore file")


evt_counter        = Counter()       
evtclass_counter   = Counter()       
evtclass2evt       = defaultdict(Counter)  

for fp in tqdm(all_files, desc="Parsing"):
    try:
        df_evt = parse_sco_wsc(fp)
    except Exception as e:
        
        continue

    evt_counter.update(df_evt["EVENT"])
    evtclass_counter.update(df_evt["EVENT_CLASS"])
    for cls, evt in zip(df_evt["EVENT_CLASS"], df_evt["EVENT"]):
        evtclass2evt[cls][evt] += 1


print("\n=== SUMMARY ===\n")
for cls in sorted(evtclass2evt):
    total = evtclass_counter[cls]
    print(f"{cls}  (total {total:,})")
    for evt, n in evtclass2evt[cls].most_common():
        print(f"    └─ {evt:<35} {n:,}")
    print()


rows = []
for evt, n in evt_counter.items():
    rows.append(("EVENT", evt, n))
for cls, n in evtclass_counter.items():
    rows.append(("EVENT_CLASS", cls, n))

df_out = pd.DataFrame(rows, columns=["Level", "Name", "Count"]) \
          .sort_values(["Level", "Count"], ascending=[True, False])

csv_path = root / "wsc_event_summary.csv"
df_out.to_csv(csv_path, index=False)
'''

=== SUMMARY ===

AROUSAL  (total 63,549)
    └─ AROUSAL-LM                          36,262
    └─ AROUSAL-RESPIRATORY EVENT           13,537
    └─ AROUSAL-SPONTANEOUS                 13,124
    └─ AROUSAL-PLM                         541
    └─ AROUSAL-NOISE                       82
    └─ AROUSAL-SNORE                       3

DESATURATION  (total 81,786)
    └─ DESATURATION                        81,786

EKG EVENTS  (total 10)
    └─ SINUS TACHYCARDIA                   10

LM  (total 152,983)
    └─ LM-ISOLATED                         152,983

RESPIRATORY EVENT  (total 81,787)
    └─ HYPOPNEA                            73,809
    └─ OBSTRUCTIVE APNEA                   6,923
    └─ CENTRAL APNEA                       963
    └─ MIXED APNEA                         92

SNORE  (total 1)
    └─ PERIODIC                            1
'''


found 1,827 allscore file


Parsing: 100%|██████████| 1827/1827 [11:22<00:00,  2.68it/s]



=== SUMMARY ===

Apnea  (total 218)
    └─ Apnea                               218

CA  (total 11)
    └─ CA                                  11

Central Apnea  (total 5,485)
    └─ Central Apnea                       5,485

Central Hypopnea  (total 46)
    └─ Central Hypopnea                    46

Hypopnea  (total 185,015)
    └─ Hypopnea                            185,015

MA  (total 1)
    └─ MA                                  1

Mixed Apnea  (total 1,036)
    └─ Mixed Apnea                         1,036

OA  (total 118)
    └─ OA                                  118

OBS Apnea  (total 5,105)
    └─ OBS Apnea                           5,105

Obs Apnea  (total 27,126)
    └─ Obs Apnea                           27,126

Obst Apnea  (total 665)
    └─ Obst Apnea                          665

Obst. Apnea  (total 1,617)
    └─ Obst. Apnea                         1,617

Obst. Hypopnea  (total 1)
    └─ Obst. Hypopnea                      1

SaO2  (total 226,446)
    └─ SaO2             

'\n\n=== SUMMARY ===\n\nAROUSAL  (total 63,549)\n    └─ AROUSAL-LM                          36,262\n    └─ AROUSAL-RESPIRATORY EVENT           13,537\n    └─ AROUSAL-SPONTANEOUS                 13,124\n    └─ AROUSAL-PLM                         541\n    └─ AROUSAL-NOISE                       82\n    └─ AROUSAL-SNORE                       3\n\nDESATURATION  (total 81,786)\n    └─ DESATURATION                        81,786\n\nEKG EVENTS  (total 10)\n    └─ SINUS TACHYCARDIA                   10\n\nLM  (total 152,983)\n    └─ LM-ISOLATED                         152,983\n\nRESPIRATORY EVENT  (total 81,787)\n    └─ HYPOPNEA                            73,809\n    └─ OBSTRUCTIVE APNEA                   6,923\n    └─ CENTRAL APNEA                       963\n    └─ MIXED APNEA                         92\n\nSNORE  (total 1)\n    └─ PERIODIC                            1\n'

In [5]:
from pathlib import Path
import xml.etree.ElementTree as ET
from collections import defaultdict, Counter
from itertools import chain

visit1 = Path("/scratch/besp/shared_data/shhs/polysomnography/annotations-events-nsrr/shhs1")
visit2 = Path("/scratch/besp/shared_data/shhs/polysomnography/annotations-events-nsrr/shhs2")


etype2concepts = defaultdict(Counter)


all_xml_files = chain(
    visit1.glob("shhs1-*-nsrr.xml"),
    visit2.glob("shhs2-*-nsrr.xml"),
)

for idx, xml_path in enumerate(all_xml_files, 1):
    if idx % 100 == 0 and idx > 0:       
        print(f"parsed {idx:,} files")
        

    tree = ET.parse(xml_path)

    for se in tree.iterfind(".//ScoredEvent"):
        etype   = (se.findtext("EventType")    or "").strip()
        concept = (se.findtext("EventConcept") or "").strip()

        if etype and concept:
            etype2concepts[etype][concept] += 1


print("\n=== SUMMARY ===\n")
for etype in sorted(etype2concepts):
    total = sum(etype2concepts[etype].values())
    print(f"{etype}  (total {total:,})")


    for concept, n in etype2concepts[etype].most_common():
        print(f"    └─ {concept:<40} {n:,}")
    print()

import pandas as pd
rows = []
for etype, counter in etype2concepts.items():
    for concept, n in counter.items():
        rows.append((etype, concept, n))


df = (pd.DataFrame(rows, columns=["Eventtype", "Eventconcept", "number-of-sample"])
        .sort_values(["Eventtype", "number-of-sample"], ascending=[True, False]))



parsed 100 files
parsed 200 files
parsed 300 files
parsed 400 files
parsed 500 files
parsed 600 files
parsed 700 files
parsed 800 files
parsed 900 files
parsed 1,000 files
parsed 1,100 files
parsed 1,200 files
parsed 1,300 files
parsed 1,400 files
parsed 1,500 files
parsed 1,600 files
parsed 1,700 files
parsed 1,800 files
parsed 1,900 files
parsed 2,000 files
parsed 2,100 files
parsed 2,200 files
parsed 2,300 files
parsed 2,400 files
parsed 2,500 files
parsed 2,600 files
parsed 2,700 files
parsed 2,800 files
parsed 2,900 files
parsed 3,000 files
parsed 3,100 files
parsed 3,200 files
parsed 3,300 files
parsed 3,400 files
parsed 3,500 files
parsed 3,600 files
parsed 3,700 files
parsed 3,800 files
parsed 3,900 files
parsed 4,000 files
parsed 4,100 files
parsed 4,200 files
parsed 4,300 files
parsed 4,400 files
parsed 4,500 files
parsed 4,600 files
parsed 4,700 files
parsed 4,800 files
parsed 4,900 files
parsed 5,000 files
parsed 5,100 files
parsed 5,200 files
parsed 5,300 files
parsed 5,40

In [6]:
from pathlib import Path
import xml.etree.ElementTree as ET
from collections import defaultdict, Counter
from itertools import chain

visit1 = Path("/scratch/besp/shared_data/chat/polysomnography/annotations-events-nsrr/baseline")
visit2 = Path("/scratch/besp/shared_data/chat/polysomnography/annotations-events-nsrr/followup")


etype2concepts = defaultdict(Counter)


all_xml_files = chain(
    visit1.glob("chat-baseline-*-nsrr.xml"),
    visit2.glob("chat-followup-*-nsrr.xml"),
)

for idx, xml_path in enumerate(all_xml_files, 1):
    if idx % 100 == 0 and idx > 0:       
        print(f"parsed {idx:,} files")
        

    tree = ET.parse(xml_path)

    for se in tree.iterfind(".//ScoredEvent"):
        etype   = (se.findtext("EventType")    or "").strip()
        concept = (se.findtext("EventConcept") or "").strip()

        if etype and concept:
            etype2concepts[etype][concept] += 1


print("\n=== SUMMARY ===\n")
for etype in sorted(etype2concepts):
    total = sum(etype2concepts[etype].values())
    print(f"{etype}  (total {total:,})")


    for concept, n in etype2concepts[etype].most_common():
        print(f"    └─ {concept:<40} {n:,}")
    print()

import pandas as pd
rows = []
for etype, counter in etype2concepts.items():
    for concept, n in counter.items():
        rows.append((etype, concept, n))


df = (pd.DataFrame(rows, columns=["Eventtype", "Eventconcept", "number-of-sample"])
        .sort_values(["Eventtype", "number-of-sample"], ascending=[True, False]))



parsed 100 files
parsed 200 files
parsed 300 files
parsed 400 files
parsed 500 files
parsed 600 files
parsed 700 files
parsed 800 files

=== SUMMARY ===

Arousals|Arousals  (total 65,967)
    └─ Arousal|Arousal ()                       65,967

Artifacts|Artifacts  (total 5,269)
    └─ EtCO2 artifact|EtCO2 artifact            5,269

Limb Movement|Limb Movement  (total 74,665)
    └─ Limb movement - right|Limb Movement (Right) 27,435
    └─ Limb movement - left|Limb Movement (Left) 27,188
    └─ Periodic leg movement - right|PLM (Right) 10,136
    └─ Periodic leg movement - left|PLM (Left)  9,906

Respiratory|Respiratory  (total 167,184)
    └─ SpO2 desaturation|SpO2 desaturation      105,905
    └─ Hypopnea|Hypopnea                        23,910
    └─ SpO2 artifact|SpO2 artifact              17,755
    └─ Obstructive apnea|Obstructive Apnea      10,415
    └─ Central apnea|Central Apnea              6,314
    └─ Periodic breathing|Periodic Breathing    2,178
    └─ Unsure|Unsure       

In [7]:
from pathlib import Path
import xml.etree.ElementTree as ET
from collections import defaultdict, Counter
from itertools import chain

visit1 = Path("/scratch/besp/shared_data/ccshs/polysomnography/annotations-events-nsrr")



etype2concepts = defaultdict(Counter)


all_xml_files = chain(
    visit1.glob("ccshs-trec-*-nsrr.xml"),
   
)

for idx, xml_path in enumerate(all_xml_files, 1):
    if idx % 100 == 0 and idx > 0:       
        print(f"parsed {idx:,} files")
        

    tree = ET.parse(xml_path)

    for se in tree.iterfind(".//ScoredEvent"):
        etype   = (se.findtext("EventType")    or "").strip()
        concept = (se.findtext("EventConcept") or "").strip()

        if etype and concept:
            etype2concepts[etype][concept] += 1


print("\n=== SUMMARY ===\n")
for etype in sorted(etype2concepts):
    total = sum(etype2concepts[etype].values())
    print(f"{etype}  (total {total:,})")


    for concept, n in etype2concepts[etype].most_common():
        print(f"    └─ {concept:<40} {n:,}")
    print()

import pandas as pd
rows = []
for etype, counter in etype2concepts.items():
    for concept, n in counter.items():
        rows.append((etype, concept, n))


df = (pd.DataFrame(rows, columns=["Eventtype", "Eventconcept", "number-of-sample"])
        .sort_values(["Eventtype", "number-of-sample"], ascending=[True, False]))



parsed 100 files
parsed 200 files
parsed 300 files
parsed 400 files
parsed 500 files

=== SUMMARY ===

Arousals|Arousals  (total 43,319)
    └─ ASDA arousal|Arousal (ASDA)              43,319

Limb Movement|Limb Movement  (total 22,308)
    └─ Limb movement - right|Limb Movement (Right) 9,161
    └─ Limb movement - left|Limb Movement (Left) 8,479
    └─ Periodic leg movement - right|PLM (Right) 2,527
    └─ Periodic leg movement - left|PLM (Left)  2,141

Respiratory|Respiratory  (total 84,245)
    └─ SpO2 artifact|SpO2 artifact              36,875
    └─ SpO2 desaturation|SpO2 desaturation      32,330
    └─ Hypopnea|Hypopnea                        13,359
    └─ Central apnea|Central Apnea              1,024
    └─ Obstructive apnea|Obstructive Apnea      657

Stages|Stages  (total 52,200)
    └─ Stage 2 sleep|2                          18,000
    └─ Wake|0                                   13,167
    └─ Stage 3 sleep|3                          8,530
    └─ Stage 1 sleep|1             

In [2]:
from pathlib import Path
import xml.etree.ElementTree as ET
from collections import defaultdict, Counter
from itertools import chain

visit1 = Path("/scratch/besp/shared_data/cfs/polysomnography/annotations-events-nsrr")



etype2concepts = defaultdict(Counter)


all_xml_files = chain(
    visit1.glob("cfs-visit5-*-nsrr.xml"),
   
)

for idx, xml_path in enumerate(all_xml_files, 1):
    if idx % 100 == 0 and idx > 0:       
        print(f"parsed {idx:,} files")
        

    tree = ET.parse(xml_path)

    for se in tree.iterfind(".//ScoredEvent"):
        etype   = (se.findtext("EventType")    or "").strip()
        concept = (se.findtext("EventConcept") or "").strip()

        if etype and concept:
            etype2concepts[etype][concept] += 1


print("\n=== SUMMARY ===\n")
for etype in sorted(etype2concepts):
    total = sum(etype2concepts[etype].values())
    print(f"{etype}  (total {total:,})")


    for concept, n in etype2concepts[etype].most_common():
        print(f"    └─ {concept:<40} {n:,}")
    print()

import pandas as pd
rows = []
for etype, counter in etype2concepts.items():
    for concept, n in counter.items():
        rows.append((etype, concept, n))


df = (pd.DataFrame(rows, columns=["Eventtype", "Eventconcept", "number-of-sample"])
        .sort_values(["Eventtype", "number-of-sample"], ascending=[True, False]))



parsed 100 files
parsed 200 files
parsed 300 files
parsed 400 files
parsed 500 files
parsed 600 files
parsed 700 files

=== SUMMARY ===

Arousals|Arousals  (total 79,383)
    └─ ASDA arousal|Arousal (ASDA)              56,656
    └─ Spontaneous arousal|Arousal (SPON ARO)   17,287
    └─ Arousal resulting from respiratory effort|Arousal (RESP ARO) 3,562
    └─ Arousal|Arousal ()                       539
    └─ Spontaneous arousal|Arousal (spon aro)   490
    └─ Spontaneous arousal|Arousal (ARO SPONT)  310
    └─ Arousal|Arousal (Arousal)                298
    └─ Spontaneous arousal|Arousal (apon aro)   71
    └─ ASDA arousal|Arousal (ADSA)              68
    └─ Arousal resulting from respiratory effort|Arousal (ARO RES) 65
    └─ Arousal resulting from periodic leg movement|Arousal (PLM) 34
    └─ Arousal resulting from periodic leg movement|Arousal (PLM ARO) 3

Limb Movement|Limb Movement  (total 134,966)
    └─ Limb movement - left|Limb Movement (Left) 58,051
    └─ Limb movement -

In [3]:
from pathlib import Path
import xml.etree.ElementTree as ET
from collections import defaultdict, Counter
from itertools import chain

visit1 = Path("/scratch/besp/shared_data/sof/polysomnography/annotations-events-nsrr")



etype2concepts = defaultdict(Counter)


all_xml_files = chain(
    visit1.glob("sof-visit-8-*-nsrr.xml"),
   
)

for idx, xml_path in enumerate(all_xml_files, 1):
    if idx % 100 == 0 and idx > 0:       
        print(f"parsed {idx:,} files")
        

    tree = ET.parse(xml_path)

    for se in tree.iterfind(".//ScoredEvent"):
        etype   = (se.findtext("EventType")    or "").strip()
        concept = (se.findtext("EventConcept") or "").strip()

        if etype and concept:
            etype2concepts[etype][concept] += 1


print("\n=== SUMMARY ===\n")
for etype in sorted(etype2concepts):
    total = sum(etype2concepts[etype].values())
    print(f"{etype}  (total {total:,})")


    for concept, n in etype2concepts[etype].most_common():
        print(f"    └─ {concept:<40} {n:,}")
    print()

import pandas as pd
rows = []
for etype, counter in etype2concepts.items():
    for concept, n in counter.items():
        rows.append((etype, concept, n))


df = (pd.DataFrame(rows, columns=["Eventtype", "Eventconcept", "number-of-sample"])
        .sort_values(["Eventtype", "number-of-sample"], ascending=[True, False]))



parsed 100 files
parsed 200 files
parsed 300 files
parsed 400 files

=== SUMMARY ===

Arousals|Arousals  (total 59,998)
    └─ ASDA arousal|Arousal (ASDA)              58,100
    └─ Arousal|Arousal ()                       1,898

Limb Movement|Limb Movement  (total 134,824)
    └─ Limb movement - right|Limb Movement (Right) 69,280
    └─ Limb movement - left|Limb Movement (Left) 64,941
    └─ Periodic leg movement - right|PLM (Right) 353
    └─ Periodic leg movement - left|PLM (Left)  250

Respiratory|Respiratory  (total 89,338)
    └─ Hypopnea|Hypopnea                        61,557
    └─ SpO2 artifact|SpO2 artifact              17,201
    └─ Obstructive apnea|Obstructive Apnea      9,281
    └─ Central apnea|Central Apnea              1,078
    └─ SpO2 desaturation|SpO2 desaturation      217
    └─ Mixed apnea|Mixed Apnea                  2
    └─ Respiratory artifact|Respiratory artifact 2

Stages|Stages  (total 67,254)
    └─ Stage 2 sleep|2                          24,017
    └─ S