In [3]:
from pathlib import Path
import xml.etree.ElementTree as ET
from collections import defaultdict, Counter
from itertools import chain

visit1 = Path("/scratch/besp/shared_data/mros/polysomnography/annotations-events-nsrr/visit1")
visit2 = Path("/scratch/besp/shared_data/mros/polysomnography/annotations-events-nsrr/visit2")


etype2concepts = defaultdict(Counter)


all_xml_files = chain(
    visit1.glob("mros-visit1-*-nsrr.xml"),
    visit2.glob("mros-visit2-*-nsrr.xml"),
)

for idx, xml_path in enumerate(all_xml_files, 1):
    if idx % 100 == 0 and idx > 0:       
        print(f"parsed {idx:,} files")
        

    tree = ET.parse(xml_path)

    for se in tree.iterfind(".//ScoredEvent"):
        etype   = (se.findtext("EventType")    or "").strip()
        concept = (se.findtext("EventConcept") or "").strip()

        if etype and concept:
            etype2concepts[etype][concept] += 1


print("\n=== SUMMARY ===\n")
for etype in sorted(etype2concepts):
    total = sum(etype2concepts[etype].values())
    print(f"{etype}  (total {total:,})")


    for concept, n in etype2concepts[etype].most_common():
        print(f"    └─ {concept:<40} {n:,}")
    print()

import pandas as pd
rows = []
for etype, counter in etype2concepts.items():
    for concept, n in counter.items():
        rows.append((etype, concept, n))


df = (pd.DataFrame(rows, columns=["Eventtype", "Eventconcept", "number-of-sample"])
        .sort_values(["Eventtype", "number-of-sample"], ascending=[True, False]))


csv_path = "mros_visit1_event_summary.csv"
df.to_csv(csv_path, index=False)

parsed 100 files
parsed 200 files
parsed 300 files
parsed 400 files
parsed 500 files
parsed 600 files
parsed 700 files
parsed 800 files
parsed 900 files
parsed 1,000 files
parsed 1,100 files
parsed 1,200 files
parsed 1,300 files
parsed 1,400 files
parsed 1,500 files
parsed 1,600 files
parsed 1,700 files
parsed 1,800 files
parsed 1,900 files
parsed 2,000 files
parsed 2,100 files
parsed 2,200 files
parsed 2,300 files
parsed 2,400 files
parsed 2,500 files
parsed 2,600 files
parsed 2,700 files
parsed 2,800 files
parsed 2,900 files
parsed 3,000 files
parsed 3,100 files
parsed 3,200 files
parsed 3,300 files
parsed 3,400 files
parsed 3,500 files
parsed 3,600 files
parsed 3,700 files
parsed 3,800 files
parsed 3,900 files

=== SUMMARY ===

Arousals|Arousals  (total 628,637)
    └─ ASDA arousal|Arousal (ASDA)              623,627
    └─ Arousal|Arousal ()                       3,754
    └─ Arousal|Arousal (ARO Limb)               1,095
    └─ Arousal|Arousal (Asda)                   133
    └─ A

In [4]:
from pathlib import Path
import xml.etree.ElementTree as ET
from collections import defaultdict, Counter
from itertools import chain

visit1 = Path("/scratch/besp/shared_data/mesa/polysomnography/annotations-events-nsrr")



etype2concepts = defaultdict(Counter)


all_xml_files = chain(
    visit1.glob("mesa-sleep-*-nsrr.xml"),

)

for idx, xml_path in enumerate(all_xml_files, 1):
    if idx % 100 == 0 and idx > 0:       
        print(f"parsed {idx:,} files")
        

    tree = ET.parse(xml_path)

    for se in tree.iterfind(".//ScoredEvent"):
        etype   = (se.findtext("EventType")    or "").strip()
        concept = (se.findtext("EventConcept") or "").strip()

        if etype and concept:
            etype2concepts[etype][concept] += 1


print("\n=== SUMMARY ===\n")
for etype in sorted(etype2concepts):
    total = sum(etype2concepts[etype].values())
    print(f"{etype}  (total {total:,})")


    for concept, n in etype2concepts[etype].most_common():
        print(f"    └─ {concept:<40} {n:,}")
    print()

import pandas as pd
rows = []
for etype, counter in etype2concepts.items():
    for concept, n in counter.items():
        rows.append((etype, concept, n))


df = (pd.DataFrame(rows, columns=["Eventtype", "Eventconcept", "number-of-sample"])
        .sort_values(["Eventtype", "number-of-sample"], ascending=[True, False]))


csv_path = "mesa_visit1_event_summary.csv"
df.to_csv(csv_path, index=False)

parsed 100 files
parsed 200 files
parsed 300 files
parsed 400 files
parsed 500 files
parsed 600 files
parsed 700 files
parsed 800 files
parsed 900 files
parsed 1,000 files
parsed 1,100 files
parsed 1,200 files
parsed 1,300 files
parsed 1,400 files
parsed 1,500 files
parsed 1,600 files
parsed 1,700 files
parsed 1,800 files
parsed 1,900 files
parsed 2,000 files

=== SUMMARY ===

Arousals|Arousals  (total 323,439)
    └─ Arousal|Arousal ()                       257,992
    └─ ASDA arousal|Arousal (ASDA)              61,976
    └─ Spontaneous arousal|Arousal (ARO SPONT)  3,326
    └─ Arousal resulting from respiratory effort|Arousal (ARO RES) 145

Electrocardiogram|Electrocardiogram  (total 1)
    └─ Narrow complex tachycardia|Narrow Complex Tachycardia 1

Limb Movement|Limb Movement  (total 230,482)
    └─ Periodic leg movement - left|PLM (Left)  170,761
    └─ Limb movement - left|Limb Movement (Left) 59,721

Respiratory|Respiratory  (total 1,155,505)
    └─ SpO2 desaturation|SpO2 desatu

In [2]:
from pathlib import Path
from collections import defaultdict, Counter
from itertools import chain
import re
import pandas as pd


visit1 = Path("/scratch/besp/shared_data/wsc/polysomnography")


all_sco_files = chain(
    visit1.glob("wsc-visit1-*-nsrr.sco.txt"),
    
)

# {event_type: Counter({concept: count, ...}), ...}
etype2concepts = defaultdict(Counter)


def parse_sco_file(fp: Path, counter_dict):

    with fp.open() as fh:
        for ln in fh:
            ln = ln.rstrip()
            
            if (not ln) or ln.startswith(("Epoch", "*", "#")):
                continue

            
            cols = re.split(r"\s+", ln, maxsplit=8)
            
            if len(cols) < 5:
                continue

            event_type  = cols[3] 
            event_concept = cols[4]  

            counter_dict[event_type][event_concept] += 1


for idx, sco_path in enumerate(all_sco_files, 1):
    parse_sco_file(sco_path, etype2concepts)
    if idx % 100 == 0:
        print(f"parsed {idx:,} files")


print("\n=== SUMMARY ===\n")
for etype in sorted(etype2concepts):
    total = sum(etype2concepts[etype].values())
    print(f"{etype}  (total {total:,})")
    for concept, n in etype2concepts[etype].most_common():
        print(f"    └─ {concept:<30} {n:,}")
    print()


rows = [
    (etype, concept, n)
    for etype, counter in etype2concepts.items()
    for concept, n in counter.items()
]

df = (
    pd.DataFrame(rows, columns=["Eventtype", "Eventconcept", "number-of-sample"])
      .sort_values(["Eventtype", "number-of-sample"], ascending=[True, False])
)

out_csv = "wsc_visit1_visit2_event_summary.csv"
df.to_csv(out_csv, index=False)
print(f"CSV saved to {out_csv}")


parsed 100 files
parsed 200 files
parsed 300 files
parsed 400 files
parsed 500 files
parsed 600 files
parsed 700 files
parsed 800 files
parsed 900 files
parsed 1,000 files

=== SUMMARY ===

100  (total 17,158)
    └─ Obs                            11,952
    └─ OBS                            3,564
    └─ Obst.                          859
    └─ Obst                           665
    └─ OA                             118

101  (total 575)
    └─ Mixed                          574
    └─ MA                             1

102  (total 3,145)
    └─ Central                        2,998
    └─ Apnea                          136
    └─ CA                             11

200  (total 3,451)
    └─ Hypopnea                       3,450
    └─ Obst.                          1

201  (total 5)
    └─ Hypopnea                       5

202  (total 97,438)
    └─ Hypopnea                       97,438

301  (total 121,772)
    └─ SaO2                           121,772

400  (total 90)
    └─ RespA     

In [3]:
import re, pandas as pd
from pathlib import Path

path = Path("/u/ztshuai/ondemand/wsc-visit1-81449-nsrr.sco.txt")     

# ↓ 正则：拿到 9 个字段
PAT = re.compile(
    r"""^\s*
        (\d+)            # 1  Epoch
        \s+(\d+)         # 2  Scan
        \s+(\d+)?        # 3  Length(Scanx2) 
        \s+(\d+)         # 4  Marker Code
        \s+(.+?)         # 5  Marker Text  
        \s+(\d+)         # 6  Channel #
        \s+([\d:]+)      # 7  Clock Time
        (?:\s+([\d.]+))? # 8  Value       
        (?:\s+([\d.]+))? # 9  Length(sec)   
        \s*$""",
    re.X
)

rows = []
with path.open() as fh:
    for ln in fh:
        ln = ln.rstrip()
        if (not ln) or ln.startswith(("Epoch", "*", "#")):
            continue                 

        m = PAT.match(ln)
        if not m:
            continue                 

        epoch, scan, len_scan2, code, text, ch, clock, val, len_sec = m.groups()

        rows.append({
            "Epoch":       int(epoch),
            "Scan":        int(scan),
            "LenScan2":    int(len_scan2) if len_scan2 else None,
            "MarkerCode":  code,              
            "MarkerText":  text.strip(),      
            "Channel":     int(ch),
            "Clock":       clock,             # 22:08:50
            "Value":       float(val) if val else None,
            "LenSec":      float(len_sec) if len_sec else None
        })

df = pd.DataFrame(rows)


In [4]:
print(df)

     Epoch     Scan  LenScan2 MarkerCode MarkerText  Channel     Clock  Value  \
0       61   362028       NaN        405        LMA        4  21:29:42    0.0   
1       64   381772       NaN        405        LMA        4  21:31:21    0.0   
2       65   388628       NaN        405        LMA        4  21:31:55    0.0   
3       69   410986       NaN        405        LMA        4  21:33:47    0.0   
4       71   420564       NaN        405        LMA        4  21:34:35    0.0   
..     ...      ...       ...        ...        ...      ...       ...    ...   
223   1060  6356596    4660.0        202   Hypopnea       15   5:49:15   95.0   
224   1060  6357608       NaN        406         LM        5   5:49:20    0.0   
225   1065  6384400       NaN        406         LM        5   5:51:34    0.0   
226   1068  6407406    8681.0        202   Hypopnea       15   5:53:29   95.9   
227   1070  6419766   12576.0        202   Hypopnea       15   5:54:31   95.9   

     LenSec  
0       NaN  

In [1]:
from pathlib import Path
from collections import Counter, defaultdict
import pandas as pd

root_dir = Path("/scratch/besp/shared_data/nchsdb/sleep_data")

def parse_annot(path: Path):
    
    rows = []
    with path.open() as f:
        for ln in f:
            ln = ln.strip()
            if not ln or ln.startswith("#"):
                continue
            *label_parts, start, dur = ln.split()
            rows.append((
                " ".join(label_parts),
                float(start),
                float(dur.lstrip("+"))
            ))
    df = pd.DataFrame(rows, columns=["class", "start", "duration"])
    df["stop"] = df["start"] + df["duration"]
    return df

# ------------------------------------------------------------
overall_counts      = Counter()                 
overall_durations   = Counter()                 
per_file_data       = defaultdict(Counter)      
files_processed     = 0

for annot_file in root_dir.glob("*.annot"):
    df = parse_annot(annot_file)

    # 聚合到总计
    overall_counts.update(df["class"])
    overall_durations.update(df.groupby("class")["duration"].sum().to_dict())

    # 保存单文件计数
    per_file_data[annot_file.name].update(df["class"])

    files_processed += 1
    if files_processed % 100 == 0:
        print(f"Processed {files_processed} files")


print("\n=== summary ===")
summary = (
    pd.DataFrame({
        "count": overall_counts,
        "total_sec": overall_durations
    })
    .assign(rate_per_hour=lambda d: d["count"] / (d["total_sec"] / 3600).replace(0, pd.NA))
    .sort_values("count", ascending=False)
)
print(summary)          
# summary.to_csv("nchsdb_event_summary.tsv", sep="\t")

print("\n=== summry count ===")
per_file_df = (
    pd.DataFrame(per_file_data)
    .T.fillna(0).astype(int)     
)
print(per_file_df.head())
# per_file_df.to_csv("nchsdb_event_counts_per_file.tsv", sep="\t")


Processed 100 files
Processed 200 files
Processed 300 files
Processed 400 files
Processed 500 files
Processed 600 files
Processed 700 files
Processed 800 files
Processed 900 files
Processed 1000 files
Processed 1100 files
Processed 1200 files
Processed 1300 files
Processed 1400 files
Processed 1500 files
Processed 1600 files
Processed 1700 files
Processed 1800 files
Processed 1900 files
Processed 2000 files
Processed 2100 files
Processed 2200 files
Processed 2300 files
Processed 2400 files
Processed 2500 files
Processed 2600 files
Processed 2700 files
Processed 2800 files
Processed 2900 files
Processed 3000 files
Processed 3100 files
Processed 3200 files
Processed 3300 files
Processed 3400 files
Processed 3500 files
Processed 3600 files
Processed 3700 files
Processed 3800 files
Processed 3900 files

=== summary ===
                                    count     total_sec rate_per_hour
Sleep stage N2                    1383765  4.151294e+07    120.000023
Sleep stage N3                   

In [2]:
print(summary.index)

Index(['Sleep stage N2', 'Sleep stage N3', 'Sleep stage W', 'Sleep stage R',
       'Sleep stage ?', 'Oxygen Desaturation', 'Oximeter Event', 'EEG arousal',
       'Sleep stage N1', 'Obstructive Hypopnea',
       ...
       'chwing motion', 'hob flat, pateint on room air, patient on one pillow',
       'Dad talking to pt', 'Dad singing to calm pt',
       'rubbing his nose and crying. Possible night terror',
       'crying mom XXX him', 'crying, and looking for mom', 'pt XXX juice',
       'pt on room air', 'mom is letting pt pull off wires'],
      dtype='object', length=34980)


In [6]:
print(summary.head(30))

                                    count     total_sec rate_per_hour
Sleep stage N2                    1383765  4.151294e+07    120.000023
Sleep stage N3                     875486  2.626456e+07    120.000076
Sleep stage W                      665676  1.996999e+07    120.001721
Sleep stage R                      611320  1.833960e+07    120.000000
Sleep stage ?                      347294  1.037437e+07    120.514184
Oxygen Desaturation                215280  5.868963e+06    132.051942
Oximeter Event                     161644  0.000000e+00          <NA>
EEG arousal                        146052  9.531373e+05    551.638453
Sleep stage N1                     128410  3.852300e+06    120.000000
Obstructive Hypopnea                42179  5.064196e+05    299.839124
Limb Movement                       36856  1.120244e+05   1184.399187
Gain/Filter Change                  27857  0.000000e+00          <NA>
move                                26380  0.000000e+00          <NA>
Body Position_ Supin