In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
df = pd.read_csv('d:/ecg/data/raw/ptbxl/ptbxl_database.csv')
df['scp_codes'] = df['scp_codes'].apply(eval)

In [3]:
scp_statements = pd.read_csv('d:/ecg/data/raw/ptbxl/scp_statements.csv', index_col=0)
code_group_map = {}
code_desc_map = {}
for code in scp_statements.index:
    code_group_map[code] = scp_statements.loc[code, 'diagnostic_class']
    code_desc_map[code] = scp_statements.loc[code, 'description']

In [4]:
group_codes = defaultdict(list)
code_counts = defaultdict(int)
for codes in df['scp_codes']:
    for code in codes.keys():
        group = code_group_map.get(code, 'Bilinmiyor')
        group_codes[group].append(code)
        code_counts[code] += 1

In [5]:
group_summary = {}
for group, codes in group_codes.items():
    unique_codes = set(codes)
    total_samples = sum(code_counts[code] for code in unique_codes)
    group_summary[group] = {
        'alt_dallar': list(unique_codes),
        'toplam_ornek': total_samples
    }
summary_df = pd.DataFrame([
    {'grup': g, 'alt_dal_sayisi': len(v['alt_dallar']), 'toplam_ornek': v['toplam_ornek']}
    for g, v in group_summary.items()
])
summary_df = summary_df.sort_values(by='toplam_ornek', ascending=False)
summary_df

Unnamed: 0,grup,alt_dal_sayisi,toplam_ornek
1,,27,30326
0,NORM,1,9528
2,MI,14,6886
3,STTC,13,5788
5,CD,11,5772
4,HYP,5,2819


In [6]:
for group, v in group_summary.items():
    print(f'Grup: {group}')
    for code in v['alt_dallar']:
        desc = code_desc_map.get(code, '')
        print(f'  - {code}: {desc} ({code_counts[code]} örnek)')
    print('-'*40)

Grup: NORM
  - NORM: normal ECG (9528 örnek)
----------------------------------------
Grup: nan
  - NT_: non-specific T-wave changes (424 örnek)
  - LVOLT: low QRS voltages in the frontal and horizontal leads (182 örnek)
  - PRC(S): premature complex(es) (10 örnek)
  - LPR: prolonged PR interval (340 örnek)
  - PAC: atrial premature complex (398 örnek)
  - STD_: non-specific ST depression (1009 örnek)
  - HVOLT: high QRS voltage (62 örnek)
  - TAB_: T-wave abnormality (35 örnek)
  - AFIB: atrial fibrillation (1514 örnek)
  - BIGU: bigeminal pattern (unknown origin, SV or Ventricular) (82 örnek)
  - SVARR: supraventricular arrhythmia (157 örnek)
  - SBRAD: sinus bradycardia (637 örnek)
  - PACE: normal functioning artificial pacemaker (296 örnek)
  - AFLT: atrial flutter (73 örnek)
  - VCLVH: voltage criteria (QRS) for left ventricular hypertrophy (875 örnek)
  - INVT: inverted T-waves (294 örnek)
  - LOWT: low amplitude T-waves (438 örnek)
  - STACH: sinus tachycardia (826 örnek)
  - P

In [7]:
idx = 0  # ilk kayıt
print('Kayıt ID:', df.loc[idx, 'ecg_id'])
print('Tanı kodları:')
for code in df.loc[idx, 'scp_codes'].keys():
    group = code_group_map.get(code, 'Bilinmiyor')
    desc = code_desc_map.get(code, '')
    print(f'  - {code} (Grup: {group}): {desc}')

Kayıt ID: 1
Tanı kodları:
  - NORM (Grup: NORM): normal ECG
  - LVOLT (Grup: nan): low QRS voltages in the frontal and horizontal leads
  - SR (Grup: nan): sinus rhythm
