In [1]:
'''
GetECGSeverityInfo_V01
Version Discription: Use the previously organized table to get severity info from svg files
Last Update: 20240813
'''

'\nGetECGSeverityInfo_V01\nVersion Discription: Use the previously organized table to get severity info from svg files\nLast Update: 20240813\n'

In [2]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import os
from tqdm import tqdm



In [3]:
def extract_info_from_svg(svg_file_path):
    """
    Extracts patientid and date from a given SVG file.
    Includes error handling to ensure the script continues even if parsing fails.

    Parameters:
    - svg_file_path: Path to the SVG file.

    Returns:
    A tuple containing (patientid, date), with both values as None if parsing fails.
    """
    try:
        # Parse the SVG file
        tree = ET.parse(svg_file_path)
        root = tree.getroot()

        # Define SVG namespace if needed
        namespaces = {'svg': 'http://www.w3.org/2000/svg'}

        # Function to find text content by element ID
        def find_text_by_id(root, element_id):
            for element in root.findall(".//*[@id='" + element_id + "']", namespaces=namespaces):
                return element.text
            return None

        # Extract 'patientid' and 'date'
        patientid = find_text_by_id(root, 'patientid')
        date = find_text_by_id(root, 'date')
        severity = find_text_by_id(root, 'severity')

        return patientid, date, severity

    except Exception as e:
        # If any error occurs during parsing, return None for both patientid and date
        print(f"Error extracting data from {svg_file_path}: {e}")
        return None, None

In [6]:
# Set folder path
folder_path = "normal_files_from_Will"

# Initialize an empty DataFrame
df = pd.DataFrame(columns=["filename", "patient_id", "date", "severity"])

# Get the list of SVG files in the folder
svg_files = [f for f in os.listdir(folder_path) if f.endswith(".svg")]

# Loop through all the SVG files with tqdm progress bar
for filename in tqdm(svg_files, desc="Processing SVG files"):
    file_path = os.path.join(folder_path, filename)
    try:
        patientid, date, severity = extract_info_from_svg(file_path)
        # Append the data to the DataFrame
        df = df.append({"filename": filename, "patient_id": patientid, "date": date, "severity": severity}, ignore_index=True)
    except Exception as e:
        print(f"Error processing file {filename}: {e}")
        continue

# Display the DataFrame
df

Processing SVG files: 100%|██████████████████| 346/346 [00:01<00:00, 337.52it/s]


Unnamed: 0,filename,patient_id,date,severity
0,PageWriterTouchECG20187227558218.svg,44280720,2018/7/22,- ABNORMAL ECG -
1,PageWriterTouchECG20181111105338326.svg,20323506,2018/11/11,- ABNORMAL ECG -
2,PageWriterTouchECG202112822452626.svg,25216653,2021/1/29,- ABNORMAL ECG -
3,PageWriterTouchECG202121935845960.svg,46732723,2021/2/19,- ABNORMAL ECG -
4,PageWriterTouchECG2020111952629133.svg,45218799,2020/11/19,- ABNORMAL ECG -
...,...,...,...,...
341,PageWriterTouchECG2018122223418234.svg,45153159,2018/12/23,- ABNORMAL ECG -
342,PageWriterTouchECG202132611837174.svg,28667723,2021/3/26,- ABNORMAL ECG -
343,PageWriterTouchECG202037161115939.svg,46112305,2020/3/7,- ABNORMAL ECG -
344,PageWriterTouchECG202092231218465.svg,4150929,2020/9/22,- NORMAL ECG -


In [8]:
# df.to_csv("file_severity_info.csv", index=False)  # Uncomment this line if you want to save the DataFrame as a CSV file

In [7]:
severity_counts = df['severity'].value_counts()
severity_counts

- ABNORMAL ECG -            203
- NORMAL ECG -               52
- BORDERLINE ECG -           50
- OTHERWISE NORMAL ECG -     41
Name: severity, dtype: int64