In [None]:
from pathlib import Path

dir_data = "../datasets/CMU_C2C12"
dataset = "090303-C2C12P15-FGF2"

dataset_dir_path = Path(dir_data) / dataset

# xml_path = dataset_dir_path / "Human Full Annotation_exp1_F0009 Data.xml"
xml_path = dataset_dir_path / "Computer exp1_F0001 Data.xml"

import xml.etree.ElementTree as ET
# xml_data = xml_path.read_text()
tree = ET.parse(xml_path)
root = tree.getroot()

In [None]:
# Revised approach for extracting lineage-centric and frame-centric data
# This approach maintains the parent-child relationship

# Extracting lineage-centric information
def extract_lineage_centric_info(fs_nodes):
    lineage_data = []
    mitosis_count = 0
    for fs in fs_nodes:
        for f in fs.findall('f'):
            for _as in f.findall('as'):
                for a_node in _as.findall('.//a'):
                    cell_id = a_node.get('id')
                    parent_cell_id = f.get('id')  # Parent cell ID is now the ID of 'f' node
                    sub_as_elements = a_node.findall('as')
                    sub_as_node = sub_as_elements[0]
                    sub_as_a_elements = sub_as_node.findall('a')
                    daughtercell_ids = []
                    if len(sub_as_a_elements) == 2:  # Cell with daughters
                        for as_element in sub_as_elements:
                            for daughter_cell in as_element.findall('a'):
                                daughtercell_ids.append(daughter_cell.get('id'))
                        mitosis_count += 1
                    elif len(sub_as_a_elements) > 2:
                        print(f"Error: More than 2 'as' elements for cell ID {cell_id}")
                    lineage_data.append({
                        'cellID': cell_id,
                        'parentcellID': parent_cell_id,
                        'daughtercellIDs': daughtercell_ids
                    })
    print("Number of mitosis events:", mitosis_count)
    return lineage_data

# Extracting frame-centric information
def extract_frame_centric_info(fs_nodes):
    frame_data = []
    for fs in fs_nodes:
        for f in fs.findall('f'):
            for _as in f.findall('as'):
                for a in _as.findall('.//a'):
                    for ss in a.findall('ss'):
                        cell_id = a.get('id')  # Cell ID is now the ID of 'a' node
                        cell_color = a.get('brush')
                        cell_type = a.get('type')
                        xs, ys = [], []
                        cell_status = []
                        for s in ss.findall('s'):
                            # print("--flag1")
                            xcoord = float(s.get('x'))
                            ycoord = float(s.get('y'))
                            _cs = int(s.get('s'))
                            xs.append(xcoord)
                            ys.append(ycoord)
                            cell_status.append(_cs)
                            
                        frame_data.append({
                            'cellID': cell_id,
                            'cellColour': cell_color,
                            'cellType': cell_type,
                            'xcoords': xs,
                            'ycoords': ys,
                            'cellStatus': cell_status
                        })
    return frame_data

# Extracting all 'fs' nodes
fs_nodes = root.findall('fs')

# Extracting lineage-centric and frame-centric data
lineage_centric_data = extract_lineage_centric_info(fs_nodes)
frame_centric_data = extract_frame_centric_info(fs_nodes)

# Displaying the first few entries of each data type for verification
print(lineage_centric_data[:5], frame_centric_data[:5])

len(fs_nodes), len(lineage_centric_data), len(frame_centric_data)

In [None]:
fs_nodes[0].findall('.//a')