In [1]:

# --- Cell 1: Parser classes & helpers (no ace_tools) ---
from dataclasses import dataclass, field
from typing import List, Dict, Optional
from pathlib import Path
import pandas as pd
import re

def slugify(heading: str) -> str:
    s = heading.strip()
    s = re.sub(r'^[#\s]+', '', s)
    s = re.sub(r'\s+', ' ', s)
    s = s.strip().lower()
    s = re.sub(r'[^a-z0-9]+', '_', s)
    s = s.strip('_') or "section"
    if not re.match(r'^[a-z_]', s):
        s = "_" + s
    return s

@dataclass
class DescriptionObject:
    """
    Represents one CSV row's description parsed into Markdown sections.
    - Each '# Heading' becomes a snake_case attribute on the object.
    - parsed_items[section_key] holds bullet-point items as [{name, value}, ...].
    """
    source_index: int
    title: str
    raw_description: str
    sections: Dict[str, str] = field(default_factory=dict)
    parsed_items: Dict[str, List[Dict[str, str]]] = field(default_factory=dict)

    def __post_init__(self):
        # Expose each section as an attribute
        for k, v in self.sections.items():
            setattr(self, k, v)
        # Pre-parse "-" bullet lines into name/value pairs (split at first "=")
        for sec_key, content in self.sections.items():
            self.parsed_items[sec_key] = self._parse_bullet_items(content)


    @staticmethod
    def _parse_bullet_items(content: str) -> List[Dict[str, str]]:
        items = []
        if not isinstance(content, str):
            return items
    
        # Matches hr/thematic break variants: '---', '- - -', '***', '___', with optional spaces
        hr_line = re.compile(r"^[-_*](?:\s*[-_*]){2,}\s*$")
        # Bullet lines like '- name = value' (requires a space after the dash and some content)
        bullet = re.compile(r"^\s*-\s+(.*\S)\s*$")
    
        for line in content.splitlines():
            s = line.strip()
    
            # 1) Ignore horizontal rules (e.g., '---', '- - -', '***', '___')
            if hr_line.match(s):
                continue
    
            # 2) Only treat '- something' as a bullet (not just a raw '-' line)
            m = bullet.match(line)
            if not m:
                continue
    
            line_content = m.group(1)
    
            # 3) Also ignore if, after removing spaces, it's still only dashes/underscores/asterisks
            #    (covers the case where '- ---' would have produced '--')
            if re.fullmatch(r"[-_*]+", line_content.replace(" ", "")):
                continue
    
            # 4) Split name/value on the first '=' if present
            if "=" in line_content:
                name, value = line_content.split("=", 1)
                items.append({"name": name.strip(), "value": value.strip()})
            else:
                items.append({"name": line_content.strip(), "value": ""})
    
        return items



class InterfaceIssueParser:
    """
    Reads a CSV with 'description' (Markdown) and 'title', producing DescriptionObject instances.
    Each top-level Markdown heading (# …) becomes an attribute on the object.
    """
    def __init__(self, csv_path: str, description_col: str = "description", title_col: str = "title"):
        self.csv_path = csv_path
        self.description_col = description_col
        self.title_col = title_col
        self.df: Optional[pandas.DataFrame] = None
        self.objects: List[DescriptionObject] = []

    @staticmethod
    def parse_markdown_sections(md: str) -> Dict[str, str]:
        if not isinstance(md, str) or not md.strip():
            return {}
        text = md.replace("\r\n", "\n").replace("\r", "\n")
        # Match lines like "# Heading", "## Subheading", etc. (start-of-line)
        heading_regex = re.compile(r"^(#{1,6})\s*(.+?)\s*$", re.MULTILINE)

        sections = {}
        matches = list(heading_regex.finditer(text))
        if not matches:
            return {}

        for i, m in enumerate(matches):
            start = m.end()
            end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
            heading_text = m.group(2).strip()
            key = slugify(heading_text)
            content = text[start:end].strip()

            # Avoid overwriting duplicate headings
            base_key = key
            counter = 2
            while key in sections:
                key = f"{base_key}_{counter}"
                counter += 1

            sections[key] = content

        return sections

    def load(self):
        self.df = pd.read_csv(self.csv_path, encoding="utf-8", engine="python")
        # Normalize column names case-insensitively
        lower_map = {c.lower(): c for c in self.df.columns}
        if self.description_col.lower() in lower_map:
            self.description_col = lower_map[self.description_col.lower()]
        else:
            raise KeyError(f"Description column '{self.description_col}' not found. Columns: {list(self.df.columns)}")
        if self.title_col.lower() in lower_map:
            self.title_col = lower_map[self.title_col.lower()]
        else:
            raise KeyError(f"Title column '{self.title_col}' not found. Columns: {list(self.df.columns)}")

    def build_objects(self):
        if self.df is None:
            self.load()
        self.objects = []
        for idx, row in self.df.iterrows():
            desc = row.get(self.description_col, "")
            title = row.get(self.title_col, "")
            sections = self.parse_markdown_sections(desc)
            self.objects.append(
                DescriptionObject(source_index=idx, title=str(title), raw_description=str(desc), sections=sections)
            )

    def get_by_section_presence(self, section_heading: str) -> List[DescriptionObject]:
        key = slugify(section_heading)
        return [o for o in self.objects if hasattr(o, key) and getattr(o, key).strip()]

    def section_items_dataframe(self, objs: List[DescriptionObject], section_heading: str) -> pd.DataFrame:
        section_key = slugify(section_heading)
        rows = []
        for o in objs:
            for item in o.parsed_items.get(section_key, []):
                rows.append({
                    "source_index": o.source_index,
                    "title": o.title,
                    "section": section_heading,
                    "property_name": item["name"],
                    "property_value": item["value"],
                })
        return pd.DataFrame(rows)


In [2]:
# --- Cell 2: Run the parser on your CSV and show results ---
# 1) Set your CSV path (local to your notebook environment)
csv_path = "Example interface issue GitLab export.csv"  # <-- change if needed

# 2) Build objects
parser = InterfaceIssueParser(csv_path=csv_path, description_col="description", title_col="title")
parser.load()
parser.build_objects()

# 3) Filter for the two sections you care about
process_objs = parser.get_by_section_presence("Process Interface Information")
structural_objs = parser.get_by_section_presence("Structural Interface Information")

# 4) Make tidy DataFrames (one row per "-" bullet item with name/value split at "=")
process_items_df = parser.section_items_dataframe(process_objs, "Process Interface Information")
structural_items_df = parser.section_items_dataframe(structural_objs, "Structural Interface Information")

# 5) Display in Jupyter
display(process_items_df.head(20))
display(structural_items_df.head(20))

# 6) (Optional) Save combined CSVs
process_items_df.to_csv("process_interface_items.csv", index=False)
structural_items_df.to_csv("structural_interface_items.csv", index=False)
print("Saved: process_interface_items.csv, structural_interface_items.csv")


Unnamed: 0,source_index,title,section,property_name,property_value
0,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Process Interface Information,Interface_Diameter,DN25
1,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Process Interface Information,Interface_Elevation,4TH FLOOR
2,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Process Interface Information,Pipe_Specification,SC1C
3,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Process Interface Information,Insulation_Specification,
4,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Process Interface Information,Nominal_Flow_Rate,5g/s
5,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Process Interface Information,Maximum_Flow_Rate,TBD
6,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Process Interface Information,Minimum_Flow_Rate,TBD
7,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Process Interface Information,Nominal_Pressure,6.2 Mpa
8,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Process Interface Information,Maximum_Pressure,TBD
9,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Process Interface Information,Minimum_Pressure,TBD


Unnamed: 0,source_index,title,section,property_name,property_value
0,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Structural Interface Information,Internal_or_External,Internal
1,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Structural Interface Information,Subcategory,Pipe Flange
2,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Structural Interface Information,Weight,Pipe/Valve not modeled
3,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Structural Interface Information,Size,Pipe/Valve not modeled
4,0,HSS-HSSS to AGSS\r\nHigh Pressure Helium Makeu...,Structural Interface Information,Location,FHS Floor 4
5,1,AGSS to HRS Recovered Helium from Charge and D...,Structural Interface Information,Internal_or_External,Internal
6,1,AGSS to HRS Recovered Helium from Charge and D...,Structural Interface Information,Subcategory,Pipe Flange
7,1,AGSS to HRS Recovered Helium from Charge and D...,Structural Interface Information,Weight,Pipe/Valve not modeled AGSS Manifold A000N542
8,1,AGSS to HRS Recovered Helium from Charge and D...,Structural Interface Information,Size,Pipe/Valve not modeled A000N542
9,1,AGSS to HRS Recovered Helium from Charge and D...,Structural Interface Information,Location,FHS Floor 4


Saved: process_interface_items.csv, structural_interface_items.csv


In [3]:
# --- Cell 3: (Optional) Per-source outputs: one CSV per source for each section ---
from pathlib import Path
import zipfile

out_root = Path("per_source_interface_items")
out_root.mkdir(exist_ok=True, parents=True)

def write_per_source_csvs(objects: List[DescriptionObject], section_heading: str, filename: str):
    section_key = slugify(section_heading)
    for o in objects:
        rows = []
        for item in o.parsed_items.get(section_key, []):
            rows.append({
                "source_index": o.source_index,
                "title": o.title,
                "section": section_heading,
                "property_name": item["name"],
                "property_value": item["value"],
            })
        if rows:
            df = pd.DataFrame(rows)
            src_dir = out_root / f"{o.source_index:04d}_{slugify(o.title) or 'row'}"
            src_dir.mkdir(exist_ok=True, parents=True)
            df.to_csv(src_dir / filename, index=False)

# Create per-source CSVs
write_per_source_csvs(parser.objects, "Process Interface Information", "process_interface_items.csv")
write_per_source_csvs(parser.objects, "Structural Interface Information", "structural_interface_items.csv")


