In [1]:
from pathlib import Path
import os

from IPython import get_ipython
from IPython.core.magic import register_cell_magic

ipython = get_ipython()


@register_cell_magic
def pybash(line, cell):
    ipython.run_cell_magic("bash", "", cell.format(**globals()))

In [2]:
import gemmi
from pathlib import Path


def extract_domain(
    struct_file: Path, chain: str, start_res: int, end_res: int, output_file: Path
) -> None:
    """
    Extract a domain from a structure file (PDB or mmCIF) using gemmi.

    Args:
        struct_file: Path to the input structure file (PDB or mmCIF)
        chain: Chain identifier
        start_res: Start residue number
        end_res: End residue number
        output_file: Path to save the extracted domain (always as mmCIF)

    Raises:
        ValueError: If the chain is not found or no residues are extracted.
        Exception: For other file reading or processing errors.
    """
    try:
        structure = gemmi.read_structure(
            str(struct_file), merge_chain_parts=True, format=gemmi.CoorFormat.Detect
        )

        # Create a new structure for the domain
        domain = gemmi.Structure()
        domain.name = f"{struct_file.stem}_{chain}_{start_res}_{end_res}"

        # Create a new model
        model = gemmi.Model("1")

        # Find and copy the specified chain
        found_chain = False
        extracted_residues = 0
        original_chain_instance = None

        for ch in structure[0]:  # Assuming model 0
            if ch.name == chain:
                original_chain_instance = ch
                found_chain = True
                break

        if not found_chain:
            raise ValueError(f"Chain {chain} not found in {struct_file}")

        new_chain = gemmi.Chain(chain)

        # Copy residues in the specified range
        for residue in original_chain_instance:
            seq_id = residue.seqid.num
            if start_res <= seq_id <= end_res:
                # Ensure residue is cloned to avoid modifying original structure if needed elsewhere
                new_chain.add_residue(residue.clone())
                extracted_residues += 1

        if extracted_residues == 0:
            raise ValueError(
                f"No residues in range {start_res}-{end_res} found in chain {chain} of {struct_file}"
            )

        model.add_chain(new_chain)
        domain.add_model(model)
        domain.make_mmcif_document().write_file(str(output_file))

        print(
            f"Successfully extracted domain from {struct_file}: chain {chain}, "
            f"residues {start_res}-{end_res} ({extracted_residues} residues) to {output_file}"
        )

    except ValueError as ve:  # Re-raise specific errors
        raise ve
    except Exception as e:
        # Raise a more informative general exception
        raise Exception(f"Failed to extract domain from {struct_file}: {e}") from e

In [3]:
structure_id = "1kt0"
tmp_dir = Path("../tmp/domain_separation")
data_dir = Path(f"../data/{structure_id}")

structure_file = data_dir / f"{structure_id}.cif"
structure_svg = tmp_dir / f"{structure_id}.svg"
domains_svg = tmp_dir / f"{structure_id}-domains.svg"

os.makedirs(tmp_dir, exist_ok=True)

## Generate Structure SVG without Domain Separation


In [4]:
%%pybash

uv run flatprot project {structure_file} -o {structure_svg}

[2;36m2025-04-01 11:23:24[0m[2;36m [0m[34mINFO    [0m Using default styles                               
[2;36m2025-04-01 11:23:24[0m[2;36m [0m[34mINFO    [0m [1mSVG saved to ..[0m[1;35m/tmp/domain_separation/[0m[1;95m1kt0.svg[0m     
[2;36m2025-04-01 11:23:24[0m[2;36m [0m[34mINFO    [0m [1mSuccessfully processed structure:[0m                  
[2;36m2025-04-01 11:23:24[0m[2;36m [0m[34mINFO    [0m   Structure file: ..[35m/data/1kt0/[0m[95m1kt0.cif[0m            
[2;36m2025-04-01 11:23:24[0m[2;36m [0m[34mINFO    [0m   Output file: ..[35m/tmp/domain_separation/[0m[95m1kt0.svg[0m   
[2;36m2025-04-01 11:23:24[0m[2;36m [0m[34mINFO    [0m   Transformation: Inertia-based                    


## Load Domains generated by Chainsaw [(Wells et al. 2024)](https://doi.org/10.1093/bioinformatics/btae296) | [GitHub](https://github.com/JudeWells/chainsaw)


In [5]:
import polars as pl
from flatprot.core import ResidueRange
from flatprot.utils.domain_utils import DomainTransformation

# Define the path to the chainsaw domains file
chainsaw_file = data_dir / f"{structure_id.lower()}-chainsaw-domains.tsv"

# Read the domains file
domains_df = pl.read_csv(chainsaw_file, separator="\t")

# Parse the chopping column which contains residue ranges
chopping = domains_df["chain_id" == structure_id]["chopping"][0]

# Get all residues from the ranges
domains = []
for range_str in chopping.split(","):
    start, end = map(int, range_str.split("-"))
    domains.append(DomainTransformation(ResidueRange("A", start, end)))

## Split the structure into domains


Successfully extracted domain from ../data/1kt0/1kt0.cif: chain A, residues 34-141 (89 residues) to ../tmp/domain_separation/1kt0_A_34_141.cif
Successfully extracted domain from ../data/1kt0/1kt0.cif: chain A, residues 148-251 (104 residues) to ../tmp/domain_separation/1kt0_A_148_251.cif
Successfully extracted domain from ../data/1kt0/1kt0.cif: chain A, residues 256-411 (152 residues) to ../tmp/domain_separation/1kt0_A_256_411.cif
--- Debug: Combined CANVAS Coordinates ---
  Range (0-89):
    Shape: (89, 2)
    Min XY: [-256.61654924 -273.9778134 ]
    Max XY: [272.18737812 266.0221866 ]
  Range (0-104):
    Shape: (104, 2)
    Min XY: [ 578.80392736 -238.10459304]
    Max XY: [1298.80392736  233.68946882]
  Range (0-152):
    Shape: (152, 2)
    Min XY: [1348.80392736 -243.5166494 ]
    Max XY: [1946.0835796  296.4833506]
--- End Debug ---
Combined SVG saved to ../tmp/domain_separation/1kt0-domains-combined.svg
