# Norwegian Trails Data Exploration

**Goal**: Understand the actual structure and content of Turrutebasen data from Geonorge before building abstractions.

**Data Source**: Geonorge/Kartverket - Norwegian government's official trail database (Turrutebasen)

In [None]:
# Setup and imports
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

# Our minimal modules
from trails.analysis.describe import describe_dataframe
from trails.io.sources.geonorge import Source as GeonorgeSource
from trails.io.sources.language import Language

warnings.filterwarnings("ignore")

# Project paths
project_root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
cache_dir = project_root / ".cache"

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", 50)
plt.rcParams["figure.figsize"] = (12, 8)

print("Setup complete!")
print(f"Project root: {project_root}")
print(f"Cache directory: {cache_dir}")

## 1. Load Geonorge Data

First, let's load the data and see what we're working with.

In [None]:
# Initialize Geonorge source and load data
geonorge = GeonorgeSource(cache_dir=str(cache_dir))

# Load with English translations (use Language.NO for Norwegian)
trail_data = geonorge.load_turrutebasen(language=Language.EN)
print("Data Source Information:")
print("=" * 60)
print(f"Dataset: {trail_data.metadata.dataset_name}")
print(f"Provider: {trail_data.metadata.provider}")
print(f"License: {trail_data.metadata.license}")
print(f"Dataset ID: {trail_data.metadata.dataset_id}")
print(f"Description: {trail_data.metadata.description}")
print(f"Attribution: {trail_data.metadata.attribution}")

print(f"\nData loaded successfully from: {trail_data.source_url}")
print(f"Version: {trail_data.version}")
print(f"Language: {trail_data.language.value}")
print(f"CRS: {trail_data.crs}")
print(f"Total features: {trail_data.total_features}")

# Layers and columns are now automatically translated
print(f"\nFound {len(trail_data.spatial_layers)} spatial layers:")
for layer_name, gdf in trail_data.spatial_layers.items():
    print(f"  - {layer_name}: {len(gdf)} features")

print(f"\nFound {len(trail_data.attribute_tables)} attribute tables:")
for table_name, df in trail_data.attribute_tables.items():
    print(f"  - {table_name}: {len(df)} rows")

## 2. Complete Layer Analysis

Let's examine each layer in detail, analyzing both structure and columns.

In [None]:
# Complete analysis of all layers using describe_dataframe
print("=" * 80)
print("SPATIAL LAYERS ANALYSIS")
print("=" * 80)

layer_summary = {}

for layer_name, gdf in trail_data.spatial_layers.items():
    print(f"\n{'=' * 60}")
    print(f"LAYER: {layer_name}")
    print(f"{'=' * 60}")

    # Basic info
    print("\nBasic Information:")
    print(f"  Shape: {gdf.shape}")
    print(f"  CRS: {gdf.crs}")

    # Geometry info
    print("\nGeometry Information:")
    if not gdf.empty and "geometry" in gdf.columns:
        geom_types = gdf.geometry.geom_type.unique()
        print(f"  Geometry types: {geom_types}")
        print(f"  Total bounds: {gdf.total_bounds}")

    # Column analysis using describe_dataframe (sorted by completeness)
    columns = [col for col in gdf.columns if col != "geometry"]
    print(f"\nColumn Analysis ({len(columns)} columns, sorted by completeness):")

    # Use describe_dataframe for detailed analysis with automatic sorting
    print("\n" + describe_dataframe(gdf, columns))

    # Store summary
    layer_summary[layer_name] = {
        "shape": gdf.shape,
        "crs": str(gdf.crs),
        "columns": list(gdf.columns),
        "geometry_types": list(geom_types) if not gdf.empty else [],
    }

In [None]:
print("\n" + "=" * 80)
print("ATTRIBUTE TABLES ANALYSIS")
print("=" * 80)

for table_name, df in trail_data.attribute_tables.items():
    print(f"\n{'=' * 60}")
    print(f"TABLE: {table_name}")
    print(f"{'=' * 60}")

    # Basic info
    print("\nBasic Information:")
    print(f"  Shape: {df.shape}")

    # Column analysis using describe_dataframe (sorted by completeness)
    print(f"\nColumn Analysis ({len(df.columns)} columns, sorted by completeness):")

    # Use describe_dataframe for detailed analysis with automatic sorting
    print("\n" + describe_dataframe(df))

## 3. Identify Trail Layers

Let's identify which layers contain actual trail data (LineString geometries).

In [None]:
# Find trail layers (with line geometries)
trail_layers = {}
point_layers = {}
other_layers = {}

for layer_name, gdf in trail_data.spatial_layers.items():
    if gdf.empty:
        other_layers[layer_name] = gdf
        continue

    # Check geometry type
    geom_types = gdf.geometry.geom_type.unique()

    if any(gt in ["LineString", "MultiLineString"] for gt in geom_types):
        trail_layers[layer_name] = gdf
    elif any(gt in ["Point", "MultiPoint"] for gt in geom_types):
        point_layers[layer_name] = gdf
    else:
        other_layers[layer_name] = gdf

print("Layer Classification:")
print("=" * 60)

print(f"\nTrail Layers (Lines): {len(trail_layers)}")
for name, gdf in trail_layers.items():
    print(f"  - {name}: {len(gdf)} features")

print(f"\nPoint Layers (Facilities): {len(point_layers)}")
for name, gdf in point_layers.items():
    print(f"  - {name}: {len(gdf)} features")

if other_layers:
    print(f"\nOther Layers: {len(other_layers)}")
    for name, gdf in other_layers.items():
        print(f"  - {name}: {len(gdf)} features")

## 4. Database Structure & Relationships

Let's understand how the hiking trail data is organized in the database, including spatial layers, attribute tables, and their relationships.

### 4.1 Spatial Layer Schema

Analyze the structure and fields of the spatial layer containing trail geometries.

In [None]:
# Analyze spatial layer schema for hiking trails
hiking_trail_layer = "hiking_trail_centerline" if trail_data.language == Language.EN else "fotrute_senterlinje"

if hiking_trail_layer in trail_data.spatial_layers:
    trails = trail_data.spatial_layers[hiking_trail_layer]

    print("SPATIAL LAYER SCHEMA ANALYSIS")
    print("=" * 70)

    # Basic schema info
    print("\n1. LAYER STRUCTURE")
    print("-" * 50)
    print(f"Layer name: {hiking_trail_layer}")
    print(f"Total features: {len(trails):,}")
    print(f"CRS: {trails.crs}")
    print(f"Geometry type: {trails.geometry.geom_type.unique()}")
    print(f"Total columns: {len(trails.columns)}")

    # Analyze column types
    print("\n2. COLUMN TYPES")
    print("-" * 50)

    # Group columns by data type
    spatial_type_groups: dict[str, list[str]] = {}
    for col in trails.columns:
        if col != "geometry":
            dtype_str = str(trails[col].dtype)
            if dtype_str not in spatial_type_groups:
                spatial_type_groups[dtype_str] = []
            spatial_type_groups[dtype_str].append(col)

    for dtype_str, cols in sorted(spatial_type_groups.items()):
        print(f"\n{dtype_str} columns ({len(cols)}):")
        for col in sorted(cols):
            print(f"  - {col}")

    # Identify key fields using describe_dataframe (no sorting)
    print("\n3. KEY IDENTIFIER FIELDS")
    print("-" * 50)
    print(describe_dataframe(trails, ["local_id", "version_id", "namespace"], sort_by_completeness=False))

    # Metadata fields using describe_dataframe (no sorting)
    print("\n4. METADATA FIELDS")
    print("-" * 50)
    print(
        describe_dataframe(
            trails, ["origin", "update_date", "copy_date", "data_capture_date", "measurement_method", "accuracy"], sort_by_completeness=False
        )
    )

    # Trail attribute fields using describe_dataframe (with sorting)
    print("\n5. TRAIL ATTRIBUTE FIELDS (in spatial layer)")
    print("-" * 50)
    print(
        describe_dataframe(
            trails,
            [
                "object_type",
                "marking",
                "signage",
                "lighting",
                "season",
                "information",
                "trail_follows",
                "trail_width",
                "surface_type",
                "traffic_load",
                "SHAPE_Length",
            ],
        )
    )

### 4.2 Attribute Table Schema

Analyze the structure of the attribute table that contains detailed trail information.

In [None]:
# Analyze attribute table schema
hiking_attrs_layer = "hiking_trail_info_table" if trail_data.language == Language.EN else "fotrute_info_tabell"

if hiking_attrs_layer in trail_data.attribute_tables:
    hiking_attrs = trail_data.attribute_tables[hiking_attrs_layer]

    print("ATTRIBUTE TABLE SCHEMA ANALYSIS")
    print("=" * 70)

    # Basic schema info
    print("\n1. TABLE STRUCTURE")
    print("-" * 50)
    print(f"Table name: {hiking_attrs_layer}")
    print(f"Total rows: {len(hiking_attrs):,}")
    print(f"Total columns: {len(hiking_attrs.columns)}")

    # Column types
    print("\n2. COLUMN TYPES")
    print("-" * 50)

    attr_type_groups: dict[str, list[str]] = {}
    for col in hiking_attrs.columns:
        dtype_str = str(hiking_attrs[col].dtype)
        if dtype_str not in attr_type_groups:
            attr_type_groups[dtype_str] = []
        attr_type_groups[dtype_str].append(col)

    for dtype_str, cols in sorted(attr_type_groups.items()):
        print(f"\n{dtype_str} columns ({len(cols)}):")
        for col in sorted(cols):
            print(f"  - {col}")

    # Key fields using describe_dataframe
    print("\n3. KEY FIELDS")
    print("-" * 50)
    print(describe_dataframe(hiking_attrs, ["hiking_trail_fk"], sort_by_completeness=False))

    # Information fields using describe_dataframe
    print("\n4. INFORMATION FIELDS")
    print("-" * 50)
    print(
        describe_dataframe(
            hiking_attrs,
            [
                "object_type",
                "special_hiking_trail_type",
                "trail_type",
                "trail_name",
                "trail_number",
                "trail_significance",
                "trail_information",
                "accessibility",
                "difficulty",
            ],
        )
    )

    # Administrative fields using describe_dataframe
    print("\n5. ADMINISTRATIVE FIELDS")
    print("-" * 50)
    print(describe_dataframe(hiking_attrs, ["maintenance_responsible"]))

### 4.3 Foreign Key Relationships

Analyze how the spatial layer and attribute table are connected through foreign keys.

In [None]:
# Analyze foreign key relationships
if "hiking_trail_fk" in hiking_attrs.columns:
    print("FOREIGN KEY RELATIONSHIP ANALYSIS")
    print("=" * 70)

    hiking_spatial = trail_data.spatial_layers[hiking_trail_layer]

    # Basic relationship stats
    print("\n1. RELATIONSHIP OVERVIEW")
    print("-" * 50)
    print(f"Spatial layer records: {len(hiking_spatial):,}")
    print(f"Attribute table records: {len(hiking_attrs):,}")
    print("Relationship type: Many-to-One (many attribute rows -> one geometry)")

    # Analyze FK distribution
    fk_counts = hiking_attrs["hiking_trail_fk"].value_counts()

    print("\n2. FOREIGN KEY DISTRIBUTION")
    print("-" * 50)
    fk_dtype = hiking_attrs["hiking_trail_fk"].dtype
    print(f"Foreign key column type: hiking_trail_fk [{fk_dtype}]")
    print(f"Unique foreign keys: {hiking_attrs['hiking_trail_fk'].nunique():,}")
    print(f"Should match spatial records: {len(hiking_spatial):,}")
    print(f"Match: {'✓ Yes' if hiking_attrs['hiking_trail_fk'].nunique() == len(hiking_spatial) else '✗ No'}")

    print("\nDistribution of attribute rows per geometry:")
    print(f"  Min: {fk_counts.min()} row(s)")
    print(f"  Max: {fk_counts.max()} row(s)")
    print(f"  Mean: {fk_counts.mean():.2f} row(s)")
    print(f"  Median: {fk_counts.median():.0f} row(s)")

    # Detailed distribution
    print("\n3. RELATIONSHIP CARDINALITY")
    print("-" * 50)

    cardinality_dist = fk_counts.value_counts().sort_index()

    # Group into categories
    one_to_one = (fk_counts == 1).sum()
    one_to_two = (fk_counts == 2).sum()
    one_to_many = (fk_counts > 2).sum()

    print(f"One-to-One (1 attribute row): {one_to_one:,} geometries ({one_to_one / len(fk_counts) * 100:.1f}%)")
    print(f"One-to-Two (2 attribute rows): {one_to_two:,} geometries ({one_to_two / len(fk_counts) * 100:.1f}%)")
    print(f"One-to-Many (3+ attribute rows): {one_to_many:,} geometries ({one_to_many / len(fk_counts) * 100:.1f}%)")

    # Show distribution for first 20 cardinalities
    print("\nDetailed cardinality distribution:")
    for n_rows, count in cardinality_dist.head(20).items():
        pct = count / len(fk_counts) * 100
        print(f"  {n_rows} attribute row(s): {count:,} geometries ({pct:.1f}%)")

    # Analyze why multiple rows exist
    print("\n4. MULTIPLE ROWS ANALYSIS")
    print("-" * 50)

    # Get FKs with multiple rows
    multi_row_fks = fk_counts[fk_counts > 1].index

    if len(multi_row_fks) > 0:
        # Sample a FK with multiple rows
        sample_fk = multi_row_fks[0]
        sample_rows = hiking_attrs[hiking_attrs["hiking_trail_fk"] == sample_fk]

        print(f"Example: FK '{sample_fk[:8]}...' has {len(sample_rows)} rows")

        # Check what differs between rows
        varying_cols = []
        for col in hiking_attrs.columns:
            if col != "hiking_trail_fk" and sample_rows[col].nunique() > 1:
                varying_cols.append(col)

        print(f"Columns that vary across these rows: {len(varying_cols)}")
        if "trail_name" in varying_cols:
            name_dtype = hiking_attrs["trail_name"].dtype
            print(f"\nDifferent trail names [column type: {name_dtype}] for same segment:")
            for name in sample_rows["trail_name"].unique()[:5]:
                if pd.notna(name):
                    print(f"  - {name}")

    # Validation check
    print("\n5. DATA INTEGRITY CHECKS")
    print("-" * 50)

    # Check if all FKs in attribute table exist in spatial layer
    spatial_ids = set(hiking_spatial["local_id"])
    attr_fks = set(hiking_attrs["hiking_trail_fk"])

    # Get types for comparison
    spatial_id_dtype = hiking_spatial["local_id"].dtype
    attr_fk_dtype = hiking_attrs["hiking_trail_fk"].dtype

    print(f"Comparing: local_id [{spatial_id_dtype}] with hiking_trail_fk [{attr_fk_dtype}]")

    orphaned_fks = attr_fks - spatial_ids
    print(f"Orphaned foreign keys (in attributes but not spatial): {len(orphaned_fks)}")

    unreferenced_ids = spatial_ids - attr_fks
    print(f"Unreferenced geometries (in spatial but not attributes): {len(unreferenced_ids)}")

    if len(orphaned_fks) == 0 and len(unreferenced_ids) == 0:
        print("✓ Perfect referential integrity - all relationships are valid")

### 4.4 Data Quality Assessment

Evaluate the completeness and quality of the data across both layers.

In [None]:
# Data quality assessment
print("DATA QUALITY ASSESSMENT")
print("=" * 70)

# Spatial layer quality
print("\n1. SPATIAL LAYER QUALITY")
print("-" * 50)

print("Geometry validation:")
print(f"  Valid geometries: {trails.geometry.is_valid.sum():,} / {len(trails):,}")
print(f"  Invalid geometries: {(~trails.geometry.is_valid).sum()}")
print(f"  Empty geometries: {trails.geometry.is_empty.sum()}")
print(f"  Simple geometries: {trails.geometry.is_simple.sum():,} / {len(trails):,}")

# Check for duplicates
duplicate_geoms = trails.geometry.duplicated().sum()
print(f"  Duplicate geometries: {duplicate_geoms}")

# Field completeness in spatial layer
print("\nField completeness (spatial layer):")
spatial_completeness = []
for col in trails.columns:
    if col != "geometry":
        dtype = trails[col].dtype
        non_null = trails[col].notna().sum()
        completeness = (non_null / len(trails)) * 100
        spatial_completeness.append((col, dtype, completeness))

# Sort by completeness
spatial_completeness.sort(key=lambda x: x[2], reverse=True)

print("  Fully complete fields (100%):")
for col, dtype, pct in spatial_completeness:
    if pct == 100:
        print(f"    - {col} [{dtype}]")

print("  Partially populated fields (<100% & >=10%):")
partial_fields = [(col, dtype, pct) for col, dtype, pct in spatial_completeness if 100 > pct >= 10]
for col, dtype, pct in partial_fields:
    print(f"    - {col} [{dtype}]: {pct:.1f}%")

print("  Poorly populated fields (<10%):")
poor_fields = [(col, dtype, pct) for col, dtype, pct in spatial_completeness if pct < 10]
for col, dtype, pct in poor_fields:
    print(f"    - {col} [{dtype}]: {pct:.1f}%")

# Attribute table quality
print("\n2. ATTRIBUTE TABLE QUALITY")
print("-" * 50)

# Critical fields completeness
critical_fields = ["trail_name", "trail_number", "difficulty", "trail_type", "trail_significance", "maintenance_responsible", "hiking_trail_fk"]

print("Critical field completeness:")
for field in critical_fields:
    if field in hiking_attrs.columns:
        dtype = hiking_attrs[field].dtype
        non_null = hiking_attrs[field].notna().sum()
        completeness = (non_null / len(hiking_attrs)) * 100
        print(f"  {field} [{dtype}]: {completeness:.1f}%")

# Check for unnamed trails
if "trail_name" in hiking_attrs.columns:
    name_dtype = hiking_attrs["trail_name"].dtype
    unnamed = hiking_attrs["trail_name"].isna().sum()
    unknown = (hiking_attrs["trail_name"] == "Ukjent").sum() if "Ukjent" in hiking_attrs["trail_name"].values else 0
    total_unnamed = unnamed + unknown
    print(f"\nUnnamed/unknown trails in trail_name [{name_dtype}]: {total_unnamed:,} ({total_unnamed / len(hiking_attrs) * 100:.1f}%)")

# Overall assessment
print("\n3. OVERALL DATA QUALITY SUMMARY")
print("-" * 50)

# Calculate quality score including new metrics
quality_metrics = {
    "Geometry validity": trails.geometry.is_valid.sum() / len(trails) * 100,
    "No duplicate geometries": (len(trails) - duplicate_geoms) / len(trails) * 100,
    "Trail names present": (len(hiking_attrs) - total_unnamed) / len(hiking_attrs) * 100 if "trail_name" in hiking_attrs.columns else 0,
    "Trail numbers present": hiking_attrs["trail_number"].notna().sum() / len(hiking_attrs) * 100 if "trail_number" in hiking_attrs.columns else 0,
    "Maintenance responsible present": hiking_attrs["maintenance_responsible"].notna().sum() / len(hiking_attrs) * 100
    if "maintenance_responsible" in hiking_attrs.columns
    else 0,
    "Referential integrity": 100 if len(orphaned_fks) == 0 and len(unreferenced_ids) == 0 else 0,
}

print("Quality metrics:")
for metric, score in quality_metrics.items():
    status = "✓" if score >= 95 else "⚠" if score >= 80 else "✗"
    print(f"  {status} {metric}: {score:.1f}%")

avg_quality = sum(quality_metrics.values()) / len(quality_metrics)
print(f"\nOverall quality score: {avg_quality:.1f}%")

# Data structure summary
print("\n4. DATA STRUCTURE INSIGHTS")
print("-" * 50)
print("✓ Normalized database design prevents geometry duplication")
print("✓ Many-to-one relationships allow multiple trail names per segment")
print("✓ Spatial layer contains minimal attributes (optimized for geometry)")
print("✓ Attribute table contains rich trail information")
print(f"✓ Total unique trail segments: {len(trails):,}")
print(f"✓ Total trail name assignments: {len(hiking_attrs):,}")
print(f"✓ Segments with multiple names: {len(multi_row_fks):,} ({len(multi_row_fks) / len(trails) * 100:.1f}%)")

## 5. Trail Analysis

Now let's analyze what the data tells us about the trails themselves - their characteristics, distribution, and patterns.

### 5.1 Spatial Distribution

Analyze where the trails are located and their spatial coverage.

In [None]:
# Analyze spatial distribution for main trail layer
hiking_trail_layer = "hiking_trail_centerline" if trail_data.language == Language.EN else "fotrute_senterlinje"

if hiking_trail_layer in trail_data.spatial_layers:
    trails = trail_data.spatial_layers[hiking_trail_layer]

    print("Spatial Distribution Analysis")
    print("=" * 60)

    # Get bounds
    bounds = trails.total_bounds
    print("\nSpatial extent (minx, miny, maxx, maxy):")
    print(f"  {bounds}")
    print(f"\nCRS: {trails.crs}")

    # Analyze by object_type if it exists (translated column name)
    objtype_col = "object_type" if trail_data.language == Language.EN else "objtype"
    if objtype_col in trails.columns:
        print(f"\nDistribution by {objtype_col}:")
        type_stats = trails[objtype_col].value_counts()
        for trail_type, count in type_stats.items():
            # Values are already expanded and translated
            print(f"  {trail_type}: {count} trails ({count / len(trails) * 100:.1f}%)")

    # Use SHAPE_Length directly (already in meters for EPSG:25833)
    print("\nLength Analysis:")

    # SHAPE_Length is already in meters for this projected CRS
    total_length = trails["SHAPE_Length"].sum()
    avg_length = trails["SHAPE_Length"].mean()
    median_length = trails["SHAPE_Length"].median()

    # Print in kilometers for readability
    print(f"  Total length: {total_length / 1000:,.1f} km")
    print(f"  Average trail length: {avg_length / 1000:.2f} km")
    print(f"  Median trail length: {median_length / 1000:.2f} km")

    # Show distribution of trail lengths
    print("\nTrail Length Distribution:")
    print(f"  Shortest trail: {trails['SHAPE_Length'].min() / 1000:.3f} km")
    print(f"  Longest trail: {trails['SHAPE_Length'].max() / 1000:.1f} km")

    # Length categories (using meters for calculation, displaying in km)
    bins = [0, 1000, 5000, 10000, 20000, float("inf")]
    labels = ["<1km", "1-5km", "5-10km", "10-20km", ">20km"]
    trails["length_category"] = pd.cut(trails["SHAPE_Length"], bins=bins, labels=labels)

    print("\nTrails by length category:")
    for category in labels:
        count = (trails["length_category"] == category).sum()
        pct = count / len(trails) * 100
        print(f"  {category}: {count:,} trails ({pct:.1f}%)")

### 5.2 Trail Names & Types

Analyze trail naming patterns and classifications.

In [None]:
# Trail names and types analysis
from typing import Any

print("TRAIL NAMES, TYPES AND INFRASTRUCTURE ANALYSIS")
print("=" * 70)

# Trail names
print("\n1. TRAIL NAMES")
print("-" * 50)

if "trail_name" in hiking_attrs.columns:
    name_dtype = hiking_attrs["trail_name"].dtype
    unique_names = hiking_attrs["trail_name"].nunique()
    total_named = hiking_attrs["trail_name"].notna().sum()
    unnamed = hiking_attrs["trail_name"].isna().sum()

    print(f"Column: trail_name [{name_dtype}]")
    print(f"Unique trail names: {unique_names:,}")
    print(f"Named trail segments: {total_named:,} ({total_named / len(hiking_attrs) * 100:.1f}%)")
    print(f"Unnamed segments: {unnamed:,} ({unnamed / len(hiking_attrs) * 100:.1f}%)")

    # Most common trail names
    name_counts = hiking_attrs["trail_name"].value_counts()

    print("\nTop 15 most common trail names:")
    # Iterate without unpacking to avoid type issues
    for _i, item in enumerate(name_counts.head(15).items()):
        trail_name_val: Any = item[0]  # Type hint to avoid mypy error
        trail_count: int = item[1]
        pct = trail_count / len(hiking_attrs) * 100
        print(f"  {trail_name_val}: {trail_count:,} segments ({pct:.2f}%)")

    # Trail name patterns
    print("\n2. TRAIL NAME PATTERNS")
    print("-" * 50)

    # Check for numbered trails
    numbered_pattern = hiking_attrs["trail_name"].str.contains(r"\d+", na=False).sum()
    print(f"Trails with numbers: {numbered_pattern:,}")

    # Check for common prefixes/patterns
    common_patterns = {
        "Pilegrimsleden": hiking_attrs["trail_name"].str.contains("Pilegrimsleden", na=False).sum(),
        "Kyststi": hiking_attrs["trail_name"].str.contains("Kyststi", na=False).sum(),
        "DNT": hiking_attrs["trail_name"].str.contains("DNT", na=False).sum(),
        "Tursti": hiking_attrs["trail_name"].str.contains("Tursti", na=False).sum(),
        "Rundtur": hiking_attrs["trail_name"].str.contains("Rundtur", na=False).sum(),
    }

    print("Common trail name patterns:")
    for pattern, pattern_count in common_patterns.items():
        if pattern_count > 0:
            pct = pattern_count / total_named * 100
            print(f"  Contains '{pattern}': {pattern_count:,} segments ({pct:.2f}%)")

# Trail types
print("\n3. TRAIL TYPES")
print("-" * 50)

if "trail_type" in hiking_attrs.columns:
    type_dtype = hiking_attrs["trail_type"].dtype
    type_counts = hiking_attrs["trail_type"].value_counts()

    print(f"Column: trail_type [{type_dtype}]")
    print("Trail type distribution:")
    for trail_type, type_count in type_counts.items():
        pct = type_count / len(hiking_attrs) * 100
        print(f"  {trail_type}: {type_count:,} ({pct:.1f}%)")

# Special hiking trail types
if "special_hiking_trail_type" in hiking_attrs.columns:
    print("\n4. SPECIAL TRAIL TYPES")
    print("-" * 50)

    special_dtype = hiking_attrs["special_hiking_trail_type"].dtype
    special_counts = hiking_attrs["special_hiking_trail_type"].value_counts()

    print(f"Column: special_hiking_trail_type [{special_dtype}]")
    print("Special hiking trail classifications:")
    for special_type, special_count in special_counts.items():
        pct = special_count / len(hiking_attrs) * 100
        print(f"  {special_type}: {special_count:,} ({pct:.1f}%)")

# Trail significance
if "trail_significance" in hiking_attrs.columns:
    print("\n5. TRAIL SIGNIFICANCE")
    print("-" * 50)

    sig_dtype = hiking_attrs["trail_significance"].dtype
    sig_counts = hiking_attrs["trail_significance"].value_counts()

    print(f"Column: trail_significance [{sig_dtype}]")
    print("Trail significance levels:")
    for significance, sig_count in sig_counts.items():
        pct = sig_count / len(hiking_attrs) * 100
        print(f"  {significance}: {sig_count:,} ({pct:.1f}%)")

# Marking and signage (from spatial layer)
print("\n5. TRAIL MAINTENANCE")
print("-" * 50)
print("-" * 50)

# Maintenance Responsible
if "maintenance_responsible" in hiking_attrs.columns:
    print("\n6. MAINTENANCE RESPONSIBLE")
    print("-" * 50)

    maint_dtype = hiking_attrs["maintenance_responsible"].dtype
    maint_counts = hiking_attrs["maintenance_responsible"].value_counts()
    maint_total = hiking_attrs["maintenance_responsible"].notna().sum()

    print(f"Column: maintenance_responsible [{maint_dtype}]")
    print(f"Trails with maintenance info: {maint_total:,} ({maint_total / len(hiking_attrs) * 100:.1f}%)")
    print(f"Unique maintenance organizations: {hiking_attrs['maintenance_responsible'].nunique():,}")

    print("\nTop 10 maintenance organizations:")
    for _i, item in enumerate(maint_counts.head(10).items()):
        org_name: Any = item[0]  # Type hint to avoid mypy error
        org_count: int = item[1]
        pct = org_count / len(hiking_attrs) * 100
        print(f"  {org_name}: {org_count:,} segments ({pct:.1f}%)")

    # Check for common organization types
    print("\nOrganization type patterns:")
    org_patterns = {
        "DNT": hiking_attrs["maintenance_responsible"].str.contains("DNT|Turistforening", na=False).sum(),
        "Kommune": hiking_attrs["maintenance_responsible"].str.contains("kommune|Kommune", na=False).sum(),
        "Nasjonalt pilegrimssenter": hiking_attrs["maintenance_responsible"].str.contains("Nasjonalt pilegrimssenter", na=False).sum(),
        "Idrettslag": hiking_attrs["maintenance_responsible"].str.contains("idrettslag|IL|Idrettslag", na=False).sum(),
        "Turlag": hiking_attrs["maintenance_responsible"].str.contains("turlag|Turlag", na=False).sum(),
    }

    for pattern, pattern_total in org_patterns.items():
        if pattern_total > 0:
            pct = pattern_total / maint_total * 100
            print(f"  Contains '{pattern}': {pattern_total:,} trails ({pct:.1f}%)")

# Marking and signage (from spatial layer)
print("\n5. TRAIL INFRASTRUCTURE")
print("-" * 50)

if "marking" in trails.columns:
    marked = trails["marking"].notna().sum()
    print(f"Segments with marking info: {marked:,} ({marked / len(trails) * 100:.1f}%)")

    if marked > 0:
        mark_values = trails["marking"].value_counts()
        for mark_type, mark_count in mark_values.items():
            pct = mark_count / len(trails) * 100
            print(f"  {mark_type}: {mark_count:,} ({pct:.1f}%)")

if "signage" in trails.columns:
    signage = trails["signage"].notna().sum()
    print(f"\nSegments with signage info: {signage:,} ({signage / len(trails) * 100:.1f}%)")

    if signage > 0:
        sign_values = trails["signage"].value_counts()
        for sign_type, sign_count in sign_values.items():
            pct = sign_count / len(trails) * 100
            print(f"  {sign_type}: {sign_count:,} ({pct:.1f}%)")

if "lighting" in trails.columns:
    lighting = trails["lighting"].notna().sum()
    print(f"\nSegments with lighting: {lighting:,} ({lighting / len(trails) * 100:.1f}%)")

    if lighting > 0:
        light_values = trails["lighting"].value_counts()
        for light_type, light_count in light_values.items():
            pct = light_count / len(trails) * 100
            print(f"  {light_type}: {light_count:,} ({pct:.1f}%)")

### 5.3 Difficulty & Classification

Analyze trail difficulty ratings and user classifications.

In [None]:
# Difficulty and classification analysis
print("DIFFICULTY & CLASSIFICATION ANALYSIS")
print("=" * 70)

# Difficulty distribution
print("\n1. DIFFICULTY RATINGS")
print("-" * 50)

if "difficulty" in hiking_attrs.columns:
    diff_counts = hiking_attrs["difficulty"].value_counts()
    diff_total = hiking_attrs["difficulty"].notna().sum()

    print(f"Trails with difficulty rating: {diff_total:,} ({diff_total / len(hiking_attrs) * 100:.1f}%)")
    print(f"Missing difficulty rating: {len(hiking_attrs) - diff_total:,}")

    print("\nDifficulty distribution:")
    for difficulty, count in diff_counts.items():
        pct = count / len(hiking_attrs) * 100
        pct_rated = count / diff_total * 100
        print(f"  {difficulty}: {count:,} ({pct:.1f}% of all, {pct_rated:.1f}% of rated)")

# Cross-analysis: difficulty by trail type
if "difficulty" in hiking_attrs.columns and "trail_type" in hiking_attrs.columns:
    print("\n4. DIFFICULTY BY TRAIL TYPE")
    print("-" * 50)

    # Get trails with both difficulty and type
    both_fields = hiking_attrs[hiking_attrs["difficulty"].notna() & hiking_attrs["trail_type"].notna()]

    if len(both_fields) > 0:
        print(f"Trails with both difficulty and type: {len(both_fields):,}")

        # Create crosstab
        cross_tab = pd.crosstab(both_fields["trail_type"], both_fields["difficulty"])

        print("\nDifficulty distribution by trail type:")
        for trail_type in cross_tab.index[:5]:  # Show top 5 trail types
            print(f"\n{trail_type}:")
            row_total = cross_tab.loc[trail_type].sum()
            for difficulty in cross_tab.columns:
                count = cross_tab.loc[trail_type, difficulty]
                if count > 0:
                    pct = count / row_total * 100
                    print(f"  {difficulty}: {count} ({pct:.1f}%)")

## 6. Visualization

In [None]:
# Create basic visualizations for main trail layer
hiking_trail_layer = "hiking_trail_centerline" if trail_data.language == Language.EN else "fotrute_senterlinje"

if hiking_trail_layer in trail_data.spatial_layers:
    trails = trail_data.spatial_layers[hiking_trail_layer]

    # Plot all trails
    fig, ax = plt.subplots(figsize=(12, 10))

    # Plot trails with a simple color
    trails.plot(ax=ax, linewidth=0.5, alpha=0.6, color="darkgreen")

    ax.set_title(f"Norwegian Hiking Trails ({len(trails)} features)", fontsize=14)
    ax.set_xlabel("X Coordinate")
    ax.set_ylabel("Y Coordinate")
    ax.grid(True, alpha=0.3)

    # Add CRS info
    ax.text(
        0.02,
        0.98,
        f"CRS: {trails.crs}",
        transform=ax.transAxes,
        fontsize=10,
        verticalalignment="top",
    )

    plt.tight_layout()
    plt.show()

## 7. DNT Trail Analysis

Analyze trails maintained by DNT (Den Norske Turistforening - Norwegian Trekking Association) and its member organizations.

In [None]:
# Identify DNT-maintained trail segments and check for DNT origin vs maintenance discrepancies
print("DNT TRAIL SEGMENT ANALYSIS")
print("=" * 70)

# Ensure we have the required variables from earlier sections
hiking_trail_layer = "hiking_trail_centerline" if trail_data.language == Language.EN else "fotrute_senterlinje"
trails = trail_data.spatial_layers[hiking_trail_layer]
hiking_attrs_layer = "hiking_trail_info_table" if trail_data.language == Language.EN else "fotrute_info_tabell"
hiking_attrs = trail_data.attribute_tables[hiking_attrs_layer]

# Filter attribute table for DNT-maintained trails
# DNT can appear as "DNT" or as part of organization name (e.g., "DNT Oslo og omegn", "Turistforening")
dnt_attrs = hiking_attrs[hiking_attrs["maintenance_responsible"].str.contains("DNT|Turistforening", case=False, na=False)]

print("\n1. DNT TRAIL RECORDS IN ATTRIBUTE TABLE")
print("-" * 50)
print(f"Total DNT-maintained trail records: {len(dnt_attrs):,}")
print(f"Percentage of all trails: {len(dnt_attrs) / len(hiking_attrs) * 100:.1f}%")

# Get unique segment IDs (foreign keys to spatial layer)
unique_dnt_segment_ids = dnt_attrs["hiking_trail_fk"].unique()
print(f"\nUnique DNT-maintained segments: {len(unique_dnt_segment_ids):,}")
print(f"Percentage of all unique segments: {len(unique_dnt_segment_ids) / len(trails) * 100:.1f}%")

# Get the unique segments from spatial layer
dnt_segments = trails[trails["local_id"].isin(unique_dnt_segment_ids)]

print("\n2. DNT SEGMENT LENGTH ANALYSIS")
print("-" * 50)
print(f"DNT segments found in spatial layer: {len(dnt_segments):,}")

# Calculate total length of DNT segments
total_dnt_length = dnt_segments["SHAPE_Length"].sum()
print(f"Total length of DNT segments: {total_dnt_length / 1000:,.1f} km")
print(f"Percentage of total trail length: {total_dnt_length / trails['SHAPE_Length'].sum() * 100:.1f}%")

# Average segment length
avg_dnt_length = dnt_segments["SHAPE_Length"].mean()
median_dnt_length = dnt_segments["SHAPE_Length"].median()
print(f"\nAverage DNT segment length: {avg_dnt_length / 1000:.2f} km")
print(f"Median DNT segment length: {median_dnt_length / 1000:.2f} km")

# DNT organization breakdown
print("\n3. DNT ORGANIZATIONS")
print("-" * 50)
dnt_org_counts = dnt_attrs["maintenance_responsible"].value_counts()
print(f"Unique DNT organizations: {dnt_org_counts.nunique()}")

print("\nTop 10 DNT organizations:")
for org, count in dnt_org_counts.head(10).items():
    pct = count / len(dnt_attrs) * 100
    print(f"  {org}: {count:,} records ({pct:.1f}%)")

# CHECK FOR DNT ORIGIN VS DNT MAINTENANCE DISCREPANCIES
print("\n4. ⚠️ DNT ORIGIN VS MAINTENANCE DISCREPANCY ANALYSIS")
print("-" * 50)

# Check for segments with DNT origin
if "origin" in trails.columns:
    dnt_origin_segments = trails[trails["origin"].str.contains("DNT|Turistforening", case=False, na=False)]
    print(f"Segments with DNT in origin field: {len(dnt_origin_segments):,}")

    # Find segments with DNT origin but NOT DNT maintained
    dnt_origin_ids = set(dnt_origin_segments["local_id"])
    dnt_maintained_ids = set(unique_dnt_segment_ids)

    dnt_origin_not_maintained = dnt_origin_ids - dnt_maintained_ids
    dnt_maintained_not_origin = dnt_maintained_ids - dnt_origin_ids
    both_dnt = dnt_origin_ids & dnt_maintained_ids

    print("\nDiscrepancy Breakdown:")
    print(f"  ✓ DNT origin AND DNT maintained: {len(both_dnt):,} segments")
    print(f"  ⚠️ DNT origin but NOT DNT maintained: {len(dnt_origin_not_maintained):,} segments")
    print(f"  📊 DNT maintained but NOT DNT origin: {len(dnt_maintained_not_origin):,} segments")

    if len(dnt_origin_not_maintained) > 0:
        print(f"\n⚠️ DISCREPANCY FOUND: {len(dnt_origin_not_maintained):,} segments with DNT origin are maintained by others!")

        # Get details about these segments
        discrepancy_segments = trails[trails["local_id"].isin(dnt_origin_not_maintained)]

        # Find who maintains these DNT-origin segments
        discrepancy_attrs = hiking_attrs[hiking_attrs["hiking_trail_fk"].isin(dnt_origin_not_maintained)]

        if len(discrepancy_attrs) > 0:
            print("\nWho maintains the DNT-origin segments:")
            maint_counts = discrepancy_attrs["maintenance_responsible"].value_counts()
            for maint, count in maint_counts.items():
                print(f"  • {maint}: {count:,} records")

            # Calculate total length of these discrepancy segments
            discrepancy_length = discrepancy_segments["SHAPE_Length"].sum()
            print(f"\nTotal length of DNT-origin segments maintained by others: {discrepancy_length / 1000:,.1f} km")

            # Show some details about these segments
            print("\nDetails of discrepancy segments:")
            print(f"  Average length: {discrepancy_segments['SHAPE_Length'].mean() / 1000:.2f} km")
            print(f"  Total segments: {len(discrepancy_segments)}")
        else:
            print("    (No maintenance info found for these segments)")
else:
    print("  'origin' column not found in spatial layer")

In [None]:
# Create joined GeoDataFrame with DNT trails (includes duplicates for segments in multiple trails)
print("CREATING JOINED DNT GEODATAFRAME")
print("=" * 70)

# Join spatial data with attribute data
# This will create duplicate geometries for segments that are part of multiple trails
dnt_joined_gdf = dnt_segments.merge(dnt_attrs, left_on="local_id", right_on="hiking_trail_fk", how="inner")

print("\n1. JOINED DATA STATISTICS")
print("-" * 50)
print(f"Total rows in joined GeoDataFrame: {len(dnt_joined_gdf):,}")
print(f"Unique segments: {dnt_joined_gdf['local_id'].nunique():,}")
print(f"Duplicate factor: {len(dnt_joined_gdf) / dnt_joined_gdf['local_id'].nunique():.2f}x")

# Trail name distribution in DNT trails
print("\n2. DNT TRAIL NAMES")
print("-" * 50)
named_dnt = dnt_joined_gdf["trail_name"].notna().sum()
print(f"Named DNT trail records: {named_dnt:,} ({named_dnt / len(dnt_joined_gdf) * 100:.1f}%)")
print(f"Unique trail names: {dnt_joined_gdf['trail_name'].nunique():,}")

print("\nTop 10 DNT trail names:")
dnt_trail_names = dnt_joined_gdf["trail_name"].value_counts()
for name, count in dnt_trail_names.head(10).items():  # type: ignore[assignment]
    if pd.notna(name):
        pct = count / len(dnt_joined_gdf) * 100
        print(f"  {name}: {count:,} records ({pct:.1f}%)")

# Difficulty distribution
print("\n3. DNT TRAIL DIFFICULTY")
print("-" * 50)
if "difficulty" in dnt_joined_gdf.columns:
    diff_counts = dnt_joined_gdf["difficulty"].value_counts()
    diff_total = dnt_joined_gdf["difficulty"].notna().sum()

    print(f"DNT trails with difficulty rating: {diff_total:,} ({diff_total / len(dnt_joined_gdf) * 100:.1f}%)")

    print("\nDifficulty distribution:")
    for difficulty, count in diff_counts.items():
        pct = count / len(dnt_joined_gdf) * 100
        print(f"  {difficulty}: {count:,} ({pct:.1f}%)")

# Store for later use
print(f"\n✓ Created joined GeoDataFrame 'dnt_joined_gdf' with {len(dnt_joined_gdf):,} records")
print(f"✓ Created unique segments GeoDataFrame 'dnt_segments' with {len(dnt_segments):,} segments")

dnt_joined_gdf

In [None]:
# Visualize DNT trails on empty canvas
print("VISUALIZING DNT TRAILS")
print("=" * 70)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))

# Plot 1: All trails with DNT highlighted
print("\n1. DNT trails highlighted among all trails")
ax1.set_title("DNT-Maintained Trails (Highlighted in Red)", fontsize=14, fontweight="bold")

# Plot all trails in light gray as background
trails.plot(ax=ax1, linewidth=0.3, alpha=0.3, color="lightgray", label="Other trails")

# Overlay DNT segments in red
dnt_segments.plot(ax=ax1, linewidth=0.8, alpha=0.8, color="red", label="DNT trails")

ax1.set_xlabel("X Coordinate (EPSG:25833)")
ax1.set_ylabel("Y Coordinate (EPSG:25833)")
ax1.grid(True, alpha=0.3)
ax1.legend(loc="upper right")

# Add statistics text
stats_text = (
    f"DNT Segments: {len(dnt_segments):,}\n"
    f"Total Length: {total_dnt_length / 1000:,.0f} km\n"
    f"({len(dnt_segments) / len(trails) * 100:.1f}% of all segments)"
)
ax1.text(
    0.02,
    0.98,
    stats_text,
    transform=ax1.transAxes,
    fontsize=10,
    verticalalignment="top",
    bbox={"boxstyle": "round", "facecolor": "white", "alpha": 0.8},
)

# Plot 2: Only DNT trails
ax2.set_title("DNT-Maintained Trails Only", fontsize=14, fontweight="bold")

# Plot only DNT segments
dnt_segments.plot(ax=ax2, linewidth=0.6, alpha=0.7, color="darkblue")

ax2.set_xlabel("X Coordinate (EPSG:25833)")
ax2.set_ylabel("Y Coordinate (EPSG:25833)")
ax2.grid(True, alpha=0.3)

# Add CRS info
ax2.text(0.02, 0.98, f"CRS: {dnt_segments.crs}", transform=ax2.transAxes, fontsize=10, verticalalignment="top")

plt.tight_layout()
plt.show()

print(f"✓ Visualized {len(dnt_segments):,} unique DNT segments")
print(f"✓ Total length: {total_dnt_length / 1000:,.1f} km")

In [None]:
# Comprehensive DNT trail summary
print("DNT TRAIL ANALYSIS SUMMARY")
print("=" * 70)

print("\n📊 KEY METRICS")
print("-" * 50)
print(f"Unique DNT-maintained segments: {len(dnt_segments):,}")
print(f"Total DNT records (with duplicates): {len(dnt_joined_gdf):,}")
print(f"Total length (unique segments): {total_dnt_length / 1000:,.1f} km")
print(f"Coverage: {len(dnt_segments) / len(trails) * 100:.1f}% of all trail segments")

print("\n🏢 ORGANIZATION BREAKDOWN")
print("-" * 50)
print(f"Total DNT organizations: {dnt_org_counts.nunique()}")

# Group by main DNT vs local associations
main_dnt = dnt_attrs[dnt_attrs["maintenance_responsible"] == "DNT"]
local_dnt = dnt_attrs[dnt_attrs["maintenance_responsible"] != "DNT"]

print(f"\nMain DNT organization: {len(main_dnt):,} records")
print(f"Local DNT associations: {len(local_dnt):,} records")

# Top local associations
print("\nTop 5 local DNT associations:")
local_org_counts = local_dnt["maintenance_responsible"].value_counts()
for org, count in local_org_counts.head(5).items():
    print(f"  • {org}: {count:,} records")

print("\n📏 LENGTH STATISTICS")
print("-" * 50)
print(f"Average segment length: {avg_dnt_length / 1000:.2f} km")
print(f"Median segment length: {median_dnt_length / 1000:.2f} km")
print(f"Shortest segment: {dnt_segments['SHAPE_Length'].min():.1f} m")
print(f"Longest segment: {dnt_segments['SHAPE_Length'].max() / 1000:.1f} km")

# Length distribution
length_bins = [0, 100, 500, 1000, 5000, 10000, float("inf")]
length_labels = ["<100m", "100-500m", "500m-1km", "1-5km", "5-10km", ">10km"]
dnt_segments["length_bin"] = pd.cut(dnt_segments["SHAPE_Length"], bins=length_bins, labels=length_labels)

print("\nLength distribution:")
for bin_label in length_labels:
    count = (dnt_segments["length_bin"] == bin_label).sum()
    pct = count / len(dnt_segments) * 100
    print(f"  {bin_label}: {count:,} segments ({pct:.1f}%)")

print("\n🏔️ TRAIL CHARACTERISTICS")
print("-" * 50)

# Marking information
if "marking" in dnt_segments.columns:
    marking_counts = dnt_segments["marking"].value_counts()
    print("\nMarking status:")
    for marking, count in marking_counts.items():
        pct = count / len(dnt_segments) * 100
        print(f"  {marking}: {count:,} ({pct:.1f}%)")

# Trail difficulty from joined data
if "difficulty" in dnt_joined_gdf.columns:
    print("\nDifficulty ratings (from all DNT records):")
    diff_with_rating = dnt_joined_gdf["difficulty"].notna().sum()
    print(f"  Records with difficulty: {diff_with_rating:,} ({diff_with_rating / len(dnt_joined_gdf) * 100:.1f}%)")

# Trail significance
if "trail_significance" in dnt_joined_gdf.columns:
    sig_counts = dnt_joined_gdf["trail_significance"].value_counts()
    if len(sig_counts) > 0:
        print("\nTrail significance:")
        for sig, count in sig_counts.items():
            pct = count / len(dnt_joined_gdf) * 100
            print(f"  {sig}: {count:,} ({pct:.1f}%)")

print("\n" + "=" * 70)
print("✓ DNT TRAIL ANALYSIS COMPLETE")
print("✓ Data available in: 'dnt_segments' (unique) and 'dnt_joined_gdf' (with duplicates)")