# Norwegian Trails Data Exploration

**Goal**: Understand the actual structure and content of Turrutebasen data from Geonorge before building abstractions.

**Data Source**: Geonorge/Kartverket - Norwegian government's official trail database (Turrutebasen)

In [None]:
# Setup and imports
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

# Our minimal modules
from trails.io.sources.geonorge import Source as GeonorgeSource
from trails.io.sources.language import Language

warnings.filterwarnings("ignore")

# Project paths
project_root = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
cache_dir = project_root / ".cache"

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", 50)
plt.rcParams["figure.figsize"] = (12, 8)

print("Setup complete!")
print(f"Project root: {project_root}")
print(f"Cache directory: {cache_dir}")

## 1. Load Geonorge Data

First, let's load the data and see what we're working with.

In [None]:
# Initialize Geonorge source and load data
geonorge = GeonorgeSource(cache_dir=str(cache_dir))

# Load with English translations (use Language.NO for Norwegian)
trail_data = geonorge.load_turrutebasen(language=Language.EN)
print("Data Source Information:")
print("=" * 60)
print(f"Dataset: {trail_data.metadata.dataset_name}")
print(f"Provider: {trail_data.metadata.provider}")
print(f"License: {trail_data.metadata.license}")
print(f"Dataset ID: {trail_data.metadata.dataset_id}")
print(f"Description: {trail_data.metadata.description}")
print(f"Attribution: {trail_data.metadata.attribution}")

print(f"\nData loaded successfully from: {trail_data.source_url}")
print(f"Version: {trail_data.version}")
print(f"Language: {trail_data.language.value}")
print(f"CRS: {trail_data.crs}")
print(f"Total features: {trail_data.total_features}")

# Layers and columns are now automatically translated
print(f"\nFound {len(trail_data.spatial_layers)} spatial layers:")
for layer_name, gdf in trail_data.spatial_layers.items():
    print(f"  - {layer_name}: {len(gdf)} features")

print(f"\nFound {len(trail_data.attribute_tables)} attribute tables:")
for table_name, df in trail_data.attribute_tables.items():
    print(f"  - {table_name}: {len(df)} rows")

## 2. Complete Layer Analysis

Let's examine each layer in detail, analyzing both structure and columns.

In [None]:
# Complete analysis of all layers
print("=" * 80)
print("SPATIAL LAYERS ANALYSIS")
print("=" * 80)

layer_summary = {}

for layer_name, gdf in trail_data.spatial_layers.items():
    print(f"\n{'=' * 60}")
    print(f"LAYER: {layer_name}")
    print(f"{'=' * 60}")

    # Basic info
    print("\nBasic Information:")
    print(f"  Shape: {gdf.shape}")
    print(f"  CRS: {gdf.crs}")

    # Geometry info
    print("\nGeometry Information:")
    if not gdf.empty and "geometry" in gdf.columns:
        geom_types = gdf.geometry.geom_type.unique()
        print(f"  Geometry types: {geom_types}")
        print(f"  Total bounds: {gdf.total_bounds}")

    # Column analysis
    columns = [col for col in gdf.columns if col != "geometry"]
    print(f"\nColumn Analysis ({len(columns)} columns):")

    for col in columns:
        dtype = gdf[col].dtype
        non_null_count = gdf[col].notna().sum()
        null_pct = (1 - non_null_count / len(gdf)) * 100

        print(f"\n  {col}:")
        print(f"    Type: {dtype}")
        print(f"    Non-null: {non_null_count}/{len(gdf)} ({100 - null_pct:.1f}% complete)")

        # For categorical/text columns
        if dtype == "object" and non_null_count > 0:
            unique_count = gdf[col].nunique()
            print(f"    Unique values: {unique_count}")

            if unique_count <= 5:
                values = gdf[col].dropna().unique()
                # Values are already expanded and translated
                print(f"    Values: {list(values)}")
            elif unique_count > 5:
                top_values = gdf[col].value_counts().head(5)
                print("    Top 5 values:")
                for val, count in top_values.items():
                    # Values are already expanded and translated
                    print(f"      - {val}: {count}")

        # For numeric columns
        elif dtype in ["int32", "int64", "float64"] and non_null_count > 0:
            print(f"    Min: {gdf[col].min()}")
            print(f"    Max: {gdf[col].max()}")
            print(f"    Mean: {gdf[col].mean():.2f}")

        # For datetime columns
        elif dtype in ["datetime64[ms, UTC]"] and non_null_count > 0:
            print(f"    Min: {gdf[col].min()}")
            print(f"    Max: {gdf[col].max()}")

    # Store summary
    layer_summary[layer_name] = {
        "shape": gdf.shape,
        "crs": str(gdf.crs),
        "columns": list(gdf.columns),
        "geometry_types": list(geom_types) if not gdf.empty else [],
    }

In [None]:
print("\n" + "=" * 80)
print("ATTRIBUTE TABLES ANALYSIS")
print("=" * 80)

for table_name, df in trail_data.attribute_tables.items():
    print(f"\n{'=' * 60}")
    print(f"TABLE: {table_name}")
    print(f"{'=' * 60}")

    # Basic info
    print("\nBasic Information:")
    print(f"  Shape: {df.shape}")

    # Column analysis
    print(f"\nColumn Analysis ({len(df.columns)} columns):")

    for col in df.columns:
        dtype = df[col].dtype
        non_null_count = df[col].notna().sum()
        null_pct = (1 - non_null_count / len(df)) * 100

        print(f"\n  {col}:")
        print(f"    Type: {dtype}")
        print(f"    Non-null: {non_null_count}/{len(df)} ({100 - null_pct:.1f}% complete)")

        # For categorical/text columns
        if dtype == "object" and non_null_count > 0:
            unique_count = df[col].nunique()
            print(f"    Unique values: {unique_count}")

            if unique_count <= 5:
                values = df[col].dropna().unique()
                # Values are already expanded and translated
                print(f"    Values: {list(values)}")
            elif unique_count > 5:
                top_values = df[col].value_counts().head(5)
                print("    Top 5 values:")
                for val, count in top_values.items():
                    # Values are already expanded and translated
                    print(f"      - {val}: {count}")

        # For numeric columns
        elif dtype in ["int32", "int64", "float64"] and non_null_count > 0:
            print(f"    Min: {df[col].min()}")
            print(f"    Max: {df[col].max()}")
            mean_val = df[col].mean()
            if not pd.isna(mean_val):
                print(f"    Mean: {mean_val:.2f}")

        # For datetime columns
        elif dtype in ["datetime64[ms, UTC]"] and non_null_count > 0:
            print(f"    Min: {df[col].min()}")
            print(f"    Max: {df[col].max()}")

## 3. Identify Trail Layers

Let's identify which layers contain actual trail data (LineString geometries).

In [None]:
# Find trail layers (with line geometries)
trail_layers = {}
point_layers = {}
other_layers = {}

for layer_name, gdf in trail_data.spatial_layers.items():
    if gdf.empty:
        other_layers[layer_name] = gdf
        continue

    # Check geometry type
    geom_types = gdf.geometry.geom_type.unique()

    if any(gt in ["LineString", "MultiLineString"] for gt in geom_types):
        trail_layers[layer_name] = gdf
    elif any(gt in ["Point", "MultiPoint"] for gt in geom_types):
        point_layers[layer_name] = gdf
    else:
        other_layers[layer_name] = gdf

print("Layer Classification:")
print("=" * 60)

print(f"\nTrail Layers (Lines): {len(trail_layers)}")
for name, gdf in trail_layers.items():
    print(f"  - {name}: {len(gdf)} features")

print(f"\nPoint Layers (Facilities): {len(point_layers)}")
for name, gdf in point_layers.items():
    print(f"  - {name}: {len(gdf)} features")

if other_layers:
    print(f"\nOther Layers: {len(other_layers)}")
    for name, gdf in other_layers.items():
        print(f"  - {name}: {len(gdf)} features")

## 4. Focus on Hiking Trail Layer

Let's analyze the hiking trail layer in detail.

In [None]:
# Assess data quality for the main trail layer
# Note: layer name depends on language - "hiking_trail_centerline" in English
hiking_trail_layer = "hiking_trail_centerline" if trail_data.language == Language.EN else "fotrute_senterlinje"

if hiking_trail_layer in trail_data.spatial_layers:
    trails = trail_data.spatial_layers[hiking_trail_layer]

    print("Data Quality Report")
    print("=" * 60)

    # Completeness analysis
    completeness_data = []
    for col in trails.columns:
        if col != "geometry":
            non_null = trails[col].notna().sum()
            completeness = (non_null / len(trails)) * 100
            completeness_data.append(
                {
                    "Field": col,
                    "Non_Null": non_null,
                    "Null": len(trails) - non_null,
                    "Completeness_%": completeness,
                }
            )

    completeness_df = pd.DataFrame(completeness_data)
    completeness_df = completeness_df.sort_values("Completeness_%", ascending=False)

    print("\nField Completeness (Top 10 most complete):")
    print(completeness_df.head(10))

    print("\nFields with poor completeness (<50%):")
    poor_fields = completeness_df[completeness_df["Completeness_%"] < 50]
    if not poor_fields.empty:
        print(poor_fields)
    else:
        print("  All fields have >50% completeness")

    # Geometry quality
    print("\nGeometry Quality:")
    print(f"  Valid geometries: {trails.geometry.is_valid.sum()} / {len(trails)}")
    print(f"  Empty geometries: {trails.geometry.is_empty.sum()}")
    print(f"  Simple geometries: {trails.geometry.is_simple.sum()} / {len(trails)}")

## 5. Spatial Distribution Analysis

In [None]:
# Analyze spatial distribution for main trail layer
from trails.utils.geo import calculate_lengths_meters

hiking_trail_layer = "hiking_trail_centerline" if trail_data.language == Language.EN else "fotrute_senterlinje"

if hiking_trail_layer in trail_data.spatial_layers:
    trails = trail_data.spatial_layers[hiking_trail_layer]

    print("Spatial Distribution Analysis")
    print("=" * 60)

    # Get bounds
    bounds = trails.total_bounds
    print("\nSpatial extent (minx, miny, maxx, maxy):")
    print(f"  {bounds}")
    print(f"\nCRS: {trails.crs}")

    # Analyze by object_type if it exists (translated column name)
    objtype_col = "object_type" if trail_data.language == Language.EN else "objtype"
    if objtype_col in trails.columns:
        print(f"\nDistribution by {objtype_col}:")
        type_stats = trails[objtype_col].value_counts()
        for trail_type, count in type_stats.items():
            # Values are already expanded and translated
            print(f"  {trail_type}: {count} trails ({count / len(trails) * 100:.1f}%)")

    # Calculate total length in meters (optimized batch calculation)
    print("\nLength Analysis:")
    print("  Calculating lengths ...")

    # Use the new optimized batch function
    trails_copy = trails.copy()
    trails_copy["length_meters"] = calculate_lengths_meters(trails_copy)

    total_length = trails_copy["length_meters"].sum()
    avg_length = trails_copy["length_meters"].mean()
    median_length = trails_copy["length_meters"].median()

    # Print in kilometers for readability
    print(f"  Total length: {total_length / 1000:,.1f} km")
    print(f"  Average trail length: {avg_length / 1000:.2f} km")
    print(f"  Median trail length: {median_length / 1000:.2f} km")

    # Show distribution of trail lengths
    print("\nTrail Length Distribution:")
    print(f"  Shortest trail: {trails_copy['length_meters'].min() / 1000:.3f} km")
    print(f"  Longest trail: {trails_copy['length_meters'].max() / 1000:.1f} km")

    # Length categories (using meters for calculation, displaying in km)
    bins = [0, 1000, 5000, 10000, 20000, float("inf")]
    labels = ["<1km", "1-5km", "5-10km", "10-20km", ">20km"]
    trails_copy["length_category"] = pd.cut(trails_copy["length_meters"], bins=bins, labels=labels)

    print("\nTrails by length category:")
    for category in labels:
        count = (trails_copy["length_category"] == category).sum()
        pct = count / len(trails_copy) * 100
        print(f"  {category}: {count:,} trails ({pct:.1f}%)")

## 6. Visualization

In [None]:
# Create basic visualizations for main trail layer
hiking_trail_layer = "hiking_trail_centerline" if trail_data.language == Language.EN else "fotrute_senterlinje"

if hiking_trail_layer in trail_data.spatial_layers:
    trails = trail_data.spatial_layers[hiking_trail_layer]

    # Plot all trails
    fig, ax = plt.subplots(figsize=(12, 10))

    # Plot trails with a simple color
    trails.plot(ax=ax, linewidth=0.5, alpha=0.6, color="darkgreen")

    ax.set_title(f"Norwegian Hiking Trails ({len(trails)} features)", fontsize=14)
    ax.set_xlabel("X Coordinate")
    ax.set_ylabel("Y Coordinate")
    ax.grid(True, alpha=0.3)

    # Add CRS info
    ax.text(
        0.02,
        0.98,
        f"CRS: {trails.crs}",
        transform=ax.transAxes,
        fontsize=10,
        verticalalignment="top",
    )

    plt.tight_layout()
    plt.show()

    # If we have object type, show distribution
    objtype_col = "object_type" if trail_data.language == Language.EN else "objtype"
    if objtype_col in trails.columns:
        type_counts = trails[objtype_col].value_counts()

        if len(type_counts) > 0:
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

            # Bar chart
            type_counts.head(10).plot(kind="bar", ax=ax1, color="forestgreen")
            ax1.set_title("Trail Types Distribution (Top 10)")
            ax1.set_xlabel("Trail Type")
            ax1.set_ylabel("Count")
            ax1.tick_params(axis="x", rotation=45)

            # Pie chart for top 5
            type_counts.head(5).plot(kind="pie", ax=ax2, autopct="%1.1f%%")
            ax2.set_title("Top 5 Trail Types")
            ax2.set_ylabel("")

            plt.tight_layout()
            plt.show()