# Norwegian Trails Data Exploration

**Goal**: Understand the actual structure and content of Turrutebasen data from Geonorge before building abstractions.

**Data Source**: Geonorge/Kartverket - Norwegian government's official trail database (Turrutebasen)

In [None]:
# Setup and imports
import json
import warnings
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

# Our minimal modules
from trails.io.sources.geonorge import Source as GeonorgeSource

warnings.filterwarnings("ignore")

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_colwidth", 50)
plt.rcParams["figure.figsize"] = (12, 8)

print("Setup complete!")

## 1. Load Geonorge Data

First, let's load the data and see what we're working with.

In [None]:
# Initialize Geonorge source
geonorge = GeonorgeSource()

# Show metadata about the source
print("Data Source Information:")
print("=" * 60)
print("Dataset: Turrutebasen")
print("Provider: Kartverket (Norwegian Mapping Authority)")
print("License: CC BY 4.0")

In [None]:
# Try to load the data
# This will either load from cache or download from Geonorge
try:
    trail_data = geonorge.load_turrutebasen()
    print("\nData loaded successfully!")

    # Extract layers
    print(f"\nFound {len(trail_data.spatial_layers)} spatial layers:")
    for layer_name, gdf in trail_data.spatial_layers.items():
        print(f"  - {layer_name}: {len(gdf)} features")

    print(f"\nFound {len(trail_data.attribute_tables)} attribute tables:")
    for table_name, df in trail_data.attribute_tables.items():
        print(f"  - {table_name}: {len(df)} rows")

    # For compatibility with rest of notebook
    data = trail_data.spatial_layers

except FileNotFoundError as e:
    print(f"\nData not found: {e}")
    print("\nThe data will be downloaded automatically on first run.")

## 2. Explore Layer Structure

Let's examine each layer in detail to understand what data we have.

In [None]:
# Detailed exploration of each layer
if "data" in locals() and isinstance(data, dict):
    layer_summary = {}

    for layer_name, gdf in data.items():
        print(f"\n{'=' * 60}")
        print(f"LAYER: {layer_name}")
        print(f"{'=' * 60}")

        # Basic info
        print("\nBasic Information:")
        print(f"  Shape: {gdf.shape}")
        print(f"  CRS: {gdf.crs}")

        # Geometry info
        print("\nGeometry Information:")
        if not gdf.empty and "geometry" in gdf.columns:
            geom_types = gdf.geometry.geom_type.unique()
            print(f"  Geometry types: {geom_types}")
            print(f"  Total bounds: {gdf.total_bounds}")

        # Store summary
        layer_summary[layer_name] = {
            "shape": gdf.shape,
            "crs": str(gdf.crs),
            "columns": list(gdf.columns),
        }

        # Show first few records
        print("\nFirst 2 records:")
        print(gdf.head(2))
else:
    print("Please load the data first (run the cell above)")

## 3. Column Analysis

Let's analyze the columns in each layer to understand what attributes are available.

In [None]:
# Analyze columns for each layer
if "data" in locals() and isinstance(data, dict):
    for layer_name, gdf in data.items():
        print(f"\n{'=' * 60}")
        print(f"LAYER: {layer_name} - Column Analysis")
        print(f"{'=' * 60}")

        # Exclude geometry column
        columns = [col for col in gdf.columns if col != "geometry"]

        print(f"\nTotal columns (excluding geometry): {len(columns)}")
        print("\nColumn details:")

        for col in columns:
            # Basic info
            dtype = gdf[col].dtype
            non_null = gdf[col].notna().sum()
            null_pct = (1 - non_null / len(gdf)) * 100

            print(f"\n  {col}:")
            print(f"    Type: {dtype}")
            print(f"    Non-null: {non_null}/{len(gdf)} ({100 - null_pct:.1f}% complete)")

            # For categorical/text columns, show unique values if not too many
            if dtype == "object" and non_null > 0:
                unique_count = gdf[col].nunique()
                print(f"    Unique values: {unique_count}")

                if unique_count <= 10:
                    values = gdf[col].dropna().unique()
                    print(f"    Values: {list(values)}")
                elif unique_count <= 20:
                    top_values = gdf[col].value_counts().head(5)
                    print("    Top 5 values:")
                    for val, count in top_values.items():
                        print(f"      - {val}: {count}")

            # For numeric columns, show statistics
            elif dtype in ["int64", "float64"] and non_null > 0:
                print(f"    Min: {gdf[col].min()}")
                print(f"    Max: {gdf[col].max()}")
                print(f"    Mean: {gdf[col].mean():.2f}")

## 4. Identify Trail Layers

Let's identify which layers contain actual trail data (LineString geometries).

In [None]:
# Find trail layers (with line geometries)
if "data" in locals() and isinstance(data, dict):
    trail_layers = {}
    point_layers = {}
    other_layers = {}

    for layer_name, gdf in data.items():
        if gdf.empty:
            other_layers[layer_name] = gdf
            continue

        # Check geometry type
        geom_types = gdf.geometry.geom_type.unique()

        if any(gt in ["LineString", "MultiLineString"] for gt in geom_types):
            trail_layers[layer_name] = gdf
        elif any(gt in ["Point", "MultiPoint"] for gt in geom_types):
            point_layers[layer_name] = gdf
        else:
            other_layers[layer_name] = gdf

    print("Layer Classification:")
    print("=" * 60)

    print(f"\nTrail Layers (Lines): {len(trail_layers)}")
    for name, gdf in trail_layers.items():
        print(f"  - {name}: {len(gdf)} features")

    print(f"\nPoint Layers (Facilities): {len(point_layers)}")
    for name, gdf in point_layers.items():
        print(f"  - {name}: {len(gdf)} features")

    if other_layers:
        print(f"\nOther Layers: {len(other_layers)}")
        for name, gdf in other_layers.items():
            print(f"  - {name}: {len(gdf)} features")

## 5. Focus on Main Trail Layer

Let's analyze the main trail layer in detail.

In [None]:
# Select and analyze the main trail layer
if "trail_layers" in locals() and trail_layers:
    # Get the largest trail layer or first one
    main_layer_name = max(trail_layers.keys(), key=lambda x: len(trail_layers[x]))
    trails = trail_layers[main_layer_name]

    print(f"Selected main trail layer: {main_layer_name}")
    print(f"Total features: {len(trails)}")
    print("\nColumns in this layer:")
    for col in trails.columns:
        if col != "geometry":
            print(f"  - {col}")

    # Try to identify key fields based on common patterns
    print("\n" + "=" * 60)
    print("Attempting to identify key fields...")
    print("=" * 60)

    field_patterns = {
        "name": ["navn", "rutenavn", "name", "title"],
        "type": ["rutetype", "type", "kategori", "category", "spesial"],
        "difficulty": ["vanskgrad", "gradering", "difficulty", "grade"],
        "length": ["lengde", "length", "distance"],
        "municipality": ["kommune", "kommunenr", "municipality"],
        "county": ["fylke", "fylkesnr", "county"],
        "marking": ["merking", "marking", "skilting"],
        "maintenance": ["vedlikehold", "vedlikeholdsansvarlig", "maintenance", "eier"],
        "id": ["lokalid", "objectid", "id", "gid"],
    }

    found_fields = {}
    for field_type, patterns in field_patterns.items():
        for col in trails.columns:
            col_lower = col.lower()
            if any(pattern in col_lower for pattern in patterns):
                found_fields[field_type] = col
                print(f"  {field_type}: {col}")

                # Show sample values
                if col in trails.columns:
                    non_null = trails[col].dropna()
                    if len(non_null) > 0:
                        if trails[col].dtype == "object":
                            unique = non_null.nunique()
                            print(f"    → {unique} unique values")
                            if unique <= 5:
                                print(f"    → Values: {list(non_null.unique())}")
                        else:
                            print(f"    → Range: {non_null.min()} to {non_null.max()}")
                break

    print(f"\nIdentified {len(found_fields)} potential key fields")
else:
    print("No trail layers found. Please check the data.")

## 6. Data Quality Assessment

In [None]:
# Assess data quality for the main trail layer
if "trails" in locals():
    print("Data Quality Report")
    print("=" * 60)

    # Completeness analysis
    completeness_data = []
    for col in trails.columns:
        if col != "geometry":
            non_null = trails[col].notna().sum()
            completeness = (non_null / len(trails)) * 100
            completeness_data.append(
                {
                    "Field": col,
                    "Non_Null": non_null,
                    "Null": len(trails) - non_null,
                    "Completeness_%": completeness,
                }
            )

    completeness_df = pd.DataFrame(completeness_data)
    completeness_df = completeness_df.sort_values("Completeness_%", ascending=False)

    print("\nField Completeness (Top 10 most complete):")
    print(completeness_df.head(10))

    print("\nFields with poor completeness (<50%):")
    poor_fields = completeness_df[completeness_df["Completeness_%"] < 50]
    if not poor_fields.empty:
        print(poor_fields)
    else:
        print("  All fields have >50% completeness")

    # Geometry quality
    print("\nGeometry Quality:")
    print(f"  Valid geometries: {trails.geometry.is_valid.sum()} / {len(trails)}")
    print(f"  Empty geometries: {trails.geometry.is_empty.sum()}")
    print(f"  Simple geometries: {trails.geometry.is_simple.sum()} / {len(trails)}")

## 7. Spatial Distribution

In [None]:
# Analyze spatial distribution
if "trails" in locals():
    print("Spatial Distribution Analysis")
    print("=" * 60)

    # Get bounds
    bounds = trails.total_bounds
    print("\nSpatial extent (minx, miny, maxx, maxy):")
    print(f"  {bounds}")
    print(f"\nCRS: {trails.crs}")

    # If we have administrative fields, analyze distribution
    if "found_fields" in locals():
        if "county" in found_fields:
            county_col = found_fields["county"]
            print(f"\nDistribution by {county_col}:")
            county_stats = trails[county_col].value_counts().head(10)
            for county, count in county_stats.items():
                print(f"  {county}: {count} trails ({count / len(trails) * 100:.1f}%)")

        if "type" in found_fields:
            type_col = found_fields["type"]
            print(f"\nDistribution by {type_col}:")
            type_stats = trails[type_col].value_counts()
            for trail_type, count in type_stats.items():
                print(f"  {trail_type}: {count} trails ({count / len(trails) * 100:.1f}%)")

    # Calculate total length
    print("\nLength Analysis:")
    if "found_fields" in locals() and "length" in found_fields:
        length_col = found_fields["length"]
        total_length = trails[length_col].sum()
        avg_length = trails[length_col].mean()
        print(f"  Total length from '{length_col}': {total_length:,.0f} (units unknown)")
        print(f"  Average length: {avg_length:,.0f}")
    else:
        print("  Calculating from geometry...")
        trails["calculated_length"] = trails.geometry.length
        total_calc_length = trails["calculated_length"].sum()
        print(f"  Total calculated length: {total_calc_length:,.0f} (CRS units)")
        print(f"  Average calculated length: {trails['calculated_length'].mean():,.0f}")

## 8. Basic Visualization

In [None]:
# Create basic visualizations
if "trails" in locals():
    # Plot all trails
    fig, ax = plt.subplots(figsize=(12, 10))

    # Plot trails with a simple color
    trails.plot(ax=ax, linewidth=0.5, alpha=0.6, color="darkgreen")

    ax.set_title(f"Norwegian Trails from Geonorge ({len(trails)} features)", fontsize=14)
    ax.set_xlabel("X Coordinate")
    ax.set_ylabel("Y Coordinate")
    ax.grid(True, alpha=0.3)

    # Add CRS info
    ax.text(
        0.02,
        0.98,
        f"CRS: {trails.crs}",
        transform=ax.transAxes,
        fontsize=10,
        verticalalignment="top",
    )

    plt.tight_layout()
    plt.show()

    # If we have categories, show distribution
    if "found_fields" in locals() and "type" in found_fields:
        type_col = found_fields["type"]
        type_counts = trails[type_col].value_counts()

        if len(type_counts) > 0:
            fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

            # Bar chart
            type_counts.head(10).plot(kind="bar", ax=ax1, color="forestgreen")
            ax1.set_title("Trail Types Distribution (Top 10)")
            ax1.set_xlabel("Trail Type")
            ax1.set_ylabel("Count")
            ax1.tick_params(axis="x", rotation=45)

            # Pie chart for top 5
            type_counts.head(5).plot(kind="pie", ax=ax2, autopct="%1.1f%%")
            ax2.set_title("Top 5 Trail Types")
            ax2.set_ylabel("")

            plt.tight_layout()
            plt.show()

## 9. Export Sample Data

In [None]:
# Export sample data for testing
if "trails" in locals():
    # Create output directory
    output_dir = Path("data/samples")
    output_dir.mkdir(parents=True, exist_ok=True)

    # Export a sample
    sample_size = min(100, len(trails))
    sample = trails.sample(n=sample_size, random_state=42)

    # Save as GeoJSON
    output_file = output_dir / "geonorge_trails_sample.geojson"
    sample.to_file(output_file, driver="GeoJSON")
    print(f"Exported {sample_size} sample trails to {output_file}")

    # Save field summary
    if "found_fields" in locals():
        summary = {
            "source": "Geonorge/Kartverket",
            "layer": main_layer_name,
            "total_features": len(trails),
            "sample_size": sample_size,
            "crs": str(trails.crs),
            "identified_fields": found_fields,
            "all_columns": list(trails.columns),
            "completeness": completeness_df.to_dict("records")
            if "completeness_df" in locals()
            else None,
        }

        summary_file = output_dir / "geonorge_field_summary.json"
        with open(summary_file, "w") as f:
            json.dump(summary, f, indent=2)
        print(f"Saved field summary to {summary_file}")

## 10. Summary and Findings

In [None]:
# Generate final summary
if "data" in locals():
    print("EXPLORATION SUMMARY")
    print("=" * 60)

    print("\n1. DATA STRUCTURE:")
    if isinstance(data, dict):
        print(f"   - Total layers: {len(data)}")
        for layer, gdf in data.items():
            print(f"   - {layer}: {len(gdf)} features")

    if "trails" in locals():
        print(f"\n2. MAIN TRAIL LAYER: {main_layer_name}")
        print(f"   - Total trails: {len(trails)}")
        print(f"   - CRS: {trails.crs}")
        print(f"   - Columns: {len(trails.columns)}")

        if "found_fields" in locals():
            print("\n3. IDENTIFIED KEY FIELDS:")
            for field_type, col_name in found_fields.items():
                print(f"   - {field_type}: {col_name}")

        print("\n4. DATA QUALITY:")
        print(f"   - Valid geometries: {trails.geometry.is_valid.sum()}/{len(trails)}")
        if "completeness_df" in locals():
            high_quality = completeness_df[completeness_df["Completeness_%"] > 90]
            print(f"   - Fields with >90% completeness: {len(high_quality)}")

        print("\n5. RECOMMENDATIONS FOR IMPLEMENTATION:")
        print("   Based on the exploration, consider:")
        print("   - [Add specific recommendations based on actual findings]")

        # Check for expected vs actual fields
        print("\n6. NOTABLE OBSERVATIONS:")
        if "found_fields" in locals():
            if "difficulty" not in found_fields:
                print("   ⚠ No difficulty field found (expected 'vanskgrad')")
            if "name" not in found_fields:
                print("   ⚠ No name field found")
            if "type" in found_fields:
                print(f"   ✓ Trail type field found: {found_fields['type']}")
else:
    print("No data loaded. Please download and load the data first.")

## Next Steps

Based on the exploration findings:

1. **Build specific processing modules** for the actual data structure found
2. **Create appropriate mappings** for trail types and categories
3. **Design analysis functions** that work with available fields
4. **Plan visualization approach** based on data quality and completeness

The actual implementation should be adapted based on what we discovered in this exploration.