In [None]:
import polars as pl
import json
from pathlib import Path

## Load Curated Data

In [None]:
# Load unified parquet
df = pl.read_parquet("../data/curated/unified.parquet")
print(f"Total entities: {len(df)}")
df.head()

## Entity Counts by Type

In [None]:
# Count entities by type
type_counts = df.group_by("entity_type").agg(pl.count().alias("count")).sort("count", descending=True)
print(type_counts)

## Base Game vs DLC

In [None]:
# Count by DLC status
dlc_counts = df.group_by("is_dlc").agg(pl.count().alias("count"))
print(dlc_counts)

base_count = df.filter(~pl.col("is_dlc")).height
dlc_count = df.filter(pl.col("is_dlc")).height

print(f"\nBase game entities: {base_count}")
print(f"DLC entities: {dlc_count}")

## Sample Bosses (Reconciled)

In [None]:
# Filter bosses and show sample
bosses = df.filter(pl.col("entity_type") == "boss")
print(f"Total bosses: {len(bosses)}")

# Show 10 sample bosses with descriptions
sample_bosses = bosses.select(["name", "is_dlc", "description", "sources"]).head(10)
for row in sample_bosses.iter_rows(named=True):
    dlc_tag = "[DLC]" if row["is_dlc"] else "[Base]"
    sources = json.loads(row["sources"]) if isinstance(row["sources"], str) else row["sources"]
    print(f"\n{dlc_tag} {row['name']}")
    print(f"  Sources: {', '.join(sources)}")
    print(f"  Description: {row['description'][:200]}..." if len(row['description']) > 200 else f"  Description: {row['description']}")

## Weapons by Category

In [None]:
# Show weapons
weapons = df.filter(pl.col("entity_type") == "weapon")
print(f"Total weapons: {len(weapons)}")

# Sample 5 weapons
sample_weapons = weapons.select(["name", "is_dlc", "slug"]).head(5)
print(sample_weapons)

## Provenance Analysis

In [None]:
# Count unique sources
all_sources = set()
for row in df.select("sources").iter_rows(named=True):
    sources = row["sources"]
    if isinstance(sources, str):
        sources = json.loads(sources)
    if isinstance(sources, list):
        all_sources.update(sources)

print(f"Unique data sources: {sorted(all_sources)}")

## Load Metadata

In [None]:
# Load metadata file
metadata_path = Path("../data/curated/metadata.json")
if metadata_path.exists():
    with open(metadata_path) as f:
        metadata = json.load(f)
    
    print("=== Corpus Metadata ===")
    print(f"\nRow Counts:")
    for key, value in metadata.get("row_counts", {}).items():
        print(f"  {key}: {value}")
    
    print(f"\nEntity Counts:")
    for key, value in metadata.get("entity_counts", {}).items():
        print(f"  {key}: {value}")
    
    print(f"\nUnmapped DLC texts: {metadata.get('unmapped_texts', 0)}")
else:
    print("Metadata file not found. Run 'corpus curate' first.")