In [1]:
# Babel-backed numeric parsing (per-value detection)
from PyDI.normalization.types import NumericParser

# Provided samples
samples = [
    "1,234.56",  # US
    "1.234,56",  # EU
    "1 234,56",  # EU with space grouping
    "1\xa0234,56",  # NBSP grouping
    "1'234.56",  # Swiss
    "(1.234,56)",  # EU negative parentheses
    "$1,234.56",  # currency US
    "€ 1.234,56",  # currency EU
    "12,5%",  # EU percent
]

np = NumericParser(use_babel=True)

# Parse each value independently using Babel-backed detection
parsed = {s: np.parse_numeric(s) for s in samples}
parsed


NLTK not available. Advanced tokenization features will be limited.


{'1,234.56': 1234.56,
 '1.234,56': 1234.56,
 '1 234,56': 1234.56,
 '1\xa0234,56': 1234.56,
 "1'234.56": 1234.56,
 '(1.234,56)': 1234.56,
 '$1,234.56': 1234.56,
 '€ 1.234,56': 1234.56,
 '12,5%': 0.125}

In [None]:
# Quantity scaling examples (k, million, billion)
from PyDI.normalization.values import AdvancedValueNormalizer

avn = AdvancedValueNormalizer()
examples = [
    "1.2k", "2k", "3 thousand", "1.5 million", "2.75 million",
    "4 billion", "7.2B", "120 M", "0.5 trilLION"
]

scaled = {s: avn._normalize_numeric(s) for s in examples}
scaled


In [None]:
# Unit conversion examples (normalize to target units)
from PyDI.normalization.units import UnitNormalizer, UnitCategory

un = UnitNormalizer()
unit_examples = [
    "12 km", "4800 m", "3.2 miles",  # length → m
    "5 kg", "12,000 g", "2.2 lb",     # weight → kg
    "750 ml", "2 L", "0.5 gallon",    # volume → l
    "90 min", "2 h", "3600 s",        # time → s
    "32 °F", "0 °C", "273.15 K",      # temperature → °C
    "36 km/h", "10 m/s"                 # speed → m/s
]

converted = {}
for s in unit_examples:
    result = un.normalize_value(s)
    converted[s] = result

converted


# Winter-like Normalization Demo (PyDI)

This notebook mirrors `PyDI/examples/normalization_demo_winter_like.py`.

It demonstrates:
- Provenance-aware loading from XML and CSV
- Header normalization and header-derived unit detection
- Locale-aware numeric parsing (commas as decimals, apostrophe/space groupings)
- Unit-aware numeric normalization and type detection
- Dataset-level normalization with a concise summary

Run cells below in order. Adjust paths if running outside the repo root.


In [2]:
# Step 0: Imports and helpers (Notebook-friendly)
from __future__ import annotations

from pathlib import Path
import pandas as pd

from PyDI.io.loaders import load_xml, load_csv
from PyDI.normalization.datasets import (
    DatasetNormalizer,
    create_normalization_config,
)
from PyDI.normalization.text import HeaderNormalizer
from PyDI.normalization.columns import ColumnTypeInference
from PyDI.normalization.types import NumericParser


def _repo_root() -> Path:
    try:
        return Path.cwd().resolve().parents[1]  
    except Exception:
        return Path.cwd()

print(_repo_root())

/Users/aaronsteiner/Documents/GitHub/PyDI


## Step 1: Load demo frames
Load XML and CSV frames from bundled inputs.


In [3]:
def load_demo_frames() -> dict[str, pd.DataFrame]:
    root = _repo_root()

    paths = {
        "academy_awards": root / "input" / "fusion" / "data" / "academy_awards.xml",
        "actors": root / "input" / "fusion" / "data" / "actors.xml",
        "movies_csv": root / "input" / "schemamatching" / "data" / "movie_list.csv",
    }

    frames: dict[str, pd.DataFrame] = {}

    frames["academy_awards"] = load_xml(
        paths["academy_awards"], name="academy_awards")
    frames["actors"] = load_xml(paths["actors"], name="actors")

    frames["movies_csv"] = load_csv(paths["movies_csv"], name="movies")

    return frames

frames = load_demo_frames()
[list(frames.keys()), {k: v.shape for k, v in frames.items()}]


[['academy_awards', 'actors', 'movies_csv'],
 {'academy_awards': (4592, 7), 'actors': (151, 7), 'movies_csv': (656, 23)}]

## Step 2: Header normalization
Normalize column headers to improve type detection later.


In [4]:
def demonstrate_header_normalization(df: pd.DataFrame) -> pd.DataFrame:
    print("Original columns:", list(df.columns)[:10])
    hn = HeaderNormalizer(lowercase=True, remove_brackets=True, replace_whitespace_with_underscore=True)
    cleaned = hn.normalize_dataframe_headers(df)
    print("Normalized columns:", list(cleaned.columns)[:10])
    return cleaned

movies = frames["movies_csv"].copy()
movies = demonstrate_header_normalization(movies)
movies.head(3)


Original columns: ['movies_id', 'id', 'year', 'exclude', 'Film', 'Lead Studio', 'Rotten Tomatoes', 'Audience Score', 'Story', 'Genre']
Normalized columns: ['movies_id', 'id', 'year', 'exclude', 'film', 'lead_studio', 'rotten_tomatoes', 'audience_score', 'story', 'genre']


Unnamed: 0,movies_id,id,year,exclude,film,lead_studio,rotten_tomatoes,audience_score,story,genre,...,foreign_gross,worldwide_gross,budget,profit,proftitability,opening_weekend,oscar,bafta,source,column
0,movies-0000,1,2010,,127 Hours,Independent,93.0,84,Escape,Adventure,...,42.4,60.73,18.0,42.73,337.39%,0.26,,,http://boxofficemojo.com/movies/?id=127hours.htm,
1,movies-0001,2,2010,,A Nightmare on Elm Street,Warner Bros.,13.0,40,Monster Force,Horror,...,52.59,115.66,35.0,80.66,330.46%,32.9,,,,
2,movies-0002,3,2010,,Alice in Wonderland,Disney,52.0,72,Journey And Return,Adventure,...,690.2,1024.39,200.0,824.39,512.20%,116.1,,,,


## Step 3: Locale-aware numeric parsing
See how `NumericParser` interprets diverse numeric formats.


In [9]:
samples = [
    "1,234.56",  # US
    "1.234,56",  # EU
    "1 234,56",  # EU with space grouping
    "1\xa0234,56",  # NBSP grouping
    "1'234.56",  # Swiss
    "$1,234.56",  # currency US
    "€ 1.234,56",  # currency EU
    "12,5%",  # EU percent
]

parser = NumericParser()
{ s: parser.parse_numeric(s) for s in samples }


{'1,234.56': 1234.56,
 '1.234,56': 1234.56,
 '1 234,56': 1234.56,
 '1\xa0234,56': 1234.56,
 "1'234.56": 1234.56,
 '$1,234.56': 1234.56,
 '€ 1.234,56': 1234.56,
 '12,5%': 0.125}

## Step 4: Dataset normalization
Run the end-to-end dataset normalization and inspect results.


In [6]:
config = create_normalization_config(
    enable_unit_conversion=True,
    enable_quantity_scaling=True,
    normalize_text=True,
    standardize_nulls=True,
)

normalizer = DatasetNormalizer(config)
out_dir = _repo_root() / "output" / "examples" / "normalization_demo"
out_dir.mkdir(parents=True, exist_ok=True)
normalized_df, result = normalizer.normalize_dataset(movies, output_path=out_dir)
(
    result.original_shape,
    result.normalized_shape,
    f"{result.overall_success_rate:.1%}"
)


((656, 23), (656, 23), '100.0%')

## Step 5: Inspect detected types (sample)
Peek at a few column detection results with units and confidence.


In [7]:
rows = []
for cr in result.column_results[:8]:
    unit_info = cr.specific_unit if cr.specific_unit else None
    rows.append({
        "name": cr.normalized_name,
        "type": cr.detected_type.value,
        "confidence": round(cr.confidence, 2),
        "unit": unit_info,
    })
import pandas as _pd
_pd.DataFrame(rows)


Unnamed: 0,name,type,confidence,unit
0,movies_id,string,1.0,
1,id,currency,1.0,
2,year,numeric,1.0,year
3,exclude,bool,1.0,
4,film,string,0.98,
5,lead_studio,string,0.98,
6,rotten_tomatoes,numeric,1.0,
7,audience_score,currency,1.0,


## Step 6: Save normalized output
Write the normalized CSV to the example output directory.


In [8]:
out_csv = out_dir / "movies_normalized.csv"
normalized_df.to_csv(out_csv, index=False)
print("Saved normalized dataset to:", out_csv)


Saved normalized dataset to: /Users/aaronsteiner/Documents/GitHub/PyDI/output/examples/normalization_demo/movies_normalized.csv
