# PyDI Data Loading Example

This notebook mirrors `PyDI/examples/data_loading_example.py` and demonstrates provenance-aware data loading.

It shows how to load:
- CSV (with provenance)
- XML (flattened)
- JSON (flattened)

Run cells below in order. Adjust paths if needed.


In [None]:
# Step 0: Imports and setup
import logging
from pathlib import Path

from PyDI.io import load_csv, load_xml, load_json

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

root = Path.cwd().parents[1]


## Step 1: CSV loading with provenance


In [6]:
csv_path = root / "input/schemamatching/data/movie_list.csv"
if csv_path.exists():
    df_csv = load_csv(csv_path, name="movies")
    print(f"Loaded {len(df_csv)} rows from {csv_path.name}")
    prov = df_csv.attrs.get('provenance', {})
    print(f"Dataset name: {prov.get('dataset_name', 'N/A')}")
    print(f"Columns: {list(df_csv.columns)[:8]} ...")
    df_csv.head(3)
else:
    print(f"CSV file not found: {csv_path}")


INFO:PyDI.io.loaders:Loaded dataset 'movies' via read_csv: shape=(656, 23), source=/Users/aaronsteiner/Documents/GitHub/PyDI/input/schemamatching/data/movie_list.csv


Loaded 656 rows from movie_list.csv
Dataset name: movies
Columns: ['movies_id', 'id', 'year', 'exclude', 'Film', 'Lead Studio', 'Rotten Tomatoes', 'Audience Score'] ...


## Step 2: XML loading and flattening


In [7]:
xml_path = root / "input/entitymatching/data/academy_awards.xml"
if xml_path.exists():
    df_xml = load_xml(xml_path, name="academy_awards")
    print(f"Loaded {len(df_xml)} rows from {xml_path.name}")
    print(f"Columns: {list(df_xml.columns)[:10]} ...")
    df_xml.head(6)
else:
    print(f"XML file not found: {xml_path}")


INFO:PyDI.io.loaders:Loaded dataset 'academy_awards' via read_xml_flattened: shape=(4592, 7), source=/Users/aaronsteiner/Documents/GitHub/PyDI/input/entitymatching/data/academy_awards.xml


Loaded 4592 rows from academy_awards.xml
Columns: ['academy_awards_id', 'id', 'title', 'actor_name', 'date', 'director_name', 'oscar'] ...


## Step 3: JSON loading and flattening

















In [8]:
json_path = root / "winter/winter-framework/src/test/resource/testTable.json"
if json_path.exists():
    try:
        df_json = load_json(json_path, name="hockey_stats")
        print(f"Loaded {len(df_json)} rows from {json_path.name}")
        print(f"Columns: {list(df_json.columns)[:8]} ...")
        df_json.head(3)
    except Exception as e:
        print(f"Failed to load JSON file: {e}")
else:
    print(f"JSON file not found: {json_path}")


INFO:PyDI.io.loaders:Loaded dataset 'hockey_stats' via read_json: shape=(25, 3), source=/Users/aaronsteiner/Documents/GitHub/PyDI/winter/winter-framework/src/test/resource/testTable.json


Loaded 25 rows from testTable.json
Columns: ['hockey_stats_id', 'table', 'mapping'] ...
