In [31]:
import pandas as pd
from pathlib import Path


def _is_missing_xml(value):
    try:
        if pd.isna(value):
            return True
    except Exception:
        pass
    if isinstance(value, (list, tuple, set)):
        return len(value) == 0
    try:
        import numpy as np
        if isinstance(value, np.ndarray):
            return value.size == 0 or np.all(pd.isna(value))
    except Exception:
        pass
    return False


def dataframe_to_xml(df, path, root_tag="movies", record_tag="movie"):
    """Export DataFrame to XML format with robust missing-value handling."""
    from xml.etree.ElementTree import Element, SubElement, tostring
    from xml.dom.minidom import parseString

    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)

    root = Element(root_tag)

    for _, row in df.iterrows():
        movie = SubElement(root, record_tag)
        for col, value in row.items():
            if col in ["_fusion_group_id", "_fusion_sources", "_fusion_confidence", "_fusion_metadata"]:
                continue
            if _is_missing_xml(value):
                continue
            elem = SubElement(movie, col.replace("_", ""))
            elem.text = str(value)

    rough_string = tostring(root, 'utf-8')
    reparsed = parseString(rough_string)
    with open(path, 'w') as f:
        f.write(reparsed.toprettyxml(indent="  "))

    return len(df)



# PyDI Data Fusion Framework - Winter Dataset Showcase

This notebook demonstrates the comprehensive data fusion capabilities of PyDI using real datasets from the Winter framework. We'll show:

1. **Loading Winter movie datasets using PyDI IO**
2. **Creating sophisticated fusion strategies**
3. **Running the fusion engine with connected components grouping**
4. **Evaluating fusion quality**
5. **Generating comprehensive reports**
6. **Custom conflict resolution rules**
7. **Provenance tracking**

The PyDI fusion framework provides Winter-level capabilities with a modern, pandas-first Python API.

In [32]:
import pandas as pd
import numpy as np
from pathlib import Path
import logging
from datetime import datetime
import warnings
import time
warnings.filterwarnings('ignore')

# Configure logging for better visibility
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger(__name__)

print("Imports successful")

def repo_root():
    """Return the repository root directory."""
    
    if '__file__' in globals():
        return Path(__file__).parent.parent.parent
    else:
    
        current = Path.cwd()
        while current != current.parent:
            if (current / 'pyproject.toml').exists():
                return current
            current = current.parent
        return Path.cwd()  # fallback

Imports successful


In [33]:
from PyDI.fusion import (
    DataFusionEngine,
    DataFusionStrategy,
    DataFusionEvaluator,
    
    longest_string, shortest_string, most_complete,
    average, median, maximum, minimum, 
    most_recent, earliest,
    union, intersection, voting,
    build_record_groups_from_correspondences,
    
    # Convenient aliases
    LONGEST, SHORTEST, AVG, MAX, MIN, LATEST, UNION, VOTE,
    
    # Reporting and evaluation
    FusionReport,
    FusionQualityMetrics,
    ProvenanceTracker,
    
    # Base classes
    ConflictResolutionFunction,
    AttributeValueFuser,
    FusionResult,
    
    
    analyze_attribute_coverage, 
    AttributeCoverageAnalyzer,
)

from PyDI.io import load_xml, load_csv

print("PyDI components imported successfully")

PyDI components imported successfully


## 1. Loading Movie Datasets 


We'll use PyDI's provenance-aware XML loader to load the Winter datasets. This automatically handles XML parsing, flattening, and adds provenance metadata.

In [34]:
# Set up data and correspondence paths
root = repo_root()
data_dir = root / "input" / "movies" / "fusion" / "data"
correspondences_dir = root / "input" / "movies" / "fusion" / "correspondences"

print(f"Data directory: {data_dir}")
print(f"Correspondences directory: {correspondences_dir}")

# List available XML datasets
print(f"\nAvailable datasets:")
for xml_file in data_dir.glob("*.xml"):
    print(f"  - {xml_file.name}")

Data directory: /Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/data
Correspondences directory: /Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/correspondences

Available datasets:
  - academy_awards.xml
  - actors.xml
  - golden_globes.xml


In [35]:
# Load Academy Awards dataset
academy_awards_df = load_xml(
    data_dir / 'academy_awards.xml',
    name='academy_awards',
    record_tag='movie',  # Specify the record element
    flatten=True,        # Enable automatic XML flattening
    add_index=True,      # Add unique IDs
    include_provenance_columns=False, 
)

# Load Actors dataset
actors_df = load_xml(
    data_dir / 'actors.xml',
    name='actors',
    record_tag='movie',
    flatten=True,
    add_index=True,
    include_provenance_columns=False
)

# Load Golden Globes dataset
golden_globes_df = load_xml(
    data_dir / 'golden_globes.xml',
    name='golden_globes',
    record_tag='movie',
    flatten=True,
    add_index=True,
    include_provenance_columns=False
)

print("Datasets loaded successfully!")
print(f"\n Dataset Overview:")
print(f"  Academy Awards: {len(academy_awards_df)} records")
print(f"  Actors: {len(actors_df)} records")
print(f"  Golden Globes: {len(golden_globes_df)} records")
print(f"  Total: {len(academy_awards_df) + len(actors_df) + len(golden_globes_df)} records")

INFO: Loaded dataset 'academy_awards' via read_xml_flattened: shape=(4592, 7), source=/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/data/academy_awards.xml
INFO: Loaded dataset 'actors' via read_xml_flattened: shape=(151, 7), source=/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/data/actors.xml
INFO: Loaded dataset 'golden_globes' via read_xml_flattened: shape=(2286, 7), source=/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/data/golden_globes.xml


Datasets loaded successfully!

 Dataset Overview:
  Academy Awards: 4592 records
  Actors: 151 records
  Golden Globes: 2286 records
  Total: 7029 records


In [36]:
academy_awards_df

Unnamed: 0,academy_awards_id,id,title,actor_name,date,director_name,oscar
0,academy_awards-0000,academy_awards_1,Biutiful,Javier Bardem,2010-01-01,,
1,academy_awards-0001,academy_awards_2,True Grit,Jeff Bridges,2010-01-01,Joel Coen,
2,academy_awards-0002,academy_awards_2,True Grit,Jeff Bridges,2010-01-01,Ethan Coen,
3,academy_awards-0003,academy_awards_3,The Social Network,Jesse Eisenberg,2010-01-01,David Fincher,yes
4,academy_awards-0004,academy_awards_4,The King's Speech,Colin Firth,2010-01-01,Tom Hooper,yes
...,...,...,...,...,...,...,...
4587,academy_awards-4587,academy_awards_4576,Lajos Biro,,1927-01-01,,
4588,academy_awards-4588,academy_awards_4577,Ben Hecht,,1927-01-01,,yes
4589,academy_awards-4589,academy_awards_4578,Gerald Duffy,,1927-01-01,,
4590,academy_awards-4590,academy_awards_4579,Roy Pomeroy,,1927-01-01,,yes


In [37]:
datasets = [academy_awards_df, actors_df, golden_globes_df]
dataset_names = ['Academy Awards', 'Actors', 'Golden Globes']

for i, (df, name) in enumerate(zip(datasets, dataset_names)):
    print(f"\n{name} Dataset:")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    print(f"  Dataset name: {df.attrs.get('dataset_name', 'N/A')}")
    print(f"  Sample records:")
    
    display(df.head(3))


Academy Awards Dataset:
  Shape: (4592, 7)
  Columns: ['academy_awards_id', 'id', 'title', 'actor_name', 'date', 'director_name', 'oscar']
  Dataset name: academy_awards
  Sample records:


Unnamed: 0,academy_awards_id,id,title,actor_name,date,director_name,oscar
0,academy_awards-0000,academy_awards_1,Biutiful,Javier Bardem,2010-01-01,,
1,academy_awards-0001,academy_awards_2,True Grit,Jeff Bridges,2010-01-01,Joel Coen,
2,academy_awards-0002,academy_awards_2,True Grit,Jeff Bridges,2010-01-01,Ethan Coen,



Actors Dataset:
  Shape: (151, 7)
  Columns: ['actors_id', 'id', 'title', 'actor_name', 'actors_actor_birthday', 'actors_actor_birthplace', 'date']
  Dataset name: actors
  Sample records:


Unnamed: 0,actors_id,id,title,actor_name,actors_actor_birthday,actors_actor_birthplace,date
0,actors-0000,actors_1,7th Heaven,Janet Gaynor,1906-01-01,Pennsylvania,1929-01-01
1,actors-0001,actors_2,Coquette,Mary Pickford,1892-01-01,Canada,1930-01-01
2,actors-0002,actors_3,The Divorcee,Norma Shearer,1902-01-01,Canada,1931-01-01



Golden Globes Dataset:
  Shape: (2286, 7)
  Columns: ['golden_globes_id', 'id', 'title', 'actor_name', 'date', 'director_name', 'globe']
  Dataset name: golden_globes
  Sample records:


Unnamed: 0,golden_globes_id,id,title,actor_name,date,director_name,globe
0,golden_globes-0000,golden_globes_1,Frankie and Alice,Halle Berry,2011-01-01,,
1,golden_globes-0001,golden_globes_2,Rabbit Hole,Nicole Kidman,2011-01-01,,
2,golden_globes-0002,golden_globes_3,Winter's Bone,Jennifer Lawrence,2011-01-01,,


In [38]:
print("Creating enhanced correspondences to enable real conflict resolution...")

# Load original correspondence files
corr_aa_actors = load_csv(
    correspondences_dir / 'academy_awards_2_actors_correspondences.csv',
    name='aa_actors_correspondences',
    add_index=False,
    header=None,
    names=['id1', 'id2', 'score']
)

corr_actors_gg = load_csv(
    correspondences_dir / 'actors_2_golden_globes_correspondences.csv', 
    name='actors_gg_correspondences',
    add_index=False,
    header=None,
    names=['id1', 'id2', 'score']
)
print("Sample correspondences:")
print("Academy Awards ↔ Actors:")
display(corr_aa_actors.head(3))
print("Actors ↔ Golden Globes:")
display(corr_actors_gg.head(3))

INFO: Loaded dataset 'aa_actors_correspondences' via read_csv: shape=(150, 3), source=/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/correspondences/academy_awards_2_actors_correspondences.csv
INFO: Loaded dataset 'actors_gg_correspondences' via read_csv: shape=(107, 3), source=/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/correspondences/actors_2_golden_globes_correspondences.csv


Creating enhanced correspondences to enable real conflict resolution...
Sample correspondences:
Academy Awards ↔ Actors:


Unnamed: 0,id1,id2,score
0,academy_awards_4557,actors_1,1.0
1,academy_awards_4529,actors_2,1.0
2,academy_awards_4500,actors_3,1.0


Actors ↔ Golden Globes:


Unnamed: 0,id1,id2,score
0,actors_16,golden_globes_2279,1.0
1,actors_22,golden_globes_2263,1.0
2,actors_23,golden_globes_2252,1.0


In [39]:
# Create enhanced correspondences that will form multi-record groups
enhanced_correspondences = []

# Add all original pairwise correspondences
for _, row in corr_aa_actors.iterrows():
    enhanced_correspondences.append({
        'id1': row['id1'], 'id2': row['id2'], 'score': row['score']
    })

for _, row in corr_actors_gg.iterrows():
    enhanced_correspondences.append({
        'id1': row['id1'], 'id2': row['id2'], 'score': row['score']
    })

# Add direct Academy ↔ Golden Globe connections to create multi-record groups
# This ensures some records will belong to groups with 3+ records from different sources
sample_connections = [
    # Create specific multi-record groups with known conflicts
    ('academy_awards_1880', 'golden_globes_1733', 0.95),  # One Flew Over Cuckoo's Nest
    ('academy_awards_1430', 'golden_globes_1330', 0.95),  # Amadeus
    ('academy_awards_3059', 'golden_globes_974', 0.95),   # Stalag 17
    ('academy_awards_4080', 'golden_globes_1131', 0.95),  # Goodbye, Mr. Chips
    ('academy_awards_3816', 'golden_globes_2187', 0.95),  # Mrs. Miniver
]

for id1, id2, score in sample_connections:
    enhanced_correspondences.append({
        'id1': id1, 'id2': id2, 'score': score
    })

# Add some same-source conflicts to demonstrate intra-dataset resolution
same_source_conflicts = [
    # Link similar movies within Academy Awards to create conflict groups
    ('academy_awards_1880', 'academy_awards_1430', 0.75),  # Both Milos Forman films
    ('academy_awards_3816', 'academy_awards_3910', 0.70),  # Both 1940s dramas
]

for id1, id2, score in same_source_conflicts:
    enhanced_correspondences.append({
        'id1': id1, 'id2': id2, 'score': score
    })

# Create final correspondences dataframe
all_correspondences = pd.DataFrame(enhanced_correspondences)
print(f"Enhanced correspondences: {len(all_correspondences)} total")
print("This creates multi-record groups that will trigger actual conflict resolution!")

# Show sample correspondences
print(f"\nEnhanced correspondences include:")
display(all_correspondences.tail(8))  # Show the new connections we added

Enhanced correspondences: 264 total
This creates multi-record groups that will trigger actual conflict resolution!

Enhanced correspondences include:


Unnamed: 0,id1,id2,score
256,actors_150,golden_globes_372,1.0
257,academy_awards_1880,golden_globes_1733,0.95
258,academy_awards_1430,golden_globes_1330,0.95
259,academy_awards_3059,golden_globes_974,0.95
260,academy_awards_4080,golden_globes_1131,0.95
261,academy_awards_3816,golden_globes_2187,0.95
262,academy_awards_1880,academy_awards_1430,0.75
263,academy_awards_3816,academy_awards_3910,0.7


## 2. Data Quality Analysis with Loaded Datasets

Now let's analyze the loaded data to understand attribute coverage and potential conflicts.

### Method 1: PyDI comes with some integrated Attribute Coverage Analysis.

In [40]:
coverage_df = analyze_attribute_coverage(
    datasets=datasets,
    dataset_names=dataset_names,
    include_samples=True
)

print(f"Coverage analysis complete!")
print(f"   Total attributes analyzed: {len(coverage_df)}")
print(f"   Datasets analyzed: {len(dataset_names)}")

# Display the results (same format as the original notebook code)
print(f"\nAttribute Coverage Summary:")
display_cols = ['attribute'] + [f'{name}_count' for name in dataset_names] + [f'{name}_pct' for name in dataset_names]
available_display_cols = [col for col in display_cols if col in coverage_df.columns]
display(coverage_df[available_display_cols])



INFO: Analyzed 12 attributes across 3 datasets


Coverage analysis complete!
   Total attributes analyzed: 12
   Datasets analyzed: 3

Attribute Coverage Summary:


Unnamed: 0,attribute,Academy Awards_count,Actors_count,Golden Globes_count,Academy Awards_pct,Actors_pct,Golden Globes_pct
0,academy_awards_id,4592/4592,0/0,0/0,100.0%,0%,0%
1,actor_name,1057/4592,151/151,2232/2286,23.0%,100.0%,97.6%
2,actors_actor_birthday,0/0,151/151,0/0,0%,100.0%,0%
3,actors_actor_birthplace,0/0,151/151,0/0,0%,100.0%,0%
4,actors_id,0/0,151/151,0/0,0%,100.0%,0%
5,date,4592/4592,151/151,2286/2286,100.0%,100.0%,100.0%
6,director_name,420/4592,0/0,320/2286,9.1%,0%,14.0%
7,globe,0/0,0/0,625/2286,0%,0%,27.3%
8,golden_globes_id,0/0,0/0,2286/2286,0%,0%,100.0%
9,id,4592/4592,151/151,2286/2286,100.0%,100.0%,100.0%


In [41]:
# Method 2: Advanced analyzer class  
print(f"\nMethod 2: Advanced Coverage Analyzer")
coverage_analyzer = AttributeCoverageAnalyzer(datasets, dataset_names)

# This provides much more detailed analysis
coverage_analyzer.print_summary(max_attributes=12)

INFO: Analyzed 12 attributes across 3 datasets
INFO: Initialized AttributeCoverageAnalyzer for 3 datasets



Method 2: Advanced Coverage Analyzer
Attribute Coverage Analysis Summary

Dataset Overview:
  1. Academy Awards: 4,592 records, 7 attributes
  2. Actors: 151 records, 7 attributes
  3. Golden Globes: 2,286 records, 7 attributes
  Total: 7,029 records

Attribute Statistics:
  Total unique attributes: 12
  Common across all datasets: 4
  Dataset-specific attributes: 8

Coverage Distribution:
  Low (0-25%): 3 attributes (25.0%)
  Medium (25-50%): 5 attributes (41.7%)
  High (50-75%): 1 attributes (8.3%)
  Very High (75-100%): 3 attributes (25.0%)

Top Attributes by Coverage:
  1. date: 100.0% avg (3/3 datasets)
  2. id: 100.0% avg (3/3 datasets)
  3. title: 99.9% avg (3/3 datasets)
  4. actor_name: 73.6% avg (3/3 datasets)
  5. academy_awards_id: 33.3% avg (1/3 datasets)
  6. actors_actor_birthday: 33.3% avg (1/3 datasets)
  7. actors_actor_birthplace: 33.3% avg (1/3 datasets)
  8. actors_id: 33.3% avg (1/3 datasets)
  9. golden_globes_id: 33.3% avg (1/3 datasets)
  10. oscar: 9.3% avg (

In [42]:
print(f"\nConflict Analysis:")
conflict_analysis = coverage_analyzer.conflict_analysis
if conflict_analysis:
    print(f"   Detected potential conflicts in {len(conflict_analysis)} attributes:")
    for attr, conflicts in list(conflict_analysis.items())[:5]:
        conflict_types = []
        if conflicts.get('data_type_mismatches'):
            conflict_types.append("data type")
        if len(conflicts.get('value_distributions', {})) > 1:
            conflict_types.append("value distribution")
        print(f"     • {attr}: {', '.join(conflict_types)}")
else:
    print(f"   No significant conflicts detected - data looks compatible!")


Conflict Analysis:
   Detected potential conflicts in 4 attributes:
     • title: value distribution
     • id: value distribution
     • actor_name: value distribution
     • date: value distribution


## 3. Executing Data Fusion with Connected Components

Now we'll run the fusion engine using PyDI's sophisticated grouping and conflict resolution.

In [43]:
sample_titles = ["The Lord of the Rings", "LOTR", "The Lord of the Rings: Extended Edition"]
result_title, confidence, metadata = longest_string(sample_titles)
print(f"Longest title: '{result_title}' (confidence: {confidence:.3f})")

sample_dates = ["2001-01-01", "2001-12-19", "2001"]
result_date, confidence, metadata = most_recent(sample_dates)
print(f"Most recent date: '{result_date}' (confidence: {confidence:.3f})")

sample_numbers = [8.5, 9.1, 8.8, 9.0]
result_avg, confidence, metadata = average(sample_numbers)
print(f"Average rating: {result_avg:.2f} (confidence: {confidence:.3f})")

Longest title: 'The Lord of the Rings: Extended Edition' (confidence: 0.731)
Most recent date: '2001-12-19' (confidence: 0.500)
Average rating: 8.85 (confidence: 0.974)


In [44]:
# Demonstrate how to create strategies from scratch 
print("Easy Strategy Creation with Function-Based Rules:")

# Create a custom strategy with movie-specific rules
fusion_strategy = DataFusionStrategy("movies_custom")

# Add rules 
fusion_strategy.add_attribute_fuser("title", AttributeValueFuser(LONGEST))
fusion_strategy.add_attribute_fuser("date", AttributeValueFuser(LATEST)) 
fusion_strategy.add_attribute_fuser("director_name", AttributeValueFuser(union, separator=", "))
fusion_strategy.add_attribute_fuser("actor_name", AttributeValueFuser(VOTE))  # Most common name

# Add a custom rule for award data
def best_award_rule(values, context=None):
    """Custom rule: prioritize Oscar over Globe, handle award hierarchy."""
    award_priority = {'yes': 3, 'oscar': 3, 'globe': 2, 'golden globe': 2}
    best_award = None
    highest_priority = 0
    
    for value in values:
        if pd.notna(value):
            value_str = str(value).lower()
            priority = award_priority.get(value_str, 1)
            if priority > highest_priority:
                highest_priority = priority
                best_award = value
    
    confidence = highest_priority / 3.0 if highest_priority > 0 else 0.0
    metadata = {"rule": "best_award_rule", "priority": highest_priority}
    
    return best_award, confidence, metadata

# Add award rules
fusion_strategy.add_attribute_fuser("oscar", AttributeValueFuser(best_award_rule))
fusion_strategy.add_attribute_fuser("globe", AttributeValueFuser(best_award_rule))

print(f"Custom strategy created: {len(fusion_strategy.get_registered_attributes())} rules")
print(f"Registered attributes: {list(fusion_strategy.get_registered_attributes())}")


INFO: Registered fuser for attribute 'title' using rule 'longest_string'
INFO: Registered fuser for attribute 'date' using rule 'most_recent'
INFO: Registered fuser for attribute 'director_name' using rule 'union'
INFO: Registered fuser for attribute 'actor_name' using rule 'voting'
INFO: Registered fuser for attribute 'oscar' using rule 'best_award_rule'
INFO: Registered fuser for attribute 'globe' using rule 'best_award_rule'


Easy Strategy Creation with Function-Based Rules:
Custom strategy created: 6 rules
Registered attributes: ['title', 'globe', 'director_name', 'actor_name', 'oscar', 'date']


In [45]:
academy_awards_df

Unnamed: 0,academy_awards_id,id,title,actor_name,date,director_name,oscar
0,academy_awards-0000,academy_awards_1,Biutiful,Javier Bardem,2010-01-01,,
1,academy_awards-0001,academy_awards_2,True Grit,Jeff Bridges,2010-01-01,Joel Coen,
2,academy_awards-0002,academy_awards_2,True Grit,Jeff Bridges,2010-01-01,Ethan Coen,
3,academy_awards-0003,academy_awards_3,The Social Network,Jesse Eisenberg,2010-01-01,David Fincher,yes
4,academy_awards-0004,academy_awards_4,The King's Speech,Colin Firth,2010-01-01,Tom Hooper,yes
...,...,...,...,...,...,...,...
4587,academy_awards-4587,academy_awards_4576,Lajos Biro,,1927-01-01,,
4588,academy_awards-4588,academy_awards_4577,Ben Hecht,,1927-01-01,,yes
4589,academy_awards-4589,academy_awards_4578,Gerald Duffy,,1927-01-01,,
4590,academy_awards-4590,academy_awards_4579,Roy Pomeroy,,1927-01-01,,yes


In [46]:
actors_df[actors_df["actors_id"] == "actors-0000"]

Unnamed: 0,actors_id,id,title,actor_name,actors_actor_birthday,actors_actor_birthplace,date
0,actors-0000,actors_1,7th Heaven,Janet Gaynor,1906-01-01,Pennsylvania,1929-01-01


In [47]:
academy_awards_df[academy_awards_df["academy_awards_id"] == "academy_awards-4567"]

Unnamed: 0,academy_awards_id,id,title,actor_name,date,director_name,oscar
4567,academy_awards-4567,academy_awards_4557,7th Heaven,Janet Gaynor,1927-01-01,,yes


In [48]:
# We need to transform the ids in the correspondences to match the ids in the datasets
def _id_to_pydi_id(id: str, df: pd.DataFrame, lookup_column: str = "_id", return_column: str = "id"):
    values = df[df[lookup_column] == id][return_column].values
    if len(values) == 0:
        return id
    return values[0]

# Apply the transformation to the correspondences
all_correspondences["id1"] = all_correspondences["id1"].apply(lambda x: _id_to_pydi_id(x, academy_awards_df, "id", "academy_awards_id"))
all_correspondences["id2"] = all_correspondences["id2"].apply(lambda x: _id_to_pydi_id(x, academy_awards_df, "id", "academy_awards_id"))

all_correspondences["id2"] = all_correspondences["id2"].apply(lambda x: _id_to_pydi_id(x, actors_df, "id", "actors_id"))
all_correspondences["id2"] = all_correspondences["id2"].apply(lambda x: _id_to_pydi_id(x, golden_globes_df, "id", "golden_globes_id"))

all_correspondences


Unnamed: 0,id1,id2,score
0,academy_awards-4567,actors-0000,1.00
1,academy_awards-4539,actors-0001,1.00
2,academy_awards-4509,actors-0002,1.00
3,academy_awards-4484,actors-0003,1.00
4,academy_awards-4455,actors-0004,1.00
...,...,...,...
259,academy_awards-3067,golden_globes-0978,0.95
260,academy_awards-4089,golden_globes-1135,0.95
261,academy_awards-3824,golden_globes-2193,0.95
262,academy_awards-1887,academy_awards-1435,0.75


In [49]:
all_correspondences[all_correspondences["id2"] == "academy_awards_1880"]

Unnamed: 0,id1,id2,score


In [50]:
# Create the fusion engine
fusion_engine = DataFusionEngine(
    strategy=fusion_strategy
)

# Execute the fusion with our custom strategy
fusion_result, runtime = fusion_engine.run(
    datasets=datasets,
    correspondences=all_correspondences,
    id_column={"academy_awards": "academy_awards_id",
                "actors": "actors_id",
                "golden_globes": "golden_globes_id"},
    include_singletons=False
)

print(f"Fusion runtime: {runtime:.2f} seconds")
display(fusion_result.head(3))

INFO: Starting data fusion with strategy 'movies_custom'
INFO: Correspondence ID coverage: matched 403 of 510 unique IDs
INFO: Created 6872 record groups from 264 correspondences
INFO: Groups: 146 multi-record, 6726 singleton
INFO: Fusion complete: 146 records from 146 groups
INFO: Fusion time: 0.28 seconds


Fusion runtime: 0.28 seconds


Unnamed: 0,_id,_fusion_group_id,_fusion_sources,actors_id,id,actors_actor_birthplace,academy_awards_id,title,director_name,actors_actor_birthday,actor_name,oscar,date,_fusion_confidence,_fusion_metadata,golden_globes_id,globe
0,academy_awards-1547,group_3,"[actors, academy_awards]",actors-0055,academy_awards_1541,New Jersey,academy_awards-1547,Sophie's Choice,,1949-01-01,Meryl Streep,yes,1983-01-01,0.463636,"{'actors_id_rule': 'first_non_null', 'id_rule'...",,
1,actors-0098,group_4,"[actors, academy_awards]",actors-0098,actors_99,England,academy_awards-3374,Hamlet,[Laurence Olivier],1907-01-01,Laurence Olivier,yes,1949-01-01,0.554545,"{'actors_id_rule': 'first_non_null', 'id_rule'...",,
2,academy_awards-2247,group_6,"[actors, academy_awards]",actors-0040,academy_awards_2240,New York,academy_awards-2247,Funny Girl,,1942-01-01,Barbra Streisand,yes,1969-01-01,0.463636,"{'actors_id_rule': 'first_non_null', 'id_rule'...",,


In [51]:
# Extract results
fused_dataset = fusion_result

print(f"Fusion execution complete!")
print(f"Input records: {sum(len(df) for df in datasets):,}")
print(f"Output records: {len(fused_dataset):,}")
print(f"Data reduction: {((sum(len(df) for df in datasets) - len(fused_dataset))/sum(len(df) for df in datasets)*100):.1f}%")

# Show sample of fused data
print(f"\nSample Fused Records:")
if len(fused_dataset) > 0:
    display_cols = ['_id', 'title', 'date', 'actor_name', 'director_name']
    available_cols = [col for col in display_cols if col in fused_dataset.columns]
    if available_cols:
        display(fused_dataset[available_cols].head(3))
    else:
        display(fused_dataset.head(3))
        
    # Show fusion metadata if available
    if '_fusion_confidence' in fused_dataset.columns:
        avg_confidence = fused_dataset['_fusion_confidence'].mean()
        print(f"\nFusion Quality:")
        print(f"Average confidence: {avg_confidence:.3f}")
        print(f"High confidence records (>=0.8): {(fused_dataset['_fusion_confidence'] >= 0.8).sum()} ({(fused_dataset['_fusion_confidence'] >= 0.8).mean():.1%})")
else:
    print("No records were fused!")

Fusion execution complete!
Input records: 7,029
Output records: 146
Data reduction: 97.9%

Sample Fused Records:


Unnamed: 0,_id,title,date,actor_name,director_name
0,academy_awards-1547,Sophie's Choice,1983-01-01,Meryl Streep,
1,actors-0098,Hamlet,1949-01-01,Laurence Olivier,[Laurence Olivier]
2,academy_awards-2247,Funny Girl,1969-01-01,Barbra Streisand,



Fusion Quality:
Average confidence: 0.506
High confidence records (>=0.8): 0 (0.0%)


In [52]:
# Load gold standard and perform accuracy evaluation
print("Loading Gold Standard for Accuracy Evaluation...")
print("=" * 50)

# Load gold standard dataset
gold_standard_path = root / "input" / "movies" / "fusion" / "splits" / "gold.xml"
gold_df = load_xml(
    gold_standard_path,
    name='gold_standard',
    record_tag='movie',
    explode=True,
    add_index=False, 
    include_provenance_columns=False
)

print(f"Gold standard loaded: {len(gold_df)} records")
display(gold_df.head(3))    


INFO: Loaded dataset 'gold_standard' via read_xml_flattened: shape=(20, 6), source=/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/splits/gold.xml


Loading Gold Standard for Accuracy Evaluation...
Gold standard loaded: 20 records


Unnamed: 0,id,title,director_name,actor_name,date,oscar
0,academy_awards_1880,One Flew over the Cuckoo's Nest,Milos Forman,Jack Nicholson,1975-01-01,yes
1,academy_awards_3624,Gaslight,,Charles Boyer,1944-01-01,yes
2,academy_awards_3548,Mildred Pierce,,Joan Crawford,1945-01-01,yes


In [53]:
gold_df["id"] = gold_df["id"].apply(lambda x: _id_to_pydi_id(x, academy_awards_df, "id", "academy_awards_id"))
display(gold_df.head(3))

Unnamed: 0,id,title,director_name,actor_name,date,oscar
0,academy_awards-1887,One Flew over the Cuckoo's Nest,Milos Forman,Jack Nicholson,1975-01-01,yes
1,academy_awards-3632,Gaslight,,Charles Boyer,1944-01-01,yes
2,academy_awards-3556,Mildred Pierce,,Joan Crawford,1945-01-01,yes


In [62]:
# Create evaluator and run against gold standard
evaluator = DataFusionEvaluator(fusion_strategy)

# Perform evaluation
eval_results = evaluator.evaluate(
    fused_df=fused_dataset,
    fused_id_column="academy_awards_id",
    gold_df=gold_df,
    gold_id_column="id"  
)

print(f"\nAccuracy Evaluation Results:")
print(f"=" * 35)
for metric, value in eval_results.items():
    if isinstance(value, float):
        print(f"  {metric}: {value:.3f}")
    else:
        print(f"  {metric}: {value}")

# Show detailed per-attribute accuracy if available
attribute_accuracies = {k: v for k, v in eval_results.items() if k.endswith('_accuracy') and not k.startswith(('overall', 'macro', 'micro'))}
if attribute_accuracies:
    print(f"\nPer-Attribute Accuracy:")
    for attr, accuracy in sorted(attribute_accuracies.items()):
        attr_name = attr.replace('_accuracy', '')
        print(f"  {attr_name}: {accuracy:.3f}")


INFO: Starting fusion evaluation
INFO: Evaluation complete: 0.553 overall accuracy (47/85)



Accuracy Evaluation Results:
  overall_accuracy: 0.553
  macro_accuracy: 0.522
  micro_accuracy: 0.553
  num_evaluated_records: 18
  num_evaluated_attributes: 5
  total_evaluations: 85
  total_correct: 47
  title_accuracy: 0.722
  title_count: 18
  director_name_accuracy: 0.000
  director_name_count: 13
  actor_name_accuracy: 0.889
  actor_name_count: 18
  oscar_accuracy: 1.000
  oscar_count: 18
  date_accuracy: 0.000
  date_count: 18

Per-Attribute Accuracy:
  actor_name: 0.889
  date: 0.000
  director_name: 0.000
  oscar: 1.000
  title: 0.722


In [55]:
fusion_report = FusionReport(
    fused_df=fused_dataset,
    input_datasets=[academy_awards_df, actors_df, golden_globes_df],
    strategy_name=fusion_strategy.name,
    correspondences=all_correspondences
)

print("Generating Comprehensive Fusion Report...")
print("=" * 50)

fusion_report.print_summary()

INFO: Analyzed 12 attributes across 3 datasets
INFO: Initialized AttributeCoverageAnalyzer for 3 datasets


Generating Comprehensive Fusion Report...

=== PyDI Data Fusion Report ===
Generated: 2025-09-02 15:04:38
Strategy: movies_custom

📊 Data Summary:
  Input datasets: 3
  Input records: 7029
  Output records: 146
  Correspondences: 264
  Record coverage: 2.08%

📈 Quality Metrics:
  Mean confidence: 0.506
  Multi-source records: 146
  Single-source records: 0

👥 Group Statistics:
  Total groups: 146
  Multi-record groups: 146
  Average group size: 2.03
  Largest group: 3 records

🏷️  Attribute Statistics:
  Total attributes: 13
  Attributes with conflicts: 0
  Most conflicted: actors_id

⚙️  Rule Usage:
  best_award_rule: 150 applications
  first_non_null: 880 applications
  longest_string: 146 applications
  most_recent: 146 applications
  union: 146 applications
  voting: 146 applications



In [56]:
# Generate detailed quality and coverage metrics
quality_metrics = FusionQualityMetrics.calculate_consistency_metrics(fused_dataset)
coverage_metrics = FusionQualityMetrics.calculate_coverage_metrics(datasets, fused_dataset)

print("Advanced Quality Metrics:")
print("=" * 40)

print(f"\nQuality Metrics:")
for key, value in quality_metrics.items():
    if isinstance(value, dict) and len(value) <= 10:  # Don't overwhelm with huge dicts
        print(f"  {key}:")
        for sub_key, sub_value in list(value.items())[:5]:  # Limit sub-items
            print(f"    {sub_key}: {sub_value}")
        if len(value) > 5:
            print(f"    ... and {len(value)-5} more")
    elif not isinstance(value, dict):
        if isinstance(value, float):
            print(f"  {key}: {value:.3f}")
        else:
            print(f"  {key}: {value}")

print(f"\nCoverage Metrics:")
for key, value in coverage_metrics.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.3f}")
    else:
        print(f"  {key}: {value}")

# Show sample JSON export structure (without saving)
print(f"\nExport Capabilities:")
report_json = fusion_report.to_json()

Advanced Quality Metrics:

Quality Metrics:
  mean_confidence: 0.506
  std_confidence: 0.043
  min_confidence: 0.418
  max_confidence: 0.558
  multi_source_records: 146
  single_source_records: 0
  mean_sources_per_record: 2.027
  rule_usage:
    first_non_null: 880
    longest_string: 146
    union: 146
    voting: 146
    best_award_rule: 150
    ... and 1 more
  num_unique_rules: 6

Coverage Metrics:
  record_coverage: 0.021
  attribute_coverage: 1.000
  total_input_records: 7029
  total_output_records: 146
  total_input_attributes: 12
  total_output_attributes: 13

Export Capabilities:


## 6. Record Grouping Analysis

Examine PyDI's connected components algorithm in detail.

In [61]:
# Analyze the connected components grouping process
record_groups = build_record_groups_from_correspondences(
    [academy_awards_df, actors_df, golden_globes_df],
    all_correspondences
)

print(f"Connected Components Analysis:")
print(f"=" * 45)

# Group size analysis
group_sizes = [len(group.records) for group in record_groups]
group_size_distribution = pd.Series(group_sizes).value_counts().sort_index()

print(f"\nGroup Statistics:")
print(f"  Total groups created: {len(record_groups)}")
print(f"  Average group size: {np.mean(group_sizes):.2f}")
print(f"  Largest group size: {max(group_sizes)}")

print(f"\nGroup Size Distribution:")
for size, count in group_size_distribution.items():
    percentage = count / len(record_groups) * 100
    print(f"  {size} record(s): {count} groups ({percentage:.1f}%)")

# Focus on multi-record groups (the interesting ones for fusion)
multi_record_groups = [g for g in record_groups if len(g.records) > 1]
print(f"\nMulti-Record Groups Details:")
print(f"  Count: {len(multi_record_groups)}")
print(f"  Percentage: {len(multi_record_groups)/len(record_groups):.1%}")


INFO: Created 7149 record groups from 264 correspondences
INFO: Groups: 2 multi-record, 7005 singleton


Connected Components Analysis:

Group Statistics:
  Total groups created: 7149
  Average group size: 0.98
  Largest group size: 3

Group Size Distribution:
  0 record(s): 142 groups (2.0%)
  1 record(s): 7005 groups (98.0%)
  2 record(s): 1 groups (0.0%)
  3 record(s): 1 groups (0.0%)

Multi-Record Groups Details:
  Count: 2
  Percentage: 0.0%


## 7. Provenance and Trust Analysis

Demonstrate PyDI's provenance tracking using the loaded Winter datasets.

In [58]:
# Set up comprehensive provenance tracking
provenance_tracker = ProvenanceTracker()

print("Setting up Provenance Tracking for Winter Datasets...")

# Register data sources with differential trust scores based on Winter characteristics
# Academy Awards: Generally very reliable, official source
provenance_tracker.register_dataset_source('academy_awards', trust_score=0.95)
# Golden Globes: Also reliable, but sometimes has minor discrepancies  
provenance_tracker.register_dataset_source('golden_globes', trust_score=0.90)
# Actors: Useful for cast info, but sometimes inconsistent on dates/details
provenance_tracker.register_dataset_source('actors', trust_score=0.85)

# Track all input datasets
for df in [academy_awards_df, actors_df, golden_globes_df]:
    dataset_name = df.attrs.get('dataset_name', 'unknown')
    provenance_tracker.track_input_data(df, dataset_name)
    print(f"  Tracked {len(df)} records from {dataset_name}")

print(f"\nProvenance Analysis:")
source_stats = provenance_tracker.get_source_statistics()

for source, stats in source_stats.items():
    print(f"\n{source.title()} Dataset:")
    print(f"  Records tracked: {stats['record_count']:,}")
    print(f"  Trust score: {stats['trust_score']:.2f}")
    print(f"  Average confidence: {stats['average_confidence']:.3f}")
    print(f"  Data contribution: {stats['contribution_ratio']:.1%}")

# Simulate some fusion result tracking
if len(multi_record_groups) > 0:
    print(f"\nSample Fusion Provenance Tracking:")
    
    # Track a sample fusion result
    sample_group = multi_record_groups[0]
    source_ids = [record.get('id', record.get('_id', '')) for record in sample_group.records]
    source_ids = [sid for sid in source_ids if sid]  # Remove empty IDs
    
    if source_ids:
        fused_id = f"fused_{sample_group.group_id}"
        provenance_tracker.track_fusion_result(
            fused_id=fused_id,
            source_ids=source_ids,
            operation="winter_movie_fusion",
            confidence=0.85,
            metadata={"fusion_strategy": fusion_strategy.name, "rule_count": len(fusion_strategy.get_registered_attributes())}
        )
        
        # Show provenance for this fused record
        fused_provenance = provenance_tracker.get_provenance(fused_id)
        if fused_provenance:
            print(f"  Fused record: {fused_id}")
            print(f"  Source datasets: {list(fused_provenance.sources)}")
            print(f"  Operation: {fused_provenance.operation}")
            print(f"  Confidence: {fused_provenance.confidence:.3f}")
            print(f"  Timestamp: {fused_provenance.timestamp}")

Setting up Provenance Tracking for Winter Datasets...
  Tracked 4592 records from academy_awards
  Tracked 151 records from actors
  Tracked 2286 records from golden_globes

Provenance Analysis:

Sample Fusion Provenance Tracking:
  Fused record: fused_group_21
  Source datasets: []
  Operation: winter_movie_fusion
  Confidence: 0.850
  Timestamp: 2025-09-02 15:04:38.629762


In [59]:
# Comprehensive performance analysis
print("PyDI Fusion Performance Analysis:")
print("=" * 45)

# Calculate detailed metrics
total_input_records = sum(len(df) for df in datasets)
total_attributes = sum(len(df.columns) for df in datasets)
total_correspondences = len(all_correspondences)
fusion_rules = len(fusion_strategy.get_registered_attributes())

print(f"\nProcessing Statistics:")
print(f"  Input datasets: {len(datasets)}")
print(f"  Total input records: {total_input_records:,}")
print(f"  Total input attributes: {total_attributes}")
print(f"  Correspondences processed: {total_correspondences:,}")
print(f"  Fusion rules applied: {fusion_rules}")
print(f"  Output records: {len(fused_dataset):,}")

print(f"\nPerformance Metrics:")
print(f"  Total processing time: {runtime:.3f} seconds")
print(f"  Records per second: {total_input_records/runtime:.0f}")
print(f"  Correspondences per second: {total_correspondences/runtime:.0f}")
print(f"  Data reduction achieved: {total_input_records - len(fused_dataset):,} records ({(total_input_records - len(fused_dataset))/total_input_records:.1%})")

# Memory efficiency analysis
import sys

def get_size_mb(obj):
    return sys.getsizeof(obj) / (1024 * 1024)

input_size = sum(get_size_mb(df) for df in datasets)
output_size = get_size_mb(fused_dataset)
corr_size = get_size_mb(all_correspondences)

print(f"\nMemory Efficiency:")
print(f"  Input datasets: {input_size:.2f} MB")
print(f"  Correspondences: {corr_size:.2f} MB") 
print(f"  Output dataset: {output_size:.2f} MB")
print(f"  Memory reduction: {input_size - output_size:.2f} MB ({(input_size - output_size)/input_size:.1%})")

# Quality vs Speed analysis
if '_fusion_confidence' in fused_dataset.columns:
    avg_confidence = fused_dataset['_fusion_confidence'].mean()
    high_confidence_ratio = (fused_dataset['_fusion_confidence'] >= 0.8).mean()
    
    print(f"\nQuality vs Performance:")
    print(f"  Average confidence: {avg_confidence:.3f}")
    print(f"  High confidence ratio: {high_confidence_ratio:.1%}")
    print(f"  Quality per second: {avg_confidence * total_input_records / runtime:.0f}")
    
# Scalability projection
print(f"\nScalability Projections:")
records_per_sec = total_input_records / runtime
print(f"  10K records: ~{10000/records_per_sec:.1f} seconds")
print(f"  100K records: ~{100000/records_per_sec:.1f} seconds")
print(f"  1M records: ~{1000000/records_per_sec/60:.1f} minutes")

PyDI Fusion Performance Analysis:

Processing Statistics:
  Input datasets: 3
  Total input records: 7,029
  Total input attributes: 21
  Correspondences processed: 264
  Fusion rules applied: 6
  Output records: 146

Performance Metrics:
  Total processing time: 0.282 seconds
  Records per second: 24916
  Correspondences per second: 936
  Data reduction achieved: 6,883 records (97.9%)

Memory Efficiency:
  Input datasets: 2.55 MB
  Correspondences: 0.03 MB
  Output dataset: 0.19 MB
  Memory reduction: 2.36 MB (92.6%)

Quality vs Performance:
  Average confidence: 0.506
  High confidence ratio: 0.0%
  Quality per second: 12607

Scalability Projections:
  10K records: ~0.4 seconds
  100K records: ~4.0 seconds
  1M records: ~0.7 minutes


In [60]:
# Export results with proper organization and artifact management
from datetime import datetime
import json
from pathlib import Path

print("Comprehensive Export and Artifact Management:")
print("=" * 50)

# Create organized output directory structure
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_base = Path("output") / f"winter_fusion_{timestamp}"
output_base.mkdir(parents=True, exist_ok=True)

# Create subdirectories
(output_base / "datasets").mkdir(exist_ok=True)
(output_base / "reports").mkdir(exist_ok=True)
(output_base / "evaluation").mkdir(exist_ok=True)
(output_base / "provenance").mkdir(exist_ok=True)

print(f"Output directory: {output_base}")

# Export fused dataset in multiple formats
print("\nExporting fused dataset...")
fused_dataset.to_csv(output_base / "datasets" / "fused_movies.csv", index=False)

# Now Parquet should work with pyarrow installed
try:
    fused_dataset.to_parquet(output_base / "datasets" / "fused_movies.parquet")
    print("  ✓ Parquet export successful")
except Exception as e:
    print(f"  ✗ Parquet export failed: {e}")

# Export as XML (demonstrate XML export)
def dataframe_to_xml(df, path, root_tag="movies", record_tag="movie"):
    """Export DataFrame to XML format."""
    from xml.etree.ElementTree import Element, SubElement, tostring
    from xml.dom import minidom
    
    root = Element(root_tag)
    
    for _, row in df.head(10).iterrows():  # Limit to first 10 for demo
        movie = SubElement(root, record_tag)
        for col, value in row.items():
            if pd.notna(value) and col not in ['_fusion_group_id', '_fusion_sources', '_fusion_confidence']:
                elem = SubElement(movie, col.replace('_', ''))
                elem.text = str(value)
    
    # Pretty print
    rough_string = tostring(root, 'unicode')
    reparsed = minidom.parseString(rough_string)
    
    with open(path, 'w', encoding='utf-8') as f:
        f.write(reparsed.toprettyxml(indent="  "))
    
    return len(df.head(10))

xml_records = dataframe_to_xml(fused_dataset, output_base / "datasets" / "fused_movies_sample.xml")
print(f"  ✓ XML export successful ({xml_records} records)")

# Export comprehensive fusion report
print("\nExporting fusion reports...")
fusion_report.export_detailed_results(output_base / "reports")
fusion_report.to_html(output_base / "reports" / "fusion_report.html")

# Export evaluation results
print("Exporting evaluation results...")
with open(output_base / "evaluation" / "accuracy_results.json", 'w') as f:
    json.dump(eval_results, f, indent=2, default=str)

# Export rule usage metrics (capture what rules were actually used)
print("Capturing rule usage metrics...")
rule_usage_metrics = {
    "strategy_name": fusion_strategy.name,
    "total_rules_defined": len(fusion_strategy.get_registered_attributes()),
    "registered_attributes": list(fusion_strategy.get_registered_attributes()),
    "runtime": runtime,
    "multi_record_groups": 0,  # From our analysis above
    "singleton_groups": len(fused_dataset),
    "rules_applied": {
        "title": "longest_string",
        "date": "most_recent", 
        "director_name": "union",
        "actor_name": "voting",
        "oscar": "best_award_rule",
        "globe": "best_award_rule"
    },
    "custom_rules": ["best_award_rule", "trust_weighted_fusion_rule"]
}

with open(output_base / "evaluation" / "rule_usage_metrics.json", 'w') as f:
    json.dump(rule_usage_metrics, f, indent=2)

# Export provenance data
print("Exporting provenance data...")
provenance_data = provenance_tracker.export_provenance()
with open(output_base / "provenance" / "provenance_tracking.json", 'w') as f:
    json.dump(provenance_data, f, indent=2, default=str)

# Create comprehensive manifest
manifest = {
    "experiment": {
        "name": "PyDI Winter Dataset Fusion Showcase",
        "timestamp": timestamp,
        "version": "1.0",
        "description": "Data fusion demonstration using Winter movie datasets"
    },
    "inputs": {
        "datasets": ["academy_awards.xml", "actors.xml", "golden_globes.xml"],
        "correspondences": ["academy_awards_2_actors_correspondences.csv", "actors_2_golden_globes_correspondences.csv"],
        "gold_standard": "gold.xml"
    },
    "processing": {
        "fusion_strategy": fusion_strategy.name,
        "processing_time_seconds": runtime,
        "total_input_records": sum(len(df) for df in datasets),
        "total_output_records": len(fused_dataset),
        "correspondences_processed": len(all_correspondences)
    },
    "outputs": {
        "datasets": {
            "fused_movies.csv": f"{len(fused_dataset)} records in CSV format",
            "fused_movies.parquet": f"{len(fused_dataset)} records in Parquet format",  
            "fused_movies_sample.xml": f"{xml_records} records in XML format (sample)"
        },
    },
    "metrics": {
        "accuracy": eval_results.get("overall_accuracy", 0),
        "confidence": fused_dataset["_fusion_confidence"].mean() if "_fusion_confidence" in fused_dataset.columns else None,
        "coverage": eval_results.get("num_evaluated_records", 0) / len(gold_df) if len(gold_df) > 0 else 0
    }
}

with open(output_base / "manifest.json", 'w') as f:
    json.dump(manifest, f, indent=2)

Comprehensive Export and Artifact Management:
Output directory: output/winter_fusion_20250902_150438

Exporting fused dataset...
  ✓ Parquet export successful


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
# Export fused dataset to CSV and Parquet
fused_dataset.to_csv('winter_movies_fused.csv', index=False)
fused_dataset.to_parquet('winter_movies_fused.parquet')

# Export comprehensive fusion report
fusion_report.export_detailed_results('output/winter_fusion/')
fusion_report.to_html('fusion_report.html')

# Export provenance data
provenance_data = provenance_tracker.export_provenance()
import json
with open('provenance.json', 'w') as f:
    json.dump(provenance_data, f, indent=2)

INFO: Report saved to output/winter_fusion/fusion_report.json
INFO: HTML report saved to output/winter_fusion/fusion_report.html
INFO: Detailed results exported to output/winter_fusion/
INFO: HTML report saved to fusion_report.html
