# PyDI Data Fusion Framework Showcase

This notebook demonstrates the data fusion capabilities of PyDI. We'll show:

1. **Loading and preparing Winter movie datasets**
2. **Creating sophisticated fusion strategies**
3. **Running the fusion engine with connected components grouping**
4. **Evaluating fusion quality**
5. **Generating reports**
6. **Custom conflict resolution rules**
7. **Provenance tracking**


In [1]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from pathlib import Path
import logging
from datetime import datetime
import warnings
from typing import List, Any, Tuple, Dict
import pandas as pd

warnings.filterwarnings('ignore')

# Configure logging for debug level and save output to a file
log_file = "data_fusion_debug.log"
logging.basicConfig(
    level=logging.DEBUG,
    format='%(levelname)s: %(message)s',
    handlers=[
        logging.FileHandler(log_file, mode='w'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

logger.debug("Imports successful")

DEBUG: Imports successful


In [2]:
# Import PyDI fusion components
from PyDI.fusion import (
    # Core engine and strategy
    DataFusionEngine,
    DataFusionStrategy,
    DataFusionEvaluator,
    
    # Reporting and evaluation
    FusionReport,
    ProvenanceTracker,
    calculate_consistency_metrics,
        
    # Analysis
    analyze_attribute_coverage,
    analyze_conflicts_preview,
    print_conflict_preview,
    
    # Conflict Resolution
    longest_string,
    union as Union,
    most_recent,
    
    #evaluation
    tokenized_match, year_only_match, boolean_match
    
    
)

from PyDI.io import load_xml

print("✅ PyDI fusion components imported successfully")

✅ PyDI fusion components imported successfully


## 1. Loading Winter Movie Datasets

We'll load the movie datasets from Winter's XML files and convert them to pandas DataFrames suitable for fusion.

In [3]:
# Define paths to Winter datasets
data_dir = Path('/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/data')
correspondences_dir = Path('/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/correspondences')

print(f"Data directory: {data_dir}")
print(f"Correspondences directory: {correspondences_dir}")

Data directory: /Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/data
Correspondences directory: /Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/correspondences


In [4]:
# Load the three movie datasets
academy_awards_df = load_xml(f"{data_dir}/academy_awards.xml", name='academy_awards', nested_handling="aggregate")
actors_df = load_xml(f"{data_dir}/actors.xml", name='actors', nested_handling="aggregate")
golden_globes_df = load_xml(f"{data_dir}/golden_globes.xml", name='golden_globes', nested_handling="aggregate")


INFO: Loaded dataset 'academy_awards' via read_xml_aggregated: shape=(4580, 7), source=/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/data/academy_awards.xml
INFO: Loaded dataset 'actors' via read_xml_aggregated: shape=(151, 7), source=/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/data/actors.xml
INFO: Loaded dataset 'golden_globes' via read_xml_aggregated: shape=(2279, 7), source=/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/data/golden_globes.xml


In [5]:
# Display sample data from each dataset
print(f"Academy Awards Sample(n={len(academy_awards_df)}):")
display(academy_awards_df.head())

print(f"\nActors Sample(n={len(actors_df)}):")
display(actors_df.head())

print(f"\nGolden Globes Sample(n={len(golden_globes_df)}):")
display(golden_globes_df.head())

Academy Awards Sample(n=4580):


Unnamed: 0,academy_awards_id,id,title,actors_actor_name,date,director_name,oscar
0,academy_awards-0000,academy_awards_1,Biutiful,Javier Bardem,2010-01-01,,
1,academy_awards-0001,academy_awards_2,True Grit,"[Jeff Bridges, Hailee Steinfeld]",2010-01-01,Joel Coen and Ethan Coen,
2,academy_awards-0002,academy_awards_3,The Social Network,Jesse Eisenberg,2010-01-01,David Fincher,yes
3,academy_awards-0003,academy_awards_4,The King's Speech,"[Colin Firth, Geoffrey Rush, Helena Bonham Car...",2010-01-01,Tom Hooper,yes
4,academy_awards-0004,academy_awards_5,127 Hours,James Franco,2010-01-01,,



Actors Sample(n=151):


Unnamed: 0,actors_id,id,title,actors_actor_name,actors_actor_birthday,actors_actor_birthplace,date
0,actors-0000,actors_1,7th Heaven,Janet Gaynor,1906-01-01,Pennsylvania,1929-01-01
1,actors-0001,actors_2,Coquette,Mary Pickford,1892-01-01,Canada,1930-01-01
2,actors-0002,actors_3,The Divorcee,Norma Shearer,1902-01-01,Canada,1931-01-01
3,actors-0003,actors_4,Min and Bill,Marie Dressler,1868-01-01,Canada,1932-01-01
4,actors-0004,actors_5,The Sin of Madelon Claudet,Helen Hayes,1900-01-01,Washington DC,1933-01-01



Golden Globes Sample(n=2279):


Unnamed: 0,golden_globes_id,id,title,actors_actor_name,date,director_name,globe
0,golden_globes-0000,golden_globes_1,Frankie and Alice,Halle Berry,2011-01-01,,
1,golden_globes-0001,golden_globes_2,Rabbit Hole,Nicole Kidman,2011-01-01,,
2,golden_globes-0002,golden_globes_3,Winter's Bone,Jennifer Lawrence,2011-01-01,,
3,golden_globes-0003,golden_globes_4,Black Swan,"[Natalie Portman, Mila Kunis]",2011-01-01,Darren Aronofsky,yes
4,golden_globes-0004,golden_globes_5,Blue Valentine,"[Michelle Williams, Ryan Gosling]",2011-01-01,,


In [6]:
# Load correspondences (matches between datasets)
def load_correspondences(corr_path):
    """Load correspondences from Winter CSV format."""
    df = pd.read_csv(corr_path, header=None, names=['id1', 'id2', 'score'])
    return df

# Load all correspondence files
corr_aa_actors = load_correspondences(correspondences_dir / 'academy_awards_2_actors_correspondences.csv')
corr_actors_gg = load_correspondences(correspondences_dir / 'actors_2_golden_globes_correspondences.csv')

print("🔗 Correspondences Overview:")
print(f"Academy Awards ↔ Actors: {len(corr_aa_actors)} matches")
print(f"Actors ↔ Golden Globes: {len(corr_actors_gg)} matches")

# Combine correspondences for fusion
all_correspondences = pd.concat([corr_aa_actors, corr_actors_gg], ignore_index=True)
print(f"\nTotal correspondences: {len(all_correspondences)}")

display(all_correspondences.head(10))

🔗 Correspondences Overview:
Academy Awards ↔ Actors: 150 matches
Actors ↔ Golden Globes: 107 matches

Total correspondences: 257


Unnamed: 0,id1,id2,score
0,academy_awards_4557,actors_1,1.0
1,academy_awards_4529,actors_2,1.0
2,academy_awards_4500,actors_3,1.0
3,academy_awards_4475,actors_4,1.0
4,academy_awards_4446,actors_5,1.0
5,academy_awards_4399,actors_6,1.0
6,academy_awards_4363,actors_7,1.0
7,academy_awards_4320,actors_8,1.0
8,academy_awards_4270,actors_9,1.0
9,academy_awards_4207,actors_10,1.0


## 2. Exploring Data Quality and Overlap

Before fusion, let's analyze the data quality and understand what conflicts we might encounter.

In [7]:
# Analyze attribute coverage across datasets
datasets = [academy_awards_df, actors_df, golden_globes_df]
dataset_names = ['Academy Awards', 'Actors', 'Golden Globes']
ids = ['academy_awards_id', 'actors_id', 'golden_globes_id']

# Use the existing function instead of manual loops
coverage_df = analyze_attribute_coverage(datasets, dataset_names)

# Display the results
display(coverage_df)

INFO: Analyzed 12 attributes across 3 datasets


Unnamed: 0,attribute,Academy Awards_count,Academy Awards_pct,Academy Awards_coverage,Academy Awards_samples,Actors_count,Actors_pct,Actors_coverage,Actors_samples,Golden Globes_count,Golden Globes_pct,Golden Globes_coverage,Golden Globes_samples,avg_coverage,max_coverage,datasets_with_attribute
0,academy_awards_id,4580/4580,100.0%,1.0,"['academy_awards-0000', 'academy_awards-0001']",0/0,0%,0.0,,0/0,0%,0.0,,0.333333,1.0,1
1,actors_actor_birthday,0/0,0%,0.0,,151/151,100.0%,1.0,"['1906-01-01', '1892-01-01']",0/0,0%,0.0,,0.333333,1.0,1
2,actors_actor_birthplace,0/0,0%,0.0,,151/151,100.0%,1.0,"['Pennsylvania', 'Canada']",0/0,0%,0.0,,0.333333,1.0,1
3,actors_actor_name,1049/4580,22.9%,0.229039,"['Javier Bardem', ['Jeff Bridges', 'Hailee Ste...",151/151,100.0%,1.0,"['Janet Gaynor', 'Mary Pickford']",2225/2279,97.6%,0.976305,"['Halle Berry', 'Nicole Kidman']",0.735115,1.0,3
4,actors_id,0/0,0%,0.0,,151/151,100.0%,1.0,"['actors-0000', 'actors-0001']",0/0,0%,0.0,,0.333333,1.0,1
5,date,4580/4580,100.0%,1.0,"['2010-01-01', '2010-01-01']",151/151,100.0%,1.0,"['1929-01-01', '1930-01-01']",2279/2279,100.0%,1.0,"['2011-01-01', '2011-01-01']",1.0,1.0,3
6,director_name,408/4580,8.9%,0.089083,"['Joel Coen and Ethan Coen', 'David Fincher']",0/0,0%,0.0,,313/2279,13.7%,0.137341,"['Darren Aronofsky', 'David Fincher']",0.075475,0.137341,2
7,globe,0/0,0%,0.0,,0/0,0%,0.0,,622/2279,27.3%,0.272927,"['yes', 'yes']",0.090976,0.272927,1
8,golden_globes_id,0/0,0%,0.0,,0/0,0%,0.0,,2279/2279,100.0%,1.0,"['golden_globes-0000', 'golden_globes-0001']",0.333333,1.0,1
9,id,4580/4580,100.0%,1.0,"['academy_awards_1', 'academy_awards_2']",151/151,100.0%,1.0,"['actors_1', 'actors_2']",2279/2279,100.0%,1.0,"['golden_globes_1', 'golden_globes_2']",1.0,1.0,3


In [8]:
analyze_conflicts_preview(datasets, all_correspondences, sample_size=200, id_columns=ids)

INFO: Analyzing conflicts in 200 correspondence pairs
DEBUG: Built lookup table with 7010 records
DEBUG: Sample dataset IDs: ['academy_awards-0000', 'academy_awards-0001', 'academy_awards-0002', 'academy_awards-0003', 'academy_awards-0004', 'academy_awards-0005', 'academy_awards-0006', 'academy_awards-0007', 'academy_awards-0008', 'academy_awards-0009']
DEBUG: Sample correspondence IDs: id1=['academy_awards_4557', 'academy_awards_4529', 'academy_awards_4500', 'academy_awards_4475', 'academy_awards_4446'], id2=['actors_1', 'actors_2', 'actors_3', 'actors_4', 'actors_5']
DEBUG: Missing record(s) for correspondence 1: ['academy_awards_4557', 'actors_1']
DEBUG: Missing record(s) for correspondence 2: ['academy_awards_4529', 'actors_2']
DEBUG: Missing record(s) for correspondence 3: ['academy_awards_4500', 'actors_3']
DEBUG: Missing record(s) for correspondence 4: ['academy_awards_4475', 'actors_4']
DEBUG: Missing record(s) for correspondence 5: ['academy_awards_4446', 'actors_5']
DEBUG: Mi

{'conflict_examples': [],
 'conflict_summary': {'total_matches': 200,
  'matches_with_conflicts': 0,
  'conflict_rate': 0.0,
  'total_attribute_conflicts': 0},
 'attribute_conflicts': {},
 'diagnostics': {'records_not_found': 200,
  'no_common_attributes': 0,
  'identical_values': 0,
  'total_comparisons': 0,
  'lookup_table_size': 7010,
  'processed_pairs': 0}}

## 3. Creating a Fusion Strategy

Now we'll create a fusion strategy that handles different types of conflicts intelligently.

In [9]:
def smart_date_fusion(values: List[Any], **kwargs) -> Tuple[Any, float, Dict[str, Any]]:
    """Smart date fusion that handles different date formats."""
    if not values:
        return None, 0.0, {"reason": "no_values"}
    
    clean_values = [v for v in values if pd.notna(v)]
    if not clean_values:
        return None, 0.0, {"reason": "no_valid_values"}
    
    # Parse dates and score them by precision
    parsed_dates = []
    for val in clean_values:
        date_str = str(val)
        precision_score = 0
        
        # Score based on precision (more specific dates get higher scores)
        if len(date_str) >= 10:  # Full date YYYY-MM-DD
            precision_score = 3
        elif len(date_str) >= 7:   # Year-Month YYYY-MM
            precision_score = 2
        elif len(date_str) >= 4:   # Just year YYYY
            precision_score = 1
        
        parsed_dates.append((val, precision_score))
    
    # Choose date with highest precision
    best_date = max(parsed_dates, key=lambda x: x[1])
    
    # Calculate confidence
    max_score = best_date[1]
    confidence = min(1.0, 0.5 + max_score * 0.15)
    
    return best_date[0], confidence, {
        "rule": "smart_date_fusion",
        "precision_score": max_score,
        "candidates": clean_values
    }

In [10]:
# Create a comprehensive fusion strategy
movie_strategy = DataFusionStrategy("comprehensive_movie_fusion")

# Configure fusion rules for each attribute
movie_strategy.add_attribute_fuser_from_resolver("title", longest_string
                                                 )
movie_strategy.add_attribute_fuser_from_resolver("director", Union)  
movie_strategy.add_attribute_fuser_from_resolver("actors_actor_name", Union, separator=", ")
movie_strategy.add_attribute_fuser_from_resolver("date", smart_date_fusion)  

print("Fusion Strategy Configuration:")
print(f"Strategy name: {movie_strategy.name}")
print(f"Registered attributes: {list(movie_strategy.get_registered_attributes())}")

# Show the rules for each attribute
for attr in movie_strategy.get_registered_attributes():
    fuser = movie_strategy.get_attribute_fuser(attr)
    resolver_name = getattr(fuser.resolver, 'name', None) or getattr(fuser.resolver, '__name__', str(fuser.resolver))
    print(f"  {attr}: {resolver_name}")

INFO: Registered fuser for attribute 'title' using rule 'longest_string'
INFO: Registered fuser for attribute 'director' using rule 'union'
INFO: Registered fuser for attribute 'actors_actor_name' using rule 'union'
INFO: Registered fuser for attribute 'date' using rule 'smart_date_fusion'


Fusion Strategy Configuration:
Strategy name: comprehensive_movie_fusion
Registered attributes: ['actors_actor_name', 'director', 'title', 'date']
  actors_actor_name: union
  director: union
  title: longest_string
  date: smart_date_fusion


## 4. Running the Fusion Engine

Now we'll execute the fusion process using the DataFusionEngine with connected components grouping.

In [11]:
all_correspondences

Unnamed: 0,id1,id2,score
0,academy_awards_4557,actors_1,1.0
1,academy_awards_4529,actors_2,1.0
2,academy_awards_4500,actors_3,1.0
3,academy_awards_4475,actors_4,1.0
4,academy_awards_4446,actors_5,1.0
...,...,...,...
252,actors_146,golden_globes_562,1.0
253,actors_147,golden_globes_513,1.0
254,actors_148,golden_globes_463,1.0
255,actors_149,golden_globes_417,1.0


In [12]:
# We need to transform the ids in the correspondences to match the ids in the datasets
def _id_to_pydi_id(id: str, df: pd.DataFrame, lookup_column: str = "_id", return_column: str = "id"):
    values = df[df[lookup_column] == id][return_column].values
    if len(values) == 0:
        return id
    return values[0]

# Apply the transformation to the correspondences
all_correspondences["id1"] = all_correspondences["id1"].apply(lambda x: _id_to_pydi_id(x, academy_awards_df, "id", "academy_awards_id"))
all_correspondences["id2"] = all_correspondences["id2"].apply(lambda x: _id_to_pydi_id(x, academy_awards_df, "id", "academy_awards_id"))

all_correspondences["id1"] = all_correspondences["id1"].apply(lambda x: _id_to_pydi_id(x, actors_df, "id", "actors_id"))
all_correspondences["id2"] = all_correspondences["id2"].apply(lambda x: _id_to_pydi_id(x, actors_df, "id", "actors_id"))

all_correspondences["id1"] = all_correspondences["id1"].apply(lambda x: _id_to_pydi_id(x, golden_globes_df, "id", "golden_globes_id"))
all_correspondences["id2"] = all_correspondences["id2"].apply(lambda x: _id_to_pydi_id(x, golden_globes_df, "id", "golden_globes_id"))

all_correspondences


Unnamed: 0,id1,id2,score
0,academy_awards-4556,actors-0000,1.0
1,academy_awards-4528,actors-0001,1.0
2,academy_awards-4499,actors-0002,1.0
3,academy_awards-4474,actors-0003,1.0
4,academy_awards-4445,actors-0004,1.0
...,...,...,...
252,actors-0145,golden_globes-0561,1.0
253,actors-0146,golden_globes-0512,1.0
254,actors-0147,golden_globes-0462,1.0
255,actors-0148,golden_globes-0416,1.0


In [13]:
# Create and run the fusion engine
fusion_engine = DataFusionEngine(movie_strategy)

# Run fusion with all datasets and correspondences
fused_movies  = fusion_engine.run(
    datasets=[academy_awards_df, actors_df, golden_globes_df],
    correspondences=all_correspondences,
    id_column={"academy_awards": "academy_awards_id",
                "actors": "actors_id",
                "golden_globes": "golden_globes_id"},
    include_singletons=False
)

# Display the fusion result
fused_movies.head(5)

INFO: Starting data fusion with strategy 'comprehensive_movie_fusion'
INFO: Correspondence ID coverage: matched 403 of 403 unique IDs
INFO: Created 6755 record groups from 257 correspondences
INFO: Groups: 148 multi-record, 6607 singleton
DEBUG: Fusing attribute 'oscar' for group 'group_0' using default (first_non_null)
DEBUG:   Fused 'oscar': 'yes' (default, confidence: 0.5)
DEBUG: Fusing attribute '_id' for group 'group_0' using default (first_non_null)
DEBUG:   Fused '_id': 'academy_awards-4468' (default, confidence: 0.5)
DEBUG: Fusing attribute 'actors_actor_birthday' for group 'group_0' using default (first_non_null)
DEBUG:   Fused 'actors_actor_birthday': '1878-01-01' (default, confidence: 0.5)
DEBUG: Fusing attribute 'title' for group 'group_0' using fuser: longest_string
DEBUG: Fusion invocation: group_id=group_0, attribute=title, rule=longest_string
DEBUG:   Input: record_id=academy_awards-4468, value='A Free Soul'
DEBUG:   Input: record_id=actors-0081, value='A Free Soul'
DEB

Unnamed: 0,_id,_fusion_group_id,_fusion_sources,oscar,actors_actor_birthday,title,date,actors_id,actors_actor_name,id,actors_actor_birthplace,director_name,academy_awards_id,_fusion_confidence,_fusion_metadata,globe,golden_globes_id
0,academy_awards-4468,group_0,"[actors, academy_awards]",yes,1878-01-01,A Free Soul,1930-01-01,actors-0081,"[Lionel Barrymore, Norma Shearer]",academy_awards_4469,Pennsylvania,Clarence Brown,academy_awards-4468,0.586364,"{'oscar_rule': 'first_non_null', '_id_rule': '...",,
1,actors-0040,group_1,"[golden_globes, actors, academy_awards]",yes,1942-01-01,Funny Girl,1969-01-01,actors-0040,"[Barbra Streisand, Kay Medford]",actors_41,New York,William Wyler,academy_awards-2239,0.534615,"{'oscar_rule': 'first_non_null', '_id_rule': '...",,golden_globes-1978
2,golden_globes-1086,group_2,"[golden_globes, actors, academy_awards]",yes,1909-01-01,Driving Miss Daisy,1990-01-01,actors-0062,"[Dan Aykroyd, Jessica Tandy, Morgan Freeman]",golden_globes_1087,England,,academy_awards-1168,0.534615,"{'oscar_rule': 'first_non_null', '_id_rule': '...",yes,golden_globes-1086
3,actors-0105,group_3,"[golden_globes, actors, academy_awards]",yes,1917-01-01,Marty,1956-01-01,actors-0105,"[Betsy Blair, Ernest Borgnine, Joe Mantell]",actors_106,Connecticut,Delbert Mann,academy_awards-2942,0.573077,"{'oscar_rule': 'first_non_null', '_id_rule': '...",yes,golden_globes-2211
4,actors-0124,group_4,"[golden_globes, actors, academy_awards]",yes,1918-01-01,Harry and Tonto,1975-01-01,actors-0124,[Art Carney],actors_125,New York,,academy_awards-1935,0.534615,"{'oscar_rule': 'first_non_null', '_id_rule': '...",yes,golden_globes-1784


## 5. Creating a Fusion Report

PyDI's reporting framework provides detailed analytics and diagnostics for fusion results.

In [14]:
# Create a comprehensive fusion report
fusion_report = FusionReport(
    fused_df=fused_movies,
    input_datasets=[academy_awards_df, actors_df, golden_globes_df],
    strategy_name=movie_strategy.name,
    correspondences=all_correspondences,
)

# Display the comprehensive report
fusion_report.print_summary()

INFO: Analyzed 12 attributes across 3 datasets



=== PyDI Data Fusion Report ===
Generated: 2025-09-11 13:34:28
Strategy: comprehensive_movie_fusion

Data Summary:
  Input datasets: 3
  Input records: 7010
  Output records: 148
  Correspondences: 257
  Record coverage: 2.11%

Quality Metrics:
  Mean confidence: 0.555
  Multi-source records: 148
  Single-source records: 0

Group Statistics:
  Total groups: 148
  Multi-record groups: 148
  Average group size: 2.71
  Largest group: 3 records

Attribute Statistics:
  Total attributes: 13
  Attributes with conflicts: 2
  Most conflicted: director_name

Rule Usage:
  first_non_null: 1312 applications
  longest_string: 148 applications
  no_value: 82 applications
  smart_date_fusion: 148 applications
  union: 148 applications



## 6. Evaluating Fusion Quality Against Gold Standard

Now let's evaluate how well our fusion performed by comparing the results against a gold standard dataset.

In [24]:
# Load the gold standard dataset
gold_standard_path = f"{data_dir}/../splits/gold.xml"
gold_df = load_xml(gold_standard_path, name='gold_standard', add_index=False, nested_handling="aggregate")

gold_df.head(5)

INFO: Loaded dataset 'gold_standard' via read_xml_aggregated: shape=(20, 6), source=/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/data/../splits/gold.xml


Unnamed: 0,id,title,director_name,actors_actor_name,date,oscar
0,academy_awards_1880,One Flew over the Cuckoo's Nest,Milos Forman,"[Jack Nicholson, Brad Dourif, Louise Fletcher]",1975-01-01,yes
1,academy_awards_3624,Gaslight,,"[Charles Boyer, Ingrid Bergman, Angela Lansbury]",1944-01-01,yes
2,academy_awards_3548,Mildred Pierce,,"[Joan Crawford, Eve Arden, Ann Blyth]",1945-01-01,yes
3,academy_awards_4146,Jezebel,,"[Bette Davis, Fay Bainter]",1938-01-01,yes
4,academy_awards_3423,The Farmer's Daughter,,"[Charles Bickford, Loretta Young]",1947-01-01,yes


In [16]:
# We need to transform the ids in the correspondences to match the ids in the datasets
def _id_to_pydi_id(id: str, df: pd.DataFrame, lookup_column: str = "_id", return_column: str = "id"):
    values = df[df[lookup_column] == id][return_column].values
    if len(values) == 0:
        return id
    return values[0]

gold_df["academy_awards_id"] = gold_df["id"].apply(lambda x: _id_to_pydi_id(x, academy_awards_df, "id", "academy_awards_id"))


In [17]:
gold_df.head(5)

Unnamed: 0,id,title,director_name,actors_actor_name,date,oscar,academy_awards_id
0,academy_awards_1880,One Flew over the Cuckoo's Nest,Milos Forman,"[Jack Nicholson, Brad Dourif, Louise Fletcher]",1975-01-01,yes,academy_awards-1879
1,academy_awards_3624,Gaslight,,"[Charles Boyer, Ingrid Bergman, Angela Lansbury]",1944-01-01,yes,academy_awards-3623
2,academy_awards_3548,Mildred Pierce,,"[Joan Crawford, Eve Arden, Ann Blyth]",1945-01-01,yes,academy_awards-3547
3,academy_awards_4146,Jezebel,,"[Bette Davis, Fay Bainter]",1938-01-01,yes,academy_awards-4145
4,academy_awards_3423,The Farmer's Daughter,,"[Charles Bickford, Loretta Young]",1947-01-01,yes,academy_awards-3422


In [18]:
# lets select only the columns we need our gs has these id, title, director_name, actors_actor_name, date, oscar
gs_cols = ['academy_awards_id', 'title', 'director_name', 'actors_actor_name', 'date', 'oscar']
fused_movies = fused_movies[gs_cols]
fused_movies.head(5)

Unnamed: 0,academy_awards_id,title,director_name,actors_actor_name,date,oscar
0,academy_awards-4468,A Free Soul,Clarence Brown,"[Lionel Barrymore, Norma Shearer]",1930-01-01,yes
1,academy_awards-2239,Funny Girl,William Wyler,"[Barbra Streisand, Kay Medford]",1969-01-01,yes
2,academy_awards-1168,Driving Miss Daisy,,"[Dan Aykroyd, Jessica Tandy, Morgan Freeman]",1990-01-01,yes
3,academy_awards-2942,Marty,Delbert Mann,"[Betsy Blair, Ernest Borgnine, Joe Mantell]",1956-01-01,yes
4,academy_awards-1935,Harry and Tonto,,[Art Carney],1975-01-01,yes


In [19]:
evaluation_strategy =  DataFusionStrategy("movie_evaluation_strategy")
evaluation_strategy.add_evaluation_function("title", tokenized_match)
evaluation_strategy.add_evaluation_function("director_name", tokenized_match, threshold=0.7)
evaluation_strategy.add_evaluation_function("actors_actor_name", tokenized_match)
evaluation_strategy.add_evaluation_function("date", year_only_match)
evaluation_strategy.add_evaluation_function("oscar", boolean_match)

INFO: Registered evaluation function for attribute 'title'
INFO: Registered evaluation function for attribute 'director_name' with params {'threshold': 0.7}
INFO: Registered evaluation function for attribute 'actors_actor_name'
INFO: Registered evaluation function for attribute 'date'
INFO: Registered evaluation function for attribute 'oscar'


In [20]:
# Create evaluator with our fusion strategy
evaluator = DataFusionEvaluator(evaluation_strategy)

# Evaluate the fused results against the gold standard
print("Evaluating fusion results against gold standard...")
evaluation_results = evaluator.evaluate(
    fused_df=fused_movies,
    fused_id_column='academy_awards_id',
    gold_df=gold_df,
    gold_id_column='academy_awards_id'
)

# Display evaluation metrics
print("\nFusion Evaluation Results:")
print("=" * 40)
for metric, value in evaluation_results.items():
    if isinstance(value, float):
        print(f"  {metric}: {value:.3f}")
    else:
        print(f"  {metric}: {value}")
        
print(f"\nOverall Accuracy: {evaluation_results.get('overall_accuracy', 0):.1%}")

INFO: Starting fusion evaluation
DEBUG: Attribute 'oscar': 1.000 (20/20)
DEBUG: Attribute 'title': 0.950 (19/20)
DEBUG: Attribute 'date': 0.500 (10/20)
DEBUG: Attribute 'actors_actor_name': 0.900 (18/20)
DEBUG: Attribute 'director_name': 0.933 (14/15)
INFO: Evaluation complete: 0.853 overall accuracy (81/95)


Evaluating fusion results against gold standard...

Fusion Evaluation Results:
  overall_accuracy: 0.853
  macro_accuracy: 0.857
  num_evaluated_records: 20
  num_evaluated_attributes: 5
  total_evaluations: 95
  total_correct: 81
  oscar_accuracy: 1.000
  oscar_count: 20
  title_accuracy: 0.950
  title_count: 20
  date_accuracy: 0.500
  date_count: 20
  actors_actor_name_accuracy: 0.900
  actors_actor_name_count: 20
  director_name_accuracy: 0.933
  director_name_count: 15

Overall Accuracy: 85.3%


In [21]:
# Create an updated fusion report with evaluation results
print("Creating updated fusion report with evaluation metrics...")

fusion_report_with_eval = FusionReport(
    fused_df=fused_movies,
    input_datasets=[academy_awards_df, actors_df, golden_globes_df],
    strategy_name=movie_strategy.name,
    correspondences=all_correspondences,
    evaluation_results=evaluation_results
)

# Display the updated report with evaluation metrics
fusion_report_with_eval.print_summary()

INFO: Analyzed 12 attributes across 3 datasets


Creating updated fusion report with evaluation metrics...

=== PyDI Data Fusion Report ===
Generated: 2025-09-11 13:34:28
Strategy: comprehensive_movie_fusion

Data Summary:
  Input datasets: 3
  Input records: 7010
  Output records: 148
  Correspondences: 257
  Record coverage: 2.11%

Quality Metrics:
  Mean confidence: 0.000
  Multi-source records: 0
  Single-source records: 148

Group Statistics:
  Total groups: 0
  Multi-record groups: 0
  Average group size: 0.00
  Largest group: 0 records

Attribute Statistics:
  Total attributes: 6
  Attributes with conflicts: 0

Evaluation Results:
  Overall accuracy: 0.853
  Macro accuracy: 0.857
  Evaluated records: 20



In [22]:
print("Now let's see specific examples of what went wrong...")
fusion_report_with_eval.print_evaluation_examples(
    gold_df=gold_df,
    fused_id_column='academy_awards_id', 
    gold_id_column='academy_awards_id',
    max_examples=5
)

Now let's see specific examples of what went wrong...

=== Fusion Examples ===

❌ Incorrect Fusion Examples:

  Example 1:
    Record ID: academy_awards-1429
    Attribute: date
    Fused:  '1985-01-01'
    Gold:   '1984-01-01'
    Confidence: N/A

  Example 2:
    Record ID: academy_awards-1429
    Attribute: actors_actor_name
    Fused:  '['F. Murray Abraham', 'Jeffrey Jones', 'Tom Hulce']'
    Gold:   '['F. Murray Abraham', 'Tom Hulce']'
    Confidence: N/A

  Example 3:
    Record ID: academy_awards-1429
    Attribute: director_name
    Fused:  'Milo Forman'
    Gold:   'Milos Forman'
    Confidence: N/A

  Example 4:
    Record ID: academy_awards-1879
    Attribute: title
    Fused:  'One Flew Over The Cuckoo''s Nest'
    Gold:   'One Flew over the Cuckoo's Nest'
    Confidence: N/A

  Example 5:
    Record ID: academy_awards-1879
    Attribute: actors_actor_name
    Fused:  '['Brad Dourif', 'Jack Nicholson', 'Louise Fletcher']'
    Gold:   '['Jack Nicholson', 'Brad Dourif', 'Loui

## 7. Exporting Results and Reports

Finally, let's export our fusion results and reports for further analysis or documentation.

In [23]:
# Export fusion results and detailed reports
output_dir = Path('./fusion_output')
output_dir.mkdir(exist_ok=True)

print("Exporting fusion results and reports...")

# Export the fused dataset
fused_movies.to_csv(output_dir / 'fused_movies.csv', index=False)
print(f"✓ Fused dataset saved to: {output_dir / 'fused_movies.csv'}")

# Export detailed reports in multiple formats
fusion_report_with_eval.export_detailed_results(str(output_dir))
print(f"✓ Detailed reports exported to: {output_dir}")

# List all exported files
print(f"\nExported files:")
for file in output_dir.glob('*'):
    print(f"  - {file.name} ({file.stat().st_size} bytes)")
    
print(f"\nFusion pipeline complete!")
print(f"Final accuracy: {evaluation_results.get('overall_accuracy', 0):.1%}")
print(f"Records fused: {len(fused_movies)} from {sum(len(df) for df in datasets)} input records")

INFO: Report saved to fusion_output/fusion_report.json
INFO: HTML report saved to fusion_output/fusion_report.html
INFO: Detailed results exported to fusion_output


Exporting fusion results and reports...
✓ Fused dataset saved to: fusion_output/fused_movies.csv
✓ Detailed reports exported to: fusion_output

Exported files:
  - fusion_report.json (1925 bytes)
  - fused_movies.csv (14723 bytes)
  - fused_data.csv (14723 bytes)
  - input_summary.json (846 bytes)
  - coverage_analysis (160 bytes)
  - fusion_report.html (4051 bytes)
  - correspondences.csv (9159 bytes)

Fusion pipeline complete!
Final accuracy: 85.3%
Records fused: 148 from 7010 input records
