# Movie Data Fusion 

This tutorial fuses movie datasets using precomputed correspondences and a gold standard from:

- `correspondences/` (old ID format)
- `splits/` (gold standard, old ID format)

It mirrors the approach shown in `PyDI/examples/data_fusion_example.ipynb`, converts old IDs to PyDI's injected IDs for grouping and evaluation, and applies the same fusion and evaluation rules.

In [1]:
from pathlib import Path
import pandas as pd

from PyDI.io.loaders import load_xml
from PyDI.fusion import (
    DataFusionStrategy, DataFusionEngine, AttributeValueFuser, DataFusionEvaluator, FusionReport
)

ROOT = Path('../../input/movies/fusion')
DATA_DIR = ROOT / 'data'
CORR_DIR = ROOT / 'correspondences'
GS_DIR = ROOT / 'splits'

print('Inputs configured.')

Inputs configured.


## Load Datasets
We use aggregated XML parsing to keep nested values (like multiple actors) as lists. PyDI injects stable ID columns like `academy_awards_id` into each DataFrame.

In [2]:
print(DATA_DIR / 'academy_awards.xml')

../../input/movies/fusion/data/academy_awards.xml


In [3]:
from PyDI.fusion import provenance

academy_awards = load_xml(DATA_DIR / 'academy_awards.xml', name='academy_awards', nested_handling='aggregate', provenance={'trust_score': 3})
academy_awards["academy_awards_id"] = academy_awards["id"]
actors = load_xml(DATA_DIR / 'actors.xml', name='actors', nested_handling='aggregate', provenance={'trust_score': 2})
golden_globes = load_xml(DATA_DIR / 'golden_globes.xml', name='golden_globes', nested_handling='aggregate', provenance={'trust_score': 1})

print('Loaded:')
for df in [academy_awards, actors, golden_globes]:
    ds = df.attrs.get('dataset_name')
    provenance = df.attrs.get('provenance', {})
    print(f'- {ds}: shape={df.shape}, provenance={provenance}')

display(academy_awards.head(3))
display(actors.head(3))
display(golden_globes.head(3))

Loaded:
- academy_awards: shape=(4580, 7), provenance={'dataset_name': 'academy_awards', 'reader': 'read_xml_aggregated', 'loaded_time_utc_iso': '2025-09-16T12:21:56.334806+00:00', 'source_path': '/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/data/academy_awards.xml', 'file_size_bytes': 771854, 'modified_time_utc_iso': '2025-08-29T12:59:13.764462+00:00', 'sha256_prefix': '537dfaeb80d8752a9c2dd15800082c26886ee6723167e48a590a1b463ce9bddb', 'sha256_prefix_bytes': 771854, 'trust_score': 3}
- actors: shape=(151, 6), provenance={'dataset_name': 'actors', 'reader': 'read_xml_aggregated', 'loaded_time_utc_iso': '2025-09-16T12:21:56.349587+00:00', 'source_path': '/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/data/actors.xml', 'file_size_bytes': 39941, 'modified_time_utc_iso': '2025-09-03T12:23:05.182854+00:00', 'sha256_prefix': '79bdb4ef1adfe671c931236fcda0abe3c2906ff209c3c64945f5e8ec67fcb496', 'sha256_prefix_bytes': 39941, 'trust_score': 2}
- golden_globes: shap

Unnamed: 0,id,title,actors_actor_name,date,director_name,oscar,academy_awards_id
0,academy_awards_1,Biutiful,Javier Bardem,2010-01-01,,,academy_awards_1
1,academy_awards_2,True Grit,"[Jeff Bridges, Hailee Steinfeld]",2010-01-01,Joel Coen and Ethan Coen,,academy_awards_2
2,academy_awards_3,The Social Network,Jesse Eisenberg,2010-01-01,David Fincher,yes,academy_awards_3


Unnamed: 0,id,title,actors_actor_name,actors_actor_birthday,actors_actor_birthplace,date
0,actors_1,7th Heaven,Janet Gaynor,1906-01-01,Pennsylvania,1929-01-01
1,actors_2,Coquette,Mary Pickford,1892-01-01,Canada,1930-01-01
2,actors_3,The Divorcee,Norma Shearer,1902-01-01,Canada,1931-01-01


Unnamed: 0,id,title,actors_actor_name,date,director_name,globe
0,golden_globes_1,Frankie and Alice,Halle Berry,2011-01-01,,
1,golden_globes_2,Rabbit Hole,Nicole Kidman,2011-01-01,,
2,golden_globes_3,Winter's Bone,Jennifer Lawrence,2011-01-01,,


## Load Correspondences 


In [4]:
corr_aa_a = pd.read_csv(CORR_DIR / 'academy_awards_2_actors_correspondences.csv', header=None, names=['id1','id2','score'])
corr_a_gg = pd.read_csv(CORR_DIR / 'actors_2_golden_globes_correspondences.csv', header=None, names=['id1','id2','score'])
all_corr = pd.concat([corr_aa_a, corr_a_gg], ignore_index=True)
print(f'Total correspondences (old IDs): {len(all_corr):,}')
display(all_corr.head(5))


Total correspondences (old IDs): 257


Unnamed: 0,id1,id2,score
0,academy_awards_4557,actors_1,1.0
1,academy_awards_4529,actors_2,1.0
2,academy_awards_4500,actors_3,1.0
3,academy_awards_4475,actors_4,1.0
4,academy_awards_4446,actors_5,1.0


## Define Fusion Strategy 
Rules used here: trust-based resolution for most attributes (title, director, date, oscar), union for actors, and prefer Academy Awards for _id.

In [5]:
from PyDI.fusion import longest_string, most_recent, union, prefer_higher_trust

strategy = DataFusionStrategy('movie_fusion_strategy')

strategy.add_attribute_fuser('title', longest_string)
strategy.add_attribute_fuser('director_name', longest_string)
strategy.add_attribute_fuser('date', prefer_higher_trust, trust_key="trust_score")

strategy.add_attribute_fuser('actors_actor_name', union)

print('Strategy ready.')

Strategy ready.


## Run Fusion
We build connected components from the converted correspondences and fuse per attribute using the rules above.

In [6]:
engine = DataFusionEngine(strategy, debug=True, debug_format='json')

fused = engine.run(
    datasets=[academy_awards, actors, golden_globes],
    correspondences=all_corr,
    id_column="id",
    include_singletons=False,
)
print(f'Fused rows: {len(fused):,}')
display(fused.head(5))

Fused rows: 148


Unnamed: 0,_id,_fusion_group_id,_fusion_sources,date,globe,actors_actor_name,actors_actor_birthday,oscar,id,director_name,title,actors_actor_birthplace,academy_awards_id,_fusion_confidence,_fusion_metadata
0,actors_65,group_0,"[golden_globes, actors, academy_awards]",1991-01-01,yes,"[Anthony Hopkins, Jodie Foster]","[1962-01-01, 1937-01-01]",yes,actors_65,Jonathan Demme,"Silence Of The Lambs, The","[California, Wales]",academy_awards_1069,0.592727,"{'date_rule': 'prefer_higher_trust', 'date_sou..."
1,academy_awards_2240,group_1,"[golden_globes, academy_awards, actors]",1968-01-01,,"[Barbra Streisand, Kay Medford]",1942-01-01,yes,academy_awards_2240,William Wyler,Funny Girl,New York,academy_awards_2240,0.590909,"{'date_rule': 'prefer_higher_trust', 'date_sou..."
2,academy_awards_3713,group_2,"[academy_awards, actors]",1943-01-01,,"[Lucile Watson, Paul Lukas]",1895-01-01,yes,academy_awards_3713,,Watch on the Rhine,Hungary,academy_awards_3713,0.55,"{'date_rule': 'prefer_higher_trust', 'date_sou..."
3,actors_42,group_3,"[academy_awards, actors, golden_globes]",1968-01-01,yes,"[Jane Morrow, Katharine Hepburn, Peter O'Toole]",1907-01-01,yes,actors_42,Anthony Harvey,"Lion In Winter (1969), The",Connecticut,academy_awards_2233,0.604895,"{'date_rule': 'prefer_higher_trust', 'date_sou..."
4,actors_110,group_4,"[academy_awards, actors, golden_globes]",1959-01-01,yes,"[Charlton Heston, Hugh Griffith, Stephen Boyd]",1923-01-01,yes,actors_110,William Wyler,Ben-Hur,Illinois,academy_awards_2732,0.590909,"{'date_rule': 'prefer_higher_trust', 'date_sou..."


## Evaluate vs. Gold Standard
We load the gold standard (old ID format), convert its IDs, and evaluate accuracy.

In [7]:
from PyDI.fusion import tokenized_match, year_only_match, boolean_match

strategy.add_evaluation_function("title", tokenized_match)
strategy.add_evaluation_function("director_name", tokenized_match)
strategy.add_evaluation_function("actors_actor_name", tokenized_match)
strategy.add_evaluation_function("date", year_only_match)
strategy.add_evaluation_function("oscar", boolean_match)

In [8]:
gold = load_xml(GS_DIR / 'gold.xml', name='gold_standard', nested_handling='aggregate')

# Keep core evaluation columns if present in fused output
eval_cols = ['academy_awards_id','title','director_name','actors_actor_name','date','oscar']
fused_eval = fused[eval_cols].copy()


# Create evaluator with our fusion strategy
evaluator = DataFusionEvaluator(strategy)

# Evaluate the fused results against the gold standard
print("Evaluating fusion results against gold standard...")
evaluation_results = evaluator.evaluate(
    fused_df=fused_eval,
    fused_id_column='academy_awards_id',
    gold_df=gold,
    gold_id_column='id',
)

# Display evaluation metrics
print("\nFusion Evaluation Results:")
print("=" * 40)
for metric, value in evaluation_results.items():
    if isinstance(value, float):
        print(f"  {metric}: {value:.3f}")
    else:
        print(f"  {metric}: {value}")
        
print(f"\nOverall Accuracy: {evaluation_results.get('overall_accuracy', 0):.1%}")

Evaluating fusion results against gold standard...

Fusion Evaluation Results:
  overall_accuracy: 0.958
  macro_accuracy: 0.960
  num_evaluated_records: 20
  num_evaluated_attributes: 5
  total_evaluations: 95
  total_correct: 91
  date_accuracy: 0.950
  date_count: 20
  actors_actor_name_accuracy: 0.900
  actors_actor_name_count: 20
  oscar_accuracy: 1.000
  oscar_count: 20
  director_name_accuracy: 1.000
  director_name_count: 15
  title_accuracy: 0.950
  title_count: 20

Overall Accuracy: 95.8%
