# Movie Data Fusion 

This tutorial fuses movie datasets using precomputed correspondences and a gold standard from:

- `correspondences/` (old ID format)
- `splits/` (gold standard, old ID format)

It mirrors the approach shown in `PyDI/examples/data_fusion_example.ipynb`, converts old IDs to PyDI's injected IDs for grouping and evaluation, and applies the same fusion and evaluation rules.

In [1]:
from pathlib import Path
import pandas as pd

from PyDI.io.loaders import load_xml
from PyDI.fusion import (
    DataFusionStrategy, DataFusionEngine, AttributeValueFuser, DataFusionEvaluator, FusionReport
)

ROOT = Path('../../input/movies/fusion')
DATA_DIR = ROOT / 'data'
CORR_DIR = ROOT / 'correspondences'
GS_DIR = ROOT / 'splits'

print('Inputs configured.')

Inputs configured.


## Load Datasets
We use aggregated XML parsing to keep nested values (like multiple actors) as lists. PyDI injects stable ID columns like `academy_awards_id` into each DataFrame.

In [2]:
print(DATA_DIR / 'academy_awards.xml')

../../input/movies/fusion/data/academy_awards.xml


In [3]:
from PyDI.fusion import provenance

academy_awards = load_xml(DATA_DIR / 'academy_awards.xml', name='academy_awards', nested_handling='aggregate', provenance={'trust_level': 3})
actors = load_xml(DATA_DIR / 'actors.xml', name='actors', nested_handling='aggregate', provenance={'trust_level': 2})
golden_globes = load_xml(DATA_DIR / 'golden_globes.xml', name='golden_globes', nested_handling='aggregate', provenance={'trust_level': 1})

print('Loaded:')
for df in [academy_awards, actors, golden_globes]:
    ds = df.attrs.get('dataset_name')
    provenance = df.attrs.get('provenance', {})
    print(f'- {ds}: shape={df.shape}, provenance={provenance}')

display(academy_awards.head(3))
display(actors.head(3))
display(golden_globes.head(3))

Loaded:
- academy_awards: shape=(4580, 6), provenance={'dataset_name': 'academy_awards', 'reader': 'read_xml_aggregated', 'loaded_time_utc_iso': '2025-09-16T09:45:24.672918+00:00', 'source_path': '/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/data/academy_awards.xml', 'file_size_bytes': 771854, 'modified_time_utc_iso': '2025-08-29T12:59:13.764462+00:00', 'sha256_prefix': '537dfaeb80d8752a9c2dd15800082c26886ee6723167e48a590a1b463ce9bddb', 'sha256_prefix_bytes': 771854, 'trust_level': 3}
- actors: shape=(151, 6), provenance={'dataset_name': 'actors', 'reader': 'read_xml_aggregated', 'loaded_time_utc_iso': '2025-09-16T09:45:24.674792+00:00', 'source_path': '/Users/aaronsteiner/Documents/GitHub/PyDI/input/movies/fusion/data/actors.xml', 'file_size_bytes': 39941, 'modified_time_utc_iso': '2025-09-03T12:23:05.182854+00:00', 'sha256_prefix': '79bdb4ef1adfe671c931236fcda0abe3c2906ff209c3c64945f5e8ec67fcb496', 'sha256_prefix_bytes': 39941, 'trust_level': 2}
- golden_globes: shap

Unnamed: 0,id,title,actors_actor_name,date,director_name,oscar
0,academy_awards_1,Biutiful,Javier Bardem,2010-01-01,,
1,academy_awards_2,True Grit,"[Jeff Bridges, Hailee Steinfeld]",2010-01-01,Joel Coen and Ethan Coen,
2,academy_awards_3,The Social Network,Jesse Eisenberg,2010-01-01,David Fincher,yes


Unnamed: 0,id,title,actors_actor_name,actors_actor_birthday,actors_actor_birthplace,date
0,actors_1,7th Heaven,Janet Gaynor,1906-01-01,Pennsylvania,1929-01-01
1,actors_2,Coquette,Mary Pickford,1892-01-01,Canada,1930-01-01
2,actors_3,The Divorcee,Norma Shearer,1902-01-01,Canada,1931-01-01


Unnamed: 0,id,title,actors_actor_name,date,director_name,globe
0,golden_globes_1,Frankie and Alice,Halle Berry,2011-01-01,,
1,golden_globes_2,Rabbit Hole,Nicole Kidman,2011-01-01,,
2,golden_globes_3,Winter's Bone,Jennifer Lawrence,2011-01-01,,


## Load Correspondences 


In [4]:
corr_aa_a = pd.read_csv(CORR_DIR / 'academy_awards_2_actors_correspondences.csv', header=None, names=['id1','id2','score'])
corr_a_gg = pd.read_csv(CORR_DIR / 'actors_2_golden_globes_correspondences.csv', header=None, names=['id1','id2','score'])
all_corr = pd.concat([corr_aa_a, corr_a_gg], ignore_index=True)
print(f'Total correspondences (old IDs): {len(all_corr):,}')
display(all_corr.head(5))


Total correspondences (old IDs): 257


Unnamed: 0,id1,id2,score
0,academy_awards_4557,actors_1,1.0
1,academy_awards_4529,actors_2,1.0
2,academy_awards_4500,actors_3,1.0
3,academy_awards_4475,actors_4,1.0
4,academy_awards_4446,actors_5,1.0


## Define Fusion Strategy 
Rules used here: trust-based resolution for most attributes (title, director, date, oscar), union for actors, and prefer Academy Awards for _id.

In [5]:
from PyDI.fusion import longest_string, most_recent, union, prefer_higher_trust

# Trust levels are read automatically from each dataset's provenance (trust_level)

def prefer_dataset_id(values, *, sources, source_datasets, preferred_dataset, **_):
  # values: list of candidate IDs; sources: list of record _idâ€™s; source_datasets: map record _id -> dataset name
  for v, rid in zip(values, sources):
    if source_datasets.get(rid) == preferred_dataset and v is not None:
        return v, 1.0, {'selected_source': rid, 'selected_dataset': preferred_dataset}
  # fallback: first non-null
  for v in values:
    if v is not None:
        return v, 0.5, {'fallback': 'first_non_null'}
  return None, 0.0, {'reason': 'no_valid_values'}

strategy = DataFusionStrategy('movie_fusion_strategy')

strategy.add_attribute_fuser('title', prefer_higher_trust)
strategy.add_attribute_fuser('director_name', prefer_higher_trust)
strategy.add_attribute_fuser('date', prefer_higher_trust)
strategy.add_attribute_fuser('oscar', prefer_higher_trust)

strategy.add_attribute_fuser('actors_actor_name', union)

strategy.add_attribute_fuser('_id', prefer_dataset_id, preferred_dataset='academy_awards')
print('Strategy ready.')

Strategy ready.


## Run Fusion
We build connected components from the converted correspondences and fuse per attribute using the rules above.

In [6]:
engine = DataFusionEngine(strategy, debug=True, debug_format='json')

fused = engine.run(
    datasets=[academy_awards, actors, golden_globes],
    correspondences=all_corr,
    id_column="id",
    include_singletons=False,
)
print(f'Fused rows: {len(fused):,}')
display(fused.head(5))

Fused rows: 148


Unnamed: 0,_id,_fusion_group_id,_fusion_sources,actors_actor_name,actors_actor_birthplace,director_name,id,date,actors_actor_birthday,title,globe,oscar,_fusion_confidence,_fusion_metadata
0,academy_awards_560,group_0,"[actors, golden_globes, academy_awards]","[Albert Finney, Julia Roberts]",Georgia,Steven Soderbergh,actors_74,2001-01-01,1967-01-01,Erin Brockovich,yes,yes,0.616667,"{'actors_actor_name_rule': 'union', 'actors_ac..."
1,academy_awards_508,group_1,"[actors, golden_globes, academy_awards]",[Halle Berry],Ohio,,golden_globes_454,2002-01-01,1966-01-01,Monster's Ball,,yes,0.516667,"{'actors_actor_name_rule': 'union', 'actors_ac..."
2,academy_awards_2686,group_2,"[actors, golden_globes, academy_awards]","[Elisabeth Taylor, Elizabeth Taylor]",England,,actors_33,1961-01-01,1932-01-01,Butterfield 8,,yes,0.516667,"{'actors_actor_name_rule': 'union', 'actors_ac..."
3,academy_awards_1943,group_3,"[actors, golden_globes, academy_awards]","[Diane Ladd, Ellen Burstyn]",Michigan,,academy_awards_1943,1974-01-01,1932-01-01,Alice Doesn't Live Here Anymore,,yes,0.516667,"{'actors_actor_name_rule': 'union', 'actors_ac..."
4,academy_awards_1580,group_4,"[actors, golden_globes, academy_awards]","[Henry Fonda, Jane Fonda, Katharine Hepburn]","[Connecticut, Nebraska]",Mark Rydell,actors_55,1982-01-01,"[1907-01-01, 1905-01-01]",On Golden Pond,yes,yes,0.616667,"{'actors_actor_name_rule': 'union', 'actors_ac..."


## Evaluate vs. Gold Standard
We load the gold standard (old ID format), convert its IDs, and evaluate accuracy.

In [7]:
from PyDI.fusion import tokenized_match, year_only_match, boolean_match

strategy.add_evaluation_function("title", tokenized_match)
strategy.add_evaluation_function("director_name", tokenized_match)
strategy.add_evaluation_function("actors_actor_name", tokenized_match)
strategy.add_evaluation_function("date", year_only_match)
strategy.add_evaluation_function("oscar", boolean_match)

In [8]:
gold = load_xml(GS_DIR / 'gold.xml', name='gold_standard', nested_handling='aggregate')

# Keep core evaluation columns if present in fused output
eval_cols = ['_id','title','director_name','actors_actor_name','date','oscar']
fused_eval = fused[eval_cols].copy()


# Create evaluator with our fusion strategy
evaluator = DataFusionEvaluator(strategy)

# Evaluate the fused results against the gold standard
print("Evaluating fusion results against gold standard...")
evaluation_results = evaluator.evaluate(
    fused_df=fused_eval,
    fused_id_column='_id',
    gold_df=gold,
    gold_id_column='id',
)

# Display evaluation metrics
print("\nFusion Evaluation Results:")
print("=" * 40)
for metric, value in evaluation_results.items():
    if isinstance(value, float):
        print(f"  {metric}: {value:.3f}")
    else:
        print(f"  {metric}: {value}")
        
print(f"\nOverall Accuracy: {evaluation_results.get('overall_accuracy', 0):.1%}")

Evaluating fusion results against gold standard...

Fusion Evaluation Results:
  overall_accuracy: 0.842
  macro_accuracy: 0.843
  num_evaluated_records: 20
  num_evaluated_attributes: 5
  total_evaluations: 95
  total_correct: 80
  actors_actor_name_accuracy: 0.900
  actors_actor_name_count: 20
  director_name_accuracy: 0.867
  director_name_count: 15
  date_accuracy: 0.550
  date_count: 20
  title_accuracy: 0.900
  title_count: 20
  oscar_accuracy: 1.000
  oscar_count: 20

Overall Accuracy: 84.2%


In [9]:
gold[gold['id'] == 'academy_awards_1880']


Unnamed: 0,id,title,director_name,actors_actor_name,date,oscar
0,academy_awards_1880,One Flew over the Cuckoo's Nest,Milos Forman,"[Jack Nicholson, Brad Dourif, Louise Fletcher]",1975-01-01,yes


In [10]:
fused_eval[fused_eval['_id'] == 'academy_awards_1880']

Unnamed: 0,_id,title,director_name,actors_actor_name,date,oscar
88,academy_awards_1880,One Flew Over The Cuckoo's Nes,Milo Forman,"[Brad Dourif, Jack Nicholson, Louise Fletcher]",1976-01-01,yes


In [11]:
all_corr[all_corr['id1'] == 'academy_awards_1880']

Unnamed: 0,id1,id2,score
48,academy_awards_1880,actors_49,1.0
124,academy_awards_1880,actors_126,1.0
