# Spurious Edge Filtering Demo (All Edges Named)
This notebook demonstrates the updated `read_edges_csv_and_filter_spurious` function from `src.filters`.

Unlike the previous demo, this version enriches **all** edges with page titles (names), not just the suspect edges. It loads an edge dataset, analyzes suspect edges, calculates a custom `final_weight`, and substitutes IDs with page titles for all edges for easier inspection and querying.

In [1]:
# Import Required Libraries and Setup Paths
import pandas as pd
import sys
import os

# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src import filters, edge_processing
INPUT_DATASET = '../data/out/SpotlightWeightSource_0102_0505_fullmatch.csv'  # Update if needed!
DB_PATH = '/home/vlr/Workspaces/WikipediaBiasProject/PyProject/data/out/graph_final.db'

In [2]:
# Load Edge Dataset Using New Function
import time
print(f"Loading and filtering: {INPUT_DATASET}")
start_time = time.time()
filtered_edges = edge_processing.read_edges_csv_and_filter_spurious(
    csv_path=INPUT_DATASET,
    db_path=DB_PATH
)
print(f"Done in {time.time() - start_time:.2f} seconds.")

# Filter out self edges immediately
filtered_edges = filtered_edges[filtered_edges['source_wikidata_id'] != filtered_edges['target_wikidata_id']].copy()

Loading and filtering: ../data/out/SpotlightWeightSource_0102_0505_fullmatch.csv
Retrieving titles for 93,847 unique QIDs across 5 languages...
Computing shared names filter...
Done in 255.12 seconds.


In [3]:
# Analyze Suspect Edges
n_suspect = (filtered_edges['has_shared_word'] & (filtered_edges['z_score'] > 1.5)).sum()
n_total = len(filtered_edges)
print(f"Flagged {n_suspect:,} edges as suspect out of {n_total:,} ({(n_suspect/n_total)*100:.2f}%)")
suspect_df = filtered_edges[filtered_edges['has_shared_word'] & (filtered_edges['z_score'] > 1.5)].copy()
if n_suspect > 0:
    print("\nSample of flagged pseudo-self-loops:")
    display(suspect_df.head(20))
else:
    print("No suspect edges found.")

Flagged 29,169 edges as suspect out of 12,438,067 (0.23%)

Sample of flagged pseudo-self-loops:


Unnamed: 0,language_code,source_wikidata_id,target_wikidata_id,weight,fullmatch_count,source_self_count,has_shared_word,ordered_substring,z_score,substring
438,en,Q2254738,Q856749,6.0,6,0,True,False,2.867242,True
1265,it,Q557092,Q312311,1.0,0,0,True,False,2.561738,True
1856,de,Q1337341,Q314241,1.0,1,0,True,False,3.712494,True
2196,en,Q1926870,Q57283,1.0,1,0,True,False,3.407918,True
2315,de,Q1926870,Q57283,1.0,1,0,True,False,3.407918,True
2375,fr,Q1926870,Q57283,1.0,1,0,True,False,3.407918,True
2495,it,Q1926870,Q57283,1.0,1,0,True,False,3.407918,True
2514,es,Q1926870,Q57283,1.0,1,0,True,False,3.407918,True
2825,de,Q274434,Q983501,2.0,2,0,True,False,2.14698,True
2884,fr,Q457714,Q237106,2.0,2,0,True,False,2.0649,True


In [4]:
# Calculate Final Weight with Custom Logic
def calculate_final_weight(row):
    if row['z_score'] > 1.5 and row['has_shared_word']:
        if row['ordered_substring']:
            return 1.0
        else:
            return row['fullmatch_count']
    else:
        return row['weight']
filtered_edges['final_weight'] = filtered_edges.apply(calculate_final_weight, axis=1)
print("Sample with final_weight column:")
display(filtered_edges.head(20))

Sample with final_weight column:


Unnamed: 0,language_code,source_wikidata_id,target_wikidata_id,weight,fullmatch_count,source_self_count,has_shared_word,ordered_substring,z_score,substring,final_weight
0,en,Q6882,Q53003,0.0,0,0,False,False,-0.108882,False,0.0
1,en,Q6882,Q5912,0.0,0,0,False,False,-0.108882,False,0.0
2,en,Q6882,Q212719,0.0,0,0,False,False,-0.108882,False,0.0
3,en,Q6882,Q514998,0.0,0,0,False,False,-0.108882,False,0.0
4,en,Q6882,Q210134,0.0,0,0,False,False,-0.108882,False,0.0
5,en,Q6882,Q55391,0.0,0,0,False,False,-0.108882,False,0.0
6,en,Q6882,Q1388518,0.0,1,0,False,False,-0.108882,False,0.0
7,en,Q6882,Q274143,0.0,0,0,False,False,-0.108882,False,0.0
8,en,Q6882,Q41406,0.0,0,0,False,False,-0.108882,False,0.0
9,en,Q6882,Q7371,0.0,0,0,False,False,-0.108882,False,0.0


In [5]:
# Retrieve Titles for All Edges
from src.modules.duckdb_handler import DuckDBHandler
# Collect all unique QIDs and language codes from the full filtered_edges DataFrame
unique_qids_all = list(set(filtered_edges['source_wikidata_id'].dropna()).union(set(filtered_edges['target_wikidata_id'].dropna())))
unique_langs_all = filtered_edges['language_code'].dropna().unique().tolist()
with DuckDBHandler(DB_PATH) as db:
    batch_size = 10000
    all_titles = []
    for i in range(0, len(unique_qids_all), batch_size):
        batch_qids = unique_qids_all[i : i + batch_size]
        batch_titles_df = db.get_titles_for_qids(qids=batch_qids, langs=unique_langs_all)
        all_titles.append(batch_titles_df)
    import pandas as pd
    if all_titles:
        titles_df = pd.concat(all_titles, ignore_index=True)
    else:
        titles_df = pd.DataFrame(columns=["wikidata_id", "language_code", "page_title"])

In [6]:
# Substitute IDs with Titles in All Edges (Enriched DataFrame)
enriched_edges = filtered_edges.copy()
enriched_edges = enriched_edges.merge(
    titles_df.rename(columns={'wikidata_id': 'source_wikidata_id', 'page_title': 'source_title'}),
    on=['source_wikidata_id', 'language_code'],
    how='left'
)
enriched_edges = enriched_edges.merge(
    titles_df.rename(columns={'wikidata_id': 'target_wikidata_id', 'page_title': 'target_title'}),
    on=['target_wikidata_id', 'language_code'],
    how='left'
)
enriched_edges['source_wikidata_id'] = enriched_edges['source_title'].fillna(enriched_edges['source_wikidata_id'])
enriched_edges['target_wikidata_id'] = enriched_edges['target_title'].fillna(enriched_edges['target_wikidata_id'])
enriched_edges = enriched_edges.drop(columns=['source_title', 'target_title'])

In [7]:
# Display Enriched DataFrame Sample
print(f"Sample of {len(enriched_edges)} enriched edge rows with substituted names:")
display(enriched_edges.sort_values(by='fullmatch_count', ascending=False).head(30))

Sample of 12438067 enriched edge rows with substituted names:


Unnamed: 0,language_code,source_wikidata_id,target_wikidata_id,weight,fullmatch_count,source_self_count,has_shared_word,ordered_substring,z_score,substring,final_weight
5858112,en,Sting (wrestler),Sting (musician),0.0,556,0,True,True,-0.174887,True,0.0
2125183,en,IU (entertainer),V (singer),0.0,342,0,False,False,-0.660011,False,0.0
10316982,en,G-Dragon,V (singer),0.0,266,0,False,False,-0.515722,False,0.0
3462261,fr,Yuzuru Hanyu,Ai (chanteuse),0.0,266,0,False,False,-0.135929,False,0.0
124166,en,Ānanda,The Buddha,0.0,253,0,False,False,-0.302645,False,0.0
4637787,en,Jennie (singer),V (singer),0.0,243,0,False,False,-0.685422,True,0.0
9358263,en,Mu'awiya I,Ali,63.0,239,0,False,False,9.921028,False,63.0
6658701,en,BoA,V (singer),0.0,237,0,False,False,-0.153028,False,0.0
3533444,en,Psy,V (singer),0.0,235,0,False,False,-0.720961,False,0.0
7515612,it,Sting (wrestler),Sting,225.0,228,0,True,True,25.399185,True,1.0


In [8]:
# Query Enriched DataFrame by Title
# Example: Query for a specific source or target title (replace as needed)
enriched_edges[(enriched_edges["source_wikidata_id"] == "Isabel Perón") & (enriched_edges["target_wikidata_id"] == "Juan Perón")]

Unnamed: 0,language_code,source_wikidata_id,target_wikidata_id,weight,fullmatch_count,source_self_count,has_shared_word,ordered_substring,z_score,substring,final_weight
820536,en,Isabel Perón,Juan Perón,10.0,9,0,True,False,0.514669,True,10.0


In [9]:
# Custom Final Weight Assignment Based on Multiple Conditions
def custom_final_weight(row):
    if row['has_shared_word'] and row['z_score'] > 1.5:
        if row['ordered_substring']:
            return 1
        else:
            return row['ordered_substring']
    else:
        return row['weight']
enriched_edges['final_weight'] = enriched_edges.apply(custom_final_weight, axis=1)
print("Sample with custom final_weight column:")
display(enriched_edges.head(20))

Sample with custom final_weight column:


Unnamed: 0,language_code,source_wikidata_id,target_wikidata_id,weight,fullmatch_count,source_self_count,has_shared_word,ordered_substring,z_score,substring,final_weight
0,en,James Joyce,Roberto Rossellini,0.0,0,0,False,False,-0.108882,False,0.0
1,en,James Joyce,Marcel Duchamp,0.0,0,0,False,False,-0.108882,False,0.0
2,en,James Joyce,Wilfred Owen,0.0,0,0,False,False,-0.108882,False,0.0
3,en,James Joyce,Richard Aldington,0.0,0,0,False,False,-0.108882,False,0.0
4,en,James Joyce,Louis Kahn,0.0,0,0,False,False,-0.108882,False,0.0
5,en,James Joyce,Robert Bresson,0.0,0,0,False,False,-0.108882,False,0.0
6,en,James Joyce,Oliver St. John Gogarty,0.0,1,0,False,False,-0.108882,False,0.0
7,en,James Joyce,Patrick Pearse,0.0,0,0,False,False,-0.108882,False,0.0
8,en,James Joyce,Edvard Munch,0.0,0,0,False,False,-0.108882,False,0.0
9,en,James Joyce,Federico Fellini,0.0,0,0,False,False,-0.108882,False,0.0


In [10]:
# Display edges where final_weight was changed to 1 or ordered_substring
changed_edges = enriched_edges[((enriched_edges['has_shared_word']) & (enriched_edges['z_score'] > 1.5)) & ((enriched_edges['final_weight'] == 1) | (enriched_edges['final_weight'] == enriched_edges['ordered_substring']))]
print(f"Number of edges with custom weight assignment: {len(changed_edges)}")
display(changed_edges[['source_wikidata_id', 'target_wikidata_id', 'has_shared_word', 'z_score', 'ordered_substring', 'weight', 'final_weight']].head(20))

Number of edges with custom weight assignment: 29169


Unnamed: 0,source_wikidata_id,target_wikidata_id,has_shared_word,z_score,ordered_substring,weight,final_weight
437,Lorenzo di Bicci,Bicci di Lorenzo,True,2.867242,False,6.0,False
1263,Giovanni d'Aragona (reggente di Trinacria),Pietro III d'Aragona,True,2.561738,False,1.0,False
1854,Ernesto Farías,José Ernesto Sosa,True,3.712494,False,1.0,False
2194,Michael Artin,Emil Artin,True,3.407918,False,1.0,False
2313,Michael Artin,Emil Artin,True,3.407918,False,1.0,False
2373,Michael Artin,Emil Artin,True,3.407918,False,1.0,False
2493,Michael Artin,Emil Artin,True,3.407918,False,1.0,False
2512,Michael Artin,Emil Artin,True,3.407918,False,1.0,False
2823,Ekaterina Guliyev,Ramil Guliyev,True,2.14698,False,2.0,False
2882,Marie Darrieussecq,Marie NDiaye,True,2.0649,False,2.0,False
