In [1]:
import pandas as pd

# Load the CSV
file_path = "/content/mlops-events.csv"  # adjust if needed
df = pd.read_csv(file_path)

# Find duplicate groups
duplicate_groups = (
    df.groupby(["Talk Title", "Full Name", "YouTube Link"])
    .size()
    .reset_index(name="Count")
)

# Keep only duplicates
duplicate_groups = duplicate_groups[duplicate_groups["Count"] > 1]

# For each duplicate group, get the row indices
for _, row in duplicate_groups.iterrows():
    talk, name, yt, count = row["Talk Title"], row["Full Name"], row["YouTube Link"], row["Count"]
    indices = df[
        (df["Talk Title"] == talk) &
        (df["Full Name"] == name) &
        (df["YouTube Link"] == yt)
    ].index.tolist()

    print(f"\nDuplicate group: [Talk Title='{talk}', Full Name='{name}', YouTube Link='{yt}']")
    print(f"Row indices: {indices} (Total: {count})")

# Optionally, show summary at the end
print("\nNumber of duplicate groups:", len(duplicate_groups))
print("Total rows involved in duplicates:", duplicate_groups["Count"].sum())



Duplicate group: [Talk Title='AI Tools Under Control: Keeping Your Agents Secure and Reliable', Full Name='Bar Chen', YouTube Link='https://youtu.be/poqhv4hPTpA?si=wDHbtOf6DGPVJfvl']
Row indices: [61, 413] (Total: 2)

Duplicate group: [Talk Title='Agentic AI: Learning Iteratively, Acting Autonomously', Full Name='Fatma Tarlaci', YouTube Link='https://youtu.be/_k8sPizcqUg?si=nQFRnoSd3voCuvSC']
Row indices: [49, 401] (Total: 2)

Duplicate group: [Talk Title='Build with Mistral', Full Name='Sophia  Yang', YouTube Link='https://youtu.be/_IM53bMowlQ?si=TkQ_sbpgcShA9mOD']
Row indices: [59, 411] (Total: 2)

Duplicate group: [Talk Title='Building AI Infrastructure for the GenAI Wave', Full Name='Shreya Rajpal', YouTube Link='https://youtu.be/Se9_38V2TPA?si=TSU-oQAASiQ5eAG3']
Row indices: [65, 417] (Total: 2)

Duplicate group: [Talk Title='Building Agentic and Multi-Agent Systems with LangGraph (Pt. 2)', Full Name='Greg Loughnane, Chris Alexiuk', YouTube Link='https://youtu.be/uPuoysjaCbw?si=T

In [2]:
import pandas as pd

# Load the CSV
file_path = "/content/mlops-events.csv"
df = pd.read_csv(file_path)

# Find full duplicates (across all columns)
duplicate_rows = df[df.duplicated(keep=False)]

# Group them by all column values
grouped = duplicate_rows.groupby(list(df.columns)).apply(lambda x: x.index.tolist())

# Print each group with row indices
for values, indices in grouped.items():
    print(f"\nDuplicate group (all columns identical):")
    print(f"Row indices: {indices}")
    print(f"Values: {values}")

# Summary
print("\nNumber of full-duplicate groups:", len(grouped))
print("Total rows involved in full duplicates:", len(duplicate_rows))



Duplicate group (all columns identical):
Row indices: Series([], Name: Full Name, dtype: object)
Values: Full Name

Duplicate group (all columns identical):
Row indices: Series([], Name: Company Name, dtype: object)
Values: Company Name

Duplicate group (all columns identical):
Row indices: Series([], Name: Job Title, dtype: object)
Values: Job Title

Duplicate group (all columns identical):
Row indices: Series([], Name: Talk Title, dtype: object)
Values: Talk Title

Duplicate group (all columns identical):
Row indices: Series([], Name: Abstract, dtype: object)
Values: Abstract

Duplicate group (all columns identical):
Row indices: Series([], Name: What You'll Learn, dtype: object)
Values: What You'll Learn

Duplicate group (all columns identical):
Row indices: Series([], Name: Prerequiste Knowledge (if required), dtype: object)
Values: Prerequiste Knowledge (if required)

Duplicate group (all columns identical):
Row indices: Series([], Name: Track, dtype: object)
Values: Track

Dupli

  grouped = duplicate_rows.groupby(list(df.columns)).apply(lambda x: x.index.tolist())


In [3]:
import pandas as pd

# Load the CSV
file_path = "/content/mlops-events.csv"
df = pd.read_csv(file_path)

# Find duplicate groups
duplicate_groups = (
    df.groupby(["Talk Title", "Full Name", "YouTube Link"])
    .size()
    .reset_index(name="Count")
)
duplicate_groups = duplicate_groups[duplicate_groups["Count"] > 1]

# Inspect duplicates for differences
for _, row in duplicate_groups.iterrows():
    talk, name, yt = row["Talk Title"], row["Full Name"], row["YouTube Link"]
    dup_rows = df[
        (df["Talk Title"] == talk) &
        (df["Full Name"] == name) &
        (df["YouTube Link"] == yt)
    ].copy()

    print(f"\nDuplicate group: [Talk Title='{talk}', Full Name='{name}', YouTube Link='{yt}']")
    print(f"Row indices: {dup_rows.index.tolist()} (Total: {len(dup_rows)})")

    # Compare values column by column
    differences = {}
    for col in df.columns:
        if len(dup_rows[col].unique()) > 1:  # if column has more than one unique value in this group
            differences[col] = dup_rows[col].tolist()

    if differences:
        print("Fields that differ:")
        for col, vals in differences.items():
            print(f"  {col}: {vals}")
    else:
        print("  No differing fields — rows are exact duplicates.")



Duplicate group: [Talk Title='AI Tools Under Control: Keeping Your Agents Secure and Reliable', Full Name='Bar Chen', YouTube Link='https://youtu.be/poqhv4hPTpA?si=wDHbtOf6DGPVJfvl']
Row indices: [61, 413] (Total: 2)
  No differing fields — rows are exact duplicates.

Duplicate group: [Talk Title='Agentic AI: Learning Iteratively, Acting Autonomously', Full Name='Fatma Tarlaci', YouTube Link='https://youtu.be/_k8sPizcqUg?si=nQFRnoSd3voCuvSC']
Row indices: [49, 401] (Total: 2)
  No differing fields — rows are exact duplicates.

Duplicate group: [Talk Title='Build with Mistral', Full Name='Sophia  Yang', YouTube Link='https://youtu.be/_IM53bMowlQ?si=TkQ_sbpgcShA9mOD']
Row indices: [59, 411] (Total: 2)
  No differing fields — rows are exact duplicates.

Duplicate group: [Talk Title='Building AI Infrastructure for the GenAI Wave', Full Name='Shreya Rajpal', YouTube Link='https://youtu.be/Se9_38V2TPA?si=TSU-oQAASiQ5eAG3']
Row indices: [65, 417] (Total: 2)
  No differing fields — rows are e

In [4]:
import pandas as pd
import numpy as np

# === Load ===
file_path = "/content/mlops-events.csv"  # adjust as needed
df = pd.read_csv(file_path)

# === Create two views ===
# raw_df: untouched; clean_df: normalized strings for fair comparison
raw_df = df.copy()

clean_df = df.copy()
for c in clean_df.select_dtypes(include=["object"]).columns:
    # normalize strings: strip, collapse internal whitespace, normalize case for urls/titles/names as desired
    clean_df[c] = clean_df[c].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()

# === 1) Whole-row duplicates (exact) ===
whole_row_dupe_mask_raw   = raw_df.duplicated(keep=False)
whole_row_dupe_mask_clean = clean_df.duplicated(keep=False)

print("Exact duplicate rows (raw):", int(whole_row_dupe_mask_raw.sum()))
print("Exact duplicate rows (cleaned):", int(whole_row_dupe_mask_clean.sum()))

# === 2) Key-duplicates on the trio ===
KEY = ["Talk Title", "Full Name", "YouTube Link"]
missing_keys = [k for k in KEY if k not in df.columns]
if missing_keys:
    raise ValueError(f"Missing expected columns: {missing_keys}")

key_dupe_counts = clean_df.groupby(KEY).size().reset_index(name="Count")
key_dupe_groups = key_dupe_counts[key_dupe_counts["Count"] > 1]

print("Duplicate key groups (same trio):", len(key_dupe_groups))
print("Total rows in key-duplicate groups:", int(key_dupe_groups["Count"].sum()))

# === 3) For each key-duplicate group, check if rows are fully identical (across ALL columns) ===
def row_signature(row):
    # tuple of all columns in order; NaN-safe
    return tuple((None if pd.isna(v) else v) for v in row.tolist())

fully_identical_groups = []
partly_different_groups = []

for _, grp_key in key_dupe_groups[KEY].iterrows():
    mask = (clean_df[KEY] == grp_key.values).all(axis=1)
    grp = clean_df.loc[mask]

    # Build signatures across ALL columns for each row
    sigs = grp.apply(row_signature, axis=1)
    unique_sigs = sigs.drop_duplicates()

    if len(unique_sigs) == 1:
        fully_identical_groups.append(tuple(grp_key.values))
    else:
        partly_different_groups.append(tuple(grp_key.values))

print("Key-duplicate groups that are full-row identical (cleaned view):", len(fully_identical_groups))
print("Key-duplicate groups with some differing fields:", len(partly_different_groups))

# === 4) Show diagnostics for differences (including hidden chars) ===
def show_hidden(s):
    if pd.isna(s):
        return "<NaN>"
    return repr(str(s))  # exposes escaped whitespace like '\xa0', '\n', etc.

def explain_group(triple, max_rows=10):
    talk, name, yt = triple
    mask_raw   = (raw_df["Talk Title"] == talk) & (raw_df["Full Name"] == name) & (raw_df["YouTube Link"] == yt)
    mask_clean = (clean_df["Talk Title"] == talk) & (clean_df["Full Name"] == name) & (clean_df["YouTube Link"] == yt)

    raw_rows   = raw_df.loc[mask_raw]
    clean_rows = clean_df.loc[mask_clean]

    print("\n=== Group ===")
    print(f"Talk Title: {talk}")
    print(f"Full Name:  {name}")
    print(f"YouTube:    {yt}")
    print("Row indices (raw):  ", raw_rows.index.tolist())
    print("Row indices (clean):", clean_rows.index.tolist())

    # Column-by-column unique values (cleaned)
    diffs = {}
    for c in clean_df.columns:
        vals = clean_rows[c]
        uniq = pd.unique(vals)
        if len(uniq) > 1:
            diffs[c] = [show_hidden(x) for x in uniq[:max_rows]]

    if diffs:
        print("Differing fields (cleaned view):")
        for c, vals in diffs.items():
            print(f"  - {c}: {vals}")
    else:
        print("No differing fields in cleaned view (full-row identical after normalization).")

# Example: print details for a few groups (both categories)
for triple in fully_identical_groups[:3]:
    explain_group(triple)

for triple in partly_different_groups[:3]:
    explain_group(triple)


Exact duplicate rows (raw): 103
Exact duplicate rows (cleaned): 103
Duplicate key groups (same trio): 30
Total rows in key-duplicate groups: 113
Key-duplicate groups that are full-row identical (cleaned view): 29
Key-duplicate groups with some differing fields: 1

=== Group ===
Talk Title: AI Tools Under Control: Keeping Your Agents Secure and Reliable
Full Name:  Bar Chen
YouTube:    https://youtu.be/poqhv4hPTpA?si=wDHbtOf6DGPVJfvl
Row indices (raw):   [61, 413]
Row indices (clean): [61, 413]
No differing fields in cleaned view (full-row identical after normalization).

=== Group ===
Talk Title: Agentic AI: Learning Iteratively, Acting Autonomously
Full Name:  Fatma Tarlaci
YouTube:    https://youtu.be/_k8sPizcqUg?si=nQFRnoSd3voCuvSC
Row indices (raw):   [49, 401]
Row indices (clean): [49, 401]
No differing fields in cleaned view (full-row identical after normalization).

=== Group ===
Talk Title: Build with Mistral
Full Name:  Sophia Yang
YouTube:    https://youtu.be/_IM53bMowlQ?si=T