##### ═══════════════════════════════════════════════════════════
##### PSEUDONYMIZATION & INSPECTION NOTEBOOK
##### Creates privacy-safe versions of marketing and streaming data using David Lynch character names and deterministic hashing. Also streaming numbers perturbation and change to project numbers and campaign ids. Checks how many artists are present in both streaming and marketing dataset, or only in one of them.
##### ═══════════════════════════════════════════════════════════

In [1]:
# ═══════════════════════════════════════════════════════════
# 0. Imports
# ═══════════════════════════════════════════════════════════

import pandas as pd
import hashlib
import random
import warnings
warnings.filterwarnings('ignore')
import os
import gzip
import shutil

In [2]:
# ═══════════════════════════════════════════════════════════
# 1. LOAD DATA
# ═══════════════════════════════════════════════════════════

m_df = pd.read_csv('original_marketing_canonical_names.csv', encoding='ISO-8859-1')
s_df = pd.read_csv('original_streaming_canonical_names.csv', encoding='ISO-8859-1')

print(f"  Marketing: {len(m_df):,} rows")
print(f"  Marketing unique artists - Canonical names file - cleaned: {m_df['canonical_artist'].nunique()}")
print(f"  Marketing unique products - Canonical names file- cleaned: {m_df['canonical_product'].nunique()}")
# ═══════════════════════════════════════════════════════════
print(f"  Streaming: {len(s_df):,} rows")
print(f"  Streaming unique artists - Canonical names file- cleaned: {s_df['canonical_artist'].nunique()}")
print(f"  Streaming unique products - Canonical names file- cleaned: {s_df['canonical_product'].nunique()}")

  Marketing: 13,835 rows
  Marketing unique artists - Canonical names file - cleaned: 79
  Marketing unique products - Canonical names file- cleaned: 126
  Streaming: 614,005 rows
  Streaming unique artists - Canonical names file- cleaned: 72
  Streaming unique products - Canonical names file- cleaned: 910


In [3]:
# ═══════════════════════════════════════════════════════════
# 2. LOAD PSEUDONYM POOLS
# ═══════════════════════════════════════════════════════════

def load_pool(path, column):
    """Load pseudonym pool from CSV file"""
    # Try utf-8-sig first (for files with BOM), fall back to cp1252
    try:
        df = pd.read_csv(path, encoding='utf-8-sig')
    except UnicodeDecodeError:
        df = pd.read_csv(path, encoding='cp1252')
    
    if column not in df.columns:
        raise ValueError(f"Column '{column}' not found in {path}")
    vals = df[column].dropna().astype(str).tolist()
    if not vals:
        raise ValueError(f"No values found in column '{column}' of {path}")
    return vals
    
# Load David Lynch character names for artists and songs
artist_pool = load_pool('pseudonym_artists.csv', 'pseudonym_artist')
song_pool = load_pool('pseudonym_songs.csv', 'pseudonym_song')

print(f"✓ Pseudonym pools loaded")
print(f"  Artists available: {len(artist_pool)}")
print(f"  Songs available: {len(song_pool)}")

✓ Pseudonym pools loaded
  Artists available: 113
  Songs available: 1019


In [4]:
# ═══════════════════════════════════════════════════════════
# 3. HELPER FUNCTIONS FOR DETERMINISTIC MAPPING
# ═══════════════════════════════════════════════════════════

def stable_index(key: str, modulo: int, salt: str = "") -> int:
    """
    Generate deterministic index from key using SHA-256 hashing.
    Same key always produces same index.
    """
    h = hashlib.sha256((salt + str(key)).encode("utf-8")).hexdigest()
    return int(h, 16) % modulo

def assign_unique(values, pool, salt):
    """
    Deterministically assign unique pseudonyms to values.
    Uses hashing + linear probing to avoid collisions.
    
    Args:
        values: list of real values to pseudonymize
        pool: list of available pseudonyms
        salt: string to ensure different mappings for different purposes
    
    Returns:
        dict mapping real value → pseudonym
    """
   
    uniques = list(dict.fromkeys(map(str, values)))
    
    if len(uniques) > len(pool):
        raise ValueError(f"Need {len(uniques)} pseudonyms; pool has {len(pool)}.")
    
    n = len(pool)
    used_pseudonyms = set()  # Track pseudonyms, not indices
    mapping = {}
    
    for val in uniques:
        idx = stable_index(val, n, salt)
        start = idx
        
        while pool[idx] in used_pseudonyms:  # Check if this pseudonym is taken
            idx = (idx + 1) % n
            if idx == start:
                raise RuntimeError("Exhausted pool")
        
        used_pseudonyms.add(pool[idx])
        mapping[val] = pool[idx]
    
    return mapping


def assign_numeric_ids(values, digits=12, salt=""):
    """
    Generate deterministic numeric IDs for project_no and campaign_id.
    
    Args:
        values: list of real IDs
        digits: length of generated numeric ID
        salt: string to ensure different mappings for projects vs campaigns
    
    Returns:
        dict mapping real ID → pseudonymized numeric ID
    """
    uniques = sorted(set(map(str, values)))
    mapping = {}
    used = set()
    modulo = 10 ** digits
    
    for val in uniques:
        # Generate deterministic number from hash
        h_int = int(hashlib.sha256((salt + val).encode("utf-8")).hexdigest(), 16)
        code = h_int % modulo
        start = code
        
        # Linear probing to avoid collisions
        while code in used:
            code = (code + 1) % modulo
            if code == start:
                raise RuntimeError("ID space exhausted")
        
        used.add(code)
        mapping[val] = str(code).zfill(digits)
    
    return mapping


def normalize_key(series):
    """
    Normalize keys for consistent matching.
    Removes trailing .0 from numbers: '12345.0' → '12345'
    """
    s = series.astype(str).str.strip()
    s = s.str.replace(r"\.0+$", "", regex=True)
    return s

print("✓ Helper functions defined")

✓ Helper functions defined


In [5]:
# ═══════════════════════════════════════════════════════════
# 4. NORMALIZE IDs IN BOTH FILES
# ═══════════════════════════════════════════════════════════

# Ensure consistent string format for IDs (remove trailing .0)
m_df['project_no'] = normalize_key(m_df['project_no'])
s_df['project_no'] = normalize_key(s_df['project_no'])

if 'campaign_id' in m_df.columns:
    m_df['campaign_id'] = normalize_key(m_df['campaign_id'])

print("✓ IDs normalized")

✓ IDs normalized


In [6]:
# ═══════════════════════════════════════════════════════════
# 5. COLLECT ALL UNIQUE VALUES FROM BOTH FILES
# ═══════════════════════════════════════════════════════════
# This ensures consistent pseudonyms across both files!

# Collect all project numbers from both files
all_projects = set(m_df['project_no']) | set(s_df['project_no'])
print(f"  Total unique projects: {len(all_projects)}")

# Collect all campaign IDs (only in marketing)
all_campaigns = set()
if 'campaign_id' in m_df.columns:
    all_campaigns = set(m_df['campaign_id'])
print(f"  Total unique campaigns: {len(all_campaigns)}")

# Collect all canonical artists from both files
all_artists = set(m_df['canonical_artist'].dropna()) | set(s_df['canonical_artist'].dropna())
print(f"  Total unique canonical artists: {len(all_artists)}")

# Collect all canonical products from both files
all_products = set(m_df['canonical_product'].dropna()) | set(s_df['canonical_product'].dropna())

print(f"\n✓ All unique values collected from both files")
print(f"  Total unique canonical products: {len(all_products)}")

  Total unique projects: 82
  Total unique campaigns: 111
  Total unique canonical artists: 82

✓ All unique values collected from both files
  Total unique canonical products: 962


In [7]:
# ═══════════════════════════════════════════════════════════
# 6. BUILD PSEUDONYM MAPPINGS
# ═══════════════════════════════════════════════════════════
# Each mapping is deterministic: same input always gives same output

SALT = "lynch_salt_v6"  # Salt ensures different mappings for different purposes

# Map project_no → numeric pseudonym (e.g., "12345" → "847362910485")
project_map = assign_numeric_ids(all_projects, digits=12, salt=SALT + "_project")
print(f"✓ Project mapping created: {len(project_map)} projects")

# Map campaign_id → numeric pseudonym
campaign_map = {}
if all_campaigns:
    campaign_map = assign_numeric_ids(all_campaigns, digits=12, salt=SALT + "_campaign")
    print(f"✓ Campaign mapping created: {len(campaign_map)} campaigns")

# Map canonical_artist → David Lynch character (e.g., "Rihanna" → "Audrey_Horne")
# Uses project_no implicitly since canonical_artist is tied to project
artist_map = assign_unique(sorted(all_artists), artist_pool, SALT + "_artist")
print(f"✓ Artist mapping created: {len(artist_map)} artists")

# Map product_key → David Lynch location/object (e.g., "12345_Umbrella" → "Black_Lodge")
product_map = assign_unique(sorted(all_products), song_pool, SALT + "_song")
print(f"✓ Product mapping created: {len(product_map)} products")

✓ Project mapping created: 82 projects
✓ Campaign mapping created: 111 campaigns
✓ Artist mapping created: 82 artists
✓ Product mapping created: 962 products


In [8]:
# ═══════════════════════════════════════════════════════════
# 7. ADD STREAM NOISE (±10% per project)
# ═══════════════════════════════════════════════════════════
# Adds controlled noise to stream counts to protect privacy
# Each project gets consistent noise multiplier (0.9 to 1.1)

rng = random.Random(101)  # Fixed seed for reproducibility

# Create noise multiplier for each project (before pseudonymization)
project_noise = {
    proj: rng.uniform(0.9, 1.1)
    for proj in all_projects
}

# Apply noise to streaming data
if 'streams' in s_df.columns:
    s_df['streams'] = s_df.apply(
        lambda row: int(row['streams'] * project_noise[row['project_no']])
        if pd.notnull(row['streams']) else row['streams'],
        axis=1
    )
    print(f"✓ Stream noise applied (±10%)")

✓ Stream noise applied (±10%)


In [9]:
# ═══════════════════════════════════════════════════════════
# 8. APPLY PSEUDONYMIZATION TO MARKETING FILE
# ═══════════════════════════════════════════════════════════
print("\nPseudonymizing marketing data...")

# Pseudonymize project_no
m_df['project_no'] = m_df['project_no'].map(project_map)
# Convert to Int64 to preserve precision when saving
m_df['project_no'] = m_df['project_no'].astype('Int64')

# Pseudonymize campaign_id
if 'campaign_id' in m_df.columns and campaign_map:
    m_df['campaign_id'] = m_df['campaign_id'].map(campaign_map)

# Pseudonymize artist_name using canonical_artist mapping
m_df['artist_name'] = m_df['canonical_artist'].map(artist_map)
# Pseudonymize canonical_artist
m_df['canonical_artist'] = m_df['canonical_artist'].map(artist_map)

# Pseudonymize product columns - map directly from canonical_product
m_df['canonical_product'] = m_df['canonical_product'].map(product_map)
m_df['product'] = m_df['product'].map(product_map)

print("✓ Marketing data pseudonymized")
print(f"  Rows: {len(m_df):,}")
print(f"  Columns: {len(m_df.columns)}")


Pseudonymizing marketing data...
✓ Marketing data pseudonymized
  Rows: 13,835
  Columns: 46


In [10]:
# ═══════════════════════════════════════════════════════════
# 9. APPLY PSEUDONYMIZATION TO STREAMING FILE
# ═══════════════════════════════════════════════════════════
print("\nPseudonymizing streaming data...")

# Pseudonymize project_no
s_df['project_no'] = s_df['project_no'].map(project_map)
# Convert to Int64 to preserve precision when saving:
s_df['project_no'] = s_df['project_no'].astype('Int64')

# Pseudonymize artist_name using canonical_artist mapping
s_df['artist_name'] = s_df['canonical_artist'].map(artist_map)
# Pseudonymize canonical_artist
s_df['canonical_artist'] = s_df['canonical_artist'].map(artist_map)

# Pseudonymize product columns - map directly from canonical_product
s_df['canonical_product'] = s_df['canonical_product'].map(product_map)
s_df['product_name'] = s_df['product_name'].map(product_map)

# Check mapping
print(f"NaN values after mapping: {s_df['canonical_product'].isna().sum()}")
print(f"Unique canonical_product: {s_df['canonical_product'].nunique()}")

print("✓ Streaming data pseudonymized")
print(f"  Rows: {len(s_df):,}")
print(f"  Columns: {len(s_df.columns)}")


Pseudonymizing streaming data...
NaN values after mapping: 0
Unique canonical_product: 910
✓ Streaming data pseudonymized
  Rows: 614,005
  Columns: 9


In [11]:
# ═══════════════════════════════════════════════════════════
# 10. VERIFY PSEUDONYMIZATION
# ═══════════════════════════════════════════════════════════

print("\n" + "="*60)
print("PSEUDONYMIZATION VERIFICATION")
print("="*60)

print("\nMARKETING FILE:")
print(f"  Unique projects: {m_df['project_no'].nunique()}")
print(f"  Unique artists: {m_df['canonical_artist'].nunique()}")
print(f"  Unique products: {m_df['canonical_product'].nunique()}")
if 'campaign_id' in m_df.columns:
    print(f"  Unique campaigns: {m_df['campaign_id'].nunique()}")

print("\nSTREAMING FILE:")
print(f"  Unique projects: {s_df['project_no'].nunique()}")
print(f"  Unique artists: {s_df['canonical_artist'].nunique()}")
print(f"  Unique products: {s_df['canonical_product'].nunique()}")

print("\nSample pseudonymized data (marketing):")
print(m_df[['project_no','canonical_artist', 'canonical_product']].head(3))

print("\nSample pseudonymized data (streaming):")
print(s_df[['project_no', 'canonical_artist', 'canonical_product']].head(3))


PSEUDONYMIZATION VERIFICATION

MARKETING FILE:
  Unique projects: 80
  Unique artists: 79
  Unique products: 126
  Unique campaigns: 111

STREAMING FILE:
  Unique projects: 72
  Unique artists: 72
  Unique products: 910

Sample pseudonymized data (marketing):
     project_no canonical_artist  canonical_product
0  432191283317    Senorita_Dido          True_Face
1  791586853774         Log_Lady  Got_a_Light_Again
2  663102993334      Gordon_Cole       Rita_Appears

Sample pseudonymized data (streaming):
     project_no canonical_artist canonical_product
0  318733585791    Nadine_Hurley      Red_Bathrobe
1  616610447432  Annie_Blackburn           No_Hope
2  963210974654     Tom_Beaumont      Inner_Vision


We have one artist, Sailor Ripley (and the original artist in the original file), who is an exception to the "one artist, one project_no" rule. They have two project numbers, one for an album, one for a track (not pertaining to that album). That's how we have 79 artists and 80 project numbers.

In [12]:
s_df.to_csv('streaming_pseudonymized_new.csv')
m_df.to_csv('marketing_pseudonymized_new.csv')

In [13]:
# Check size
size_mb = os.path.getsize('streaming_pseudonymized_new.csv') / (1024**2)
print(f"Size: {size_mb:.2f} MB")

# Compress it
with open('streaming_pseudonymized_new.csv', 'rb') as f_in:
    with gzip.open('streaming_pseudonymized_new.csv.gz', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

# Check compressed size
compressed_mb = os.path.getsize('streaming_pseudonymized_new.csv.gz') / (1024**2)
print(f"Compressed: {compressed_mb:.2f} MB")
print(f"Savings: {(1 - compressed_mb/size_mb)*100:.1f}%")

Size: 61.80 MB
Compressed: 12.01 MB
Savings: 80.6%


In [14]:
# ═══════════════════════════════════════════════════════════
# MATCHED ARTISTS
# ═══════════════════════════════════════════════════════════
# Find artists that appear in BOTH files
marketing_artists = set(m_df['canonical_artist'].unique())
streaming_artists = set(s_df['canonical_artist'].unique())
overlap = marketing_artists.intersection(streaming_artists)

print(f"\nArtists in both files (artists who had both ad campaigns and streams): {len(overlap)}")
print(f"Example overlapping artists: {list(overlap)[:5]}")

# Pick one overlapping artist and show their project_no in both files
if overlap:
    test_artist = list(overlap)[0]
    m_project = m_df[m_df['canonical_artist'] == test_artist]['project_no'].iloc[0]
    s_project = s_df[s_df['canonical_artist'] == test_artist]['project_no'].iloc[0]
    print(f"\n{test_artist}:")
    print(f"  Marketing project_no: {m_project}")
    print(f"  Streaming project_no: {s_project}")
    print(f"  Match: {m_project == s_project}")


Artists in both files (artists who had both ad campaigns and streams): 69
Example overlapping artists: ['Major_Briggs', 'Shelly_Johnson', 'Norma_Jennings', 'Rebecca_Del_Rio', 'The_Fireman']

Major_Briggs:
  Marketing project_no: 106206807592
  Streaming project_no: 106206807592
  Match: True


In [15]:
# ═══════════════════════════════════════════════════════════
# UNMATCHED ARTISTS
# ═══════════════════════════════════════════════════════════

print("\n" + "="*60)
print("UNMATCHED ARTISTS")
print("="*60)


# 1. Artists only in one file
print("\nARTISTS: Present in only one file")
print("-" * 60)
marketing_artists = set(m_df['canonical_artist'].unique())
streaming_artists = set(s_df['canonical_artist'].unique())
both_artists = marketing_artists & streaming_artists

print(f"   Marketing only - Signed artists & Compilations that had campaigns but had no streams({len(marketing_artists - streaming_artists)}):")
for a in sorted(marketing_artists - streaming_artists):
    print(f"     - {a}")

print(f"\n   Streaming only - Signed artists that had no campaigns but still had streams ({len(streaming_artists - marketing_artists)}):")
for a in sorted(streaming_artists - marketing_artists):
    print(f"     - {a}")



UNMATCHED ARTISTS

ARTISTS: Present in only one file
------------------------------------------------------------
   Marketing only - Signed artists & Compilations that had campaigns but had no streams(10):
     - Coco
     - Diane_Selwyn
     - Lady_Jessica
     - Marietta_Fortune
     - Monica_Bellucci_Herself
     - Mr_Roque
     - Mrs_Tremonds_Grandson
     - Nikki_Grace
     - Pete_Martell
     - The_Man_from_Another_Place

   Streaming only - Signed artists that had no campaigns but still had streams (3):
     - Gersten_Hayward
     - Harold_Smith
     - Lady_Margot_Fenring


In [16]:
# Check artists present in both files but with no overlapping products
print("\n ARTISTS IN BOTH FILES WITH NO OVERLAPPING PRODUCTS")
print("-" * 60)

artists_m = set(m_df['canonical_artist'].unique())
artists_s = set(s_df['canonical_artist'].unique())

both_artists = artists_m & artists_s

artists_both_no_overlap = []

for artist in both_artists:
    m_products = set(
        m_df[m_df['canonical_artist'] == artist]['canonical_product']
    )
    s_products = set(
        s_df[s_df['canonical_artist'] == artist]['canonical_product']
    )
    
    if len(m_products & s_products) == 0:
        artists_both_no_overlap.append(artist)
        
num_no_overlap = len(artists_both_no_overlap)
pct_no_overlap = num_no_overlap / len(both_artists) * 100

print(f"Artists in both datasets with NO shared products: {num_no_overlap}")
print(f"→ {pct_no_overlap:.1f}% of artists present in both files with no overlapping products")


 ARTISTS IN BOTH FILES WITH NO OVERLAPPING PRODUCTS
------------------------------------------------------------
Artists in both datasets with NO shared products: 12
→ 17.4% of artists present in both files with no overlapping products


This doesn't look ideal, having 1/6 of the advertised products not correlate with any streams. But we remember that some of our products were albums or playlists or giveaways for fans, i.e. if they generated streams, they don't have a direct match in our streaming dataset, where records of streams are kept at track level. We need to inspect the share of each product_type.

In [17]:
problem_m_df = m_df[
    m_df['canonical_artist'].isin(artists_both_no_overlap)
]

product_type_summary = (
    problem_m_df
    .groupby(['product_number_type', 'product_type_on_spotify'])
    ['canonical_product']
    .nunique()
    .reset_index(name='unique_products')
)

product_type_summary.sort_values('unique_products', ascending=False)

total_problem_products = problem_m_df['canonical_product'].nunique()

product_type_summary['pct_of_products'] = (
    product_type_summary['unique_products'] / total_problem_products * 100
)

product_type_summary.sort_values('pct_of_products', ascending=False)


Unnamed: 0,product_number_type,product_type_on_spotify,unique_products,pct_of_products
3,URL,no_spotify_link,6,42.857143
2,Track URI,track,4,28.571429
0,Album URI,album,3,21.428571
1,Playlist URI,playlist,2,14.285714


This looks much better: Only 4 advertised tracks (songs) didn't have streams.

In [18]:
# Find out what the problematic tracks are

problematic_tracks_df = (
    problem_m_df[
        (problem_m_df['product_number_type'] == 'Track URI') &
        (problem_m_df['product_type_on_spotify'] == 'track')
    ]
    [['canonical_artist', 'canonical_product', 'project_no']]
    .drop_duplicates()
)
print(problematic_tracks_df)

        canonical_artist canonical_product    project_no
15         Count_Fenring      Red_Curtains  511340909446
162      Annie_Blackburn  Together_at_last  616610447432
716           The_Cowboy         Horseshoe  633147923560
1462  Detective_Williams        Purple_Sea  351022636206


# Horseshoe is Think about it by 2Hot2Play, das hat Streams.... Vielleicht capitalization war ein issue???

In [19]:
# Export mappings WITH original names for verification
print("\n7. EXPORTING MAPPINGS WITH ORIGINAL NAMES")
print("-" * 60)

# Create reverse mappings from the pseudonymization dictionaries
artist_reverse = {v: k for k, v in artist_map.items()}
product_reverse = {v: k for k, v in product_map.items()}

# Create artist mapping with original names
artist_mapping_export = []
for artist in sorted(set(m_df['canonical_artist'].unique()) | set(s_df['canonical_artist'].unique())):
    in_marketing = artist in m_df['canonical_artist'].values
    in_streaming = artist in s_df['canonical_artist'].values
    
    m_products = sorted(m_df[m_df['canonical_artist'] == artist]['canonical_product'].unique()) if in_marketing else []
    s_products = sorted(s_df[s_df['canonical_artist'] == artist]['canonical_product'].unique()) if in_streaming else []
    overlap_products = sorted(set(m_products) & set(s_products))
    
    # Get original names
    original_artist = artist_reverse.get(artist, artist)
    original_m_products = [product_reverse.get(p, p) for p in m_products]
    original_s_products = [product_reverse.get(p, p) for p in s_products]
    original_overlap = [product_reverse.get(p, p) for p in overlap_products]
    
    artist_mapping_export.append({
        'pseudonym_artist': artist,
        'original_artist': original_artist,
        'in_marketing': in_marketing,
        'in_streaming': in_streaming,
        'marketing_product_count': len(m_products),
        'streaming_product_count': len(s_products),
        'overlap_product_count': len(overlap_products),
        'marketing_products_pseudo': ', '.join(m_products) if m_products else '',
        'marketing_products_original': ', '.join(original_m_products) if original_m_products else '',
        'streaming_products_pseudo': ', '.join(s_products) if s_products else '',
        'streaming_products_original': ', '.join(original_s_products) if original_s_products else '',
        'overlap_products_pseudo': ', '.join(overlap_products) if overlap_products else '',
        'overlap_products_original': ', '.join(original_overlap) if original_overlap else ''
    })

artist_mapping_df = pd.DataFrame(artist_mapping_export)
artist_mapping_df.to_csv('artist_product_mapping_with_originals.csv', index=False)
print(f"   ✓ Saved artist_product_mapping_with_originals.csv ({len(artist_mapping_df)} artists)")

# Create product mapping with original names
product_mapping_export = []
for product in sorted(set(m_df['canonical_product'].unique()) | set(s_df['canonical_product'].unique())):
    in_marketing = product in m_df['canonical_product'].values
    in_streaming = product in s_df['canonical_product'].values
    
    m_artists = sorted(m_df[m_df['canonical_product'] == product]['canonical_artist'].unique()) if in_marketing else []
    s_artists = sorted(s_df[s_df['canonical_product'] == product]['canonical_artist'].unique()) if in_streaming else []
    
    # Get original names
    original_product = product_reverse.get(product, product)
    original_m_artists = [artist_reverse.get(a, a) for a in m_artists]
    original_s_artists = [artist_reverse.get(a, a) for a in s_artists]
    
    product_mapping_export.append({
        'pseudonym_product': product,
        'original_product': original_product,
        'in_marketing': in_marketing,
        'in_streaming': in_streaming,
        'marketing_artists_pseudo': ', '.join(m_artists) if m_artists else '',
        'marketing_artists_original': ', '.join(original_m_artists) if original_m_artists else '',
        'streaming_artists_pseudo': ', '.join(s_artists) if s_artists else '',
        'streaming_artists_original': ', '.join(original_s_artists) if original_s_artists else ''
    })

product_mapping_df = pd.DataFrame(product_mapping_export)
product_mapping_df.to_csv('product_artist_mapping_with_originals.csv', index=False)
print(f"   ✓ Saved product_artist_mapping_with_originals.csv ({len(product_mapping_df)} products)")

print(f"\n   Files saved with both pseudonyms and original names")


7. EXPORTING MAPPINGS WITH ORIGINAL NAMES
------------------------------------------------------------
   ✓ Saved artist_product_mapping_with_originals.csv (82 artists)
   ✓ Saved product_artist_mapping_with_originals.csv (962 products)

   Files saved with both pseudonyms and original names
