In [1]:
# ═══════════════════════════════════════════════════════════
# 0. IMPORTS
# ═══════════════════════════════════════════════════════════
import pandas as pd
import numpy as np
import chardet
from rapidfuzz import fuzz, process
import re
import warnings
warnings.filterwarnings('ignore')

In [2]:
# ═══════════════════════════════════════════════════════════
# 1. LOAD DATA
# ═══════════════════════════════════════════════════════════
m_df = pd.read_csv('marketing.csv', encoding='ISO-8859-1')
s_df = pd.read_csv('streaming.csv', encoding='ISO-8859-1')

print(f"Marketing data: {len(m_df):,} rows")
print(f"Streaming data: {len(s_df):,} rows")

Marketing data: 14,048 rows
Streaming data: 614,005 rows


In [3]:
m_df.dtypes

Unnamed: 0               int64
supplier                object
objective               object
campaign_start_date     object
campaign_end_date       object
campaign_id             object
campaign_name           object
campaign_status         object
campaign_budget        float64
adset_id                object
adset_name              object
adset_status            object
adset_budget           float64
adset_start_date        object
adset_end_date          object
ad_id                   object
ad_name                 object
ad_status               object
territory               object
project_no             float64
artist                  object
product                 object
created_at              object
format                  object
campaign_info           object
adset_info              object
ad_info                 object
is_test                  int64
business_unit           object
platform_manual         object
product_number_type     object
product_identifier      object
audience

In [4]:
s_df.dtypes

Unnamed: 0       int64
report_date     object
source_name     object
project_id       int64
artist_name     object
product_name    object
isrc_cd         object
streams          int64
dtype: object

In [5]:
print(f"  Marketing unique artists : {m_df['artist'].nunique()}")
print(f"  Marketing unique products : {m_df['product'].nunique()}")
print(f"  Streaming unique artists : {s_df['artist_name'].nunique()}")
print(f"  Streaming unique products: {s_df['product_name'].nunique()}")

  Marketing unique artists : 90
  Marketing unique products : 149
  Streaming unique artists : 385
  Streaming unique products: 925


In [6]:
# ═══════════════════════════════════════════════════════════
# 2. CLEAN MARKETING DATA
# ═══════════════════════════════════════════════════════════

# Convert dates
date_cols = ['campaign_start_date', 'campaign_end_date', 'report_date', 'adset_start_date', 'adset_end_date']
for col in date_cols:
    if col in m_df.columns:
        m_df[col] = pd.to_datetime(m_df[col], errors='coerce')

# Rename for consistency
m_df.rename(columns={
    'artist': 'artist_name',
    'report_date': 'marketing_report_date'
}, inplace=True)

# Convert project_no to string
m_df['project_no'] = m_df['project_no'].astype(str).str.replace(r'\.0$', '', regex=True)


print(f"✓ Marketing data cleaned")
print(f"  Date range: {m_df['campaign_start_date'].min()} to {m_df['campaign_end_date'].max()}")

✓ Marketing data cleaned
  Date range: 2023-01-03 00:00:00 to 2023-08-18 00:00:00


In [7]:
# ═══════════════════════════════════════════════════════════
# 3. CLEAN STREAMING DATA
# ═══════════════════════════════════════════════════════════
# Rename project column
if 'project_id' in s_df.columns:
    s_df.rename(columns={'project_id': 'project_no'}, inplace=True)

# Convert project_no to string
s_df['project_no'] = s_df['project_no'].astype(str).str.replace(r'\.0$', '', regex=True)

# Convert report_date
s_df['report_date'] = pd.to_datetime(s_df['report_date'], errors='coerce')

# Strip whitespace from text columns
text_cols = ['source_name', 'artist_name', 'product_name']
for col in text_cols:
    if col in s_df.columns:
        s_df[col] = s_df[col].astype(str).str.strip()

# Dete Identity-giveaway ISRC code

col_to_drop = ['isrc_cd']
s_df.drop(columns=[c for c in col_to_drop if c in s_df.columns], inplace=True)

print(f"✓ Streaming data cleaned")
print(f"  Date range: {s_df['report_date'].min()} to {s_df['report_date'].max()}")

✓ Streaming data cleaned
  Date range: 2023-01-01 00:00:00 to 2023-03-30 00:00:00


In [8]:
# ═══════════════════════════════════════════════════════════
# 4. MARKETING DATA: EXTRACT SPOTIFY TYPE
# ═══════════════════════════════════════════════════════════
def extract_spotify_type(x):
    """Extract spotify content type from URI or URL"""
    if pd.isna(x):
        return "no_spotify_link"
    
    x = str(x).lower().strip()
    
    # Match spotify:track:xxxx or spotify:album:xxxx (URI format)
    match_uri = re.search(r'spotify:([a-z]+):', x)
    if match_uri:
        return match_uri.group(1)
    
    # Match https://open.spotify.com/.../xxxx (URL format)
    # Skip 'intl-xx' localization segments
    match_url = re.search(r'open\.spotify\.com/(?:intl-[a-z]{2}/)?([a-z]+)/', x)
    if match_url:
        return match_url.group(1)
    
    # Explicitly mark non-Spotify links (YouTube, artist site, Eventim, etc.)
    return "no_spotify_link"


m_df['product_type_on_spotify'] = m_df['product_identifier'].apply(extract_spotify_type)

print("✓ Spotify type extracted")
print(m_df['product_type_on_spotify'].value_counts(dropna=False))


✓ Spotify type extracted
product_type_on_spotify
track              5603
no_spotify_link    2816
playlist           2635
album              2544
prerelease          399
artist               51
Name: count, dtype: int64


In [9]:
# ═══════════════════════════════════════════════════════════
# 5. MARKETING DATA: DELETE TEST CAMPAIGNS & IDENTITY GIVEAWAY OR COMPANY-INTERNAL COLUMNS 
# ═══════════════════════════════════════════════════════════

# Adset_id, territory and created_at contain identity giveaways thus would need anonymization.
# As I'm doing anything with them analytically, I'm also dropping them now.

cols_to_drop = ['adset_name', 'ad_name','campaign_name','campaign_info', 'adset_info', 'adset_id', 'territory', 'created_at', 'ad_info', 'is_test','business_unit','product_identifier']
m_df.drop(columns=[c for c in cols_to_drop if c in m_df.columns], inplace=True)

In [10]:
# ═══════════════════════════════════════════════════════════
# 6. SAVE FILES
# ═══════════════════════════════════════════════════════════

m_df.to_csv('original_marketing_precleaned.csv', index=False)
s_df.to_csv('original_streaming_precleaned.csv', index=False)

In [11]:
# ═══════════════════════════════════════════════════════════
# 7. CREATE CANONICAL ARTIST/PRODUCT COLUMNS
# ═══════════════════════════════════════════════════════════

def canonicalize_artist(name):
    """Strip featuring artists to get primary/canonical artist"""
    if pd.isna(name):
        return name
    
    name = str(name).strip()
    
    # Remove everything after comma (e.g., "Rihanna, Jay-Z" → "Rihanna")
    name = re.sub(r'\s*,\s*.*', '', name)
    
    # Remove everything after feat/ft/featuring (with various separators)
    patterns = [
        r'\s*[\(\[]?\s*f(ea)?t\.?\s+.*',  # feat., ft., featuring
        r'\s*[\(\[]?\s*with\s+.*',         # with
        r'\s*[\(\[]?\s*&\s+.*',            # & (ampersand)
        r'\s*[\(\[]?\s*x\s+.*',            # x (collaboration marker)
    ]
    
    for pattern in patterns:
        name = re.sub(pattern, '', name, flags=re.IGNORECASE)
    
    return name.strip()


def canonicalize_product(name):
    """Strip versions/variants to get canonical product name"""
    if pd.isna(name):
        return name
    
    name = str(name).strip()
    
    # Remove featuring artists
    name = re.sub(r'\s*[\(\[]?\s*f(ea)?t\.?\s+.*?[\)\]]?', '', name, flags=re.IGNORECASE)
    
    # Remove version indicators
    version_patterns = [
    r'\s*[-–]\s*(live|remix|version|acoustic|radio edit|instrumental|explicit|clean).*',
    r'\s*[\(\[]\s*(live|remix|version|acoustic|radio edit|instrumental|explicit|clean).*?[\)\]]?',
    r'\s+(live|remix|acoustic|instrumental|explicit|clean|sommer|summer|winter|spring|fall)$',
    r'\s+\d{4}$',  # Catches years like 2023 as in "Umbrella 2023"
    r'\s*[-–]\s*\d{4}$',       # "Umbrella - 2023"
]
    for pattern in version_patterns:
        name = re.sub(pattern, '', name, flags=re.IGNORECASE)
    
    # Remove phase/number suffixes at END of song name
    # Matches: "Umbrella 3", "Umbrella Phase5", "Umbrella #3"
    phase_patterns = [
        r'\s+Phase\s*[1-5]$',      # Phase1, Phase 2, etc.
        r'\s+#[2-5]$',              # #2, #3, etc. (keeping #1 would remove track numbers)
        r'\s+[2-5]$',               # Just numbers 2-5 at the end
    ]
    
    for pattern in phase_patterns:
        name = re.sub(pattern, '', name, flags=re.IGNORECASE)
    
    return name.strip()


# Apply to marketing data
m_df['canonical_artist'] = m_df['artist_name'].apply(canonicalize_artist)
m_df['canonical_product'] = m_df['product'].apply(canonicalize_product)

# Apply to streaming data
s_df['canonical_artist'] = s_df['artist_name'].apply(canonicalize_artist)
s_df['canonical_product'] = s_df['product_name'].apply(canonicalize_product)

print(f"✓ Canonical columns created")
print(f"  Marketing unique artists - Canonical names file: {m_df['canonical_artist'].nunique()}")
print(f"  Marketing unique products - Canonical names file: {m_df['canonical_product'].nunique()}")
print(f"  Streaming unique artists - Canonical names file: {s_df['canonical_artist'].nunique()}")
print(f"  Streaming unique products - Canonical names file: {s_df['canonical_product'].nunique()}")

✓ Canonical columns created
  Marketing unique artists - Canonical names file: 88
  Marketing unique products - Canonical names file: 142
  Streaming unique artists - Canonical names file: 75
  Streaming unique products - Canonical names file: 921


In [12]:
# ═══════════════════════════════════════════════════════════
# 8. SAVE FILES WITH CANONICAL NAMES
# ═══════════════════════════════════════════════════════════
print(m_df.shape)
print(s_df.shape)

m_df.to_csv('original_marketing_canonical_names.csv', index=False)
s_df.to_csv('original_streaming_canonical_names.csv', index=False)

(14048, 47)
(614005, 9)
