##### ═══════════════════════════════════════════════════════════
##### Paid Media Summary - Part 2 of 2 - Platform-level analysis
##### ═══════════════════════════════════════════════════════════

In [1]:
import pandas as pd
import numpy as np

campaign_summary = pd.read_csv ('final_campaign_summary_paid_media.csv')

# Remove 'other' platform category 
# (Instagram campaigns with $0 budget and almost 0 marketing metrics)

campaign_summary_clean = campaign_summary[
    (campaign_summary['platform_category'] != 'other')
].copy()


In [4]:
def calculate_cross_platform_overlap(row):
    """Calculate overlap with campaigns on DIFFERENT platforms only"""
    
    # Find all campaigns for same artist on DIFFERENT platforms
    other_platforms = campaign_summary_clean[
        (campaign_summary_clean['project_no'] == row['project_no']) &
        (campaign_summary_clean['platform_category'] != row['platform_category'])
    ]
    
    # Count how many OTHER PLATFORMS had overlapping campaigns during this campaign's window
    overlapping = other_platforms[
        (other_platforms['campaign_start_date'] <= row['campaign_end_date']) &
        (other_platforms['campaign_end_date'] >= row['campaign_start_date'])
    ]
    
    cross_platform_overlap_count = len(overlapping)
    
    baseline_overlap = during_overlap = decay_overlap = 0
    
    for _, other in other_platforms.iterrows():
    
        # Baseline period overlap
        baseline_start = row['campaign_start_date'] - pd.Timedelta(days=14)
        baseline_end = row['campaign_start_date'] - pd.Timedelta(days=1)
        baseline_overlap += overlap_days(
            baseline_start, baseline_end,
            other['campaign_start_date'], other['campaign_end_date']
        )
        
        # During period overlap
        during_overlap += overlap_days(
            row['campaign_start_date'], row['campaign_end_date'],
            other['campaign_start_date'], other['campaign_end_date']
        )
        
        # Decay period overlap
        decay_start = row['campaign_end_date'] + pd.Timedelta(days=1)
        decay_end = row['campaign_end_date'] + pd.Timedelta(days=10)
        decay_overlap += overlap_days(
            decay_start, decay_end,
            other['campaign_start_date'], other['campaign_end_date']
        )
    
    return pd.Series({
        'cross_platform_overlap_count': cross_platform_overlap_count,  
        'cross_platform_overlap_baseline': baseline_overlap,
        'cross_platform_overlap_during': during_overlap,
        'cross_platform_overlap_decay': decay_overlap
    })


def overlap_days(a_start, a_end, b_start, b_end):
    """Helper to calculate overlapping days between two date ranges"""
    start = max(a_start, b_start)
    end = min(a_end, b_end)
    return max(0, (end - start).days + 1)

# Convert dates
campaign_summary_clean['campaign_start_date'] = pd.to_datetime(campaign_summary_clean['campaign_start_date'])
campaign_summary_clean['campaign_end_date'] = pd.to_datetime(campaign_summary_clean['campaign_end_date'])

# Calculate cross-platform overlap
cross_platform_overlaps = campaign_summary_clean.apply(calculate_cross_platform_overlap, axis=1)
campaign_summary_clean = pd.concat([campaign_summary_clean, cross_platform_overlaps], axis=1)

In [5]:
# ═══════════════════════════════════════════════════════════
# CREATE PLATFORM × OBJECTIVE SUMMARY WITH COMPOSITE METRICS
# (OVERLAP-AWARE, SAMPLE-AWARE, TRACKABILITY-AWARE)
# ═══════════════════════════════════════════════════════════

# Aggregate to platform × objective level
platform_objective_summary = campaign_summary_clean.groupby(
    ['platform_category', 'objective', 'product_number_type']
).agg({
    # Campaign counts and duration
    'campaign_id': 'count',
    'campaign_duration_days': ['sum', 'median'],

    # Spend
    'spend': 'sum',

    # Marketing metrics (totals)
    'impressions': 'sum',
    'reach': 'max',
    'video_views': 'sum',
    'link_clicks': 'sum',
    'interactions': 'sum',
    'conversions': 'sum',
    'comments': 'sum',
    'likes': 'sum',
    'reactions': 'sum',
    'shares': 'sum',
    'follows': 'sum',

    # Streaming metrics
    'streams_during_campaign': 'sum',
    'streams_10day_post': 'sum',
    'baseline_streams_14day_pre': 'sum',
    'streaming_trackable': lambda x: (x == 'track').sum() / len(x),  # Proportion of trackable campaigns

    # Cross-platform overlap metrics
    'cross_platform_overlap_count': 'mean',
    'cross_platform_overlap_baseline': 'sum',
    'cross_platform_overlap_during': 'sum',
    'cross_platform_overlap_decay': 'sum'
}).reset_index()

# Flatten multi-level columns from aggregation
platform_objective_summary.columns = [
    'platform_category', 'objective', 'product_number_type',
    'campaign_count',
    'total_campaign_duration_days', 'typical_campaign_duration',
    'total_spend',
    'impressions', 'reach', 'video_views',
    'link_clicks', 'interactions', 'conversions',
    'comments', 'likes', 'reactions', 'shares', 'follows',
    'streams_during_campaign', 'streams_10day_post', 'baseline_streams_14day_pre',
    'trackable_campaign_proportion',  # renamed for clarity
    'any_overlap_count',
    'overlap_days_baseline_sum',
    'overlap_days_during_sum',
    'overlap_days_decay_sum'
]

# ═══════════════════════════════════════════════════════════
# CALCULATED METRICS - ORDER MATTERS FOR DEPENDENCIES
# ═══════════════════════════════════════════════════════════

# Spend per day
platform_objective_summary['spend_per_day'] = (
    platform_objective_summary['total_spend'] /
    platform_objective_summary['total_campaign_duration_days']
)

# Overlap ratios (weighted by campaign length)
platform_objective_summary['overlap_ratio_baseline'] = (
    platform_objective_summary['overlap_days_baseline_sum'] / 
    (platform_objective_summary['campaign_count'] * 14)
)

platform_objective_summary['overlap_ratio_during'] = (
    platform_objective_summary['overlap_days_during_sum'] / 
    platform_objective_summary['total_campaign_duration_days']
)

platform_objective_summary['overlap_ratio_decay'] = (
    platform_objective_summary['overlap_days_decay_sum'] / 
    (platform_objective_summary['campaign_count'] * 10)
)

# Composite marketing metrics (platform-specific logic)
def calculate_composite_metrics(row):
    platform = row['platform_category']
    
    # Awareness (avoid double-counting impressions/reach)
    awareness_base = max(row['impressions'], row['reach'])
    total_awareness = awareness_base + row['video_views']
    
    # Traffic
    total_traffic = row['link_clicks'] + row['conversions']
    
    # Engagement (platform-specific)
    if platform in ['facebook', 'instagram']:
        total_engagement = row['interactions']
    elif platform in ['google', 'youtube']:
        total_engagement = max(0, row['interactions'] - row['video_views'])
    elif platform == 'tiktok':
        total_engagement = (
            row['comments'] + row['likes'] + row['reactions'] +
            row['shares'] + row['follows']
        )
    else:  # snapchat
        total_engagement = 0
    
    return pd.Series({
        'total_awareness_metrics': total_awareness,
        'total_traffic_metrics': total_traffic,
        'total_engagement_metrics': total_engagement
    })

# Apply composite calculations
composite_metrics = platform_objective_summary.apply(calculate_composite_metrics, axis=1)
platform_objective_summary = pd.concat([platform_objective_summary, composite_metrics], axis=1)

# Per-day rates for composites
platform_objective_summary['awareness_per_day'] = (
    platform_objective_summary['total_awareness_metrics'] /
    platform_objective_summary['total_campaign_duration_days']
)

platform_objective_summary['traffic_per_day'] = (
    platform_objective_summary['total_traffic_metrics'] /
    platform_objective_summary['total_campaign_duration_days']
)

platform_objective_summary['engagement_per_day'] = (
    platform_objective_summary['total_engagement_metrics'] /
    platform_objective_summary['total_campaign_duration_days']
)

# Cost & rate metrics
platform_objective_summary['cpm'] = np.where(
    platform_objective_summary['impressions'] > 0,
    (platform_objective_summary['total_spend'] / platform_objective_summary['impressions']) * 1000,
    np.nan
)

platform_objective_summary['cost_per_click'] = np.where(
    platform_objective_summary['link_clicks'] > 0,
    platform_objective_summary['total_spend'] / platform_objective_summary['link_clicks'],
    np.nan
)

platform_objective_summary['cost_per_1000_engagements'] = np.where(
    platform_objective_summary['total_engagement_metrics'] > 0,
    platform_objective_summary['total_spend'] / platform_objective_summary['total_engagement_metrics']*1000,
    np.nan
)

platform_objective_summary['engagement_rate_pct'] = np.where(
    platform_objective_summary['impressions'] > 0,
    (platform_objective_summary['total_engagement_metrics'] / platform_objective_summary['impressions']) * 100,
    np.nan
)

platform_objective_summary['click_through_rate_pct'] = np.where(
    platform_objective_summary['impressions'] > 0,
    (platform_objective_summary['link_clicks'] / platform_objective_summary['impressions']) * 100,
    np.nan
)

platform_objective_summary['conversion_rate_pct'] = np.where(
    platform_objective_summary['link_clicks'] > 0,
    (platform_objective_summary['conversions'] / platform_objective_summary['link_clicks']) * 100,
    np.nan
)

# Streaming metrics
platform_objective_summary['campaign_daily_streams'] = (
    platform_objective_summary['streams_during_campaign'] /
    platform_objective_summary['total_campaign_duration_days']
)


# ═══════════════════════════════════════════════════════════
# QUALITY FLAGS - UPDATED WITH TRACKABILITY AWARENESS
# ═══════════════════════════════════════════════════════════

def confidence_tier(row):
    """Dual thresholds: campaign count and campaign duration"""
    if row['campaign_count'] >= 10 or row['total_campaign_duration_days'] >= 150:
        return 'A_reliable'
    elif row['campaign_count'] >= 6 or row['total_campaign_duration_days'] >= 90:
        return 'B_indicative'
    else:
        return 'C_exploratory'

def streaming_quality(row):
    """
    Updated to handle:
    - Non-trackable products (albums/playlists)
    - Trackable products with zero streams
    - Overlap-based noise assessment
    """
    # Check if majority of campaigns are non-trackable
    if row['trackable_campaign_proportion'] < 0.5:
        return 'not_applicable'
    
    # Check if we have any streams (for trackable campaigns)
    if row['streams_during_campaign'] == 0:
        return 'no_streams'
    
    # If we have streams from trackable campaigns, assess based on overlap
    avg_overlap = (row['overlap_ratio_baseline'] + row['overlap_ratio_during'] + row['overlap_ratio_decay']) / 3
    if avg_overlap < 0.3:
        return 'clean'
    elif avg_overlap < 0.6:
        return 'moderate_noise'
    else:
        return 'high_noise'

platform_objective_summary['confidence_tier'] = platform_objective_summary.apply(confidence_tier, axis=1)
platform_objective_summary['streaming_attribution_quality'] = platform_objective_summary.apply(streaming_quality, axis=1)


# ═══════════════════════════════════════════════════════════
# REORDER COLUMNS FOR READABILITY
# ═══════════════════════════════════════════════════════════

column_order = [
    # Identifiers & sample info
    'platform_category',
    'objective',
    'product_number_type',
    'campaign_count',
    'total_campaign_duration_days',
    'typical_campaign_duration',
    'confidence_tier',
    'streaming_attribution_quality',
    'trackable_campaign_proportion',  
    'any_overlap_count',
    
    # Spend
    'total_spend',
    'spend_per_day',
    
    # Marketing metrics - raw totals
    'impressions',
    'reach',
    'video_views',
    'link_clicks',
    'conversions',
    'interactions',
    'comments',
    'likes',
    'reactions',
    'shares',
    'follows',
    
    # Marketing metrics - composites
    'total_awareness_metrics',
    'awareness_per_day',
    'total_traffic_metrics',
    'traffic_per_day',
    'total_engagement_metrics',
    'engagement_per_day',
    
    # Marketing efficiency
    'cpm',
    'cost_per_click',
    'cost_per_1000_engagements',
    'engagement_rate_pct',
    'click_through_rate_pct',
    'conversion_rate_pct',
    
    # Streaming metrics
    'baseline_streams_14day_pre',
    'streams_during_campaign',
    'streams_10day_post',
    'campaign_daily_streams',
   
    
    # Data quality - overlap
    'overlap_ratio_baseline',
    'overlap_ratio_during',
    'overlap_ratio_decay'
]

platform_objective_summary = platform_objective_summary[column_order]

# ═══════════════════════════════════════════════════════════
# PRINT SUMMARY STATISTICS
# ═══════════════════════════════════════════════════════════

print("\n" + "="*80)
print("PLATFORM × OBJECTIVE SUMMARY STATISTICS")
print("="*80)

print(f"\nTotal platform × objective pairs: {len(platform_objective_summary)}")

print("\nConfidence tier distribution:")
print(platform_objective_summary['confidence_tier'].value_counts())

print("\nStreaming attribution quality distribution:")
print(platform_objective_summary['streaming_attribution_quality'].value_counts())

print("\nPairs by trackability:")
highly_trackable = (platform_objective_summary['trackable_campaign_proportion'] >= 0.8).sum()
moderately_trackable = ((platform_objective_summary['trackable_campaign_proportion'] >= 0.5) & 
                        (platform_objective_summary['trackable_campaign_proportion'] < 0.8)).sum()
low_trackable = (platform_objective_summary['trackable_campaign_proportion'] < 0.5).sum()
print(f"  Highly trackable (≥80%): {highly_trackable}")
print(f"  Moderately trackable (50-79%): {moderately_trackable}")
print(f"  Low trackability (<50%): {low_trackable}")

# Save - keeping all pairs
platform_objective_summary.to_csv('platform_objective_summary_truly_deepy_really_final.csv', index=False)
print(f"\n✓ Saved platform × objective summary: {len(platform_objective_summary)} pairs")
print("✓ All pairs included (filtering will happen at visualization stage)")


PLATFORM × OBJECTIVE SUMMARY STATISTICS

Total platform × objective pairs: 35

Confidence tier distribution:
confidence_tier
C_exploratory    19
A_reliable       11
B_indicative      5
Name: count, dtype: int64

Streaming attribution quality distribution:
streaming_attribution_quality
not_applicable    24
high_noise         6
clean              2
moderate_noise     2
no_streams         1
Name: count, dtype: int64

Pairs by trackability:
  Highly trackable (≥80%): 11
  Moderately trackable (50-79%): 0
  Low trackability (<50%): 24

✓ Saved platform × objective summary: 35 pairs
✓ All pairs included (filtering will happen at visualization stage)
