In [1]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from datetime import datetime, timedelta
import gzip

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [2]:
# Load marketing data
marketing = pd.read_csv('marketing_pseudonymized_new.csv')
print(f"Marketing data: {marketing.shape}")
print(f"Date range: {marketing['campaign_start_date'].min()} to {marketing['campaign_end_date'].max()}")
marketing.head(3)

Marketing data: (13835, 47)
Date range: 1/10/2023 to 8/18/2023


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,supplier,objective,campaign_start_date,campaign_end_date,campaign_id,campaign_status,campaign_budget,adset_status,adset_budget,adset_start_date,adset_end_date,ad_status,project_no,artist_name,product,format,platform_manual,product_number_type,audience_type,content_type,bidding_strategy,budget_period,marketing_report_date,placement_category,platform_category,platform,placement,impressions,reach,spend,video_views,comments,interactions,likes,link_clicks,reactions,shares,follows,saves,conversions,all_conversions,cpc,product_type_on_spotify,canonical_artist,canonical_product
0,0,0,Meta,traffic,3/28/2023,4/6/2023,157858920231,inactive,562.322,active,281.161,3/28/2023,4/6/2023,active,432191283317,Senorita_Dido,,,Meta,URL,Interests,Music_Video,,,4/11/2023,feed,instagram,instagram,feed,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,no_spotify_link,Senorita_Dido,True_Face
1,1,1,Meta,traffic,2/14/2023,3/9/2023,157858920231,inactive,899.7152,active,899.7152,2/14/2023,3/9/2023,active,791586853774,Log_Lady,Got_a_Light_Again,,"Instagram,Instagram",Album URI,Interests,Music_Video,,,3/4/2023,feed,instagram,instagram,instagram_reels,1206,1206,7.12708,13,0,136,0,13,7,0,0,0,0,0,0.487476,album,Log_Lady,Got_a_Light_Again
2,2,2,Snapchat,traffic,3/17/2023,4/4/2023,732927318296,inactive,562.322,active,562.322,3/17/2023,4/4/2023,active,663102993334,Gordon_Cole,Rita_Appears,,Snapchat,Track URI,Interests,Other,,,4/16/2023,feed,snapchat,Snapchat,Feed,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0.0,track,Gordon_Cole,Rita_Appears


In [3]:
# Load streaming data (gzipped)
streaming = pd.read_csv('streaming_pseudonymized_new.csv.gz', compression='gzip')
print(f"Streaming data: {streaming.shape}")
print(f"Date range: {streaming['report_date'].min()} to {streaming['report_date'].max()}")
print(f"\nSources available: {streaming['source_name'].unique()}")
streaming.head(3)

Streaming data: (614005, 10)
Date range: 1/1/2023 to 3/9/2023

Sources available: ['Album' 'Collection' 'Others Playlist' 'Radio' 'Other' 'Search'
 'Play_Queue' 'Artist' 'Chart']


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,report_date,source_name,project_no,artist_name,product_name,streams,canonical_artist,canonical_product
0,0,0,1/25/2023,Album,318733585791,Nadine_Hurley,Red_Bathrobe,975,Nadine_Hurley,Red_Bathrobe
1,1,1,1/26/2023,Collection,616610447432,Annie_Blackburn,No_Hope,1436,Annie_Blackburn,No_Hope
2,2,2,1/26/2023,Others Playlist,963210974654,Tom_Beaumont,Inner_Vision,28,Tom_Beaumont,Inner_Vision


In [4]:
# Parse dates in marketing
date_cols_marketing = ['campaign_start_date', 'campaign_end_date', 'adset_start_date', 
                       'adset_end_date', 'marketing_report_date']
for col in date_cols_marketing:
    if col in marketing.columns:
        marketing[col] = pd.to_datetime(marketing[col], errors='coerce')

# Parse dates in streaming
streaming['report_date'] = pd.to_datetime(streaming['report_date'], errors='coerce')

print("Date parsing complete")

Date parsing complete


In [5]:
# Load existing campaign summary to get baseline periods
campaign_summary = pd.read_csv('final_campaign_summary_paid_media.csv')

# Parse dates
date_cols_summary = ['campaign_start_date', 'campaign_end_date', 'baseline_start', 
                     'baseline_end', 'during_start', 'during_end', 'decay_start', 'decay_end']
for col in date_cols_summary:
    if col in campaign_summary.columns:
        campaign_summary[col] = pd.to_datetime(campaign_summary[col], errors='coerce')

print(f"Campaign summary: {campaign_summary.shape}")
campaign_summary.head(3)

Campaign summary: (271, 42)


Unnamed: 0,campaign_id,platform_category,canonical_artist,canonical_product,objective,project_no,product_number_type,campaign_start_date,spend,impressions,reach,video_views,link_clicks,interactions,comments,likes,reactions,shares,follows,conversions,campaign_end_date,campaign_duration_days,total_awareness_metrics,total_traffic_metrics,total_engagement_metrics,streams_during_campaign,streams_10day_post,baseline_streams_14day_pre,streaming_trackable,baseline_start,baseline_end,during_start,during_end,decay_start,decay_end,overlap_days_baseline,overlap_days_during,overlap_days_decay,overlap_ratio_baseline,overlap_ratio_during,overlap_ratio_decay,any_overlap_count
0,8600344923,youtube,Feyd_Rautha,The_World_Spings,video_views,388191401155,URL,2023-02-12,112.536504,9245,830,5272,22,7251,0,0,0,0,0,0,2023-03-02,19,14517,22,1979,705076.0,275622.0,31802.071429,non_track,2023-01-29,2023-02-11,2023-02-12,2023-03-02,2023-03-03,2023-03-12,0.0,19.0,10.0,0.0,1.0,1.0,1
1,15921939106,youtube,Jeffrey_Beaumont,Moment_of_Truth,video_views,379017089087,Album URI,2023-01-13,280.893006,25456,4179,13845,43,19832,0,0,0,0,0,0,2023-01-19,7,39301,43,5987,173258.0,167647.0,10199.0,non_track,2022-12-30,2023-01-12,2023-01-13,2023-01-19,2023-01-20,2023-01-29,7.0,20.0,2.0,0.5,2.857143,0.2,3
2,24210616442,snapchat,Gurney_Halleck,Packards_Vibration,traffic,114531372146,Playlist URI,2023-01-09,635.525078,134633,23058,7940,1188,0,0,0,0,0,0,0,2023-02-09,32,142573,1188,0,0.0,0.0,0.0,non_track,2022-12-26,2023-01-08,2023-01-09,2023-02-09,2023-02-10,2023-02-19,0.0,101.0,34.0,0.0,3.15625,3.4,4


In [6]:
# Aggregate to daily level by artist, product, source
daily_streams = (
    streaming
    .groupby(['report_date', 'canonical_artist', 'canonical_product', 'source_name'], as_index=False)
    ['streams']
    .sum()
)

print(f"Daily timeseries created: {daily_streams.shape}")
print(f"Date range: {daily_streams['report_date'].min()} to {daily_streams['report_date'].max()}")
print(f"\nUnique artist-product combinations: {daily_streams[['canonical_artist', 'canonical_product']].drop_duplicates().shape[0]}")
daily_streams.head(10)

Daily timeseries created: (473127, 5)
Date range: 2023-01-01 00:00:00 to 2023-03-30 00:00:00

Unique artist-product combinations: 918


Unnamed: 0,report_date,canonical_artist,canonical_product,source_name,streams
0,2023-01-01,Albert_Rosenfield,Creaking_Stairs,Album,40
1,2023-01-01,Albert_Rosenfield,Creaking_Stairs,Artist,0
2,2023-01-01,Albert_Rosenfield,Creaking_Stairs,Collection,192
3,2023-01-01,Albert_Rosenfield,Creaking_Stairs,Other,4
4,2023-01-01,Albert_Rosenfield,Creaking_Stairs,Others Playlist,882
5,2023-01-01,Albert_Rosenfield,Creaking_Stairs,Play_Queue,83
6,2023-01-01,Albert_Rosenfield,Creaking_Stairs,Radio,53
7,2023-01-01,Albert_Rosenfield,Creaking_Stairs,Search,9
8,2023-01-01,Albert_Rosenfield,Ghostwood_Development,Collection,22
9,2023-01-01,Albert_Rosenfield,Ghostwood_Development,Other,0


In [8]:
# Calculate aggregate metrics - RAW VOLUME
total_baseline = campaign_summary['baseline_streams_14day_pre'].sum()
total_during = campaign_summary['streams_during_campaign'].sum()
total_decay = campaign_summary['streams_10day_post'].sum()
total_baseline_volume = total_baseline * 14  # baseline_streams_14day_pre is daily average

print("=== AGGREGATE Q1 2023 STREAMING PERFORMANCE ===")
print(f"\nTotal streams across all phases:")
print(f"  Baseline (14 days): {total_baseline_volume:,.0f} streams")
print(f"  During campaigns: {total_during:,.0f} streams")
print(f"  Decay phase (10 days): {total_decay:,.0f} streams")

if total_baseline_volume > 0:
    during_lift = ((total_during - total_baseline_volume) / total_baseline_volume * 100)
    decay_lift = ((total_decay - total_baseline_volume) / total_baseline_volume * 100)
    print(f"\nLift vs baseline:")
    print(f"  During campaigns: {during_lift:+.1f}%")
    print(f"  Decay phase: {decay_lift:+.1f}%")

# Save daily timeseries
daily_streams.to_csv('daily_streams_by_source_newer.csv', index=False)
print(f"\nSaved daily timeseries: {daily_streams.shape}")

=== AGGREGATE Q1 2023 STREAMING PERFORMANCE ===

Total streams across all phases:
  Baseline (14 days): 101,708,760 streams
  During campaigns: 190,464,151 streams
  Decay phase (10 days): 72,991,694 streams

Lift vs baseline:
  During campaigns: +87.3%
  Decay phase: -28.2%

Saved daily timeseries: (473127, 5)
