## Campaign-Level EDA
This notebook analyses campaign characteristics in relation to customer engagement outcomes, focusing on potential drivers like campaign type, impressions, and click-through rate.

In [19]:
# Load packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import eda_functions
importlib.reload(eda_functions)
import eda_functions as eda

In [20]:
# Load datasets
engagement_details = pd.read_csv("../data/processed/engagement_details.csv")
campaigns = pd.read_csv("../data/processed/campaigns.csv")

In [21]:
# Merge and check nulls
merged = engagement_details.merge(campaigns, on='campaign_id', how='left')

print(f"Null counts per column: \n {merged.isnull().sum()}")
print(f"\nShape of digital_usage: {merged.shape}")
print(f"\nFirst few rows of digital_usage:\n{merged.head()}")

Null counts per column: 
 engagement_id            0
customer_id              0
campaign_id              0
channel_used             0
has_engaged              0
day                      0
month                    0
duration             13186
campaign_type            0
target_audience          0
campaign_duration        0
conversion_rate          0
acquisition_cost         0
roi                      0
campaign_language        0
impressions              0
clicks                2657
dtype: int64

Shape of digital_usage: (16000, 17)

First few rows of digital_usage:
   engagement_id  customer_id  campaign_id channel_used  has_engaged  day  \
0              1            1           31   Google Ads            0   12   
1              2            2           96    Telephone            0   20   
2              3            3           81      Website            0    9   
3              4         1004           11        Email            1   12   
4              5            4            7    

In [22]:
clicks_missing_by_channel = eda.check_missing_correlation(merged, "clicks", "channel_used")
clicks_missing_by_type = eda.check_missing_correlation(merged, "clicks", "campaign_type")

Total missing values in "clicks": 2657

Value counts of "channel_used" where "clicks" is missing:
channel_used
Telephone    1344
Landline     1313
Name: count, dtype: int64
Total missing values in "clicks": 2657

Value counts of "campaign_type" where "clicks" is missing:
campaign_type
Telemarketing    2657
Name: count, dtype: int64


In [23]:
# Drop duration and impute missing clicks with 0
merged = merged.drop(columns="duration")
merged["clicks"] = merged["clicks"].fillna(0)

In [24]:
# Feature engineering

# Map month to quarter
month_to_quarter = {
    "January": "Q1", "February": "Q1", "March": "Q1",
    "April": "Q2", "May": "Q2", "June": "Q2",
    "July": "Q3", "August": "Q3", "September": "Q3",
    "October": "Q4", "November": "Q4", "December": "Q4"
}
merged["quarter"] = merged["month"].map(month_to_quarter)

# Aggregate to campaign_id level
campaign_grouped = (
    merged.groupby(["campaign_id", "channel_used"]).agg(
        num_targeted=("engagement_id", "count"),
        num_engaged=("has_engaged", "sum"),
        impressions=("impressions", "mean"),
        clicks=("clicks", "mean"),
        campaign_duration=("campaign_duration", "mean"),
        campaign_language=("campaign_language", "first"),
        target_audience=("target_audience", "first"),
        campaign_type=("campaign_type", "first"),
        quarter=("quarter", "first")
    ).reset_index()
)
campaign_grouped = campaign_grouped.drop(columns="campaign_id")

campaign_grouped["engagement_rate"] = ( campaign_grouped["num_engaged"] / campaign_grouped["num_targeted"])
campaign_grouped["click_through_rate"] = ( campaign_grouped["clicks"] / campaign_grouped["impressions"])
campaign_grouped["impressions_per_day"] = ( campaign_grouped["impressions"] / campaign_grouped["campaign_duration"])
campaign_grouped["targets_per_day"] = ( campaign_grouped["num_targeted"] / campaign_grouped["campaign_duration"])
campaign_grouped["clicks_per_day"] = ( campaign_grouped["clicks"] / campaign_grouped["campaign_duration"])

In [25]:
# Summary statistics
campaign_grouped.describe(include='all')

Unnamed: 0,channel_used,num_targeted,num_engaged,impressions,clicks,campaign_duration,campaign_language,target_audience,campaign_type,quarter,engagement_rate,click_through_rate,impressions_per_day,targets_per_day,clicks_per_day
count,153,153.0,153.0,153.0,153.0,153.0,153,153,153,153,153.0,153.0,153.0,153.0,153.0
unique,7,,,,,,5,5,5,4,,,,,
top,Instagram,,,,,,German,25-34,Display Advertising,Q1,,,,,
freq,38,,,,,,35,47,57,46,,,,,
mean,,104.575163,22.627451,26545.189542,2030.437908,38.72549,,,,,0.239039,0.075041,888.882803,3.701583,70.000436
std,,39.797463,12.036066,15136.301885,2252.686682,20.475671,,,,,0.151609,0.059114,689.300048,2.681547,83.262391
min,,53.0,0.0,1738.0,0.0,15.0,,,,,0.0,0.0,28.966667,0.983333,0.0
25%,,76.0,15.0,12259.0,288.0,15.0,,,,,0.1,0.034894,363.683333,1.755556,7.173333
50%,,86.0,26.0,27352.0,1606.0,30.0,,,,,0.24,0.068089,686.916667,2.7,39.177778
75%,,151.0,32.0,40678.0,2934.0,60.0,,,,,0.375,0.114911,1276.2,5.066667,108.333333


In [26]:
eda.plot_numeric_distributions(df=campaign_grouped, prefix="campaign")

In [27]:
# Value counts for categorical features

# Get object (str) columns
categorical_cols = campaign_grouped.select_dtypes(include="object").columns.tolist()
# Add numeric columns with <=5 unique values (treat as categorical)
categorical_cols += [col for col in campaign_grouped.columns
                     if campaign_grouped[col].dropna().nunique() <= 5 and
                     campaign_grouped[col].dtype in ["int64", "float64"] and col != "engagement_rate"]
for col in categorical_cols:
    print(f"\nValue counts for {col}:")
    print(campaign_grouped[col].value_counts())


Value counts for channel_used:
channel_used
Instagram     38
TikTok        38
Email         18
Landline      16
Telephone     16
Google Ads    14
Website       13
Name: count, dtype: int64

Value counts for campaign_language:
campaign_language
German      35
French      33
Mandarin    31
Spanish     27
English     27
Name: count, dtype: int64

Value counts for target_audience:
target_audience
25-34    47
35-44    44
45-54    43
18-24    14
55+       5
Name: count, dtype: int64

Value counts for campaign_type:
campaign_type
Display Advertising           57
Affiliate Marketing           38
Telemarketing                 32
Email Marketing               18
Search Engine Optimization     8
Name: count, dtype: int64

Value counts for quarter:
quarter
Q1    46
Q3    37
Q2    36
Q4    34
Name: count, dtype: int64

Value counts for campaign_duration:
campaign_duration
15.0    46
30.0    33
45.0    29
60.0    29
75.0    16
Name: count, dtype: int64


In [28]:
# 1. Violin plots for numeric features by engagement rate bin
eda.get_violin_plots_by_engagement_bin(campaign_grouped, target_col="engagement_rate")

In [29]:
# 2. Correlation matrix
numeric_cols = campaign_grouped.select_dtypes(include=["number"]).columns.difference(categorical_cols)

correlation_matrix = campaign_grouped[numeric_cols].corr()
correlation_matrix

Unnamed: 0,click_through_rate,clicks,clicks_per_day,engagement_rate,impressions,impressions_per_day,num_engaged,num_targeted,targets_per_day
click_through_rate,1.0,0.770141,0.639919,0.656238,0.043271,0.081468,0.545594,-0.153988,-0.023028
clicks,0.770141,1.0,0.789952,0.513848,0.532693,0.378584,0.413986,-0.15384,-0.086742
clicks_per_day,0.639919,0.789952,1.0,0.481958,0.423834,0.712664,0.425154,-0.098734,0.287758
engagement_rate,0.656238,0.513848,0.481958,1.0,0.053344,0.171152,0.844198,-0.395389,-0.106209
impressions,0.043271,0.532693,0.423834,0.053344,1.0,0.583366,0.058273,0.00045,-0.12465
impressions_per_day,0.081468,0.378584,0.712664,0.171152,0.583366,1.0,0.206577,0.010195,0.510023
num_engaged,0.545594,0.413986,0.425154,0.844198,0.058273,0.206577,1.0,0.118417,0.160052
num_targeted,-0.153988,-0.15384,-0.098734,-0.395389,0.00045,0.010195,0.118417,1.0,0.466751
targets_per_day,-0.023028,-0.086742,0.287758,-0.106209,-0.12465,0.510023,0.160052,0.466751,1.0


In [30]:
# 3. Bar plots for engagement rate by categorical features
eda.get_barplot(campaign_grouped, target_col="engagement_rate")

In [31]:
# 4. Chi-square test for categorical features vs. binned engagement rate
campaign_grouped["engagement_bin"] = pd.qcut(campaign_grouped["engagement_rate"], q=3, labels=["Low", "Medium", "High"])
chi2_results = eda.get_chi_square(campaign_grouped, categorical_cols, "engagement_bin")

print("\nChi-Square Test Results:\n", chi2_results)

TypeError: get_chi_square() takes 2 positional arguments but 3 were given