## Campaign-Level EDA
This notebook analyses campaign characteristics in relation to customer engagement outcomes, focusing on potential drivers like campaign type, impressions, and click-through rate.

In [None]:
# Load packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import eda_functions
importlib.reload(eda_functions)
import eda_functions as eda

In [None]:
# Load datasets
engagement_details = pd.read_csv("../data/processed/engagement_details.csv")
campaigns = pd.read_csv("../data/processed/campaigns.csv")

In [None]:
# Merge and check nulls
merged = engagement_details.merge(campaigns, on='campaign_id', how='left')

print(f"Null counts per column: \n {merged.isnull().sum()}")
print(f"\nShape of digital_usage: {merged.shape}")
print(f"\nFirst few rows of digital_usage:\n{merged.head()}")

In [None]:
clicks_missing_by_channel = eda.check_missing_correlation(merged, "clicks", "channel_used")
clicks_missing_by_type = eda.check_missing_correlation(merged, "clicks", "campaign_type")

In [None]:
# Drop duration and impute missing clicks with 0
merged = merged.drop(columns="duration")
merged["clicks"] = merged["clicks"].fillna(0)

In [None]:
# Feature engineering

# Map month to quarter
month_to_quarter = {
    "January": "Q1", "February": "Q1", "March": "Q1",
    "April": "Q2", "May": "Q2", "June": "Q2",
    "July": "Q3", "August": "Q3", "September": "Q3",
    "October": "Q4", "November": "Q4", "December": "Q4"
}
merged["quarter"] = merged["month"].map(month_to_quarter)

# Aggregate to campaign_id level
campaign_grouped = (
    merged.groupby(["campaign_id", "channel_used"]).agg(
        num_targeted=("engagement_id", "count"),
        num_engaged=("has_engaged", "sum"),
        impressions=("impressions", "mean"),
        clicks=("clicks", "mean"),
        campaign_duration=("campaign_duration", "mean"),
        campaign_language=("campaign_language", "first"),
        target_audience=("target_audience", "first"),
        campaign_type=("campaign_type", "first"),
        quarter=("quarter", "first")
    ).reset_index()
)
campaign_grouped = campaign_grouped.drop(columns="campaign_id")

campaign_grouped["engagement_rate"] = ( campaign_grouped["num_engaged"] / campaign_grouped["num_targeted"])
campaign_grouped["click_through_rate"] = ( campaign_grouped["clicks"] / campaign_grouped["impressions"])
campaign_grouped["impressions_per_day"] = ( campaign_grouped["impressions"] / campaign_grouped["campaign_duration"])
campaign_grouped["targets_per_day"] = ( campaign_grouped["num_targeted"] / campaign_grouped["campaign_duration"])
campaign_grouped["clicks_per_day"] = ( campaign_grouped["clicks"] / campaign_grouped["campaign_duration"])

In [None]:
# Summary statistics
campaign_grouped.describe(include='all')

In [None]:
# Distribution of numeric features
numeric_cols = ["engagement_rate", "click_through_rate", "campaign_duration", "impressions", 
                "impressions_per_day", "targets_per_day", "clicks", "clicks_per_day"]

for col in numeric_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(campaign_grouped[col], bins=20, kde=True)
    plt.title(f"Distribution of {col}")
    plt.tight_layout()
    plt.show()

In [None]:
# Value counts for categorical features
categorical_cols = ["campaign_language", "target_audience", "channel_used", "campaign_type", "quarter"]
for col in categorical_cols:
    print(f"\nValue counts for {col}:")
    print(campaign_grouped[col].value_counts())

In [None]:
# 1. Violin plots for numeric features by engagement rate bin
campaign_grouped["engagement_bin"] = pd.qcut(campaign_grouped["engagement_rate"], q=3, labels=["Low", "Medium", "High"])
for col in numeric_cols[1:]:
    plt.figure(figsize=(7, 4))
    sns.violinplot(data=campaign_grouped, x="engagement_bin", y=col, inner="quartile")
    plt.title(f"{col} by Engagement Rate Bin")
    plt.xlabel("Engagement Rate Category")
    plt.ylabel(col)
    plt.tight_layout()
    plt.show()
campaign_grouped.drop(columns="engagement_bin", inplace=True)

In [None]:
# 2. Correlation matrix
correlation_matrix = campaign_grouped[numeric_cols].corr()
correlation_matrix

In [None]:
# 3. Bar plots for engagement rate by categorical features
for col in categorical_cols:
    eda.get_barplot(campaign_grouped, cat_col=col, target_col="engagement_rate")

In [None]:
# 4. Chi-square test for categorical features vs. binned engagement rate
campaign_grouped["engagement_bin"] = pd.qcut(campaign_grouped["engagement_rate"], q=3, labels=["Low", "Medium", "High"])
chi2_results = eda.get_chi_square(campaign_grouped, categorical_cols, "engagement_bin")

print("\nChi-Square Test Results:\n", chi2_results)