## Campaign-Level EDA
This notebook analyses campaign characteristics in relation to customer engagement outcomes, focusing on potential drivers like campaign type, impressions, and click-through rate.

In [3]:
# Load packages
import pandas as pd
import eda_functions as eda
from load_data import load_campaign_data
from feature_engineering import prepare_campaign_features
from data_quality import print_null_summary, print_shape_and_preview

In [4]:
# Load data
engagement_details, campaigns = load_campaign_data()
merged = engagement_details.merge(campaigns, on='campaign_id', how='left')

### Preliminary EDA

In [5]:
# Initial null check
print_null_summary(merged, "merged")
print_shape_and_preview(merged, "merged")


Null Summary for merged:
                    Null Count  Null %
duration                13186   82.41
clicks                   2657   16.61
target_audience             0    0.00
impressions                 0    0.00
campaign_language           0    0.00
roi                         0    0.00
acquisition_cost            0    0.00
conversion_rate             0    0.00
campaign_duration           0    0.00
engagement_id               0    0.00
customer_id                 0    0.00
month                       0    0.00
day                         0    0.00
has_engaged                 0    0.00
channel_used                0    0.00
campaign_id                 0    0.00
campaign_type               0    0.00

merged shape: (16000, 17)

merged preview:
   engagement_id  customer_id  campaign_id channel_used  has_engaged  day  \
0              1            1           31   Google Ads            0   12   
1              2            2           96    Telephone            0   20   
2             

In [6]:
# Missing correlation diagnosis
eda.check_missing_correlation(merged, "clicks", "channel_used")
eda.check_missing_correlation(merged, "clicks", "campaign_type")

Total missing values in "clicks": 2657

Value counts of "channel_used" where "clicks" is missing:
channel_used
Telephone    1344
Landline     1313
Name: count, dtype: int64
Total missing values in "clicks": 2657

Value counts of "campaign_type" where "clicks" is missing:
campaign_type
Telemarketing    2657
Name: count, dtype: int64


campaign_type
Telemarketing    2657
Name: count, dtype: int64

### Feature Engineering

In [7]:
# Feature engineering
campaign_grouped = prepare_campaign_features(merged)

In [8]:
# Summary statistics
print_null_summary(campaign_grouped, "campaign_grouped")
print_shape_and_preview(campaign_grouped, "campaign_grouped")
print("\nSummary statistics:\n", campaign_grouped.describe(include='all'))


Null Summary for campaign_grouped:
                      Null Count  Null %
channel_used                  0     0.0
impressions                   0     0.0
clicks                        0     0.0
campaign_duration             0     0.0
campaign_language             0     0.0
target_audience               0     0.0
campaign_type                 0     0.0
quarter                       0     0.0
engagement_rate               0     0.0
click_through_rate            0     0.0
impressions_per_day           0     0.0
targets_per_day               0     0.0
clicks_per_day                0     0.0

campaign_grouped shape: (153, 13)

campaign_grouped preview:
  channel_used  impressions  clicks  campaign_duration campaign_language  \
0        Email      33339.0   523.0               30.0            German   
1        Email       8034.0   142.0               15.0           Spanish   
2        Email      25833.0   628.0               30.0           English   
3        Email      29051.0  1679.0  

In [9]:
# Plot distributions of campaign_grouped's numeric features
eda.plot_numeric_distributions(df=campaign_grouped, prefix="campaign")

In [10]:
# Categorical value counts and numeric exclusions
categorical_cols = eda.get_categorical_columns(campaign_grouped)
for col in categorical_cols:
    print(f"\nValue counts for {col}:\n{campaign_grouped[col].value_counts()}")


Value counts for target_audience:
target_audience
25-34    50
45-54    45
35-44    35
55+      13
18-24    10
Name: count, dtype: int64

Value counts for campaign_language:
campaign_language
German      48
English     29
Mandarin    29
Spanish     28
French      19
Name: count, dtype: int64

Value counts for quarter:
quarter
Q1    46
Q3    37
Q2    36
Q4    34
Name: count, dtype: int64

Value counts for channel_used:
channel_used
Instagram     38
TikTok        38
Email         18
Landline      16
Telephone     16
Google Ads    14
Website       13
Name: count, dtype: int64

Value counts for campaign_type:
campaign_type
Affiliate Marketing           48
Display Advertising           46
Telemarketing                 32
Email Marketing               18
Search Engine Optimization     9
Name: count, dtype: int64

Value counts for campaign_duration:
campaign_duration
15.0    46
30.0    33
45.0    29
60.0    29
75.0    16
Name: count, dtype: int64


### Relationship Analysis

In [11]:
# 1. Violin plots for numeric features by engagement rate bin
eda.get_violin_plots_by_engagement_bin(campaign_grouped, target_col="engagement_rate")

In [12]:
# 2. Correlation matrix
numeric_cols = eda.get_numerical_columns(campaign_grouped)
print("\nCorrelation Matrix:\n", campaign_grouped[numeric_cols].corr())


Correlation Matrix:
                      click_through_rate    clicks  clicks_per_day  \
click_through_rate             1.000000  0.717793        0.632650   
clicks                         0.717793  1.000000        0.794442   
clicks_per_day                 0.632650  0.794442        1.000000   
engagement_rate                0.641354  0.341954        0.337416   
impressions                    0.045859  0.563502        0.418849   
impressions_per_day            0.057241  0.365766        0.626020   
targets_per_day               -0.014185  0.054462        0.392282   

                     engagement_rate  impressions  impressions_per_day  \
click_through_rate          0.641354     0.045859             0.057241   
clicks                      0.341954     0.563502             0.365766   
clicks_per_day              0.337416     0.418849             0.626020   
engagement_rate             1.000000    -0.096964            -0.003981   
impressions                -0.096964     1.000000      

In [13]:
# 3. Bar plots for engagement rate by categorical features
eda.get_barplot(campaign_grouped, target_col="engagement_rate")

In [14]:
# 4. Chi-square test for categorical features vs. binned engagement rate
campaign_grouped["engagement_bin"] = pd.qcut(campaign_grouped["engagement_rate"], q=3, labels=["Low", "Medium", "High"])
chi2_results = eda.get_chi_square(campaign_grouped, "engagement_bin")
print("\nChi-Square Test Results:\n", chi2_results)


Chi-Square Test Results:
              Feature  Chi-Square       P-Value
3       channel_used  196.566802  1.666145e-35
4      campaign_type  138.255435  5.470147e-26
5  campaign_duration   13.966556  8.264128e-02
0    target_audience   12.138168  1.451448e-01
1  campaign_language    8.357791  3.993229e-01
2            quarter    4.410290  6.213345e-01
