# Notebook 01: Data Exploration and Economic Context

**Goal:** Understand data structure, identify temporal patterns, and validate natural experiment setup

---


## Setup

In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
os.chdir(project_root)

from src.utils import reconstruct_contact_date, plot_time_series

# Plotly settings
import plotly.io as pio
pio.templates.default = "plotly_white"

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

## 1. Load and Prepare Data

In [6]:
print(os.getcwd())

/Users/tomasz.solis/repos/private-projects/marketing-campaign-causal-impact


In [7]:
# Load raw data
df = pd.read_csv('data/raw/bank-additional/bank-additional-full.csv', sep=';')

print(f"Dataset shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")

# Display first few rows
df.head()

Dataset shape: (41188, 21)

Columns: ['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y']


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,261,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,149,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,226,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,151,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,307,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [8]:
# Reconstruct contact dates (dataset lacks explicit year information)
df = reconstruct_contact_date(df, additional_info=True)

# Create binary outcome
df['y_binary'] = (df['y'] == 'yes').astype(int)

print(f"Date range: {df['contact_date'].min()} to {df['contact_date'].max()}")
print(f"Overall subscription rate: {df['y_binary'].mean():.1%}")

Date range: 2008-05-01 00:00:00 to 2010-11-01 00:00:00
Overall subscription rate: 11.3%


## 2. Time Series Exploration


### 2.1 Contact Volume Over Time


In [24]:
# Aggregate daily contact volume
daily_contacts = df.groupby('contact_date').size().reset_index(name='count')

fig = plot_time_series(
    daily_contacts,
    date_col='contact_date',
    value_col='count',
    title='Daily Contact Volume: Natural Experiment Visible',
    yaxis_title='Number of Contacts'
)

# Annotate key periods
fig.add_annotation(
    x="2008-07-01", y=7000,
    text="Wave 1:<br>Pre-Crisis",
    showarrow=True,
    arrowhead=2,
    bgcolor="lightgreen",
    opacity=0.8
)

fig.add_annotation(
    x="2009-01-01", y=4000,
    text="Campaign Pause:<br>Financial Crisis",
    showarrow=True,
    arrowhead=2,
    bgcolor="salmon",
    opacity=0.8
)

fig.add_annotation(
    x="2009-05-01", y=6000,
    text="Wave 2:<br>Crisis Recovery",
    showarrow=True,
    arrowhead=2,
    bgcolor="lightblue",
    opacity=0.8
)

fig.show()

**Observation:** Clear campaign waves with 8-month gap (Sep 2008 - Mar 2009) coinciding with financial crisis.

### 2.2 Economic Indicators Over Time


In [26]:
# Prepare economic indicator time series
econ_df = df.groupby('contact_date').agg({
    'emp.var.rate': 'mean',
    'euribor3m': 'mean',
    'cons.conf.idx': 'mean',
    'nr.employed': 'mean'
}).reset_index()

# Create subplot for all economic indicators
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Employment Variation Rate',
        'Euribor 3-Month Rate',
        'Consumer Confidence Index',
        'Number Employed (thousands)'
    )
)

# Plot 1: Employment variation rate
fig.add_trace(
    go.Scatter(x=econ_df['contact_date'], y=econ_df['emp.var.rate'],
               mode='lines', name='emp.var.rate', line=dict(color='#EF553B')),
    row=1, col=1
)

# Plot 2: Euribor
fig.add_trace(
    go.Scatter(x=econ_df['contact_date'], y=econ_df['euribor3m'],
               mode='lines', name='euribor3m', line=dict(color='#00CC96')),
    row=1, col=2
)

# Plot 3: Consumer confidence
fig.add_trace(
    go.Scatter(x=econ_df['contact_date'], y=econ_df['cons.conf.idx'],
               mode='lines', name='cons.conf.idx', line=dict(color='#AB63FA')),
    row=2, col=1
)

# Plot 4: Employment level
fig.add_trace(
    go.Scatter(x=econ_df['contact_date'], y=econ_df['nr.employed'],
               mode='lines', name='nr.employed', line=dict(color='#FFA15A')),
    row=2, col=2
)

# Add crisis shading to all subplots
for row in [1, 2]:
    for col in [1, 2]:
        fig.add_vrect(
            x0="2008-09-01", x1="2009-03-31",
            fillcolor="red", opacity=0.1,
            layer="below", line_width=0,
            row=row, col=col
        )

fig.update_layout(
    height=600,
    title_text="Economic Indicators: Dramatic Divergence During Crisis",
    showlegend=False
)

fig.show()

**Key Finding:** Massive economic shock between campaign waves:
- **emp.var.rate**: +1.3% → -1.9% (3.2pp swing)
- **euribor3m**: 4.9% → 1.3% (73% decline)
- **cons.conf.idx**: -38.9 → -45.8 (deterioration)
- **nr.employed**: 5,211k → 5,097k (114k jobs lost)


## 3. Subscription Rate Patterns


### 3.1 Subscription Rate Over Time


In [27]:
# Monthly subscription rates
monthly_sub = df.groupby(df['contact_date'].dt.to_period('M')).agg({
    'y_binary': ['mean', 'count', 'sum']
}).reset_index()

monthly_sub.columns = ['month', 'sub_rate', 'n_contacts', 'n_subscribed']
monthly_sub['month'] = monthly_sub['month'].dt.to_timestamp()

fig = go.Figure()

# Add subscription rate line
fig.add_trace(go.Scatter(
    x=monthly_sub['month'],
    y=monthly_sub['sub_rate'] * 100,
    mode='lines+markers',
    name='Subscription Rate',
    line=dict(color='#636EFA', width=2),
    marker=dict(size=8)
))

# Add crisis period
fig.add_vrect(
    x0="2008-09-01", x1="2009-03-31",
    fillcolor="red", opacity=0.1,
    layer="below", line_width=0,
    annotation_text="Financial Crisis",
    annotation_position="top left"
)

fig.update_layout(
    title='Monthly Subscription Rate: Spike During Crisis Recovery',
    xaxis_title='Month',
    yaxis_title='Subscription Rate (%)',
    hovermode='x unified',
    height=400
)

fig.show()

**Observation:** Subscription rates increase dramatically in 2009 despite (or because of) economic crisis.

### 3.2 Contact Intensity Distribution


In [28]:
# Campaign (contact intensity per customer)
fig = px.histogram(
    df,
    x='campaign',
    color='y',
    barmode='stack',
    nbins=30,
    title='Contact Intensity Distribution by Outcome',
    labels={'campaign': 'Number of Contacts in Campaign', 'count': 'Frequency'}
)

fig.update_layout(height=400)
fig.show()

## 4. Data Quality Checks


### 4.1 Missing Values


In [29]:
# Check missing values
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)

if len(missing) > 0:
    print("Missing values found:")
    print(missing)
else:
    print("✅ No missing values")

✅ No missing values


### 4.2 Variable Distributions


In [30]:
# Key demographic distributions
demo_vars = ['age', 'job', 'marital', 'education']

fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=[v.capitalize() for v in demo_vars]
)

row_col = [(1,1), (1,2), (2,1), (2,2)]

for var, (row, col) in zip(demo_vars, row_col):
    value_counts = df[var].value_counts().head(10)
    
    fig.add_trace(
        go.Bar(x=value_counts.index, y=value_counts.values, name=var),
        row=row, col=col
    )

fig.update_layout(
    height=600,
    title_text="Customer Demographics Distribution",
    showlegend=False
)

fig.show()

## 5. Define Campaign Waves


In [31]:
# Define wave periods based on contact volume plot
wave_1_start = '2008-05-01'
wave_1_end = '2008-08-31'
wave_2_start = '2009-04-01'
wave_2_end = '2009-08-01'

# Create wave indicator
df['wave'] = 'other'
df.loc[df['contact_date'] <= wave_1_end, 'wave'] = 'wave_1'
df.loc[(df['contact_date'] >= wave_2_start) & 
       (df['contact_date'] < wave_2_end), 'wave'] = 'wave_2'

# Summary
wave_counts = df['wave'].value_counts()
print("Wave assignment:")
print(wave_counts)
print(f"\nSubscription rates by wave:")
print(df.groupby('wave')['y_binary'].mean())

Wave assignment:
wave
wave_1    23997
wave_2     9145
other      8046
Name: count, dtype: int64

Subscription rates by wave:
wave
other     0.278151
wave_1    0.046089
wave_2    0.141717
Name: y_binary, dtype: float64


## 6. Key Takeaways

**Natural Experiment Identified:**
1. **Clear temporal separation:** 8-month gap between waves
2. **Dramatic economic shock:** All indicators show massive divergence
3. **Exogenous variation:** Financial crisis was unpredictable, not driven by bank strategy
4. **Outcome variation:** Subscription rates differ substantially between waves

**Data Quality:**
- No missing values in key variables
- Contact dates successfully reconstructed
- Clear time series patterns visible

**Next Steps:**
- Identify unique customers (no explicit IDs)
- Check for cross-wave contamination
- Validate covariate balance

## Save Processed Data


In [33]:
# Save wave-labeled dataset for next notebook
df.to_csv('data/processed/data_with_waves.csv', index=False)
print("✅ Saved: data_with_waves.csv")

✅ Saved: data_with_waves.csv
