# Bank Marketing Campaign - Exploratory Data Analysis

This notebook performs comprehensive EDA on the bank marketing dataset to understand data characteristics and inform preprocessing decisions.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
# Load the dataset
train_data = pd.read_csv('/Users/yuvalheffetz/ds-agent-projects/session_313737e4-b92d-4cb9-8eb5-68f5df26d5d6/data/train.csv')

print(f"Dataset shape: {train_data.shape}")
print(f"\nDataset info:")
train_data.info()
print(f"\nFirst few rows:")
train_data.head()

In [None]:
# Define color palette for consistent plotting
app_color_palette = [
    'rgba(99, 110, 250, 0.8)',   # Blue
    'rgba(239, 85, 59, 0.8)',    # Red/Orange-Red
    'rgba(0, 204, 150, 0.8)',    # Green
    'rgba(171, 99, 250, 0.8)',   # Purple
    'rgba(255, 161, 90, 0.8)',   # Orange
    'rgba(25, 211, 243, 0.8)',   # Cyan
    'rgba(255, 102, 146, 0.8)',  # Pink
    'rgba(182, 232, 128, 0.8)',  # Light Green
    'rgba(255, 151, 255, 0.8)',  # Magenta
    'rgba(254, 203, 82, 0.8)'    # Yellow
]

## 1. Dataset Overview and Data Quality

In [None]:
# Check for missing values and data types
missing_values = train_data.isnull().sum()
data_types = train_data.dtypes

data_quality_df = pd.DataFrame({
    'Data Type': data_types,
    'Missing Values': missing_values,
    'Missing %': (missing_values / len(train_data)) * 100
})

print("Data Quality Summary:")
print(data_quality_df)

# Basic statistics
print("\nBasic Statistics for Numerical Features:")
numerical_cols = train_data.select_dtypes(include=[np.number]).columns
print(train_data[numerical_cols].describe())

print("\nBasic Statistics for Categorical Features:")
categorical_cols = train_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n{col}: {train_data[col].nunique()} unique values")
    print(train_data[col].value_counts().head(10))

In [None]:
# Create data quality visualization
fig = go.Figure(data=[go.Bar(
    x=data_quality_df.index,
    y=data_quality_df['Missing %'],
    marker_color=app_color_palette[0]
)])

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=12),
    title_font=dict(color='#7C3AED', size=16),
    xaxis=dict(
        title='Features',
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=11),
        title_font=dict(color='#7C3AED', size=12)
    ),
    yaxis=dict(
        title='Missing Values (%)',
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=11),
        title_font=dict(color='#7C3AED', size=12)
    ),
    legend=dict(font=dict(color='#8B5CF6', size=11))
)

fig.write_html("/Users/yuvalheffetz/ds-agent-projects/session_313737e4-b92d-4cb9-8eb5-68f5df26d5d6/research/plots/data_quality_overview.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 2. Target Variable Analysis

In [None]:
# Analyze target variable distribution
target_col = 'y'  # Assuming 'y' is the target column based on typical bank marketing datasets
target_counts = train_data[target_col].value_counts()
target_pct = train_data[target_col].value_counts(normalize=True) * 100

print(f"Target variable '{target_col}' distribution:")
print(f"Counts:\n{target_counts}")
print(f"\nPercentages:\n{target_pct}")

# Create target distribution plot
fig = px.pie(values=target_counts.values, names=target_counts.index,
             color_discrete_sequence=app_color_palette[:2])

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=12),
    title_font=dict(color='#7C3AED', size=16),
    legend=dict(font=dict(color='#8B5CF6', size=11))
)

fig.write_html("/Users/yuvalheffetz/ds-agent-projects/session_313737e4-b92d-4cb9-8eb5-68f5df26d5d6/research/plots/target_distribution.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 3. Numerical Features Distribution

In [None]:
# Analyze numerical features
numerical_cols = train_data.select_dtypes(include=[np.number]).columns.tolist()
if target_col in numerical_cols:
    numerical_cols.remove(target_col)  # Remove target if it's numerical

print(f"Numerical features: {numerical_cols}")

# Create subplots for numerical distributions
n_cols = len(numerical_cols)
rows = (n_cols + 2) // 3
cols = min(3, n_cols)

fig = make_subplots(rows=rows, cols=cols, 
                    subplot_titles=numerical_cols)

for i, col in enumerate(numerical_cols):
    row = (i // 3) + 1
    col_pos = (i % 3) + 1
    
    fig.add_trace(
        go.Histogram(x=train_data[col], name=col, 
                    marker_color=app_color_palette[i % len(app_color_palette)],
                    showlegend=False),
        row=row, col=col_pos
    )

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=12),
    title_font=dict(color='#7C3AED', size=16)
)

# Update all axes
fig.update_xaxes(gridcolor='rgba(139,92,246,0.2)', zerolinecolor='rgba(139,92,246,0.3)',
                 tickfont=dict(color='#8B5CF6', size=10),
                 title_font=dict(color='#7C3AED', size=11))
fig.update_yaxes(gridcolor='rgba(139,92,246,0.2)', zerolinecolor='rgba(139,92,246,0.3)',
                 tickfont=dict(color='#8B5CF6', size=10),
                 title_font=dict(color='#7C3AED', size=11))

fig.write_html("/Users/yuvalheffetz/ds-agent-projects/session_313737e4-b92d-4cb9-8eb5-68f5df26d5d6/research/plots/numerical_distributions.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 4. Categorical Features Distribution

In [None]:
# Analyze categorical features
categorical_cols = train_data.select_dtypes(include=['object']).columns.tolist()
if target_col in categorical_cols:
    categorical_cols.remove(target_col)  # Remove target if it's categorical

print(f"Categorical features: {categorical_cols}")

# Select key categorical features for visualization (limit to most important ones)
key_categorical = categorical_cols[:4]  # Show top 4 categorical features

fig = make_subplots(rows=2, cols=2, 
                    subplot_titles=key_categorical,
                    specs=[[{"type": "xy"}, {"type": "xy"}],
                           [{"type": "xy"}, {"type": "xy"}]])

for i, col in enumerate(key_categorical):
    row = (i // 2) + 1
    col_pos = (i % 2) + 1
    
    value_counts = train_data[col].value_counts().head(10)  # Top 10 categories
    
    fig.add_trace(
        go.Bar(x=value_counts.index, y=value_counts.values, name=col,
               marker_color=app_color_palette[i % len(app_color_palette)],
               showlegend=False),
        row=row, col=col_pos
    )

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=12),
    title_font=dict(color='#7C3AED', size=16)
)

fig.update_xaxes(gridcolor='rgba(139,92,246,0.2)', zerolinecolor='rgba(139,92,246,0.3)',
                 tickfont=dict(color='#8B5CF6', size=10),
                 title_font=dict(color='#7C3AED', size=11))
fig.update_yaxes(gridcolor='rgba(139,92,246,0.2)', zerolinecolor='rgba(139,92,246,0.3)',
                 tickfont=dict(color='#8B5CF6', size=10),
                 title_font=dict(color='#7C3AED', size=11))

fig.write_html("/Users/yuvalheffetz/ds-agent-projects/session_313737e4-b92d-4cb9-8eb5-68f5df26d5d6/research/plots/categorical_distributions.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 5. Feature Correlations

In [None]:
# Calculate correlation matrix for numerical features
# Encode target variable if categorical
data_for_corr = train_data.copy()
if train_data[target_col].dtype == 'object':
    data_for_corr[target_col] = (train_data[target_col] == 'yes').astype(int)

# Select numerical columns including encoded target
numerical_with_target = data_for_corr.select_dtypes(include=[np.number]).columns
correlation_matrix = data_for_corr[numerical_with_target].corr()

# Create correlation heatmap
fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='RdBu_r',
    zmid=0,
    text=np.round(correlation_matrix.values, 2),
    texttemplate="%{text}",
    textfont={"size": 10}
))

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=12),
    title_font=dict(color='#7C3AED', size=16),
    xaxis=dict(
        tickfont=dict(color='#8B5CF6', size=10),
        title_font=dict(color='#7C3AED', size=11)
    ),
    yaxis=dict(
        tickfont=dict(color='#8B5CF6', size=10),
        title_font=dict(color='#7C3AED', size=11)
    )
)

fig.write_html("/Users/yuvalheffetz/ds-agent-projects/session_313737e4-b92d-4cb9-8eb5-68f5df26d5d6/research/plots/feature_correlations.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

# Show correlations with target
target_correlations = correlation_matrix[target_col].drop(target_col).sort_values(key=abs, ascending=False)
print(f"\nCorrelations with target variable '{target_col}':")
print(target_correlations)

## 6. Age Distribution Analysis

In [None]:
# Analyze age distribution by target
fig = go.Figure()

for i, target_value in enumerate(train_data[target_col].unique()):
    subset = train_data[train_data[target_col] == target_value]['age']
    fig.add_trace(go.Histogram(
        x=subset,
        name=f'{target_col}={target_value}',
        opacity=0.7,
        marker_color=app_color_palette[i]
    ))

fig.update_layout(
    barmode='overlay',
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=12),
    title_font=dict(color='#7C3AED', size=16),
    xaxis=dict(
        title='Age',
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=11),
        title_font=dict(color='#7C3AED', size=12)
    ),
    yaxis=dict(
        title='Count',
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=11),
        title_font=dict(color='#7C3AED', size=12)
    ),
    legend=dict(font=dict(color='#8B5CF6', size=11))
)

fig.write_html("/Users/yuvalheffetz/ds-agent-projects/session_313737e4-b92d-4cb9-8eb5-68f5df26d5d6/research/plots/age_by_target.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 7. Job Type Analysis

In [None]:
# Analyze job type vs target
job_target_crosstab = pd.crosstab(train_data['job'], train_data[target_col], normalize='index') * 100

fig = go.Figure()

for i, target_value in enumerate(job_target_crosstab.columns):
    fig.add_trace(go.Bar(
        name=f'{target_col}={target_value}',
        x=job_target_crosstab.index,
        y=job_target_crosstab[target_value],
        marker_color=app_color_palette[i]
    ))

fig.update_layout(
    barmode='stack',
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=12),
    title_font=dict(color='#7C3AED', size=16),
    xaxis=dict(
        title='Job Type',
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=10),
        title_font=dict(color='#7C3AED', size=12),
        tickangle=45
    ),
    yaxis=dict(
        title='Percentage (%)',
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=11),
        title_font=dict(color='#7C3AED', size=12)
    ),
    legend=dict(font=dict(color='#8B5CF6', size=11))
)

fig.write_html("/Users/yuvalheffetz/ds-agent-projects/session_313737e4-b92d-4cb9-8eb5-68f5df26d5d6/research/plots/job_type_analysis.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

# Print statistics
print("Job type subscription rates:")
print(job_target_crosstab)

## 8. Campaign Duration Impact

In [None]:
# Analyze duration vs target (if duration column exists)
if 'duration' in train_data.columns:
    # Create box plot for duration by target
    fig = go.Figure()
    
    for i, target_value in enumerate(train_data[target_col].unique()):
        subset = train_data[train_data[target_col] == target_value]['duration']
        fig.add_trace(go.Box(
            y=subset,
            name=f'{target_col}={target_value}',
            marker_color=app_color_palette[i]
        ))
    
    fig.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        font=dict(color='#8B5CF6', size=12),
        title_font=dict(color='#7C3AED', size=16),
        xaxis=dict(
            title='Target Variable',
            gridcolor='rgba(139,92,246,0.2)',
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),
            title_font=dict(color='#7C3AED', size=12)
        ),
        yaxis=dict(
            title='Duration (seconds)',
            gridcolor='rgba(139,92,246,0.2)',
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),
            title_font=dict(color='#7C3AED', size=12)
        ),
        legend=dict(font=dict(color='#8B5CF6', size=11))
    )
    
    fig.write_html("/Users/yuvalheffetz/ds-agent-projects/session_313737e4-b92d-4cb9-8eb5-68f5df26d5d6/research/plots/duration_analysis.html", 
                   include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
    fig.show()
    
    # Print duration statistics by target
    print("Duration statistics by target:")
    print(train_data.groupby(target_col)['duration'].describe())
else:
    print("Duration column not found in dataset")

## 9. Summary Statistics and Key Insights

In [None]:
# Generate summary insights
print("=== EDA SUMMARY ===")
print(f"Dataset Shape: {train_data.shape}")
print(f"Missing Values: {train_data.isnull().sum().sum()}")
print(f"Target Variable: {target_col}")
print(f"Target Distribution: {train_data[target_col].value_counts(normalize=True).round(3)}")
print(f"Numerical Features: {len(numerical_cols)}")
print(f"Categorical Features: {len(categorical_cols)}")

# Key insights for each column
print("\n=== FEATURE INSIGHTS ===")
for col in train_data.columns:
    if col == target_col:
        continue
    
    if train_data[col].dtype in ['object']:
        print(f"{col}: {train_data[col].nunique()} categories, most common: {train_data[col].mode()[0]}")
    else:
        print(f"{col}: mean={train_data[col].mean():.2f}, std={train_data[col].std():.2f}, range=[{train_data[col].min():.2f}, {train_data[col].max():.2f}]")