# Bank Marketing Dataset - Exploratory Data Analysis

This notebook performs comprehensive EDA on the bank marketing dataset to understand characteristics for term deposit subscription prediction.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# App color palette
app_color_palette = [
    'rgba(99, 110, 250, 0.8)',   # Blue
    'rgba(239, 85, 59, 0.8)',    # Red/Orange-Red
    'rgba(0, 204, 150, 0.8)',    # Green
    'rgba(171, 99, 250, 0.8)',   # Purple
    'rgba(255, 161, 90, 0.8)',   # Orange
    'rgba(25, 211, 243, 0.8)',   # Cyan
    'rgba(255, 102, 146, 0.8)',  # Pink
    'rgba(182, 232, 128, 0.8)',  # Light Green
    'rgba(255, 151, 255, 0.8)',  # Magenta
    'rgba(254, 203, 82, 0.8)'    # Yellow
]

def style_plotly_fig(fig):
    """Apply consistent styling to Plotly figures"""
    fig.update_layout(
        height=550,
        paper_bgcolor='rgba(0,0,0,0)',  # Transparent background
        plot_bgcolor='rgba(0,0,0,0)',   # Transparent plot area
        font=dict(color='#8B5CF6', size=12),  # App's purple color for text
        title_font=dict(color='#7C3AED', size=16),  # Slightly darker purple for titles
        xaxis=dict(
            gridcolor='rgba(139,92,246,0.2)',  # Purple-tinted grid
            zerolinecolor='rgba(139,92,246,0.3)',
            tickfont=dict(color='#8B5CF6', size=11),  # Purple tick labels
            title_font=dict(color='#7C3AED', size=12)  # Darker purple axis titles
        ),
        yaxis=dict(
            gridcolor='rgba(139,92,246,0.2)',  # Purple-tinted grid
            zerolinecolor='rgba(139,92,246,0.3)', 
            tickfont=dict(color='#8B5CF6', size=11),  # Purple tick labels
            title_font=dict(color='#7C3AED', size=12)  # Darker purple axis titles
        ),
        legend=dict(font=dict(color='#8B5CF6', size=11))  # Purple legend
    )
    return fig

## 1. Dataset Loading and Basic Structure

In [None]:
# Load the dataset
df = pd.read_csv('/Users/avivnahon/ds-agent-projects/session_0f4f0dd5-122d-4338-af13-0f967752758c/data/train.csv')

# Map generic column names to meaningful names based on context
column_mapping = {
    'V1': 'age',
    'V2': 'job', 
    'V3': 'marital',
    'V4': 'education',
    'V5': 'default',
    'V6': 'balance',
    'V7': 'housing',
    'V8': 'loan',
    'V9': 'contact',
    'V10': 'day',
    'V11': 'month',
    'V12': 'duration',
    'V13': 'campaign',
    'V14': 'pdays',
    'V15': 'previous',
    'V16': 'poutcome',
    'target': 'y'
}

df = df.rename(columns=column_mapping)

print(f"Dataset shape: {df.shape}")
print(f"\nColumn names and types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()

## 2. Missing Values and Basic Statistics

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)
print(f"\nTotal missing values: {missing_values.sum()}")

# Basic statistics
print("\n=== BASIC STATISTICS ===")
print(df.describe(include='all'))

## 3. Target Variable Analysis

In [None]:
# Target variable distribution
target_counts = df['y'].value_counts()
target_props = df['y'].value_counts(normalize=True) * 100

print("Target variable distribution:")
print(f"Class 0 (No): {target_counts[0]} ({target_props[0]:.1f}%)")
print(f"Class 1 (Yes): {target_counts[1]} ({target_props[1]:.1f}%)")

# Create target distribution plot
fig = px.bar(x=['No Subscription', 'Subscription'], 
             y=[target_counts[0], target_counts[1]],
             color=['No Subscription', 'Subscription'],
             color_discrete_sequence=app_color_palette[:2])
fig.update_layout(xaxis_title='Term Deposit Subscription', 
                  yaxis_title='Number of Clients',
                  showlegend=False)
fig = style_plotly_fig(fig)
fig.write_html("/Users/avivnahon/ds-agent-projects/session_0f4f0dd5-122d-4338-af13-0f967752758c/research/plots/target_distribution.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 4. Categorical Features Analysis

In [None]:
# Identify categorical columns
categorical_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

print("Categorical feature distributions:")
for col in categorical_cols:
    print(f"\n{col.upper()}:")
    print(df[col].value_counts())
    
# Job distribution analysis
job_counts = df['job'].value_counts()
fig = px.bar(x=job_counts.index, y=job_counts.values,
             color_discrete_sequence=[app_color_palette[0]])
fig.update_layout(xaxis_title='Job Type', 
                  yaxis_title='Number of Clients',
                  xaxis={'categoryorder': 'total descending'})
fig.update_xaxis(tickangle=45)
fig = style_plotly_fig(fig)
fig.write_html("/Users/avivnahon/ds-agent-projects/session_0f4f0dd5-122d-4338-af13-0f967752758c/research/plots/job_distribution.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 5. Numerical Features Analysis

In [None]:
# Identify numerical columns
numerical_cols = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

print("Numerical feature statistics:")
print(df[numerical_cols].describe())

# Age distribution
fig = px.histogram(df, x='age', nbins=30, 
                   color_discrete_sequence=[app_color_palette[0]])
fig.update_layout(xaxis_title='Age', 
                  yaxis_title='Number of Clients')
fig = style_plotly_fig(fig)
fig.write_html("/Users/avivnahon/ds-agent-projects/session_0f4f0dd5-122d-4338-af13-0f967752758c/research/plots/age_distribution.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 6. Balance Distribution Analysis

In [None]:
# Balance distribution - handle extreme values for better visualization
balance_q95 = df['balance'].quantile(0.95)
balance_q5 = df['balance'].quantile(0.05)

print(f"Balance statistics:")
print(f"Min: {df['balance'].min()}")
print(f"Max: {df['balance'].max()}")
print(f"5th percentile: {balance_q5:.2f}")
print(f"95th percentile: {balance_q95:.2f}")
print(f"Negative balance count: {(df['balance'] < 0).sum()}")

# Create balance distribution plot (trimmed for better visualization)
balance_trimmed = df[(df['balance'] >= balance_q5) & (df['balance'] <= balance_q95)]
fig = px.histogram(balance_trimmed, x='balance', nbins=50,
                   color_discrete_sequence=[app_color_palette[2]])
fig.update_layout(xaxis_title='Account Balance (5th-95th percentile)', 
                  yaxis_title='Number of Clients')
fig = style_plotly_fig(fig)
fig.write_html("/Users/avivnahon/ds-agent-projects/session_0f4f0dd5-122d-4338-af13-0f967752758c/research/plots/balance_distribution.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 7. Feature Correlations

In [None]:
# Correlation matrix for numerical features
corr_matrix = df[numerical_cols + ['y']].corr()
print("Correlation with target variable:")
target_corr = corr_matrix['y'].abs().sort_values(ascending=False)
print(target_corr)

# Create correlation heatmap
fig = px.imshow(corr_matrix, 
                color_continuous_scale='RdBu_r',
                aspect='auto',
                text_auto='.2f')
fig.update_layout(xaxis_title='Features', 
                  yaxis_title='Features')
fig = style_plotly_fig(fig)
fig.write_html("/Users/avivnahon/ds-agent-projects/session_0f4f0dd5-122d-4338-af13-0f967752758c/research/plots/correlation_matrix.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 8. Campaign Duration Analysis

In [None]:
# Duration analysis by target
print("Duration statistics by target:")
duration_stats = df.groupby('y')['duration'].describe()
print(duration_stats)

# Duration distribution by target (log scale for better visualization)
fig = px.box(df, x='y', y='duration', 
             color='y', color_discrete_sequence=app_color_palette[:2])
fig.update_layout(xaxis_title='Target (0=No, 1=Yes)', 
                  yaxis_title='Call Duration (seconds)',
                  yaxis_type='log',
                  showlegend=False)
fig = style_plotly_fig(fig)
fig.write_html("/Users/avivnahon/ds-agent-projects/session_0f4f0dd5-122d-4338-af13-0f967752758c/research/plots/duration_by_target.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 9. Previous Campaign Analysis

In [None]:
# Previous campaign outcome analysis
poutcome_target = pd.crosstab(df['poutcome'], df['y'], normalize='index') * 100
print("Subscription rate by previous campaign outcome:")
print(poutcome_target)

# Previous campaign outcome effectiveness
poutcome_counts = df.groupby(['poutcome', 'y']).size().unstack(fill_value=0)
fig = px.bar(x=poutcome_counts.index, 
             y=[poutcome_counts[0], poutcome_counts[1]],
             color_discrete_sequence=app_color_palette[:2],
             barmode='group')
fig.update_layout(xaxis_title='Previous Campaign Outcome', 
                  yaxis_title='Number of Clients',
                  legend_title='Target')
fig = style_plotly_fig(fig)
fig.write_html("/Users/avivnahon/ds-agent-projects/session_0f4f0dd5-122d-4338-af13-0f967752758c/research/plots/previous_campaign_outcome.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 10. Monthly Campaign Analysis

In [None]:
# Monthly campaign effectiveness
monthly_success = df.groupby('month')['y'].agg(['count', 'mean']).reset_index()
monthly_success['success_rate'] = monthly_success['mean'] * 100

print("Campaign effectiveness by month:")
print(monthly_success.sort_values('success_rate', ascending=False))

# Monthly success rate plot
month_order = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 
               'jul', 'aug', 'sep', 'oct', 'nov', 'dec']
monthly_success['month'] = pd.Categorical(monthly_success['month'], categories=month_order, ordered=True)
monthly_success = monthly_success.sort_values('month')

fig = px.line(monthly_success, x='month', y='success_rate', 
              markers=True, color_discrete_sequence=[app_color_palette[0]])
fig.update_layout(xaxis_title='Month', 
                  yaxis_title='Success Rate (%)')
fig = style_plotly_fig(fig)
fig.write_html("/Users/avivnahon/ds-agent-projects/session_0f4f0dd5-122d-4338-af13-0f967752758c/research/plots/monthly_success_rate.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 11. Summary Statistics

In [None]:
print("=== EDA SUMMARY ===")
print(f"Dataset size: {df.shape[0]} rows, {df.shape[1]} columns")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Target class distribution: {target_props[0]:.1f}% No, {target_props[1]:.1f}% Yes")
print(f"Most common job: {df['job'].mode()[0]}")
print(f"Average age: {df['age'].mean():.1f} years")
print(f"Average call duration: {df['duration'].mean():.1f} seconds")
print(f"Highest success rate month: {monthly_success.loc[monthly_success['success_rate'].idxmax(), 'month']}")