# Bank Marketing Term Deposit Prediction - Exploratory Data Analysis

This notebook performs comprehensive exploratory data analysis on the Bank Marketing Term Deposit dataset to understand the data characteristics and inform preprocessing and feature engineering decisions.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# App color palette for consistent styling
app_color_palette = [
    'rgba(99, 110, 250, 0.8)',   # Blue
    'rgba(239, 85, 59, 0.8)',    # Red/Orange-Red
    'rgba(0, 204, 150, 0.8)',    # Green
    'rgba(171, 99, 250, 0.8)',   # Purple
    'rgba(255, 161, 90, 0.8)',   # Orange
    'rgba(25, 211, 243, 0.8)',   # Cyan
    'rgba(255, 102, 146, 0.8)',  # Pink
    'rgba(182, 232, 128, 0.8)',  # Light Green
    'rgba(255, 151, 255, 0.8)',  # Magenta
    'rgba(254, 203, 82, 0.8)'    # Yellow
]

## 1. Data Loading and Initial Exploration

In [None]:
# Load the training dataset
data_path = '/Users/avivnahon/ds-agent-projects/session_89600b04-b810-4506-b66d-91e28f4f611b/data/train_set.csv'
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"\nDataset info:")
df.info()
print(f"\nFirst few rows:")
df.head()

## 2. Data Types and Missing Values Analysis

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values[missing_values > 0] if missing_values.sum() > 0 else "No missing values found")

# Data types analysis
print(f"\nData types:")
print(df.dtypes)

# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"\nCategorical columns ({len(categorical_cols)}): {categorical_cols}")
print(f"Numerical columns ({len(numerical_cols)}): {numerical_cols}")

# Basic statistics
print(f"\nBasic statistics for numerical columns:")
df[numerical_cols].describe()

## 3. Target Variable Analysis

In [None]:
# Target variable analysis
target_col = 'y'
print(f"Target variable distribution:")
target_counts = df[target_col].value_counts()
target_percentages = df[target_col].value_counts(normalize=True) * 100

print(target_counts)
print(f"\nTarget variable percentages:")
for value, percentage in target_percentages.items():
    print(f"{value}: {percentage:.2f}%")

# Create target distribution plot
fig = px.bar(x=target_counts.index, y=target_counts.values, 
             labels={'x': 'Target Variable (y)', 'y': 'Count'},
             color=target_counts.index,
             color_discrete_sequence=app_color_palette[:2])

# Apply consistent styling
fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=12),
    title_font=dict(color='#7C3AED', size=16),
    xaxis=dict(
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=11),
        title_font=dict(color='#7C3AED', size=12)
    ),
    yaxis=dict(
        gridcolor='rgba(139,92,246,0.2)',
        zerolinecolor='rgba(139,92,246,0.3)',
        tickfont=dict(color='#8B5CF6', size=11),
        title_font=dict(color='#7C3AED', size=12)
    ),
    legend=dict(font=dict(color='#8B5CF6', size=11))
)

fig.write_html("/Users/avivnahon/ds-agent-projects/session_89600b04-b810-4506-b66d-91e28f4f611b/research/plots/target_distribution.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 4. Categorical Features Analysis

In [None]:
# Analyze categorical features
print("Categorical features unique values:")
for col in categorical_cols:
    if col != target_col:  # Exclude target variable
        unique_vals = df[col].nunique()
        print(f"{col}: {unique_vals} unique values")
        if unique_vals <= 10:
            print(f"  Values: {df[col].unique()}")
        print(f"  Value counts:\n{df[col].value_counts()}")
        print("="*50)

In [None]:
# Create a comprehensive categorical features plot
categorical_features = [col for col in categorical_cols if col != target_col]

# Create subplots for multiple categorical variables
n_features = len(categorical_features)
n_cols = 3
n_rows = (n_features + n_cols - 1) // n_cols

fig = make_subplots(
    rows=n_rows, cols=n_cols,
    subplot_titles=categorical_features,
    specs=[[{"type": "bar"}] * n_cols for _ in range(n_rows)]
)

for i, col in enumerate(categorical_features):
    row = (i // n_cols) + 1
    col_pos = (i % n_cols) + 1
    
    value_counts = df[col].value_counts()
    
    fig.add_trace(
        go.Bar(
            x=value_counts.index,
            y=value_counts.values,
            name=col,
            marker_color=app_color_palette[i % len(app_color_palette)],
            showlegend=False
        ),
        row=row, col=col_pos
    )

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=10),
    title_font=dict(color='#7C3AED', size=14)
)

fig.update_xaxes(
    gridcolor='rgba(139,92,246,0.2)',
    tickfont=dict(color='#8B5CF6', size=9),
    title_font=dict(color='#7C3AED', size=10)
)

fig.update_yaxes(
    gridcolor='rgba(139,92,246,0.2)',
    tickfont=dict(color='#8B5CF6', size=9),
    title_font=dict(color='#7C3AED', size=10)
)

fig.write_html("/Users/avivnahon/ds-agent-projects/session_89600b04-b810-4506-b66d-91e28f4f611b/research/plots/categorical_features_distribution.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 5. Numerical Features Distribution and Outliers

In [None]:
# Analyze numerical features
numerical_features = [col for col in numerical_cols if col != target_col]
print(f"Numerical features statistics:")
print(df[numerical_features].describe())

# Check for outliers using IQR method
print("\nOutlier analysis (IQR method):")
for col in numerical_features:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"{col}: {len(outliers)} outliers ({len(outliers)/len(df)*100:.2f}% of data)")
    print(f"  Range: [{df[col].min():.2f}, {df[col].max():.2f}]")
    print(f"  Normal range (IQR): [{lower_bound:.2f}, {upper_bound:.2f}]")
    print("="*50)

In [None]:
# Create distribution plots for numerical features
n_features = len(numerical_features)
n_cols = 2
n_rows = (n_features + n_cols - 1) // n_cols

fig = make_subplots(
    rows=n_rows, cols=n_cols,
    subplot_titles=numerical_features,
    specs=[[{"type": "histogram"}] * n_cols for _ in range(n_rows)]
)

for i, col in enumerate(numerical_features):
    row = (i // n_cols) + 1
    col_pos = (i % n_cols) + 1
    
    fig.add_trace(
        go.Histogram(
            x=df[col],
            name=col,
            marker_color=app_color_palette[i % len(app_color_palette)],
            showlegend=False,
            nbinsx=30
        ),
        row=row, col=col_pos
    )

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=10),
    title_font=dict(color='#7C3AED', size=14)
)

fig.update_xaxes(
    gridcolor='rgba(139,92,246,0.2)',
    tickfont=dict(color='#8B5CF6', size=9),
    title_font=dict(color='#7C3AED', size=10)
)

fig.update_yaxes(
    gridcolor='rgba(139,92,246,0.2)',
    tickfont=dict(color='#8B5CF6', size=9),
    title_font=dict(color='#7C3AED', size=10)
)

fig.write_html("/Users/avivnahon/ds-agent-projects/session_89600b04-b810-4506-b66d-91e28f4f611b/research/plots/numerical_features_distribution.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 6. Feature Correlations Analysis

In [None]:
# Convert target variable to numeric for correlation analysis
df_corr = df.copy()
df_corr[target_col] = df_corr[target_col].map({'yes': 1, 'no': 0})

# Calculate correlation matrix for numerical features
correlation_matrix = df_corr[numerical_features + [target_col]].corr()
print("Correlation with target variable:")
target_corr = correlation_matrix[target_col].drop(target_col).sort_values(key=abs, ascending=False)
print(target_corr)

# Create correlation heatmap
fig = px.imshow(
    correlation_matrix,
    text_auto=True,
    aspect="auto",
    color_continuous_scale='RdBu_r',
    labels=dict(color="Correlation")
)

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=12),
    title_font=dict(color='#7C3AED', size=16),
    xaxis=dict(
        tickfont=dict(color='#8B5CF6', size=11),
        title_font=dict(color='#7C3AED', size=12)
    ),
    yaxis=dict(
        tickfont=dict(color='#8B5CF6', size=11),
        title_font=dict(color='#7C3AED', size=12)
    )
)

fig.write_html("/Users/avivnahon/ds-agent-projects/session_89600b04-b810-4506-b66d-91e28f4f611b/research/plots/correlation_matrix.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 7. Categorical Features vs Target Analysis

In [None]:
# Analyze relationship between categorical features and target
print("Categorical features vs Target variable analysis:")
for col in categorical_features:
    print(f"\n{col} vs Target:")
    crosstab = pd.crosstab(df[col], df[target_col], normalize='index') * 100
    print(crosstab.round(2))
    print("="*50)

# Create a comprehensive categorical vs target plot
selected_categorical = categorical_features[:6]  # Select first 6 for readability

fig = make_subplots(
    rows=2, cols=3,
    subplot_titles=selected_categorical,
    specs=[[{"type": "bar"}] * 3 for _ in range(2)]
)

for i, col in enumerate(selected_categorical):
    row = (i // 3) + 1
    col_pos = (i % 3) + 1
    
    # Calculate percentage of positive class for each category
    category_target = df.groupby(col)[target_col].apply(lambda x: (x == 'yes').mean() * 100)
    
    fig.add_trace(
        go.Bar(
            x=category_target.index,
            y=category_target.values,
            name=col,
            marker_color=app_color_palette[i % len(app_color_palette)],
            showlegend=False
        ),
        row=row, col=col_pos
    )

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=10),
    title_font=dict(color='#7C3AED', size=14)
)

fig.update_xaxes(
    gridcolor='rgba(139,92,246,0.2)',
    tickfont=dict(color='#8B5CF6', size=8),
    title_font=dict(color='#7C3AED', size=10)
)

fig.update_yaxes(
    gridcolor='rgba(139,92,246,0.2)',
    tickfont=dict(color='#8B5CF6', size=9),
    title_font=dict(color='#7C3AED', size=10),
    title_text="Positive Rate (%)"
)

fig.write_html("/Users/avivnahon/ds-agent-projects/session_89600b04-b810-4506-b66d-91e28f4f611b/research/plots/categorical_vs_target.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## 8. Numerical Features vs Target Analysis

In [None]:
# Analyze numerical features by target class
print("Numerical features by target class:")
for col in numerical_features:
    print(f"\n{col} statistics by target:")
    stats_by_target = df.groupby(target_col)[col].describe()
    print(stats_by_target)
    print("="*50)

# Create box plots for numerical features by target
n_features = len(numerical_features)
n_cols = 2
n_rows = (n_features + n_cols - 1) // n_cols

fig = make_subplots(
    rows=n_rows, cols=n_cols,
    subplot_titles=numerical_features,
    specs=[[{"type": "box"}] * n_cols for _ in range(n_rows)]
)

for i, col in enumerate(numerical_features):
    row = (i // n_cols) + 1
    col_pos = (i % n_cols) + 1
    
    for j, target_val in enumerate(['no', 'yes']):
        fig.add_trace(
            go.Box(
                y=df[df[target_col] == target_val][col],
                name=f'{col}_{target_val}',
                marker_color=app_color_palette[j],
                showlegend=(i == 0),  # Show legend only for first subplot
                legendgroup=target_val,
                boxpoints='outliers'
            ),
            row=row, col=col_pos
        )

fig.update_layout(
    height=550,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    font=dict(color='#8B5CF6', size=10),
    title_font=dict(color='#7C3AED', size=14),
    legend=dict(font=dict(color='#8B5CF6', size=11))
)

fig.update_xaxes(
    gridcolor='rgba(139,92,246,0.2)',
    tickfont=dict(color='#8B5CF6', size=9),
    title_font=dict(color='#7C3AED', size=10)
)

fig.update_yaxes(
    gridcolor='rgba(139,92,246,0.2)',
    tickfont=dict(color='#8B5CF6', size=9),
    title_font=dict(color='#7C3AED', size=10)
)

fig.write_html("/Users/avivnahon/ds-agent-projects/session_89600b04-b810-4506-b66d-91e28f4f611b/research/plots/numerical_vs_target.html", 
               include_plotlyjs=True, config={'responsive': True, 'displayModeBar': False})
fig.show()

## Summary and Key Findings

This EDA has revealed important characteristics of the Bank Marketing dataset that will inform our model development approach:

1. **Class Imbalance**: The dataset shows significant class imbalance with approximately 88.3% negative cases (no subscription) and 11.7% positive cases (subscription).

2. **Data Quality**: The dataset has no missing values, which simplifies preprocessing.

3. **Feature Types**: Mix of categorical and numerical features, with categorical features showing varying cardinality.

4. **Outliers**: Several numerical features contain outliers that may require handling.

5. **Feature Relationships**: Some features show meaningful relationships with the target variable that can be leveraged for prediction.

These insights will guide our preprocessing, feature engineering, and model selection strategies.