## Customer-Level EDA
This notebook analyses customer demographics, digital usage, product ownership, and transaction behavior in relation to whether the customer has engaged (`has_engaged`) in any campaign.

In [1]:
# Load packages
import pandas as pd
import eda_functions as eda
from load_data import load_customer_data
from feature_engineering import (create_customer_engagement_flag,
                                 summarize_transactions,
                                 engineer_digital_usage,
                                 count_products_owned
                                 )
from business_rules import define_high_value_user, is_recently_active, is_multichannel_user
from data_quality import print_null_summary, print_shape_and_preview, check_post_merge_nulls, impute_missing_values

In [2]:
# Load data
engagement_details, customers, digital_usage, products_owned, transactions = load_customer_data()
target_col = "has_engaged"

### Preliminary EDA

In [3]:
# EDA plots
eda.plot_numeric_distributions(customers, prefix="customers")
eda.plot_numeric_distributions(digital_usage, prefix="customers")
eda.plot_product_ownership_barplot(products_owned, "customer_id")

### Feature Engineering

In [4]:
customer_engagement = create_customer_engagement_flag(engagement_details)
print_null_summary(customer_engagement, "customer_engagement")
print_shape_and_preview(customer_engagement, "customer_engagement")


Null Summary for customer_engagement:
              Null Count  Null %
customer_id           0     0.0
has_engaged           0     0.0

customer_engagement shape: (4000, 2)

customer_engagement preview:
   customer_id  has_engaged
0            1            0
1            2            1
2            3            1
3            4            1
4            5            1


In [5]:
transaction_summary = summarize_transactions(transactions)
test_df = customer_engagement.merge(transaction_summary, on='customer_id', how='left')
check_post_merge_nulls(test_df, ["last_transaction_date"], "Engagement + Transactions")


Engagement + Transactions - Null % in columns:
last_transaction_date    53.45
dtype: float64


In [6]:
# Digital usage transformation
digital_usage = engineer_digital_usage(digital_usage)
print_null_summary(digital_usage, "digital_usage")
print_shape_and_preview(digital_usage, "digital_usage")

eda.check_missing_correlation(digital_usage, "days_since_mobile_use", "has_mobile_app")
eda.check_missing_correlation(digital_usage, "days_since_web_use", "has_web_account")


Null Summary for digital_usage:
                             Null Count  Null %
customer_id                          0     0.0
has_mobile_app                       0     0.0
has_web_account                      0     0.0
days_since_mobile_use                0     0.0
days_since_web_use                   0     0.0
total_logins_per_week                0     0.0
avg_total_time_per_session           0     0.0

digital_usage shape: (4000, 7)

digital_usage preview:
   customer_id  has_mobile_app  has_web_account  days_since_mobile_use  \
0          959               1                1                   54.0   
1            2               0                1                  999.0   
2            3               1                1                  529.0   
3            4               1                1                  108.0   
4            5               1                1                   75.0   

   days_since_web_use  total_logins_per_week  avg_total_time_per_session  
0             

Series([], Name: count, dtype: int64)

In [7]:
# Products owned
products_owned = count_products_owned(products_owned)
print_null_summary(products_owned, "products_owned")
print_shape_and_preview(products_owned, "products_owned")


Null Summary for products_owned:
                         Null Count  Null %
customer_id                      0     0.0
has_investment_product           0     0.0
has_credit_card                  0     0.0
has_personal_loan                0     0.0
has_fixed_deposit                0     0.0
has_insurance                    0     0.0
num_products_owned               0     0.0

products_owned shape: (4000, 7)

products_owned preview:
   customer_id  has_investment_product  has_credit_card  has_personal_loan  \
0          217                       1                0                  1   
1          179                       0                1                  1   
2           81                       0                0                  1   
3           50                       0                0                  0   
4           13                       1                0                  1   

   has_fixed_deposit  has_insurance  num_products_owned  
0                  0              0 

In [8]:
# Merge all features
combined_df = (
    customers
    .merge(customer_engagement, on="customer_id", how="left")
    .merge(digital_usage, on="customer_id", how="left")
    .merge(transaction_summary, on="customer_id", how="left")
    .merge(products_owned[["customer_id", "num_products_owned"]], on="customer_id", how="left")
)

In [10]:
combined_df.columns

Index(['customer_id', 'age', 'job', 'marital', 'education', 'default',
       'balance', 'debt', 'income', 'tenure', 'nps', 'dependents',
       'customer_lifetime_value', 'has_engaged', 'has_mobile_app',
       'has_web_account', 'days_since_mobile_use', 'days_since_web_use',
       'total_logins_per_week', 'avg_total_time_per_session',
       'total_transaction_amt', 'transaction_count', 'last_transaction_date',
       'num_products_owned', 'is_high_value_user'],
      dtype='object')

In [11]:
# High-value user flag
combined_df["is_high_value_user"] = define_high_value_user(combined_df)
# Transaction frequency
combined_df["transaction_frequency"] = combined_df["transaction_count"] / combined_df["tenure"]
combined_df[["total_transaction_amt", "transaction_count", "transaction_frequency"]] = combined_df[[
    "total_transaction_amt", "transaction_count", "transaction_frequency"]].fillna(0)
combined_df.drop(columns=["last_transaction_date"], inplace=True)
# Active in the last 30 days flag
combined_df["is_recently_active"] = is_recently_active(combined_df, days=30)
# Customers with both mobile and web usage flag
combined_df["is_multichannel_user"] = is_multichannel_user(combined_df)

In [12]:
# Final checks
print_null_summary(combined_df, "combined_df")
print_shape_and_preview(combined_df, "combined_df")


Null Summary for combined_df:
                             Null Count  Null %
has_mobile_app                      19    0.47
avg_total_time_per_session          19    0.47
total_logins_per_week               19    0.47
days_since_web_use                  19    0.47
days_since_mobile_use               19    0.47
has_web_account                     19    0.47
customer_id                          0    0.00
is_recently_active                   0    0.00
transaction_frequency                0    0.00
is_high_value_user                   0    0.00
num_products_owned                   0    0.00
transaction_count                    0    0.00
total_transaction_amt                0    0.00
has_engaged                          0    0.00
age                                  0    0.00
customer_lifetime_value              0    0.00
dependents                           0    0.00
nps                                  0    0.00
tenure                               0    0.00
income                      

In [13]:
# Impute missing values
combined_df = impute_missing_values(combined_df)

In [14]:
# Value counts for categorical variables
categorical_cols = eda.get_categorical_columns(combined_df)
for col in categorical_cols:
    print(f"Value counts for {col}:\n{combined_df[col].value_counts()}")

Value counts for marital:
marital
married     2298
single      1275
divorced     446
Name: count, dtype: int64
Value counts for has_mobile_app:
has_mobile_app
1.0    2939
0.0    1080
Name: count, dtype: int64
Value counts for dependents:
dependents
3    1067
2    1040
1     752
4     645
0     357
5     158
Name: count, dtype: int64
Value counts for default:
default
0    3374
1     645
Name: count, dtype: int64
Value counts for has_web_account:
has_web_account
1.0    3539
0.0     480
Name: count, dtype: int64
Value counts for is_high_value_user:
is_high_value_user
0    3595
1     424
Name: count, dtype: int64
Value counts for job:
job
entrepreneur     534
unknown          445
self-employed    437
management       383
blue-collar      356
technician       355
services         324
admin.           308
housemaid        296
retired          274
student          166
unemployed       141
Name: count, dtype: int64
Value counts for is_multichannel_user:
is_multichannel_user
1    2920
0    1099

### Relationship Analysis

In [15]:
# Set df and target column
df = combined_df.copy()
target_col = "has_engaged"

In [16]:
# 1. Boxplots for numerical variables by engagement
eda.get_boxplot(df, target_col)

In [17]:
# 2. T-tests for numerical variables
ttest_results = eda.get_ttest(df, target_col)
print("T-test Results:\n", ttest_results)

T-test Results:
                        Feature  T-Statistic   P-Value
14       transaction_frequency     1.167632  0.243035
12       total_transaction_amt     0.917487  0.358953
13           transaction_count     0.840294  0.400801
8                       income     0.723748  0.469268
10                      tenure    -0.645341  0.518747
7                         debt     0.524332  0.600078
4      customer_lifetime_value     0.508185  0.611354
3                  customer_id     0.496427  0.619623
9                          nps     0.482224  0.629676
11       total_logins_per_week    -0.477696  0.632895
2                      balance     0.375547  0.707277
5        days_since_mobile_use     0.358095  0.720293
6           days_since_web_use    -0.317237  0.751083
0                          age    -0.246405  0.805383
1   avg_total_time_per_session     0.001399  0.998883


In [18]:
# 3. Proportion tables & bar plots for categorical columns against `has_engaged`
tables = eda.get_proportion_table(df, target_col)
    
barplots = eda.get_barplot(df, target_col)


Proportion Table for marital:
has_engaged         0         1
marital                        
divorced     0.408072  0.591928
married      0.412968  0.587032
single       0.421176  0.578824

Proportion Table for has_mobile_app:
has_engaged            0         1
has_mobile_app                    
0.0             0.417593  0.582407
1.0             0.414086  0.585914

Proportion Table for dependents:
has_engaged         0         1
dependents                     
0            0.400560  0.599440
1            0.405585  0.594415
2            0.390385  0.609615
3            0.448922  0.551078
4            0.421705  0.578295
5            0.398734  0.601266

Proportion Table for default:
has_engaged         0         1
default                        
0            0.418494  0.581506
1            0.396899  0.603101

Proportion Table for has_web_account:
has_engaged             0         1
has_web_account                    
0.0              0.429167  0.570833
1.0              0.413111  0.586889

In [19]:
# 4. Chi-square test results
chi2_results = eda.get_chi_square(df, target_col)
print("\nChi-Square Test Results:\n", chi2_results)


Chi-Square Test Results:
                  Feature  Chi-Square   P-Value
2             dependents    8.525681  0.129546
9     is_recently_active    1.670186  0.196234
3                default    0.953079  0.328937
10             education    2.804337  0.422787
5     is_high_value_user    0.454920  0.500008
4        has_web_account    0.385096  0.534889
6                    job    9.012204  0.620766
7   is_multichannel_user    0.099124  0.752884
0                marital    0.327602  0.848911
1         has_mobile_app    0.026853  0.869835
8     num_products_owned    0.856228  0.973309
