## Customer-Level EDA
This notebook analyses customer demographics, digital usage, product ownership, and transaction behavior in relation to whether the customer has engaged (`has_engaged`) in any campaign.

In [None]:
# Load packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import importlib
import eda_functions
importlib.reload(eda_functions)
import eda_functions as eda

In [None]:
# Load datasets
engagement_details = pd.read_csv("../data/processed/engagement_details.csv")
customers = pd.read_csv("../data/processed/customer.csv")
digital_usage = pd.read_csv("../data/processed/digital_usage.csv")
products_owned = pd.read_csv("../data/processed/products_owned.csv")
transactions = pd.read_csv("../data/processed/transactions.csv")

In [None]:
# Univariate EDA for key numeric features in customers
numeric_cols = ['age', 'income', 'balance', 'debt', 'tenure', 'nps', 'customer_lifetime_value']
for col in numeric_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(customers[col], bins=30, kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

In [None]:
# Digital usage histograms
usage_cols = ['mobile_logins_wk', 'web_logins_wk', 'avg_mobile_time', 'avg_web_time']
for col in usage_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(digital_usage[col].dropna(), bins=30, kde=True)
    plt.title(f'Distribution of {col}')
    plt.show()

In [None]:
# Product ownership (bar plot per product)
product_cols = [col for col in products_owned.columns if col != 'customer_id']
products_owned[product_cols].mean().sort_values(ascending=False).plot(kind='bar')
plt.title("Proportion of Customers Owning Each Product")
plt.ylabel("Proportion")
plt.show()

In [None]:
# Create customer-level has_engaged flag (1 if engaged in any campaign)
customer_engagement = (
    engagement_details
    .groupby("customer_id")["has_engaged"]
    .max()
    .reset_index()
)

# Check for nulls in 'has_engaged'
null_count = customer_engagement["has_engaged"].isnull().sum()
print(f"Number of nulls in 'has_engaged': {null_count}")

print(f"\nShape of customer_engagement: {customer_engagement.shape}")
print(f"\nFirst few rows of customer_engagement:\n{customer_engagement.head()}")

In [None]:
# Aggregate transactions 
transaction_summary = (
    transactions
    .groupby("customer_id")
    .agg(
        total_transaction_amt=("transaction_amt", "sum"),
        transaction_count=("transaction_id", "count"),
        last_transaction_date=("transaction_date", "max")
    )
    .reset_index()
)

# Get percentage of null values in column last_transaction_date
test_df = customer_engagement.merge(transaction_summary, on='customer_id', how='left')
null_percentages_test= test_df.isnull().mean().round(4) * 100
null_percentages_test = null_percentages_test.sort_values(ascending=False)
print(f"Percentage of nulls in transaction columns after merge:\n{null_percentages_test}")

#~54% of customers have never transacted
#This may include inactive, new, or digitally engaged but not monetized customers

In [None]:
# Feature engineering on digital usage

# Convert to datetime format
digital_usage['last_mobile_use'] = pd.to_datetime( digital_usage['last_mobile_use'], format="%Y-%m-%d")
digital_usage['last_web_use'] = pd.to_datetime( digital_usage['last_web_use'], format="%Y-%m-%d")
# Convert date fields to days since xxx format
reference_date = pd.to_datetime('2025-01-01')
digital_usage['days_since_mobile_use'] = (reference_date - digital_usage['last_mobile_use']).dt.days
digital_usage['days_since_web_use'] = (reference_date - digital_usage['last_web_use']).dt.days

digital_usage["total_logins_per_week"] = digital_usage[["mobile_logins_wk", "web_logins_wk"]].sum(axis=1)
digital_usage["avg_total_time_per_session"] = digital_usage[["avg_mobile_time", "avg_web_time"]].sum(axis=1)

# Drop original columns
digital_usage = digital_usage.drop(columns=["last_mobile_use", "last_web_use", 
                                               "mobile_logins_wk", "web_logins_wk",
                                               "avg_mobile_time", "avg_web_time"])

print(f"Null counts per column: \n {digital_usage.isnull().sum()}")
print(f"\nShape of digital_usage: {digital_usage.shape}")
print(f"\nFirst few rows of digital_usage:\n{digital_usage.head()}")

In [None]:
# Check missing days_since_mobile_use correlation with has_mobile_app
check = eda.check_missing_correlation(digital_usage, "days_since_mobile_use", "has_mobile_app")

# Check missing days_since_web_use correlation with has_web_account
check = eda.check_missing_correlation(digital_usage, "days_since_web_use", "has_web_account")

# Results interpretation: The customers who are missing days_since_mobile_use or days_since_web_use are those who never had access to the respective platforms

# Fill the missing values by assigning a large number to indicate extreme inactivity
digital_usage["days_since_mobile_use"] = digital_usage["days_since_mobile_use"].fillna(999)
digital_usage["days_since_web_use"] = digital_usage["days_since_web_use"].fillna(999)

In [None]:
# Feature engineering on products owned
products_owned["num_products_owned"] = products_owned.drop(columns="customer_id").sum(axis=1)

# Check for nulls in 'num_products_owned'
null_count = products_owned["num_products_owned"].isnull().sum()
print(f"Number of nulls in 'num_products_owned': {null_count}")

print(f"\nShape of products_owned: {products_owned.shape}")
print(f"\nFirst few rows of products_owned:\n{products_owned.head()}")

In [None]:
# Merge all features into combined_df
combined_df = (
    customers
    .merge(customer_engagement, on="customer_id", how="left")
    .merge(digital_usage, on="customer_id", how="left")
    .merge(transaction_summary, on="customer_id", how="left")
    .merge(products_owned[["customer_id", "num_products_owned"]], on="customer_id", how="left")
)

In [None]:

# Create high-value user flag based on median thresholds
login_median = combined_df["total_logins_per_week"].median()
spend_median = combined_df["total_transaction_amt"].median()
combined_df["is_high_value_user"] = (
    (combined_df["total_logins_per_week"] > login_median) &
    (combined_df["total_transaction_amt"] > spend_median)
).astype(int)

# Feature: transaction frequency
combined_df["transaction_frequency"] = combined_df["transaction_count"] / combined_df["tenure"]
combined_df[["total_transaction_amt", "transaction_count", "transaction_frequency"]] = combined_df[[
    "total_transaction_amt", "transaction_count", "transaction_frequency"]].fillna(0)

combined_df = combined_df.drop(columns=["last_transaction_date"])

In [None]:
# Check nulls
null_counts = combined_df.isnull().sum()
null_percentages = (combined_df.isnull().mean() * 100).round(2)
null_summary = pd.DataFrame({
    "Null Count": null_counts,
    "Null %": null_percentages}).sort_values("Null %", ascending=False)
print("Null summary:\n", null_summary)

print(f"\nFirst few rows:\n {combined_df.head()}")
print("\nShape of combined_df:", combined_df.shape)

In [None]:
# Drop rows with null values in any column in combined_df
before = combined_df.shape[0]
combined_df = combined_df.dropna()
after = combined_df.shape[0]

print(f"Dropped {before - after} rows with nulls. Remaining rows: {after}")

In [None]:
# Set df and target column
df = combined_df.copy()
target_col = 'has_engaged'

In [None]:
# 1. Boxplots for numerical variables by engagement
eda.get_boxplot(df, target_col)

In [None]:
# 2. T-tests for numerical variables
ttest_results = eda.get_ttest(df, target_col)
print("T-test Results:\n", ttest_results)

In [None]:
# 3. Proportion tables & bar plots for categorical columns against `has_engaged`
tables = eda.get_proportion_table(df, target_col)
    
barplots = eda.get_barplot(df, target_col)

In [None]:
# 4. Chi-square test results
chi2_results = eda.get_chi_square(df, target_col)
print("\nChi-Square Test Results:\n", chi2_results)