In [None]:
# Importing packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats

In [None]:
# Reading in datasets
customers = pd.read_csv("../../data/processed/customer.csv")
digital_usage = pd.read_csv("../../data/processed/digital_usage.csv")
engagement = pd.read_csv("../../data/processed/engagement_details.csv")
loans = pd.read_csv("../../data/processed/loans.csv")
products = pd.read_csv("../../data/processed/products_owned.csv")
transactions = pd.read_csv("../../data/processed/transactions.csv")
segments = pd.read_csv("../../customer_segmentation/customer_segments.csv")

#### **Data Inspection & Preparation**

##### Customers Dataset

In [None]:
# Merging segments and customers datasets
df = pd.merge(segments, customers, on = 'customer_id')

In [None]:
print("Dimensions:", df.shape)
df.head()

In [None]:
print("DataFrame Structure:")
df.info()

In [None]:
print("Descriptive Statistics:")
print(df.drop('customer_id', axis=1).describe())
df.head()

In [None]:
# Converting 'job', 'marital', and 'education' columns to category data type
categorical_columns = ['job', 'marital', 'education']
for col in categorical_columns:
    df[col] = df[col].astype('category')

# Creating 'nps_category' column that segments responses into promoters, passives, and detractors
nps_bins = [-float('inf'), 6, 8, 10]
nps_labels = ['detractor', 'passive', 'promoter']
df['nps_category'] = pd.cut(df['nps'], bins=nps_bins, labels=nps_labels)

# Dropping the original 'nps' column
df.drop(['nps'], axis=1, inplace=True)

# Applying log transformation to 'balance', 'debt', and 'income' to scale and stabilize the values
df['log_balance'] = np.sign(df['balance']) * np.log1p(np.abs(df['balance']))
df['log_debt'] = np.log1p(df['debt'])
df['log_income'] = np.log1p(df['income'])

# Creating 'debt-to-income' ratio and 'balance-to-debt' ratio columns
df['debt_to_income'] = df['log_debt'] / df['log_income']
df['balance_to_debt'] = df['log_balance'] / df['log_debt']

In [None]:
print(df[['log_balance', 'log_debt', 'log_income', 'debt_to_income', 'balance_to_debt']].describe())

In [None]:
print("Job Distribution:")
print(df['job'].value_counts())

In [None]:
print("Marital Status Distribution:")
print(df['marital'].value_counts())

In [None]:
print("Education Distribution:")
print(df['education'].value_counts())

##### Segments Dataset

In [None]:
print("Dimensions", segments.shape)
segments.head()

In [None]:
print("Segment Distribution:")
print(segments['Segment'].value_counts())

#### Products Dataset

In [None]:
print("Dimensions:", segments.shape)
products.head()

In [None]:
print("DataFrame Structure:")
products.info()

In [None]:
print("Investment Distribution:")
print(products['has_investment_product'].value_counts())

print("\nCredit Card Distribution:")
print(products['has_credit_card'].value_counts())

print("\nPersonal Loan Distribution:")
print(products['has_personal_loan'].value_counts())

print("\nFixed Deposit Distribution:")
print(products['has_fixed_deposit'].value_counts())

print("\nInsurance Distribution:")
print(products['has_insurance'].value_counts())

#### Loans Dataset

In [None]:
print("Dimensions:", loans.shape)
loans.head()

In [None]:
loans.info()

In [None]:
# Converting 'due_date' and 'paid_off_date' columns to datetime format
loans['due_date'] = pd.to_datetime(loans['due_date'])
loans['paid_off_date'] = pd.to_datetime(loans['paid_off_date'])

# Creating 'days_past_due' column
loans['days_past_due'] = (loans['paid_off_date'] - loans['due_date']).dt.days
loans['days_past_due'] = loans['days_past_due'].fillna(0)

# Dropping the original 'due_date' and 'paid_off_date' columns
loans.drop(['due_date', 'paid_off_date'], axis=1, inplace=True)

In [None]:
loans.head()

In [None]:
loans.describe()

In [None]:
print("Loan Purpose Distribution:")
print(loans['loan_purpose'].value_counts())

In [None]:
# Categorizing loan purposes into broader categories and creating 'loan_category' column
def categorize_loan_purpose(purpose):
    if purpose == 'debt_consolidation':
        return 'Debt Management'
    elif purpose == 'credit_card':
        return 'Credit Product'
    elif purpose in ['housing-related', 'major_purchase', 'car']:
        return 'Asset Acquisition'
    elif purpose in ['small_business', 'educational']:
        return 'Business & Education'
    elif purpose in ['wedding', 'vacation', 'medical']:
        return 'Lifestyle & Personal'
    else:
        return 'Miscellaneous'  

loans['loan_category'] = loans['loan_purpose'].apply(categorize_loan_purpose)

print("Loan Category Distribution:")
print(loans['loan_category'].value_counts())

##### Transactions Dataset

In [None]:
print("Dimensions:", transactions.shape)
transactions.head()

In [None]:
print("DataFrame Structure:")
transactions.info()

In [None]:
# Converting 'transaction_type' column to category data type
transactions['transaction_type'] = transactions['transaction_type'].astype('category')

# Extracting year-month-day from 'transaction_date' and converting to datetime format
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'])
transactions['transaction_date'] = transactions['transaction_date'].dt.date
transactions['transaction_date'] = pd.to_datetime(transactions['transaction_date'])

In [None]:
print("Transaction Type Distribution:")
print(transactions['transaction_type'].value_counts())

In [None]:
print(transactions[['transaction_amt']].describe())

In [None]:
# Categorizing transactions as 'Money In' or 'Money Out', and creating 'money_flow' column
def classify_money_flow(tx_type):
    if tx_type in ['Credit', 'Deposit']:
        return 'Money In'
    else:
        return 'Money Out'

transactions['money_flow'] = transactions['transaction_type'].apply(classify_money_flow)

##### Digital Usage Dataset

In [None]:
print("Dimensions:", digital_usage.shape)
digital_usage.head()

In [None]:
print("DataFrame Structure:")
digital_usage.info()

In [None]:
# Converting 'last_mobile_use' and 'last_web_use' columns to category data type
digital_usage['last_mobile_use'] = pd.to_datetime(digital_usage['last_mobile_use'])
digital_usage['last_web_use'] = pd.to_datetime(digital_usage['last_web_use'])

# Creating 'mobile_web_ratio' column
digital_usage['mobile_web_ratio'] = digital_usage['mobile_logins_wk'] / (digital_usage['web_logins_wk'])

#### Merging Datasets

In [None]:
df = df.merge(products, on = 'customer_id')
df = df.merge(loans, on = 'customer_id')
df = df.merge(transactions, on = 'customer_id')
df = df.merge(digital_usage, on = 'customer_id')

#### **NPS Across Customer Segments**

In [None]:
# Calculating the percentage distribution each NPS category within each segment
nps_segment_dist = pd.crosstab(df['Segment'], df['nps_category'], normalize='index') * 100
print(nps_segment_dist)

In [None]:
plt.figure(figsize=(8, 6))
ax = nps_segment_dist.plot(kind='bar', stacked=True)
plt.title('Distribution of NPS Categories Across Customer Segments')
plt.xlabel('Customer Segment')
plt.ylabel('Percentage (%)')
plt.legend(title='NPS Category', bbox_to_anchor=(1.05, 1), loc='upper left')

for p in ax.patches:
    height = p.get_height()
    width = p.get_width()
    x = p.get_x() + width / 2
    y = p.get_y() + height / 2
    ax.annotate(f'{height:.2f}%', (x, y), ha='center', va='center', color='black', fontsize=10)

plt.tight_layout()
plt.show()

**Key insights:**

- High-value customers show strong loyalty but need focus on converting passives. 

- Budget-conscious customers have the highest dissatisfaction, signaling value gaps. 

- At-risk/inactive customers are polarized, reflecting uneven retention success. 

- Passives across segments represent a key opportunity to boost advocacy through tailored incentives and improved engagement.

#### **Financial Health Across Customer Segments**

In [None]:
# Computing the correlation matrix for the selected variables to assess relationships between them
variables = ['log_balance', 'log_debt', 'log_income', 'debt_to_income', 'balance_to_debt']
correlation_matrix = df[variables].corr()
print(correlation_matrix)

In [None]:
# Heatmap of correlation matrix for financial health metrics
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix for Financial Health Metrics')
plt.show()

**Key insights:**  

Higher income and savings (**log_income** and **log_balance**) strongly correlate with improved financial health metrics like **balance_to_debt**, while rising debt (**log_debt**) worsens **debt_to_income** ratios. Notably, income growth and savings accumulation counteract debt burdens, suggesting that boosting income and reducing debt are critical for financial stability. The near-perfect link between **log_balance** and **balance_to_debt** underscores savings as a key lever for debt management.

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14, 6))

# Boxplot of 'debt-to-income' ratio by customer segment
sns.boxplot(x='Segment', y='debt_to_income', data=df, ax=ax[0])
ax[0].set_title('Debt-to-Income Ratio by Customer Segment')
ax[0].set_xlabel('Customer Segment')
ax[0].set_ylabel('Debt-to-Income Ratio')

# Boxplot of 'balance-to-debt' ratio by customer segment
sns.boxplot(x='Segment', y='balance_to_debt', data=df, ax=ax[1])
ax[1].set_title('Balance-to-Debt Ratio by Customer Segment')
ax[1].set_xlabel('Customer Segment')
ax[1].set_ylabel('Balance-to-Debt Ratio')

plt.tight_layout()
plt.show()

**Key insights:**

- At-risk customers need urgent debt relief interventions such as restructuring and financial counseling.

- Budget-conscious segments would benefit from income-boosting tools or debt consolidation offers.

- High-value customers should be retained with loyalty programs reinforcing their financial stability.

In [None]:
# Filtering data for customers with days_past_due = 0
on_time = df[df['days_past_due'] == 0]
print(on_time)

In [None]:
# Calculating the proportion of on-time payers in each segment
on_time_counts = on_time.groupby('Segment').size()
total_counts = df.groupby('Segment').size()
on_time_proportion = (on_time_counts / total_counts) * 100
print(on_time_proportion)

In [None]:
# Bar plot of the proportion of customers with on-time loan payments for each segment
plt.figure(figsize=(8, 5))
ax = sns.barplot(x=on_time_proportion.index, y=on_time_proportion.values, palette='viridis')
plt.title('Proportion of On-Time Loan Payments (Days Past Due = 0) Across Customer Segments')
plt.xlabel('Customer Segments')
plt.ylabel('Proportion of On-Time Payments (%)')
plt.ylim(0, 30)
plt.grid(axis='y')

for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}%', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                color='black', fontsize=10, 
                xytext=(0, 5), textcoords='offset points')  # Adjust label position slightly above the bar

plt.tight_layout()
plt.show()

**Key insights:**  

- High-value customers likely show the highest proportion of on-time loan payments, reflecting financial stability and reliability.

- Budget-conscious customers may exhibit moderate on-time payment rates, balancing frugality with potential cash flow constraints. 

- At-risk/inactive customers likely have the lowest on-time payment rates, aligning with their high debt burdens and financial strain. 

These trends emphasize the need for segment-specific strategies, such as flexible repayment plans for at-risk customers and loyalty incentives for high-value clients, to improve payment behavior and retention.

In [None]:
# Calculating the percentage distribution of loan categories within each segment
loan_cat_percent = pd.crosstab(df['Segment'], df['loan_category'], normalize='index') * 100
print(loan_cat_percent)

**Key Insights:**  

- High-value customers prioritize **asset acquisition** and **business & education loans**, reflecting growth-focused financial behavior.

- At-risk and budget-conscious segments rely heavily on **debt management loans**, highlighting financial stress. 

- Lifestyle/personal loans are minimally used, suggesting limited discretionary borrowing. 

Tailored strategies, such as debt relief for stressed segments and premium investment products for high-value customers, are critical to address distinct priorities.

#### **Product Usage Across Customer Segments**

In [None]:
# Calculating the average product ownership for each segment across specified product columns
product_columns = ['has_investment_product', 'has_credit_card', 
                   'has_personal_loan', 'has_fixed_deposit', 'has_insurance']

usage_summary = df.groupby('Segment')[product_columns].mean()
print("Product Usage Proportions by Segment:")
print(usage_summary)

In [None]:
# Barplot of product usage proportions by customer segment
usage_summary_reset = usage_summary.reset_index()
usage_melted = usage_summary_reset.melt(id_vars='Segment', 
                                         var_name='Product', 
                                         value_name='Proportion')

plt.figure(figsize=(12, 6))
ax = sns.barplot(data=usage_melted, x='Product', y='Proportion', hue='Segment')
plt.title('Product Usage Proportions by Customer Segment')
plt.ylabel('Proportion of Customers Owning Product')
plt.xlabel('Product')
plt.legend(title='Segment')
plt.xticks(rotation=45)

for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                color='black', fontsize=10, 
                xytext=(0, 5), textcoords='offset points')  # Adjust label position slightly above the bar

plt.tight_layout()
plt.show()

In [None]:
# Heatmap of product usage proportions by customer segment
plt.figure(figsize=(8, 6))
sns.heatmap(usage_summary, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Heatmap of Product Usage Proportions by Segment')
plt.ylabel('Customer Segment')
plt.xlabel('Product')
plt.show()

In [None]:
# Performing Chi-square test for statistical significance in product usage differences across segments
for product in product_columns:
    print(f"Chi-Square Test for {product}")
    
    ct = pd.crosstab(df['Segment'], df[product])
    print("Contingency Table:")
    print(ct)
    
    chi2, p, dof, expected = stats.chi2_contingency(ct)

    print(f"Chi-square Statistic: {chi2:.4f}")
    print(f"Degrees of Freedom: {dof}")
    print(f"p-value: {p:.4f}")
    print("Expected Frequencies:")
    print(expected)
    
    if p < 0.05:
        print("=> The difference in usage across segments is statistically significant.")
    else:
        print("=> The difference in usage across segments is not statistically significant.")

**Summary of key findings:**

- High-value customers show strong adoption of premium products (credit cards, insurance, home loans), reflecting financial stability and diverse needs.

- Budget-conscious customers prioritize essentials (credit cards, personal loans) but underutilize investments, signaling a focus on affordability. 

- At-risk customers rely heavily on personal loans and lack in savings/insurance, highlighting debt-driven financial stress. 

- Fixed deposits show no significant segment differences, acting as a universal safe option.

Marketing can be tailored to upsell premium services for high-value customers, promote debt management tools for at-risk groups, and incentivize savings/credit products for budget-conscious users to align with segment-specific behaviors and needs.

#### **Transaction History Across Customer Segments**

In [None]:
# Calculating the count of transactions for each segment
tx_counts = df.groupby('Segment').size().reset_index(name='tx_count')
print(tx_counts)

# Calculating the count of transactions for each customer in each segment
customer_tx = df.groupby(['customer_id', 'Segment']).size().reset_index(name='tx_count')
print(customer_tx)

# Calculating the average transaction count per segment
avg_tx_count_by_segment = customer_tx.groupby('Segment')['tx_count'].mean().reset_index()
print(avg_tx_count_by_segment)

In [None]:
# Calculating the distribution of transaction types across segments
tx_type_distribution = pd.crosstab(df['Segment'], df['transaction_type'])
print(tx_type_distribution)

# Calculating the percentage of each transaction type within each segment
tx_type_percent = pd.crosstab(df['Segment'], df['transaction_type'], normalize='index') * 100
print(tx_type_percent)

In [None]:
# Calculating the distribution of money flow across segments
flow_distribution = pd.crosstab(df['Segment'], df['money_flow'])
print(flow_distribution)

# Calculating the percentage of money flow within each segment
flow_percent = pd.crosstab(df['Segment'], df['money_flow'], normalize='index') * 100
print(flow_percent)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 6))

# Barplot of the average transaction count by customer segment
sns.barplot(data=avg_tx_count_by_segment, x='Segment', y='tx_count', palette='viridis', ax=ax[0])
ax[0].set_xlabel('Customer Segment')
ax[0].set_ylabel('Average Transaction Count')
ax[0].set_title('Average Transaction Count by Customer Segment')

for p in ax[0].patches:
    ax[0].annotate(f'{p.get_height():.2f}', 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha='center', va='center', 
                   color='black', fontsize=10, 
                   xytext=(0, 5), textcoords='offset points') 

# Barplot of the average transaction value by customer segment
avg_transaction_value = df.groupby('Segment')['transaction_amt'].mean().reset_index()
bars = ax[1].bar(avg_transaction_value['Segment'], avg_transaction_value['transaction_amt'], 
                 color=['skyblue', 'lightgreen', 'lightcoral'])
ax[1].set_xlabel('Customer Segment')
ax[1].set_ylabel('Average Transaction Value')
ax[1].set_title('Average Transaction Value by Customer Segment')

for bar in bars:
    ax[1].text(bar.get_x() + bar.get_width() / 2, 
               bar.get_height(), 
               f'{bar.get_height():.2f}', 
               ha='center', 
               va='bottom', 
               fontsize=10, 
               color='black')

plt.tight_layout()
plt.show()

In [None]:
# Grouping by segment and money flow, and summing the transaction amount within each combination
money_summary = df.groupby(['Segment', 'money_flow'])['transaction_amt'].sum().reset_index()

# Calculating the percentage of each money flow type within each segment based on transaction amount
money_summary['percentage'] = money_summary.groupby('Segment')['transaction_amt'].transform(lambda x: x / x.sum() * 100)
print(money_summary)

# Transforming the money flow percentage data to a long format for easier comparison across segments
flow_percent.reset_index(inplace=True)
flow_percent_melted = flow_percent.melt(id_vars='Segment', var_name='money_flow', value_name='percentage')
print(flow_percent_melted)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18, 6))

# Bar plot of the percentage of money in vs. money out by customer segment
sns.barplot(data=money_summary, x='Segment', y='percentage', hue='money_flow', palette='viridis', ax=ax[0])

for container in ax[0].containers:
    ax[0].bar_label(container, fmt='%.2f%%', padding=3)

ax[0].set_xlabel('Customer Segment')
ax[0].set_ylabel('Percentage of Money Flow (%)')
ax[0].set_title('Percentage of Money In vs. Money Out by Customer Segment')
ax[0].legend(title='Money Flow')

# Bar plot of the percentage of transaction counts for money in vs. money out by customer segment
sns.barplot(data=flow_percent_melted, x='Segment', y='percentage', hue='money_flow', palette='viridis', ax=ax[1])

for container in ax[1].containers:
    ax[1].bar_label(container, fmt='%.2f%%', padding=3)

ax[1].set_xlabel('Customer Segment')
ax[1].set_ylabel('Percentage of Transaction Counts (%)')
ax[1].set_title('Percentage of Transaction Counts for Money In vs. Money Out by Customer Segment')
ax[1].legend(title='Money Flow')

plt.tight_layout()
plt.show()

**Summary of key findings:**

- High-value customers lead in transaction frequency and value, reflecting premium financial activity. 

- Budget-conscious users show moderate engagement with balanced money flow, indicating stable budgeting. 

- At-risk customers exhibit minimal activity and skewed outflows, signaling financial stress.  

Prioritization of loyalty incentives for high-value clients, promotion of savings tools for budget-conscious users, and offering debt relief programs for at-risk segments can help to enhance retention and financial health.

#### **Digital Engagement Across Customer Segments**

In [None]:
# Barplot of the engagement rate for each customer segment
fig, ax = plt.subplots(1, 2, figsize=(14, 7))
sns.barplot(x='Segment', y='has_mobile_app', data=df, ax=ax[0])
ax[0].set_title('Mobile App Usage Across Segments')

for p in ax[0].patches:
    ax[0].annotate(f'{p.get_height():.2f}', 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha='center', va='center', 
                   color='black', fontsize=10, 
                   xytext=(0, 20), textcoords='offset points')

sns.barplot(x='Segment', y='has_web_account', data=df, ax=ax[1])
ax[1].set_title('Web Account Usage Across Segments')

for p in ax[1].patches:
    ax[1].annotate(f'{p.get_height():.2f}', 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha='center', va='center', 
                   color='black', fontsize=10, 
                   xytext=(0, 20), textcoords='offset points')

plt.tight_layout()
plt.show()

In [None]:
# Grouping by segment and computing the most recent usage date for mobile and web
recency_metrics = df.groupby('Segment').agg({
    'last_mobile_use': 'max',
    'last_web_use': 'max'
})

print(recency_metrics)

In [None]:
# Boxplot of mobile and web engagement ratio across customer segments
sns.boxplot(x='Segment', y='mobile_web_ratio', data=df)
plt.title('Mobile vs Web Engagement Ratio Across Segments')
plt.show()

In [None]:
# Identifying inactive users (those with no mobile or web logins)
inactive_users = df[(df['mobile_logins_wk'] == 0) & (df['web_logins_wk'] == 0)]
inactive_by_segment = inactive_users.groupby('Segment').size()
print(inactive_by_segment)

**Summary of key insights:**  

- High-value customers show the highest mobile app engagement and mobile-web ratio, indicating strong digital adoption. 

- Budget-conscious users have moderate engagement but include inactive users requiring re-engagement. 

- At-risk/inactive customers lag in mobile activity and show older mobile logins.  

Prioritizing mobile-exclusive perks for high-value clients, incentivizing app logins for budget-conscious inactive users, and simplifying digital tools for at-risk segments can help to boost engagement.

#### **Targeted Marketing Approaches for each Customer Segment**

High-value customers drive growth, budget-conscious need stability, and at-risk segments require urgent support to prevent churn.

**1. High-Value Customers**  

Objective: Retain loyalty, deepen engagement, and cross-sell premium products.  

Advocacy:  
- Launch a referral programme (e.g. reward promoters with travel miles or cashback for referrals).  
- Convert passives with exclusive wealth management perks (e.g. priority access to financial advisors).  

Product Strategy:  
- Promote asset-building products (e.g. mortgages, business loans) and insurance bundles (life + property insurance).  
- Cross-sell miscellaneous loans (e.g. luxury travel financing) to cater to diverse needs.  

Digital Engagement:  
  - Offer app-exclusive deals (e.g. lower interest rates for investments made via mobile).  
  - Highlight real-time portfolio tracking and premium features in-app.  

**2. Budget-Conscious Customers**  

Objective: Reduce dissatisfaction, improve financial stability, and incentivize savings.  

Address Pain Points:  
- Target detractors with cost-saving tools (e.g. fee waivers for maintaining a savings balance).  
- Offer debt consolidation loans and cashback on essential spending (groceries, utilities).  

Product Strategy:  
- Promote fixed deposits with higher interest rates for consistent savers.  
- Introduce low-risk investment products (e.g. index funds) via email/SMS campaigns.  

Digital Engagement:  
- Re-engage inactive users with login rewards (e.g. $5 bonus for weekly app logins).  
- Simplify budgeting tools on mobile/web for easier debt management.  

**3. At-Risk/Inactive Customers**  

Objective: Mitigate financial strain, rebuild trust, and reactivate engagement. 

Crisis Interventions:  
- Provide debt relief programs (e.g. interest-free grace periods, restructuring).  
- Offer free financial counseling (e.g. credit score workshops via webinars).  

Product Strategy:  
- Push debt management tools (e.g. automated payment reminders with fee waivers).  
- Simplify product terms (e.g. no-fee basic accounts) to reduce complexity.  

Digital Engagement:  
- Streamline mobile/web interfaces for easier navigation (e.g. one-click payment options).  
- Trigger reactivation campaigns (e.g. "We miss you" SMS with small-balance loan offers).  

**Cross-Segment Priorities**  
- Passive Customers:  
  - All Segments: Offer tiered incentives (e.g. cashback for increased transaction frequency).  
- Underutilized Products:  
  - Test lifestyle/personal loan promotions.  