In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

In [8]:
final_df = pd.read_csv("../data/original.csv")

# 1. How can we effectively segment our customers based on their banking behavior and preferences?

## Develop a customer segmentation model using the collected data.

### Feature Engineering Rules

#### 1. Digital Engagement
We will combine the following features to create the `Digital_Engagement` score:
- **PhoneService**: If the customer has phone service (`PhoneService = 1`), add 1 point.
- **InternetService**: 
  - If the customer has **Fiber optic internet** (`InternetService = 1`), add 2 points.
  - If **DSL** (`InternetService = 0`), add 1 point.
- **TechSupport**: 
  - If the customer uses tech support (`TechSupport = 2`), add 2 points.
  - If no tech support (`TechSupport = 0`), add 0 points.
- **PaperlessBilling**: If the customer has paperless billing (`PaperlessBilling = 1`), add 1 point.
- **PaymentMethod**: 
  - If the customer uses **automatic payments** (`PaymentMethod = 0 or 1`), add 2 points.
  - If **electronic check** (`PaymentMethod = 2`), add 1 point.

**Total score range**: 0 to 8 (where 0 means low engagement and 8 means high engagement).

---

#### 2. Financial Status
We will combine the following features to create the `Financial_Status` score:
- **Income_Category**: 
  - If the customer has income in the range `80 - 120` or higher (`120 +`), add 2 points.
  - If `40 - 60`, add 1 point.
- **Credit Score**: 
  - If the credit score is above 700, add 2 points.
  - If between 600 and 700, add 1 point.
- **Outstanding Loans**: 
  - If the loans are less than $20,000, add 2 points.
  - If loans are between $20,000 and $50,000, add 1 point.
- **Balance**: 
  - If the balance is higher than $100,000, add 2 points.
  - If balance is between $50,000 and $100,000, add 1 point.

**Total score range**: 0 to 8 (higher means stronger financial status).

---

#### 3. Transaction Behavior
We will create a composite score for `Transaction_Behavior`:
- **Total_Trans_Amt**: 
  - If the total transaction amount is in the top 25%, add 2 points.
  - If in the middle 50%, add 1 point.
- **Total_Trans_Count**: 
  - If the total number of transactions is in the top 25%, add 2 points.
  - If in the middle 50%, add 1 point.

**Total score range**: 0 to 4 (higher means frequent and high-value transactions).

---

#### 4. Product Usage
We will categorize customers based on the number of products they use and assign them a `Product_Usage` label:
- **Heavy User**: Customers using more than 3 products.
- **Moderate User**: Customers using 2-3 products.
- **Light User**: Customers using 1 product.

In [13]:
### 1. Digital Engagement Score ###
def digital_engagement(row):
    score = 0
    # PhoneService
    if row['PhoneService'] == 1:
        score += 1
    # InternetService
    if row['InternetService'] == 1:  # Fiber optic
        score += 2
    elif row['InternetService'] == 0:  # DSL
        score += 1
    # TechSupport
    if row['TechSupport'] == 2:
        score += 2
    # PaperlessBilling
    if row['PaperlessBilling'] == 1:
        score += 1
    # PaymentMethod
    if row['PaymentMethod'] in [0, 1]:  # Bank transfer or Credit card (automatic)
        score += 2
    elif row['PaymentMethod'] == 2:  # Electronic check
        score += 1
    return score

### 2. Financial Status Score ###
def financial_status(row):
    score = 0
    # Income_Category
    if row['Income_Category'] in ['80 - 120', 'Other']:
        score += 2
    elif row['Income_Category'] == '40 - 60':
        score += 1
    # Credit Score
    if row['Credit Score'] > 700:
        score += 2
    elif 600 <= row['Credit Score'] <= 700:
        score += 1
    # Outstanding Loans
    if row['Outstanding Loans'] < 20000:
        score += 2
    elif 20000 <= row['Outstanding Loans'] < 50000:
        score += 1
    # Balance
    if row['Balance'] > 100000:
        score += 2
    elif 50000 <= row['Balance'] <= 100000:
        score += 1
    return score

### 3. Transaction Behavior Score ###
def transaction_behavior(df, amt_feature, count_feature):
    # Precompute the quantiles for the given transaction amount and count features
    trans_amt_quantiles = df[amt_feature].quantile([0.25, 0.75])
    trans_count_quantiles = df[count_feature].quantile([0.25, 0.75])
    
    def calculate_score(row):
        score = 0
        # Total_Trans_Amt
        if row[amt_feature] > trans_amt_quantiles[0.75]:
            score += 2
        elif row[amt_feature] > trans_amt_quantiles[0.25]:
            score += 1
        # Total_Trans_Count
        if row[count_feature] > trans_count_quantiles[0.75]:
            score += 2
        elif row[count_feature] > trans_count_quantiles[0.25]:
            score += 1
        return score
    
    # Apply the score calculation function to the dataframe
    df['Transaction_Behavior'] = df.apply(calculate_score, axis=1)
    
    return df

### 4. Product Usage ###
def product_usage(row):
    if row['No_of_product'] > 4:
        return 'Heavy User'
    elif 2 <= row['No_of_product'] >= 4:
        return 'Moderate User'
    else:
        return 'Light User'

### Feature Integration: Banking Behavior and Customer Preferences

#### 1. **Banking Behavior**
- **Transaction Behavior**: 
  - Combines total transaction amount and count.
  - High transaction behavior receives a higher score.
  
- **Product Usage**:
  - If the customer is a **Heavy User** of products, they get an additional 2 points.
  - **Moderate Users** get 1 point.
  - **Light Users** don’t add any points.
  
#### 2. **Customer Preferences**
- **Digital Engagement**:
  - Captures how much the customer engages with the bank's digital services.
  - Score based on PhoneService, InternetService, TechSupport, PaperlessBilling, and PaymentMethod.
  
- **Financial Status**:
  - Measures the customer’s financial health based on income, credit score, outstanding loans, and balance.
  
The final **Customer Preferences Score** is the sum of **Digital Engagement** and **Financial Status**.

In [18]:
# Function to combine the individual features into two composite features

def integrate_banking_behavior(row):
    # Combine Transaction Behavior and Product Usage
    banking_behavior_score = row['Transaction_Behavior']
    
    if row['Product_Usage'] == 'Heavy User':
        banking_behavior_score += 2
    elif row['Product_Usage'] == 'Moderate User':
        banking_behavior_score += 1
    # Light users don't add extra points to the banking behavior score
    
    return banking_behavior_score

def integrate_customer_preferences(row):
    # Combine Digital Engagement and Financial Status
    customer_preferences_score = row['Digital_Engagement'] + row['Financial_Status']
    
    return customer_preferences_score


In [20]:
final_df['Digital_Engagement'] = final_df.apply(digital_engagement, axis=1)
final_df['Financial_Status'] = final_df.apply(financial_status, axis=1)
final_df = transaction_behavior(final_df, 'Total_Trans_Amt', 'Total_Trans_Count')
final_df['Product_Usage'] = final_df.apply(product_usage, axis=1)

# Apply the new functions to the dataframe
final_df['Banking_Behavior'] = final_df.apply(integrate_banking_behavior, axis=1)
final_df['Customer_Preferences'] = final_df.apply(integrate_customer_preferences, axis=1)

### Identify unique characteristics and needs for each customer segment.
### Rule-Based Segmentation Based on Banking Behavior and Customer Preferences

#### Classification Ranges:
- **Banking Behavior**:
  - **Low (L)**: Score < 3
  - **Moderate (M)**: Score 3–4(inclusive)
  - **High (H)**: Score > 4

- **Customer Preferences**:
  - **Low (L)**: Score < 6
  - **Moderate (M)**: Score 6–11(inclusive)
  - **High (H)**: Score > 11

#### Segments:

1. **Low Engagement, Low Banking Behavior**:
   - **Banking Behavior**: Low (Score < 3)
   - **Customer Preferences**: Low (Score < 6)
   - **Business Insight**: These customers are disengaged both digitally and financially. Focus on retention through educational programs and targeted basic financial offerings.

2. **Highly Engaged, High Banking Behavior**:
   - **Banking Behavior**: High (Score > 4)
   - **Customer Preferences**: High (Score > 11)
   - **Business Insight**: These customers are highly engaged both financially and digitally. They should be prioritized for premium services, loyalty programs, and personalized offers.

3. **High Engagement, Low or Moderate Banking Behavior**:
   - **Banking Behavior**: Low (Score < 3) or Moderate (Score 3–4)
   - **Customer Preferences**: High (Score > 11)
   - **Business Insight**: These customers are digitally engaged but lack full financial activity. Focus on cross-selling financial products to increase their banking engagement.

4. **High Banking Behavior, Low or Moderate Engagement**:
   - **Banking Behavior**: High (Score > 4)
   - **Customer Preferences**: Low (Score < 6) or Moderate (Score 6–10)
   - **Business Insight**: These customers are financially active but have lower digital engagement. Promote online services or mobile banking to increase digital activity.

5. **Moderate or Low Engagement and Banking Behavior**:
   - **Banking Behavior**: Moderate (Score 3–4) or Low (Score < 3)
   - **Customer Preferences**: Moderate (Score 6–10) or Low (Score < 6)
   - **Business Insight**: These customers have limited engagement both financially and digitally. Focus on retention strategies and encouraging digital services and basic financial products.

In [27]:
# Function to assign clusters based on Banking Behavior and Customer Preferences with thresholds as arguments
def assign_rule_based_cluster(row, banking_thresholds, customer_thresholds):
    banking_low, banking_moderate = banking_thresholds
    customer_low, customer_moderate = customer_thresholds
    
    # Rule 1: Low Engagement, Low Banking Behavior
    if row['Banking_Behavior'] < banking_low and row['Customer_Preferences'] < customer_low:
        return 'Low Engagement, Low Banking Behavior'
    
    # Rule 2: Highly Engaged, High Banking Behavior
    elif row['Banking_Behavior'] > banking_moderate and row['Customer_Preferences'] > customer_moderate:
        return 'Highly Engaged, High Banking Behavior'
    
    # Rule 3: High Engagement, Low or Moderate Banking Behavior
    elif row['Banking_Behavior'] <= banking_moderate and row['Customer_Preferences'] > customer_moderate:
        return 'High Engagement, Low or Moderate Banking Behavior'
    
    # Rule 4: High Banking Behavior, Low or Moderate Engagement
    elif row['Banking_Behavior'] > banking_moderate and row['Customer_Preferences'] <= customer_moderate:
        return 'High Banking Behavior, Low or Moderate Engagement'
    
    # Rule 5: Moderate or Low Engagement and Banking Behavior
    elif row['Banking_Behavior'] <= banking_moderate and row['Customer_Preferences'] <= customer_moderate:
        return 'Moderate or Low Engagement and Banking Behavior'
    
    # Default case: If no specific rule is met
    return 'Other'


In [29]:
banking_thresholds = (3, 4)  # (Low, Moderate)
customer_thresholds = (6, 11)  # (Low, Moderate)

# Apply the function to create the 'Cluster_Labels' column
final_df['Cluster_Labels'] = final_df.apply(assign_rule_based_cluster, axis=1, 
                                            banking_thresholds=banking_thresholds, 
                                            customer_thresholds=customer_thresholds)

# Display the cluster distribution
final_df['Cluster_Labels'].value_counts()

Cluster_Labels
Moderate or Low Engagement and Banking Behavior      8156
High Banking Behavior, Low or Moderate Engagement     875
High Engagement, Low or Moderate Banking Behavior     791
Low Engagement, Low Banking Behavior                  234
Highly Engaged, High Banking Behavior                  71
Name: count, dtype: int64

In [31]:
final_df.to_csv('original (4).csv', index=False)