In [1]:
import pandas as pd

df = pd.read_csv("CC GENERAL.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'CC GENERAL.csv'

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
X = df

# dropped cust id as it is not a describing feature just an identifier for the
# specific customer
X = X.drop(["CUST_ID"], axis=1)

# Its just one value lets just do median to get it
X["CREDIT_LIMIT"].fillna(X["CREDIT_LIMIT"].median(), inplace=True)



In [None]:
# 313 missing values (~3.5%)
# Financial behavior feature
# Missingness is not random
# This means missingness itself carries information.
X["MIN_PAY_MISSING"] = X["MINIMUM_PAYMENTS"].isna().astype(int)
X["MINIMUM_PAYMENTS"].fillna(X["MINIMUM_PAYMENTS"].median(), inplace=True)


print(X.isna().sum())
print((X.isna().sum() / len(X)) * 100)


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler
from itertools import combinations
import pandas as pd
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


def generate_pca_report(n_components, df):
    if n_components < 3:
        raise ValueError("n_components must be >= 3 for 3D visualization")

    # ---- Preserve feature names
    feature_names = df.columns

    # ---- Scale
    scaler = RobustScaler()
    X_scaled = scaler.fit_transform(df)

    # ---- PCA
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X_scaled)

    column_names = [f'PC{i+1}' for i in range(n_components)]
    pca_df = pd.DataFrame(X_pca, columns=column_names)

    # ---- 3D combinations
    combos = list(combinations(column_names, 3))

    fig = go.Figure()

    for i, (pc1, pc2, pc3) in enumerate(combos):
        x = pca_df[pc1].values
        y = pca_df[pc2].values
        z = pca_df[pc3].values

        # ‚úÖ Distance from origin (depth)
        distance = np.sqrt(x**2 + y**2 + z**2)

        fig.add_trace(
            go.Scatter3d(
                x=x,
                y=y,
                z=z,
                mode='markers',
                marker=dict(
                    size=2,
                    color=distance,          # üëà DISTANCE-BASED COLOR
                    colorscale='Turbo',
                    opacity=0.75,
                    showscale=True
                ),
                name=f'{pc1}-{pc2}-{pc3}',
                visible=(i == 0)
            )
        )

    # ---- Dropdown for PC combinations
    buttons = []
    for i, (pc1, pc2, pc3) in enumerate(combos):
        visible = [False] * len(combos)
        visible[i] = True
        buttons.append(
            dict(
                label=f'{pc1}-{pc2}-{pc3}',
                method='update',
                args=[
                    {'visible': visible},
                    {'scene': {
                        'xaxis': {'title': pc1},
                        'yaxis': {'title': pc2},
                        'zaxis': {'title': pc3}
                    }}
                ]
            )
        )

    fig.update_layout(
        updatemenus=[dict(
            buttons=buttons,
            direction="down",
            x=0.02,
            y=1.1
        )],
        title="Interactive 3D PCA (Color = Distance from Origin)",
        scene=dict(
            xaxis_title=combos[0][0],
            yaxis_title=combos[0][1],
            zaxis_title=combos[0][2],
        )
    )

    fig.show()

    # ---- PCA Loadings
    loadings_df = pd.DataFrame(
        pca.components_.T,
        columns=column_names,
        index=feature_names
    )

    plt.figure(figsize=(10, 8))
    sns.heatmap(
        loadings_df,
        cmap='RdBu_r',
        center=0,
        annot=True
    )
    plt.title("PCA Loading Scores")
    plt.tight_layout()
    plt.show()

    # ---- Explained variance
    print("\nExplained Variance Ratio:")
    for i, var in enumerate(pca.explained_variance_ratio_, 1):
        print(f"PC{i}: {var:.4f} ({var*100:.2f}%)")

    print(f"\nCumulative Variance: {pca.explained_variance_ratio_.sum():.2%}")
    return pca_df


In [None]:

def create_pca_scree_plot(X):
  import matplotlib.pyplot as plt
  from sklearn.decomposition import PCA

  from sklearn.preprocessing import RobustScaler
  scaler = RobustScaler()
  X_scaled = scaler.fit_transform(X)

  # Fit PCA with all components
  pca_full = PCA()
  pca_full.fit(X_scaled)

  explained_variance = pca_full.explained_variance_ratio_
  cumulative_variance = explained_variance.cumsum()

  # Scree (elbow) plot
  plt.figure()
  plt.plot(
      range(1, len(explained_variance) + 1),
      explained_variance,
      marker='o'
  )
  plt.xlabel("Number of Principal Components")
  plt.ylabel("Explained Variance Ratio")
  plt.title("PCA Scree Plot (Elbow Method)")
  plt.show()

  # Optional but useful: cumulative variance plot
  plt.figure()
  plt.plot(
      range(1, len(cumulative_variance) + 1),
      cumulative_variance,
      marker='o'
  )
  plt.xlabel("Number of Principal Components")
  plt.ylabel("Cumulative Explained Variance")
  plt.title("Cumulative Explained Variance")
  plt.axhline(y=0.8, linestyle='--')  # common cutoff
  plt.show()


In [None]:
def show_corr_graph_df(X):
  import seaborn as sns

  corr = X.corr()
  plt.figure(figsize=(12,8))
  sns.heatmap(corr, annot=True, fmt=".2f")
  plt.show()

In [None]:
import matplotlib.pyplot as plt


X.hist(bins=30, figsize=(20,15))
plt.show()


In [None]:
# Check variance
print(f"TENURE variance: {X['TENURE'].var():.2f}")
print(f"TENURE unique values: {X['TENURE'].nunique()}")
print(f"% with TENURE=12: {(X['TENURE']==12).sum()/len(X)*100:.1f}%")


In [None]:
# Tenure is overloaded with 12 so it is not that useful because it does not give much information so lets remove it

X = X.drop(["TENURE"], axis=1)

In [None]:
for col in X.select_dtypes(include=["int64", "object"]):
    print(col, X[col].nunique())


In [None]:
create_pca_scree_plot(X)

# Initial Observation of the Raw Data

Exploratory analysis of the dataset revealed that the majority of monetary variables (e.g., BALANCE, PURCHASES, CASH_ADVANCE, PAYMENTS, CREDIT_LIMIT) exhibited severely right-skewed distributions with long tails. Most customers had relatively small values, while a small subset showed extremely large magnitudes.

Additionally, several frequency-based features were bounded between 0 and 1 and displayed heavy concentration at boundary values (particularly 0 and 1), indicating categorical-like behavioral patterns rather than continuous numerical variation.

This distributional imbalance posed a significant problem for distance-based clustering algorithms.

# Problem With Using Raw Features for Clustering

Clustering algorithms such as K-Means rely on Euclidean distance, which is highly sensitive to feature scale and magnitude. In the raw data:

Features with large numeric ranges dominated distance calculations

Customers with extreme monetary values disproportionately influenced cluster centroidsProblem With Using Raw Features for Clustering

Clustering algorithms such as K-Means rely on Euclidean distance, which is highly sensitive to feature scale and magnitude. In the raw data:

- Features with large numeric ranges dominated distance calculations

- Customers with extreme monetary values disproportionately influenced cluster centroids

- Customers with moderate or low spending behavior collapsed into indistinguishable groups

As a result, clustering on the raw data would primarily separate customers by spending volume, rather than by meaningful behavioral patterns.

Customers with moderate or low spending behavior collapsed into indistinguishable groups

As a result, clustering on the raw data would primarily separate customers by spending volume, rather than by meaningful behavioral patterns.

# Log Transformation of Monetary Features

To address the extreme skewness and reduce the influence of outliers, a logarithmic transformation was applied to monetary features using the log1p function.

Rationale:

- Compresses long right tails while preserving relative ordering

- Reduces dominance of extreme values without discarding data

- Stabilizes variance across customers

Observed Effect:
After log transformation, previously heavy-tailed distributions became more symmetric and spread more evenly across their range. This improved the ability of distance-based methods to disti

In [None]:
log_cols = [
    "BALANCE",
    "PAYMENTS",
    "MINIMUM_PAYMENTS",
    "CREDIT_LIMIT",
]

import numpy as np
X[log_cols] = X[log_cols].apply(np.log1p)

# check for negative values
(X[log_cols] < 0).sum()


In [None]:
X.hist(bins=30, figsize=(20,15))
plt.show()


In [None]:
create_pca_scree_plot(X)

After doing log transformation on all features we found better

# Preprocessing Justification: Handling Zero-Inflated Cash Advance Features

## Problem Identified
The cash advance features (CASH_ADVANCE, CASH_ADVANCE_FREQUENCY, CASH_ADVANCE_TRX) exhibit severe zero-inflation, with approximately 75% of customers showing zero values. This bimodal distribution (zero vs. non-zero) causes two critical issues for clustering:

1. **Distance metric distortion**: Euclidean distance calculations become dominated by the binary pattern of "uses vs. doesn't use" rather than capturing nuanced behavioral differences
2. **Feature redundancy**: Three variables measure overlapping aspects of the same behavior (cash advance usage), leading to multicollinearity and over-weighting this single dimension

## Solution Applied
We engineer three complementary features that capture distinct aspects of cash advance behavior:

### 1. Binary Indicator (`uses_cash_advance`)
```python
uses_cash_advance = (CASH_ADVANCE > 0).astype(int)
```
- Captures the primary behavioral split: cash advance users vs. non-users
- Provides clear cluster interpretability

### 2. Economic Magnitude (`cash_advance_amount_log`)
```python
cash_advance_amount_log = log1p(CASH_ADVANCE)
```
- Captures the monetary value of cash advance usage
- Log transformation normalizes the right-skewed distribution
- Preserves zero values (log1p(0) = 0)

### 3. Behavioral Pattern (`cash_advance_frequency`)
```python
cash_advance_frequency = CASH_ADVANCE_FREQUENCY  # Retained as-is
```
- Captures usage frequency (already normalized 0-1)
- Distinguishes one-time large withdrawals from frequent small withdrawals
- Reveals chronic vs. emergency usage patterns

### Features Dropped
- **CASH_ADVANCE** ‚Üí Replaced by log-transformed version
- **CASH_ADVANCE_TRX** ‚Üí Redundant given amount and frequency


In [None]:
X['uses_cash_advance'] = (X['CASH_ADVANCE'] > 0).astype(int)
X['cash_advance_log'] = np.log1p(X['CASH_ADVANCE'])
X['cash_advance_frequency'] = X['CASH_ADVANCE_FREQUENCY']
X = X.drop(['CASH_ADVANCE_FREQUENCY', 'CASH_ADVANCE_TRX', 'CASH_ADVANCE'], axis=1)
X[['uses_cash_advance', 'cash_advance_log', 'cash_advance_frequency']].describe()

In [None]:
X[['uses_cash_advance', 'cash_advance_log', 'cash_advance_frequency']].hist(bins=30)

In [None]:
print("Cash Advance Usage Distribution:")
print(X['uses_cash_advance'].value_counts().sort_index())
print()

# With percentages
print("Cash Advance Usage Distribution (with percentages):")
print(X['uses_cash_advance'].value_counts(normalize=True).sort_index() * 100)
print()

# More detailed breakdown
print("\nDetailed Summary:")
print(f"Non-users (0): {(X['uses_cash_advance'] == 0).sum()} customers ({(X['uses_cash_advance'] == 0).sum() / len(X) * 100:.1f}%)")
print(f"Users (1): {(X['uses_cash_advance'] == 1).sum()} customers ({(X['uses_cash_advance'] == 1).sum() / len(X) * 100:.1f}%)")
print(f"Total: {len(X)} customers")


## Results of Cash Advance Feature Engineering

### Transformed Distributions

**`uses_cash_advance` (Binary Indicator)**
- Clean binary split: 4,628 non-users (0) vs. 4,322 users (1)
- 48.3% of customers use cash advances

**`cash_advance_amount_log` (Log-Transformed Amount)**
- Preserves zero spike for non-users (~4,628)
- Users show smooth distribution from 2.5 to 10.0
- Successfully normalizes extreme values while maintaining economic magnitude distinctions

**`cash_advance_frequency` (Retained)**
- Ranges from 0 (non-users) to 1 (use in every billing cycle)
- Among users: distinguishes sporadic vs. frequent usage patterns

## Key Improvements

1. **Eliminated redundancy**: Three correlated features ‚Üí three orthogonal dimensions
2. **Balanced information**: Binary flag (user type) + continuous amount (magnitude) + frequency (pattern)
3. **Enhanced separability**: Non-users form distinct group; users differentiate by both economic impact and behavioral frequency
4. **Prevents zero-inflation dominance**: Clustering can now capture nuanced behavior beyond simple "uses vs. doesn't use"
5. **Preserves behavioral nuance**: Frequency enables distinction between emergency borrowers (high amount, low frequency) and chronic users (moderate amount, high frequency)


In [None]:
# same as above

X['uses_installments'] = (X['INSTALLMENTS_PURCHASES'] > 0).astype(int)
X['installments_amount_log'] = np.log1p(X['INSTALLMENTS_PURCHASES'])
X = X.drop(['INSTALLMENTS_PURCHASES'], axis=1)

In [None]:
X['uses_oneoff'] = (X['ONEOFF_PURCHASES'] > 0).astype(int)
X['oneoff_amount_log'] = np.log1p(X['ONEOFF_PURCHASES'])
X = X.drop(['ONEOFF_PURCHASES'], axis=1)

In [None]:
# Keep balance frequency as its an indicator of activity

In [None]:
# Purchase frequency captures important behaviors for non buyers vs frequent buyers

In [None]:
X["uses_purchase"] = (X['PURCHASES'] > 0).astype(int)
X['purchases_log'] = np.log1p(X['PURCHASES'])
X['purchase_frequency'] = X['PURCHASES_FREQUENCY']
X['purchases_trx_log'] = np.log1p(X['PURCHASES_TRX'])
X = X.drop(['PURCHASES', "PURCHASES_FREQUENCY", 'PURCHASES_TRX'], axis=1)

In [None]:
create_pca_scree_plot(X)

In [None]:
X.hist(bins=30, figsize=(20,15))
plt.show()

In [None]:
show_corr_graph_df(X)

I removed highly correlated behavioral proxies to avoid overweighting the same customer actions multiple times in distance-based clustering.

In [None]:
X = X.drop(["uses_cash_advance",
            "cash_advance_frequency",
            "uses_installments",
            "uses_oneoff",
            "purchases_trx_log",
            "uses_purchase",
            "PURCHASES_INSTALLMENTS_FREQUENCY",
            ], axis=1)

In [None]:
X.hist(bins=30, figsize=(20,15))
plt.show()

In [None]:
show_corr_graph_df(X)

In [None]:
create_pca_scree_plot(X)

In [None]:
generate_pca_report(6, X)

In [None]:
pca_df = generate_pca_report(4, X)

# Principal Component Analysis (PCA) Interpretation and Component Selection

We performed Principal Component Analysis (PCA) using two configurations: **4 principal components (PCs)** and **6 principal components**, explaining **88.28%** and **95.23%** of the total variance respectively. This section provides a detailed interpretation of the extracted components and justifies the selection of **4 PCs for downstream analysis**.

---

## Interpretation of Principal Components (4-PC Solution)

### PC1 ‚Äì Credit Utilization & Balance Behavior
**Key loadings**
- BALANCE_FREQUENCY (‚âà +0.81)
- BALANCE (‚âà +0.26)
- MINIMUM_PAYMENTS (‚âà +0.20)
- PRC_FULL_PAYMENT (‚âà ‚àí0.45)

**Explanation**  
PC1 captures how frequently customers carry outstanding balances versus paying in full. High PC1 scores correspond to customers who regularly revolve balances and rely on minimum payments, while low scores indicate disciplined users who consistently pay in full. This component represents the primary axis of credit usage intensity.

---

### PC2 ‚Äì Payment Discipline
**Key loadings**
- PRC_FULL_PAYMENT (‚âà +0.82)
- BALANCE_FREQUENCY (‚âà +0.42)

**Explanation**  
PC2 reflects repayment reliability independent of spending volume. Customers with high scores consistently pay their balances in full, whereas lower scores indicate partial or inconsistent repayment behavior. This component isolates payment discipline as a distinct behavioral dimension.

---

### PC3 ‚Äì Spending Structure and Purchase Behavior
**Key loadings**
- ONEOFF_PURCHASES_FREQUENCY (‚âà +0.59)
- purchases_log (‚âà +0.52)
- oneoff_amount_log (‚âà +0.32)
- purchase_frequency (‚âà +0.20)

**Explanation**  
PC3 represents how customers spend, distinguishing between frequent purchases and one-off, higher-value transactions. This component captures consumption and transaction patterns rather than repayment or balance management behavior.

---

### PC4 ‚Äì Repayment Pressure and Liquidity Stress
**Key loadings**
- PAYMENTS (‚âà +0.72)
- MINIMUM_PAYMENTS (‚âà +0.31)
- BALANCE (‚âà +0.29)
- cash_advance_log (‚âà +0.26)

**Explanation**  
PC4 reflects financial strain and liquidity pressure. Higher scores indicate customers making larger payments, holding higher balances, and relying more on cash advances, suggesting increased repayment burden or short-term liquidity needs.

---

## Interpretation of Additional Components (6-PC Solution)

Extending the PCA to **6 components** increases the explained variance to **95.23%**, but the additional components provide limited new behavioral insight.

---

### PC5 ‚Äì Installment vs One-Off Spending Contrast
**Key loadings**
- installments_amount_log (‚âà ‚àí0.42)
- purchases_log (‚âà ‚àí0.42)
- ONEOFF_PURCHASES_FREQUENCY (‚âà +0.63)

**Explanation**  
PC5 contrasts installment-heavy spending with direct or one-off purchases. While this component adds granularity to spending behavior, it does not introduce a fundamentally new behavioral dimension beyond what is already captured by PC3.

---

### PC6 ‚Äì Minimum Payment Dominance
**Key loadings**
- MINIMUM_PAYMENTS (‚âà +0.72)
- Secondary contributions from PAYMENTS and CREDIT_LIMIT

**Explanation**  
PC6 is largely driven by a single variable, indicating that the PCA is capturing residual variance rather than meaningful latent structure. Components dominated by a single feature are generally unstable and offer limited interpretive value.

---

## Justification for Selecting 4 PCs

### Variance Coverage
- 4 PCs explain **88.28%** of total variance
- 6 PCs explain **95.23%**, adding only ~7% additional variance

The majority of meaningful structure is already captured within the first four components.

---

### Interpretability and Behavioral Coverage
The 4-PC solution cleanly represents four distinct and interpretable behavioral dimensions:
1. Credit utilization intensity  
2. Payment discipline  
3. Spending patterns  
4. Financial strain  

Additional components mainly refine existing patterns rather than reveal new structure.

---

### Model Simplicity and Stability
Using fewer components reduces dimensionality, limits noise, and improves model stability and generalization in downstream tasks such as clustering or predictive modeling.

---

## Final Justification Statement

Although the 6-component PCA explains a higher proportion of total variance, the additional components primarily capture marginal or redundant patterns. The 4-component solution, explaining 88.28% of the variance, retains the key behavioral dimensions of credit usage, payment discipline, spending behavior, and financial strain, providing a more interpretable and robust representation for downstream analysis.


# Scaling of Features Using RobustScaler

After log transformation, features were scaled using RobustScaler, which normalizes data based on the median and interquartile range.

Rationale:

- Ensures all features contribute comparably to distance calculations

- Prevents residual outliers from disproportionately influencing clusters

- More appropriate than standard scaling for non-Gaussian data

Observed Effect:
Scaling aligned features onto a comparable numeric range without reintroducing sensitivity to extreme values, allowing clustering algorithms to weigh behavioral and monetary features more evenly.

In [None]:
pc_names = {
    "PC1": "Credit Utilization Intensity",
    "PC2": "Payment Discipline",
    "PC3": "Spending Pattern",
    "PC4": "Financial Strain"
}

pca_df = pca_df.rename(columns=pc_names)
pca_df

# Clustering Method Selection and Justification

## Objective
The goal of this analysis is to identify meaningful patterns in **credit card utilization behavior** by clustering individuals based on latent behavioral dimensions derived from PCA. These clusters are intended to represent distinct financial behavior profiles rather than artificially separated geometric groupings.

---

## Feature Representation

After preprocessing and dimensionality reduction, the data is represented using four principal components:

- **PC1 ‚Äì Credit Utilization Intensity**  
- **PC2 ‚Äì Payment Discipline**  
- **PC3 ‚Äì Spending Pattern**  
- **PC4 ‚Äì Financial Strain**

These components capture the dominant behavioral axes of credit card usage and are used as inputs to the clustering algorithms.

---

## Observations After PCA

Visual inspection of the PCA-transformed space shows:
- No clear spherical or compact clusters
- Points are broadly distributed, forming a near-uniform or ‚Äúcube-like‚Äù structure
- Significant overlap between behavioral patterns is expected, especially for users with moderate credit behavior

These observations strongly influence algorithm selection.

---

## Considered Clustering Algorithms

### 1. K-Means Clustering (Not Selected as Primary)

**Rationale for consideration**:
- Simple and widely used
- Operates based on distances between points
- Can segment users based on relative position in feature space

**Limitations in this context**:
- Assumes spherical, equally sized clusters
- Requires pre-specifying the number of clusters
- Sensitive to outliers
- Poor fit for overlapping and irregular behavioral patterns

**Conclusion**:  
K-means was explored during the exploratory phase but was **not selected as the primary method** due to the absence of spherical cluster structure and the presence of overlapping behaviors. Its assumptions do not align well with the observed data geometry.

---

### 2. Hierarchical Clustering (Support

---

### 2. Hierarchical Clustering (Supporting Analysis)

**Why it is suitable**:
- Does not require specifying the number of clusters upfront
- Captures hierarchical relationships between observations
- Useful for analyzing how behavioral patterns split at different similarity levels

**Relevance to this study**:
- PCA components represent related financial behaviors
- Hierarchical clustering allows inspection of nested groupings (e.g., disciplined vs. undisciplined users, then further subgroups)

**Role in the analysis**:
- Used to **explore structure and validate relationships**
- Dendrograms aid interpretability
- Not used as the final clustering method due to scalability and sensitivity to linkage choices

---

### 3. Density-Based Clustering (DBSCAN / HDBSCAN)

**Why this is appropriate**:
- Does not assume spherical cluster shapes
- Can identify clusters of arbitrary geometry
- Explicitly models noise and outliers
- Does not require pre-defining the number of clusters

**Relevance to credit behavior data**:
- Financial behaviors often form dense regions with gradual transitions
- Users with extreme utilization or financial strain naturally appear as outliers
- The cube-like distribution suggests density variation rather than clear centroids

**Conclusion**:  
Density-based clustering is well-suited for uncovering **irregular, behavior-driven groupings** and identifying atypical users.

---

### 4. Gaussian Mixture Models (GMM)

**Why this is appropriate**:
- Models clusters as probability distributions
- Allows **overlapping clusters**
- Provides soft assignments (membership probabilities)

**Relevance to this study**:
- Credit behaviors are not strictly separable
- Users in transitional financial states (e.g., improving or deteriorating discipline) are expected
- GMM captures uncertainty in cluster membership

**Conclusion**:  
GMM is particularly suitable for representing **continuous and overlapping financial behavior segments**, which aligns with real-world credit usage patterns.

---

## Final Methodological Choice

- **Primary methods**:  
  - Density-Based Clustering (DBSCAN or HDBSCAN)  
  - Gaussian Mixture Models (GMM)

- **Supporting analysis**:  
  - Hierarchical clustering for structural validation

- **Exploratory only**:  
  - K-means (not used for final segmentation)

This combination balances:
- Flexibility in cluster shape
- Ability to model overlapK-means was explored during the exploratory phase but was not selected as the primary method due to the absence of spherical cluster structure and the presence of overlapping behaviors. Its assumptions do not align well with the observed data geometry.
- Interpretability
- Alignment with domain expectations

---

## Cluster Validation and Interpretation

Clusters are evaluated using:
- Internal validation metrics (e.g., silhouette score, Davies‚ÄìBouldin index)
- Stability analysis across random seeds and subsampling
- Behavioral interpretability using feature distributions per cluster

Each resulting cluster is characterized in terms of:
- Credit utilization intensity
- Payment discipline
- Spending behavior
- Financial strain profile

---

## Summary

The selected clustering approaches reflect both the **statistical structure of the data** and the **real-world complexity of financial behavior**. Rather than enforcing artificial separation, the methodology prioritizes interpretability, robustness, and domain relevance.


In [None]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
import matplotlib.pyplot as plt

number_of_features = 4
min_samples = 2 * number_of_features


k = min_samples

nn = NearestNeighbors(n_neighbors=k)
nn.fit(pca_df)

distances, _ = nn.kneighbors(pca_df)
k_distances = np.sort(distances[:, k-1])

plt.plot(k_distances)
plt.ylabel(f"{k}-NN distance")
plt.xlabel("Points sorted by distance")
plt.show()

y = k_distances
x = np.arange(len(y))
p1 = np.array([x[0], y[0]])
p2 = np.array([x[-1], y[-1]])

distances = np.abs(
    np.cross(p2 - p1, p1 - np.vstack((x, y)).T)
) / np.linalg.norm(p2 - p1)

elbow_index = np.argmax(distances)
optimal_eps = y[elbow_index]

print("Min Samples based on dimentionality rule of thumb:", min_samples)
print("Optimal eps:", optimal_eps)


In [None]:
from sklearn.cluster import DBSCAN

optimal_eps=0.3
min_smples=8
db = DBSCAN(eps=optimal_eps, min_samples=min_samples, metric="euclidean")
db.fit(pca_df)
labels = db.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(f"clusters={n_clusters}")

In [None]:
def generate_3d_cluster_report(df, labels):
    import numpy as np
    import pandas as pd
    import plotly.graph_objects as go
    from itertools import combinations

    # ---- Validation
    if df.shape[1] < 3:
        raise ValueError("Dataset must have at least 3 features")

    if len(df) != len(labels):
        raise ValueError("df and labels must have the same length")

    # ---- Ensure labels are 1D
    if isinstance(labels, pd.DataFrame):
        labels = labels.iloc[:, 0]
    labels = np.asarray(labels)

    feature_names = df.columns
    combos = list(combinations(feature_names, 3))

    fig = go.Figure()

    for i, (f1, f2, f3) in enumerate(combos):
        fig.add_trace(
            go.Scatter3d(
                x=df[f1],
                y=df[f2],
                z=df[f3],
                mode='markers',
                marker=dict(
                    size=3,
                    color=labels,          # üëà CLUSTER LABELS
                    colorscale='Turbo',
                    opacity=0.8,
                    showscale=True,
                    colorbar=dict(title="Cluster")
                ),
                name=f"{f1}-{f2}-{f3}",
                visible=(i == 0)
            )
        )

    # ---- Dropdown
    buttons = []
    for i, (f1, f2, f3) in enumerate(combos):
        visible = [False] * len(combos)
        visible[i] = True
        buttons.append(
            dict(
                label=f"{f1}-{f2}-{f3}",
                method="update",
                args=[
                    {"visible": visible},
                    {"scene": {
                        "xaxis": {"title": f1},
                        "yaxis": {"title": f2},
                        "zaxis": {"title": f3}
                    }}
                ]
            )
        )

    fig.update_layout(
        title="Interactive 3D Feature Space (Colored by Cluster Labels)",
        updatemenus=[dict(
            buttons=buttons,
            direction="down",
            x=1,
            y=2
        )],
        scene=dict(
            xaxis_title=combos[0][0],
            yaxis_title=combos[0][1],
            zaxis_title=combos[0][2]
        )
    )

    fig.show()


In [None]:
generate_3d_cluster_report(df=pca_df, labels=labels)

In [None]:
pd.Series(labels).value_counts()
