In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

def load_and_prepare_data():
    # Load data
    customers_df = pd.read_csv("C:\\Users\\chatu\\Downloads\\Customers.csv")
    transactions_df = pd.read_csv("C:\\Users\\chatu\\Downloads\\Transactions.csv")
    
    # Convert dates
    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
    
    # Calculate customer metrics
    customer_metrics = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'TotalValue': ['sum', 'mean'],
        'Quantity': ['sum', 'mean']
    }).reset_index()
    
    customer_metrics.columns = ['CustomerID', 'transaction_count', 'total_spend', 
                              'avg_transaction_value', 'total_quantity', 'avg_quantity']
    
    # Calculate days since signup
    customers_df['days_since_signup'] = (datetime.now() - customers_df['SignupDate']).dt.days
    
    # Merge customer data
    final_data = customers_df.merge(customer_metrics, on='CustomerID', how='left')
    
    # One-hot encode region
    region_dummies = pd.get_dummies(final_data['Region'], prefix='region')
    final_data = pd.concat([final_data, region_dummies], axis=1)
    
    return final_data

def create_comprehensive_report(data, scaled_features, labels, db_score, silhouette_score):
    # Create a large figure with subplots
    plt.figure(figsize=(20, 25))
    
    # 1. Title and Metrics
    plt.subplot(5, 1, 1)
    plt.axis('off')
    plt.text(0.5, 0.8, 'Customer Segmentation Analysis Report', 
            horizontalalignment='center', fontsize=20, fontweight='bold')
    plt.text(0.5, 0.6, f'Number of Clusters: {len(np.unique(labels))}', 
            horizontalalignment='center', fontsize=12)
    plt.text(0.5, 0.4, f'Davies-Bouldin Index: {db_score:.3f}', 
            horizontalalignment='center', fontsize=12)
    plt.text(0.5, 0.2, f'Silhouette Score: {silhouette_score:.3f}', 
            horizontalalignment='center', fontsize=12)
    
    # 2. PCA Visualization
    plt.subplot(5, 1, 2)
    pca = PCA(n_components=2)
    pca_features = pca.fit_transform(scaled_features)
    plt.scatter(pca_features[:, 0], pca_features[:, 1], c=labels, cmap='viridis')
    plt.title('Customer Segments - PCA Visualization')
    plt.xlabel('First Principal Component')
    plt.ylabel('Second Principal Component')
    plt.colorbar()
    
    # 3. Cluster Profiles
    plt.subplot(5, 1, 3)
    cluster_profiles = data.groupby('Cluster').agg({
        'total_spend': 'mean',
        'transaction_count': 'mean',
        'avg_transaction_value': 'mean'
    }).round(2)
    
    cluster_profiles.plot(kind='bar', ax=plt.gca())
    plt.title('Cluster Profiles - Key Metrics')
    plt.xlabel('Cluster')
    plt.ylabel('Value')
    plt.legend(bbox_to_anchor=(1.05, 1))
    plt.tight_layout()
    
    # 4. Region Distribution
    plt.subplot(5, 1, 4)
    region_cols = [col for col in data.columns if col.startswith('region_')]
    region_dist = data.groupby('Cluster')[region_cols].mean()
    region_dist.plot(kind='bar', stacked=True, ax=plt.gca())
    plt.title('Region Distribution by Cluster')
    plt.xlabel('Cluster')
    plt.ylabel('Proportion')
    plt.legend(bbox_to_anchor=(1.05, 1))
    plt.tight_layout()
    
    # 5. Key Statistics Table
    plt.subplot(5, 1, 5)
    plt.axis('off')
    
    stats_text = """
    Key Insights:
    
    1. Cluster Sizes:
    {}
    
    2. Average Spend per Cluster:
    {}
    
    3. Transaction Frequency:
    {}
    
    4. Regional Distribution:
    {}
    """.format(
        data.groupby('Cluster').size().to_string(),
        data.groupby('Cluster')['total_spend'].mean().round(2).to_string(),
        data.groupby('Cluster')['transaction_count'].mean().round(2).to_string(),
        region_dist.mean().round(2).to_string()
    )
    
    plt.text(0, 1, stats_text, fontsize=10, va='top', family='monospace')
    
    # Adjust layout and save
    plt.tight_layout()
    plt.savefig('comprehensive_clustering_report.pdf', bbox_inches='tight', dpi=300)
    plt.close()

def main():
    try:
        print("Loading and preparing data...")
        data = load_and_prepare_data()
        
        # Prepare features for clustering
        feature_columns = ['days_since_signup', 'transaction_count', 'total_spend', 
                         'avg_transaction_value', 'total_quantity', 'avg_quantity']
        region_columns = [col for col in data.columns if col.startswith('region_')]
        feature_columns.extend(region_columns)
        
        features = data[feature_columns].fillna(0)
        scaler = StandardScaler()
        scaled_features = scaler.fit_transform(features)
        
        # Perform clustering
        print("Performing clustering...")
        n_clusters = 4  # You can adjust this number
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        labels = kmeans.fit_predict(scaled_features)
        
        # Calculate metrics
        db_score = davies_bouldin_score(scaled_features, labels)
        silhouette = silhouette_score(scaled_features, labels)
        
        # Add cluster labels to data
        data['Cluster'] = labels
        
        # Create report
        print("Generating report...")
        create_comprehensive_report(data, scaled_features, labels, db_score, silhouette)
        
        print("Report generated successfully! Check 'comprehensive_clustering_report.pdf'")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()

Loading and preparing data...
Performing clustering...
Generating report...



KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.



Report generated successfully! Check 'comprehensive_clustering_report.pdf'
