In [6]:
#!/usr/bin/env python3
# scripts/retention_analysis.py

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def clean_currency(series: pd.Series) -> pd.Series:
    """
    Strip all non-numeric characters (except minus and dot) and convert to float.
    """
    return (
        series.astype(str)
              .str.replace(r"[^\d.-]", "", regex=True)
              .astype(float)
    )

def main():
    # 1. Load & parse dataset (UTF-16 TSV)
    data_path = "/Users/xinruyu/Downloads/Sale Map_data.csv"
    df = pd.read_csv(data_path, encoding="utf-16", sep="\t")

    # 2. Clean & convert numeric columns
    df['Profit'] = clean_currency(df['Profit'])
    df['Sales'] = clean_currency(df['Sales'])
    df['Profit Ratio'] = df['Profit Ratio'].str.replace("%", "").astype(float)

    # Rename Chinese latitude/longitude columns
    df.rename(columns={
        '纬度(自动生成)': 'Latitude',
        '经度(自动生成)': 'Longitude'
    }, inplace=True)

    # 3. State-Level Summary
    state_summary = df.groupby('State').agg(
        total_profit=('Profit', 'sum'),
        total_sales=('Sales', 'sum'),
        avg_profit_ratio=('Profit Ratio', 'mean')
    ).reset_index()
    state_summary.to_csv("/Users/xinruyu/Downloads/state_summary.csv", index=False)
    print("\nState-Level Summary saved to outputs/state_summary.csv")

    # 4. Clustering on Profit & Sales
    X = df[['Profit', 'Sales']].fillna(0)
    kmeans = KMeans(n_clusters=4, random_state=42)
    df['Cluster'] = kmeans.fit_predict(X)

    cluster_summary = df.groupby('Cluster').agg(
        count=('Cluster', 'count'),
        mean_profit=('Profit', 'mean'),
        mean_sales=('Sales', 'mean')
    ).reset_index()
    cluster_summary.to_csv("/Users/xinruyu/Downloads/cluster_summary.csv", index=False)
    print("Cluster Summary saved to outputs/cluster_summary.csv")

    # 5. Correlation Analysis
    corr_matrix = df[['Profit', 'Sales', 'Profit Ratio']].corr()
    corr_matrix.to_csv("/Users/xinruyu/Downloads/correlation_matrix.csv")
    print("Correlation Matrix saved to outputs/correlation_matrix.csv")

    # 6. Geospatial Plot
    # Plot
    plt.figure(figsize=(10, 6))
    bubble_sizes = (df['Profit'] / df['Profit'].max()) * 50 + 5  # profit-scaled sizes
    plt.scatter(df['Longitude'], df['Latitude'], s=bubble_sizes, alpha=0.6, edgecolors='w')
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.title("Geospatial Sales Points Sized by Profit")
    plt.grid(True, linestyle='--', linewidth=0.5)
    plt.tight_layout()
    plt.show()