In [1]:
import pandas as pd
from sklearn.ensemble import IsolationForest

# 1. 加载行为数据表（需包含：Name, Month, total_amount, order_count, sku_count 等）
df = pd.read_csv('Cleaned_Monthly_Customer_Behavior.csv')
df['Month'] = pd.to_datetime(df['Month'])

# 2. 用 Isolation Forest 建模识别异常行为
anomalies = []

for customer, group in df.groupby('Name'):
    if len(group) < 6:
        continue  # 客户数据太少，跳过
    
    # 多维特征构建
    features = group[['total_amount', 'order_count', 'sku_count']].fillna(0).values
    
    # 建模（20% 设为异常）
    model = IsolationForest(contamination=0.2, random_state=42)
    preds = model.fit_predict(features)
    
    group = group.copy()
    group['anomaly'] = preds
    group = group[group['anomaly'] == -1]  # -1 是异常点
    
    if not group.empty:
        anomalies.append(group)

# 3. 汇总结果
df_anomaly = pd.concat(anomalies)
df_anomaly.to_csv('anomaly_customers.csv', index=False)


In [2]:
# 记录每个客户的异常比例
all_results = []

for customer, group in df.groupby('Name'):
    if len(group) < 6:
        continue

    X = group[['total_amount', 'order_count', 'sku_count']].fillna(0).values
    model = IsolationForest(contamination=0.2, random_state=42)
    preds = model.fit_predict(X)

    group = group.copy()
    group['anomaly'] = preds
    all_results.append(group)

df_all = pd.concat(all_results)

# 查看整体异常率
print(df_all['anomaly'].value_counts(normalize=True))


anomaly
 1    0.785021
-1    0.214979
Name: proportion, dtype: float64
