In [1]:
import pandas as pd
import numpy as np

# 1. 数据采集
data = pd.read_csv('ecommerce_user_data.csv')
print("数据采集完成，已加载到DataFrame中")
print(data.head())

# 2. 数据清洗与预处理
# 处理缺失值
data = data.dropna()

# 数据类型转换
data['Age'] = data['Age'].astype(int)
data['PurchaseAmount'] = data['PurchaseAmount'].astype(float)
data['Rating'] = data['Rating'].astype(int)

# 处理异常值
data = data[
    data['Age'].between(18, 70) &
    (data['PurchaseAmount'] > 0) &
    data['Rating'].between(1, 5)
]

# 新增 AgeGroup 字段
bins = [18, 25, 35, 45, 55, 70]
labels = ['18-25', '26-35', '36-45', '46-55', '56+']
data['AgeGroup'] = pd.cut(data['Age'], bins=bins, labels=labels, right=True)

# 保存清洗后数据
data.to_csv('cleaned_ecommerce_data.csv', index=False)
print("数据清洗完成，已保存为 'cleaned_ecommerce_data.csv'")

# 3. 数据统计分析
# 每个商品类别的购买人数
category_count = data['ProductCategory'].value_counts()
print("\n每个商品类别的购买人数:\n", category_count)

# 不同性别的平均购买金额
avg_purchase_by_gender = data.groupby('Gender')['PurchaseAmount'].mean()
print("\n不同性别的平均购买金额:\n", avg_purchase_by_gender)

# 各年龄段用户数量
age_group_count = data['AgeGroup'].value_counts().sort_index()
print("\n各年龄段的用户数量:\n", age_group_count)

数据采集完成，已加载到DataFrame中
   UserID  UserName   Age  Gender  PurchaseAmount ProductCategory  Rating  \
0    1522  User_522  53.0    Male             NaN            Home     6.0   
1    1738  User_738  42.0    Male     2279.633230        Clothing     3.0   
2    1741  User_741  40.0    Male      596.719273          Sports     1.0   
3    1661  User_661  35.0  Female      222.766854          Sports     4.0   
4    1412  User_412  68.0  Female      190.221801          Sports     3.0   

  LoginFrequency LastPurchaseDate  
0          Daily       2023-07-16  
1          Daily       2023-06-19  
2        Monthly       2023-08-21  
3          Daily       2023-02-24  
4         Weekly       2023-12-07  
数据清洗完成，已保存为 'cleaned_ecommerce_data.csv'

每个商品类别的购买人数:
 ProductCategory
Sports         182
Clothing       158
Electronics    155
Books          148
Home           143
Name: count, dtype: int64

不同性别的平均购买金额:
 Gender
Female    920.887840
Male      888.778925
Name: PurchaseAmount, dtype: float64

各年龄段