In [14]:
import pandas
import numpy as np
import matplotlib.pyplot as plt

# 1. 数据采集
# 从本地文件中读取数据  2分
data = pandas.read_csv('user_behavior_data.csv')
print("数据采集完成，已加载到DataFrame中")

# 打印数据的前5条记录  2分
print(data.head(5))

数据采集完成，已加载到DataFrame中
   UserID UserName  Age  Gender    Location   LastLogin  PurchaseAmount  \
0       1   User_1   62  Female  Location_1  2023-06-10             118   
1       2   User_2   65  Female  Location_2  2023-08-14             466   
2       3   User_3   18    Male  Location_3  2023-02-17             869   
3       4   User_4   21  Female  Location_4  2023-03-14             486   
4       5   User_5   21    Male  Location_5  2023-07-26             753   

  PurchaseCategory  ReviewScore LoginFrequency  
0         Clothing            3        Monthly  
1      Electronics            4         Weekly  
2    Home & Garden            3         Weekly  
3            Books            2         Weekly  
4    Home & Garden            1        Monthly  


In [6]:
data

Unnamed: 0,UserID,UserName,Age,Gender,Location,LastLogin,PurchaseAmount,PurchaseCategory,ReviewScore,LoginFrequency
0,1,User_1,62,Female,Location_1,2023-06-10,118,Clothing,3,Monthly
1,2,User_2,65,Female,Location_2,2023-08-14,466,Electronics,4,Weekly
2,3,User_3,18,Male,Location_3,2023-02-17,869,Home & Garden,3,Weekly
3,4,User_4,21,Female,Location_4,2023-03-14,486,Books,2,Weekly
4,5,User_5,21,Male,Location_5,2023-07-26,753,Home & Garden,1,Monthly
...,...,...,...,...,...,...,...,...,...,...
995,996,User_996,44,Male,Location_96,2023-03-26,664,Food,1,Daily
996,997,User_997,52,Female,Location_97,2023-03-30,916,Food,3,Daily
997,998,User_998,35,Male,Location_98,2023-08-04,768,Electronics,3,Weekly
998,999,User_999,53,Female,Location_99,2023-04-26,270,Home & Garden,4,Weekly


In [15]:
# 2. 数据清洗与预处理
# 处理缺失值（删除）  2分
data = data.dropna()

# 数据类型转换
data['Age'] = data['Age'].astype(int)   # Age数据类型转换为int 2分
data['PurchaseAmount'] = data['PurchaseAmount'].astype(float) # PurchaseAmount数据类型转换为float  2分
data['ReviewScore'] = data['ReviewScore'].astype(int)   # ReviewScore数据类型转换为int 2分

# 处理异常值  2分
data = data[(data['Age'].between(18, 70)) & 
            (data['PurchaseAmount'] > 0) & 
            (data['ReviewScore'].between(1, 5))]

# 数据标准化
data['PurchaseAmount'] = (data['PurchaseAmount'] - data['PurchaseAmount'].mean()) / data['PurchaseAmount'].std()  # PurchaseAmount数据标准化 2分
data['ReviewScore'] = (data['ReviewScore'] - data['ReviewScore'].mean()) / data['ReviewScore'].std()  # ReviewScore数据标准化 2分

# 保存清洗后的数据  1分
data.to_csv('cleaned_user_behavior_data.csv', index=False)
print("数据清洗完成，已保存为 'cleaned_user_behavior_data.csv'")

数据清洗完成，已保存为 'cleaned_user_behavior_data.csv'


In [8]:
data

Unnamed: 0,UserID,UserName,Age,Gender,Location,LastLogin,PurchaseAmount,PurchaseCategory,ReviewScore,LoginFrequency
0,1,User_1,62,Female,Location_1,2023-06-10,-1.364167,Clothing,0.019608,Monthly
1,2,User_2,65,Female,Location_2,2023-08-14,-0.157342,Electronics,0.719875,Weekly
2,3,User_3,18,Male,Location_3,2023-02-17,1.240218,Home & Garden,0.019608,Weekly
3,4,User_4,21,Female,Location_4,2023-03-14,-0.087984,Books,-0.680660,Weekly
4,5,User_5,21,Male,Location_5,2023-07-26,0.837943,Home & Garden,-1.380928,Monthly
...,...,...,...,...,...,...,...,...,...,...
995,996,User_996,44,Male,Location_96,2023-03-26,0.529300,Food,-1.380928,Daily
996,997,User_997,52,Female,Location_97,2023-03-30,1.403208,Food,0.019608,Daily
997,998,User_998,35,Male,Location_98,2023-08-04,0.889961,Electronics,0.019608,Weekly
998,999,User_999,53,Female,Location_99,2023-04-26,-0.837048,Home & Garden,0.719875,Weekly


In [16]:
# 3. 数据统计
# 统计每个购买类别的用户数 2分
purchase_category_counts = data['PurchaseCategory'].value_counts()
print("每个购买类别的用户数:\n", purchase_category_counts)

# 统计不同性别的平均购买金额 2分
gender_purchase_amount_mean = data.groupby('Gender')['PurchaseAmount'].mean()
print("不同性别的平均购买金额:\n", gender_purchase_amount_mean)

# 统计不同年龄段的用户数 2分
bins = [18, 26, 36, 46, 56, 66, np.inf]
labels = ['18-25', '26-35', '36-45', '46-55', '56-65', '65+']
data['AgeGroup'] = pandas.cut(data['Age'], bins=bins, labels=labels, right=False)
age_group_counts = data['AgeGroup'].value_counts().sort_index()
print("不同年龄段的用户数:\n", age_group_counts)


每个购买类别的用户数:
 PurchaseCategory
Clothing         214
Electronics      213
Home & Garden    203
Food             197
Books            173
Name: count, dtype: int64
不同性别的平均购买金额:
 Gender
Female   -0.001367
Male      0.001458
Name: PurchaseAmount, dtype: float64
不同年龄段的用户数:
 AgeGroup
18-25    167
26-35    175
36-45    191
46-55    191
56-65    176
65+      100
Name: count, dtype: int64
