In [None]:
#安装依赖库
!pip install pandas numpy matplotlib seaborn scikit-learn


In [None]:
# 导入所需的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [None]:
# 读取数据
ecommerce_data_path = './data/ecommerce_data.csv'
ecommerce_behavior_path = './data/ecommerce_behavior.csv'

ecommerce_data = pd.read_csv(ecommerce_data_path)
ecommerce_behavior = pd.read_csv(ecommerce_behavior_path)

# 查看数据的前几行
print("E-commerce Data:")
print(ecommerce_data.head())

print("\nE-Commerce Behavior Data:")
print(ecommerce_behavior.head())


In [None]:
# 处理E-commerce Data的缺失值
ecommerce_data['CustomerID'].fillna(ecommerce_data['CustomerID'].mode()[0], inplace=True)
ecommerce_data['Country'].fillna(ecommerce_data['Country'].mode()[0], inplace=True)
ecommerce_data['Quantity'].fillna(ecommerce_data['Quantity'].mean(), inplace=True)

# 处理E-Commerce Behavior数据的缺失值
ecommerce_behavior['CustomerID'].fillna(ecommerce_behavior['CustomerID'].mode()[0], inplace=True)
ecommerce_behavior['Country'].fillna(ecommerce_behavior['Country'].mode()[0], inplace=True)
ecommerce_behavior['Quantity'].fillna(ecommerce_behavior['Quantity'].mean(), inplace=True)


In [None]:
# 使用IQR方法剔除异常值的函数
def remove_outliers(df, column_name):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    return df[(df[column_name] >= (Q1 - 1.5 * IQR)) & (df[column_name] <= (Q3 + 1.5 * IQR))]

# 对E-commerce Data处理异常值
ecommerce_data = remove_outliers(ecommerce_data, 'Quantity')

# 对E-commerce Behavior数据处理异常值
ecommerce_behavior = remove_outliers(ecommerce_behavior, 'Quantity')


In [None]:
# 初始化标准化对象
scaler = StandardScaler()

# 对E-commerce Data的数值列进行标准化
ecommerce_data[['Quantity', 'UnitPrice']] = scaler.fit_transform(ecommerce_data[['Quantity', 'UnitPrice']])

# 对E-commerce Behavior数据的数值列进行标准化
ecommerce_behavior[['Quantity', 'UnitPrice']] = scaler.fit_transform(ecommerce_behavior[['Quantity', 'UnitPrice']])

# 查看数据标准化后的结果
print("\nE-commerce Data after Standardization:")
print(ecommerce_data.head())

print("\nE-Commerce Behavior Data after Standardization:")
print(ecommerce_behavior.head())


In [None]:
# 对E-commerce Data的类别型数据进行One-Hot编码
ecommerce_data = pd.get_dummies(ecommerce_data, columns=['Country'], drop_first=True)

# 对E-commerce Behavior数据的类别型数据进行One-Hot编码
ecommerce_behavior = pd.get_dummies(ecommerce_behavior, columns=['Country'], drop_first=True)

# 查看编码后的数据
print("\nE-commerce Data with One-Hot Encoding:")
print(ecommerce_data.head())

print("\nE-Commerce Behavior Data with One-Hot Encoding:")
print(ecommerce_behavior.head())


In [None]:
# 对E-commerce Data进行特征构造：用户的总购买金额
ecommerce_data['TotalPurchaseAmount'] = ecommerce_data['Quantity'] * ecommerce_data['UnitPrice']

# 对E-commerce Behavior数据进行特征构造：用户的总购买金额
ecommerce_behavior['TotalPurchaseAmount'] = ecommerce_behavior['Quantity'] * ecommerce_behavior['UnitPrice']

# 查看新特征
print("\nE-commerce Data with new features:")
print(ecommerce_data[['CustomerID', 'TotalPurchaseAmount']].head())

print("\nE-Commerce Behavior Data with new features:")
print(ecommerce_behavior[['CustomerID', 'TotalPurchaseAmount']].head())


In [None]:
# 对E-commerce Data进行数据集划分
X_ecommerce = ecommerce_data.drop(['TotalPurchaseAmount'], axis=1)  # 特征列
y_ecommerce = ecommerce_data['TotalPurchaseAmount']  # 目标列
X_train_ecommerce, X_test_ecommerce, y_train_ecommerce, y_test_ecommerce = train_test_split(X_ecommerce, y_ecommerce, test_size=0.2, random_state=42)

# 对E-commerce Behavior数据进行数据集划分
X_behavior = ecommerce_behavior.drop(['TotalPurchaseAmount'], axis=1)  # 特征列
y_behavior = ecommerce_behavior['TotalPurchaseAmount']  # 目标列
X_train_behavior, X_test_behavior, y_train_behavior, y_test_behavior = train_test_split(X_behavior, y_behavior, test_size=0.2, random_state=42)

# 输出划分后的数据集大小
print(f'E-commerce Data Training set size: {X_train_ecommerce.shape}, Test set size: {X_test_ecommerce.shape}')
print(f'E-commerce Behavior Training set size: {X_train_behavior.shape}, Test set size: {X_test_behavior.shape}')


In [None]:
# 保存处理后的数据为新的CSV文件
ecommerce_data.to_csv('./data/processed_ecommerce_data.csv', index=False)
ecommerce_behavior.to_csv('./data/processed_ecommerce_behavior.csv', index=False)

print("Processed data saved successfully!")
