In [2]:
# 导入所需的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [10]:
# 设置chunk_size
chunk_size = 50000  # 每次读取50000行数据

# 读取E-commerce Data
ecommerce_data_path = './data/ecommerce_data.csv'
ecommerce_data_cols = ['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country']

# 创建一个空DataFrame来保存处理后的数据
ecommerce_data_processed = pd.DataFrame()

# 逐块读取E-commerce Data
chunks = pd.read_csv(ecommerce_data_path, encoding='ISO-8859-1', usecols=ecommerce_data_cols, chunksize=chunk_size)

for chunk in chunks:
    # 填补缺失值（不使用 inplace=True）
    chunk['CustomerID'] = chunk['CustomerID'].fillna(chunk['CustomerID'].mode()[0])
    chunk['Country'] = chunk['Country'].fillna(chunk['Country'].mode()[0])
    chunk['Quantity'] = chunk['Quantity'].fillna(chunk['Quantity'].mean())

    # 数据处理：计算总购买金额
    chunk['TotalPurchaseAmount'] = chunk['Quantity'] * chunk['UnitPrice']
    
    # 将当前块数据追加到最终的DataFrame
    ecommerce_data_processed = pd.concat([ecommerce_data_processed, chunk], ignore_index=True)

# 查看处理后的E-commerce Data
print("Processed E-commerce Data:")
print(ecommerce_data_processed.head())


Processed E-commerce Data:
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   

      InvoiceDate  UnitPrice  CustomerID         Country  TotalPurchaseAmount  
0  12/1/2010 8:26       2.55     17850.0  United Kingdom                15.30  
1  12/1/2010 8:26       3.39     17850.0  United Kingdom                20.34  
2  12/1/2010 8:26       2.75     17850.0  United Kingdom                22.00  
3  12/1/2010 8:26       3.39     17850.0  United Kingdom                20.34  
4  12/1/2010 8:26       3.39     17850.0  United Kingdom                20.34  


In [12]:
# 读取E-Commerce Behavior Data
ecommerce_behavior_path = './data/ecommerce_behavior.csv'
ecommerce_behavior_cols = ['event_time', 'event_type', 'product_id', 'category_id', 'category_code', 'brand', 'price', 'user_id', 'user_session']

# 创建一个空DataFrame来保存处理后的数据
ecommerce_behavior_processed = pd.DataFrame()

# 逐块读取E-Commerce Behavior Data
chunks = pd.read_csv(ecommerce_behavior_path, encoding='ISO-8859-1', usecols=ecommerce_behavior_cols, chunksize=chunk_size)

for chunk in chunks:
    # 填补缺失值（不使用 inplace=True）
    chunk['user_id'] = chunk['user_id'].fillna(chunk['user_id'].mode()[0])
    chunk['price'] = chunk['price'].fillna(chunk['price'].mean())
    chunk['event_type'] = chunk['event_type'].fillna(chunk['event_type'].mode()[0])

    # 数据处理：计算总购买金额
    chunk['TotalPurchaseAmount'] = chunk['price'] * 1  # 假设每次事件都对应一次购买
    
    # 将当前块数据追加到最终的DataFrame
    ecommerce_behavior_processed = pd.concat([ecommerce_behavior_processed, chunk], ignore_index=True)

# 查看处理后的E-commerce Behavior Data
print("Processed E-commerce Behavior Data:")
print(ecommerce_behavior_processed.head())


Processed E-commerce Behavior Data:
                event_time event_type  product_id          category_id  \
0  2019-10-01 00:00:00 UTC       view    44600062  2103807459595387724   
1  2019-10-01 00:00:00 UTC       view     3900821  2053013552326770905   
2  2019-10-01 00:00:01 UTC       view    17200506  2053013559792632471   
3  2019-10-01 00:00:01 UTC       view     1307067  2053013558920217191   
4  2019-10-01 00:00:04 UTC       view     1004237  2053013555631882655   

                         category_code     brand    price    user_id  \
0                                  NaN  shiseido    35.79  541312140   
1  appliances.environment.water_heater      aqua    33.20  554748717   
2           furniture.living_room.sofa       NaN   543.10  519107250   
3                   computers.notebook    lenovo   251.74  550050854   
4               electronics.smartphone     apple  1081.98  535871217   

                           user_session  TotalPurchaseAmount  
0  72d76fde-8bb3-4e00-8

In [13]:
# 使用IQR方法剔除异常值的函数
def remove_outliers(df, column_name):
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    return df[(df[column_name] >= (Q1 - 1.5 * IQR)) & (df[column_name] <= (Q3 + 1.5 * IQR))]

# 对E-commerce Data处理异常值
ecommerce_data_processed = remove_outliers(ecommerce_data_processed, 'Quantity')

# 对E-commerce Behavior数据处理异常值
ecommerce_behavior_processed = remove_outliers(ecommerce_behavior_processed, 'price')


In [14]:
# 初始化标准化对象
scaler = StandardScaler()

# 对E-commerce Data的数值列进行标准化
ecommerce_data_processed[['Quantity', 'UnitPrice']] = scaler.fit_transform(ecommerce_data_processed[['Quantity', 'UnitPrice']])

# 对E-commerce Behavior数据的数值列进行标准化
ecommerce_behavior_processed[['price']] = scaler.fit_transform(ecommerce_behavior_processed[['price']])

# 查看数据标准化后的结果
print("\nE-commerce Data after Standardization:")
print(ecommerce_data_processed.head())

print("\nE-Commerce Behavior Data after Standardization:")
print(ecommerce_behavior_processed.head())



E-commerce Data after Standardization:
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER  0.313548   
1    536365     71053                  WHITE METAL LANTERN  0.313548   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER  0.753492   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE  0.313548   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.  0.313548   

      InvoiceDate  UnitPrice  CustomerID         Country  TotalPurchaseAmount  
0  12/1/2010 8:26  -0.024046     17850.0  United Kingdom                15.30  
1  12/1/2010 8:26  -0.015847     17850.0  United Kingdom                20.34  
2  12/1/2010 8:26  -0.022094     17850.0  United Kingdom                22.00  
3  12/1/2010 8:26  -0.015847     17850.0  United Kingdom                20.34  
4  12/1/2010 8:26  -0.015847     17850.0  United Kingdom                20.34  

E-Commerce Behavior Data after Standardization

In [15]:
# 对E-commerce Data的类别型数据进行One-Hot编码
ecommerce_data_processed = pd.get_dummies(ecommerce_data_processed, columns=['Country'], drop_first=True)

# 对E-commerce Behavior数据的类别型数据进行One-Hot编码
ecommerce_behavior_processed = pd.get_dummies(ecommerce_behavior_processed, columns=['event_type'], drop_first=True)

# 查看编码后的数据
print("\nE-commerce Data with One-Hot Encoding:")
print(ecommerce_data_processed.head())

print("\nE-Commerce Behavior Data with One-Hot Encoding:")
print(ecommerce_behavior_processed.head())



E-commerce Data with One-Hot Encoding:
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER  0.313548   
1    536365     71053                  WHITE METAL LANTERN  0.313548   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER  0.753492   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE  0.313548   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.  0.313548   

      InvoiceDate  UnitPrice  CustomerID  TotalPurchaseAmount  \
0  12/1/2010 8:26  -0.024046     17850.0                15.30   
1  12/1/2010 8:26  -0.015847     17850.0                20.34   
2  12/1/2010 8:26  -0.022094     17850.0                22.00   
3  12/1/2010 8:26  -0.015847     17850.0                20.34   
4  12/1/2010 8:26  -0.015847     17850.0                20.34   

   Country_Austria  Country_Bahrain  ...  Country_RSA  Country_Saudi Arabia  \
0            False            False  ...        False    

In [16]:
# 对E-commerce Data进行特征构造：用户的总购买金额
ecommerce_data_processed['TotalPurchaseAmount'] = ecommerce_data_processed['Quantity'] * ecommerce_data_processed['UnitPrice']

# 对E-commerce Behavior数据进行特征构造：用户的总购买金额
ecommerce_behavior_processed['TotalPurchaseAmount'] = ecommerce_behavior_processed['price'] * 1  # 假设每次事件都对应一次购买

# 查看新特征
print("\nE-commerce Data with new features:")
print(ecommerce_data_processed[['CustomerID', 'TotalPurchaseAmount']].head())

print("\nE-Commerce Behavior Data with new features:")
print(ecommerce_behavior_processed[['user_id', 'TotalPurchaseAmount']].head())



E-commerce Data with new features:
   CustomerID  TotalPurchaseAmount
0     17850.0            -0.007540
1     17850.0            -0.004969
2     17850.0            -0.016648
3     17850.0            -0.004969
4     17850.0            -0.004969

E-Commerce Behavior Data with new features:
     user_id  TotalPurchaseAmount
0  541312140            -0.904860
1  554748717            -0.919062
2  519107250             1.876886
3  550050854             0.279264
6  555447699             0.987820


In [17]:
# 对E-commerce Data进行数据集划分
X_ecommerce = ecommerce_data_processed.drop(['TotalPurchaseAmount'], axis=1)  # 特征列
y_ecommerce = ecommerce_data_processed['TotalPurchaseAmount']  # 目标列
X_train_ecommerce, X_test_ecommerce, y_train_ecommerce, y_test_ecommerce = train_test_split(X_ecommerce, y_ecommerce, test_size=0.2, random_state=42)

# 对E-commerce Behavior数据进行数据集划分
X_behavior = ecommerce_behavior_processed.drop(['TotalPurchaseAmount'], axis=1)  # 特征列
y_behavior = ecommerce_behavior_processed['TotalPurchaseAmount']  # 目标列
X_train_behavior, X_test_behavior, y_train_behavior, y_test_behavior = train_test_split(X_behavior, y_behavior, test_size=0.2, random_state=42)

# 输出划分后的数据集大小
print(f'E-commerce Data Training set size: {X_train_ecommerce.shape}, Test set size: {X_test_ecommerce.shape}')
print(f'E-commerce Behavior Training set size: {X_train_behavior.shape}, Test set size: {X_test_behavior.shape}')


E-commerce Data Training set size: (386632, 44), Test set size: (96658, 44)
E-commerce Behavior Training set size: (31021748, 10), Test set size: (7755438, 10)


In [20]:
# 保存处理后的数据为新的CSV文件
ecommerce_data_processed.to_csv('./data/processed_ecommerce_data.csv', index=False)
ecommerce_behavior_processed.to_csv('./data/processed_ecommerce_behavior.csv', index=False)

print("Processed data saved successfully!")


Processed data saved successfully!
