In [6]:
import pandas as pd
import numpy as np

# 1. 读取数据
df1 = pd.read_csv('amazon_eco-friendly_products.csv', encoding='utf-8')
df2 = pd.read_csv('amazon_eco_products_数据爬取原始生成.csv', encoding='utf-8')
df3 = pd.read_csv('Form-responses-on-sustainability-products.csv', encoding='utf-8')

# 2. 统一字段命名（以英文为主，便于后续合并）
def rename_amazon(df):
    rename_dict = {
        'id': 'asin',
        'name': 'product_name',
        'category': 'category',
        'material': 'material',
        'brand': 'brand',
        'price': 'price',
        'rating': 'rating',
        'reviewsCount': 'reviews',
        'description': 'description',
        'url': 'url'
    }
    # 只重命名存在的列
    df = df.rename(columns={k: v for k, v in rename_dict.items() if k in df.columns})
    return df

df1 = rename_amazon(df1)
df2 = rename_amazon(df2)

# 3. 统一价格格式（去除$、逗号，转为float）
for df in [df1, df2]:
    if 'price' in df.columns:
        df['price'] = df['price'].astype(str).str.replace('[\$,]', '', regex=True)
        df['price'] = pd.to_numeric(df['price'], errors='coerce')

# 4. 统一评分、评论数格式
for df in [df1, df2]:
    if 'rating' in df.columns:
        df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    if 'reviews' in df.columns:
        # 只保留数字
        df['reviews'] = df['reviews'].astype(str).str.extract('(\d+)')
        df['reviews'] = pd.to_numeric(df['reviews'], errors='coerce')

# 5. 统一文本格式（小写、去空格）
for df in [df1, df2]:
    for col in ['product_name', 'category', 'material', 'brand', 'description']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip().str.lower()

# 6. 问卷数据集字段重命名（英文，便于后续分析）
df3 = df3.rename(columns={
    'Do you consider your effect on the environment as a consumer before purchasing general day to day products?': 'env_consideration',
    'Do you actively seek out information on how sustainable products are made?': 'seek_info',
    'Do you recycle product packaging?': 'recycle_packaging',
    'What barriers, if any, prevent you from purchasing sustainable products more often?': 'purchase_barriers',
    'Are you willing to pay a premium for sustainable products compared to non-sustainable products?': 'pay_premium',
    'On a scale of 1 to 10, how likely are you to purchase a product if it is labeled as sustainable or eco-friendly?': 'purchase_likelihood',
    'What types of sustainable products do you currently use or have purchased in the past?': 'used_products',
    'How do you usually learn about sustainable products? ': 'info_channel',
    'Given the following options, what will you choose?': 'choice_option',
    'What changes, if any, would encourage you to purchase more sustainable products in the future?': 'encourage_purchase',
    'What do you usually carry on your shopping days?': 'shopping_bag',
    'What factors influence your decision to choose one sustainable product over another?': 'product_choice_factor',
    'Today, we produce about 400 million tones of plastic waste every year.  Around the world, one million plastic drinking bottles are purchased every minute, while 500 billion single-use plastic bags are used worldwide every year.\n\nAfter reading this information, from a scale of 1 to 5 how likely are you to buy non-reusable plastic water bottles ?': 'plastic_bottle_likelihood',
    'How important are certificates and labels in your decision to purchase sustainable products?': 'cert_label_importance',
    'Are you willing to change your consumption habits to reduce your environmental impact?': 'willing_change_habit',
    'Are you interested in learning more about sustainable products and their benefits?': 'interest_learn'
})

# 7. 统一问卷数值型字段格式
df3['purchase_likelihood'] = pd.to_numeric(df3['purchase_likelihood'], errors='coerce')
df3['plastic_bottle_likelihood'] = pd.to_numeric(df3['plastic_bottle_likelihood'], errors='coerce')

# 8. 统一问卷文本格式
for col in ['env_consideration', 'seek_info', 'recycle_packaging', 'pay_premium', 'cert_label_importance', 'willing_change_habit', 'interest_learn']:
    if col in df3.columns:
        df3[col] = df3[col].astype(str).str.strip().str.lower()

# 9. 保存格式统一后的数据
df1.to_csv('amazon_eco-friendly_products_unified.csv', index=False, encoding='utf-8-sig')
df2.to_csv('amazon_eco_products_数据爬取原始生成_unified.csv', index=False, encoding='utf-8-sig')
df3.to_csv('Form-responses-on-sustainability-products_unified.csv', index=False, encoding='utf-8-sig')

print('数据整合与格式统一完成！')

数据整合与格式统一完成！


In [7]:
import pandas as pd
import numpy as np

# 1. 读取已统一格式的数据
df1 = pd.read_csv('amazon_eco-friendly_products_unified.csv', encoding='utf-8')
df2 = pd.read_csv('amazon_eco_products_数据爬取原始生成_unified.csv', encoding='utf-8')
df3 = pd.read_csv('Form-responses-on-sustainability-products_unified.csv', encoding='utf-8')

# 2. 缺失值处理
for df in [df1, df2]:
    # 数值型缺失值用中位数填充
    for col in ['price', 'rating', 'reviews']:
        if col in df.columns:
            df[col] = df[col].fillna(df[col].median())
    # 文本型缺失值用“未知”填充
    for col in ['product_name', 'category', 'material', 'brand', 'description']:
        if col in df.columns:
            df[col] = df[col].fillna('未知')

# 问卷数据缺失值处理
for col in df3.columns:
    if df3[col].dtype == 'O':
        df3[col] = df3[col].fillna('未知')
    else:
        df3[col] = df3[col].fillna(df3[col].median())

# 3. 异常值处理（如价格为负或极端高值）
for df in [df1, df2]:
    if 'price' in df.columns:
        df = df[(df['price'] >= 0) & (df['price'] < df['price'].quantile(0.99))]

# 4. 文本标准化
for df in [df1, df2]:
    for col in ['product_name', 'category', 'material', 'brand', 'description']:
        if col in df.columns:
            df[col] = df[col].astype(str).str.strip().str.lower()

for col in df3.select_dtypes(include='object').columns:
    df3[col] = df3[col].astype(str).str.strip().str.lower()

# 5. 重复值处理
df1.drop_duplicates(inplace=True)
df2.drop_duplicates(inplace=True)
df3.drop_duplicates(inplace=True)

# 6. 变量类型转换
for df in [df1, df2]:
    for col in ['price', 'rating', 'reviews']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

for col in ['purchase_likelihood', 'plastic_bottle_likelihood']:
    if col in df3.columns:
        df3[col] = pd.to_numeric(df3[col], errors='coerce')

# 7. 保存清洗后的数据
df1.to_csv('amazon_eco-friendly_products_final.csv', index=False, encoding='utf-8-sig')
df2.to_csv('amazon_eco_products_数据爬取原始生成_final.csv', index=False, encoding='utf-8-sig')
df3.to_csv('Form-responses-on-sustainability-products_final.csv', index=False, encoding='utf-8-sig')

print('三份数据集已完成进一步数据预处理并保存！')

三份数据集已完成进一步数据预处理并保存！
