In [12]:
import kagglehub
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import shutil

In [None]:
# 读取数据
dirty_path = "dirty_cafe_sales.csv"
clean_path = "clean_cafe_sales.csv"

shutil.copyfile(dirty_path, clean_path)

dirty_df = pd.read_csv(dirty_path)
clean_df = pd.read_csv(clean_path)

dirty_df.head()



Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


In [16]:
clean_df.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4,1.0,ERROR,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2,2.0,4.0,Digital Wallet,In-store,2023-06-11


In [27]:
def check_data_quality(df):
    """
    检查 DataFrame 各列中 'UNKNOWN'、'ERROR' 和 NaN 的数量。
    """
    keywords = ["UNKNOWN", "ERROR"]
    result = {}

    for col in df.columns:
        # 转换为字符串以便匹配
        col_str = df[col].astype(str).str.upper()
        result[col] = {
            "UNKNOWN": col_str.str.contains("UNKNOWN").sum(),
            "ERROR": col_str.str.contains("ERROR").sum(),
            "NaN": df[col].isna().sum()
        }

    # 打印结果
    print("📊 数据质量检查报告")
    print("-" * 50)
    for col, counts in result.items():
        print(f"{col:20s} | UNKNOWN: {counts['UNKNOWN']:3d} | ERROR: {counts['ERROR']:3d} | NaN: {counts['NaN']:3d}")
    print("-" * 50)

    



In [28]:
check_data_quality(dirty_df)

📊 数据质量检查报告
--------------------------------------------------
Transaction ID       | UNKNOWN:   0 | ERROR:   0 | NaN:   0
Item                 | UNKNOWN: 344 | ERROR: 292 | NaN: 333
Quantity             | UNKNOWN: 171 | ERROR: 170 | NaN: 138
Price Per Unit       | UNKNOWN: 164 | ERROR: 190 | NaN: 179
Total Spent          | UNKNOWN: 165 | ERROR: 164 | NaN: 173
Payment Method       | UNKNOWN: 293 | ERROR: 306 | NaN: 2579
Location             | UNKNOWN: 338 | ERROR: 358 | NaN: 3265
Transaction Date     | UNKNOWN: 159 | ERROR: 142 | NaN: 159
--------------------------------------------------


In [29]:
# 将 Total Spent 中的 ERROR 替换为 NaN
clean_df['Total Spent'] = pd.to_numeric(clean_df['Total Spent'], errors='coerce')
clean_df['Quantity'] = pd.to_numeric(clean_df['Quantity'], errors='coerce')
clean_df['Price Per Unit'] = pd.to_numeric(clean_df['Price Per Unit'], errors='coerce')



# 修复 Total Spent = NaN 的值
clean_df['Total Spent'] = clean_df['Total Spent'].fillna(clean_df['Quantity'] * clean_df['Price Per Unit'])




In [30]:
check_data_quality(clean_df)

📊 数据质量检查报告
--------------------------------------------------
Transaction ID       | UNKNOWN:   0 | ERROR:   0 | NaN:   0
Item                 | UNKNOWN: 344 | ERROR: 292 | NaN: 333
Quantity             | UNKNOWN:   0 | ERROR:   0 | NaN: 479
Price Per Unit       | UNKNOWN:   0 | ERROR:   0 | NaN: 533
Total Spent          | UNKNOWN:   0 | ERROR:   0 | NaN:  40
Payment Method       | UNKNOWN: 293 | ERROR: 306 | NaN: 2579
Location             | UNKNOWN: 338 | ERROR: 358 | NaN: 3265
Transaction Date     | UNKNOWN: 159 | ERROR: 142 | NaN: 159
--------------------------------------------------


In [31]:
duplicate_ids = clean_df[clean_df['Transaction ID'].duplicated(keep=False)]

if not duplicate_ids.empty:
    print("⚠️ 检测到重复的 Transaction ID：")
    print(duplicate_ids[['Transaction ID', 'Item', 'Total Spent']])
    
    # 删除重复的 Transaction ID，只保留第一次出现的记录
    data = clean_df.drop_duplicates(subset=['Transaction ID'], keep='first')
    print(f"✅ 已删除重复 Transaction ID，共删除 {duplicate_ids['Transaction ID'].nunique()} 条。")
else:
    print("✅ 没有发现重复的 Transaction ID。")

✅ 没有发现重复的 Transaction ID。


In [33]:
clean_df.head()

Unnamed: 0,Transaction ID,Item,Quantity,Price Per Unit,Total Spent,Payment Method,Location,Transaction Date
0,TXN_1961373,Coffee,2.0,2.0,4.0,Credit Card,Takeaway,2023-09-08
1,TXN_4977031,Cake,4.0,3.0,12.0,Cash,In-store,2023-05-16
2,TXN_4271903,Cookie,4.0,1.0,4.0,Credit Card,In-store,2023-07-19
3,TXN_7034554,Salad,2.0,5.0,10.0,UNKNOWN,UNKNOWN,2023-04-27
4,TXN_3160411,Coffee,2.0,2.0,4.0,Digital Wallet,In-store,2023-06-11
