In [None]:
# E-commerce Data Cleaning and Preparation
## Project Overview
"""
This notebook focuses on cleaning and preparing e-commerce data for analysis. We'll handle missing values, data type conversions, and create a clean dataset for further analysis.

### Steps:
1. Data Loading and Initial Exploration
2. Data Quality Assessment
    "3. Handling Missing Values\n",
    "4. Data Type Conversions\n",
    "5. Feature Engineering\n",
    "6. Data Validation\n",
    "7. Saving Cleaned Data"
  """ 
# Import libraries
import pandas as pd
import sys

# Add shared modules to path
sys.path.append('C:/Data Science Projects/data-engineering-portfolio/shared')

# Fix for missing 'shared' module
# If 'shared' is a local folder, ensure it exists and is in the correct path.
# If it's a package, install it using pip.
try:
    from shared.utils.helpers import validate_data, VisualizationHelper
except ModuleNotFoundError:
    # Try installing if it's a pip package (uncommon for 'shared')
    # %pip install shared
    print("‚ö†Ô∏è 'shared' module not found. Please ensure '../../shared' exists and contains 'utils/helpers.py'.")
    # Optionally, raise the error to stop execution
    raise

# Setup plotting
VisualizationHelper.setup_plotting()
%matplotlib inline
# Load raw data\n",
print("üìÇ Loading raw data...")

customers = pd.read_csv('../data/raw/ecommerce_customers.csv')
products = pd.read_csv('../data/raw/ecommerce_products.csv') 
transactions = pd.read_csv('../data/raw/ecommerce_transactions.csv')

print(f"Customers: {customers.shape}")
print(f"Products: {products.shape}")
print(f"Transactions: {transactions.shape}")
# Initial data validation\n",
print("üîç Initial Data Validation")
print("=" * 50)

validate_data(customers, "Customers")
validate_data(products, "Products")
validate_data(transactions, "Transactions")

print("üîÑ Converting data types...")

# Convert dates
customers['signup_date'] = pd.to_datetime(customers['signup_date'])
transactions['date'] = pd.to_datetime(transactions['date'])

# Ensure numeric columns are proper types
products['price'] = pd.to_numeric(products['price'], errors='coerce')
products['cost'] = pd.to_numeric(products['cost'], errors='coerce')
transactions['quantity'] = pd.to_numeric(transactions['quantity'], errors='coerce')

print("‚úÖ Data types converted")
print("üîß Handling missing values...")

# Check for missing values
print("\nMissing values before cleaning:")
print("Customers:", customers.isnull().sum().sum())
print("Products:", products.isnull().sum().sum())
print("Transactions:", transactions.isnull().sum().sum())

# Fill missing values
customers_clean = customers.dropna()
products_clean = products.fillna({'category': 'Unknown', 'price': products['price'].median()})
transactions_clean = transactions.dropna()

print("\n‚úÖ Missing values handled")
print("üßπ Removing duplicates...")

print(f"Customers before: {len(customers_clean)}")
customers_clean = customers_clean.drop_duplicates()
print(f"Customers after: {len(customers_clean)}")

print(f"Products before: {len(products_clean)}")
products_clean = products_clean.drop_duplicates()
print(f"Products after: {len(products_clean)}")

print(f"Transactions before: {len(transactions_clean)}")
transactions_clean = transactions_clean.drop_duplicates()
print(f"Transactions after: {len(transactions_clean)}")
# Feature engineering\n",
print("üéØ Creating new features...")

# Calculate customer tenure (days since signup)
latest_date = transactions_clean['date'].max()
customers_clean['tenure_days'] = (latest_date - customers_clean['signup_date']).dt.days

# Create price segments for products
products_clean['price_segment'] = pd.cut(products_clean['price'], 
                                       bins=[0, 50, 100, 200, 500],
                                       labels=['Budget', 'Mid-range', 'Premium', 'Luxury'])

# Add profit margin to products
products_clean['profit_margin'] = ((products_clean['price'] - products_clean['cost']) / products_clean['price']) * 100

print("‚úÖ New features created")
# Create enriched transactions dataset\n",
print("üîó Creating enriched transactions dataset...")

enriched_transactions = transactions_clean.merge(
    customers_clean[['customer_id', 'location', 'tier', 'tenure_days']], 
    on='customer_id', 
    how='left'
).merge(
    products_clean[['product_id', 'product_name', 'category', 'price', 'price_segment', 'profit_margin']], 
    on='product_id', 
    how='left'
)

# Calculate business metrics
enriched_transactions['revenue'] = enriched_transactions['quantity'] * enriched_transactions['price']
enriched_transactions['profit'] = enriched_transactions['quantity'] * (enriched_transactions['price'] * (enriched_transactions['profit_margin'] / 100))

# Add time-based features
enriched_transactions['month'] = enriched_transactions['date'].dt.to_period('M')
enriched_transactions['day_of_week'] = enriched_transactions['date'].dt.day_name()
enriched_transactions['is_weekend'] = enriched_transactions['date'].dt.dayofweek >= 5
print(f"‚úÖ Enriched transactions created: {enriched_transactions.shape}")
# Data validation after cleaning\n",
print("üîç Final Data Validation")
print("=" * 50)

validate_data(customers_clean, "Cleaned Customers")
validate_data(products_clean, "Cleaned Products")
validate_data(enriched_transactions, "Enriched Transactions")

# Check data quality metrics
print("\nüìä Data Quality Summary:")
print(f"Total Customers: {len(customers_clean)}")
print(f"Total Products: {len(products_clean)}")
print(f"Total Transactions: {len(enriched_transactions)}")
print(f"Total Revenue: ${enriched_transactions['revenue'].sum():,.2f}")
print(f"Date Range: {enriched_transactions['date'].min()} to {enriched_transactions['date'].max()}")
# Save cleaned data\n",
print("üíæ Saving cleaned data...")

customers_clean.to_csv('../data/processed/customers_clean.csv', index=False)
products_clean.to_csv('../data/processed/products_clean.csv', index=False)
enriched_transactions.to_csv('../data/processed/enriched_transactions.csv', index=False)
print("‚úÖ Cleaned data saved to data/processed/")

# Create a basic summary for quick analysis
summary_stats = {
    'total_customers': len(customers_clean),
    'total_products': len(products_clean),
    'total_transactions': len(enriched_transactions),
    'total_revenue': enriched_transactions['revenue'].sum(),
    'avg_transaction_value': enriched_transactions['revenue'].mean(),
    'date_range_start': enriched_transactions['date'].min(),
    'date_range_end': enriched_transactions['date'].max()
}

summary_df = pd.DataFrame([summary_stats])
summary_df.to_csv('../data/processed/summary_statistics.csv', index=False)

print("\nüéâ Data cleaning completed successfully!")
print("\nNext steps:")
print("1. Run 02_eda.ipynb for exploratory data analysis")
print("2. Check the processed data in data/processed/")
print("3. Proceed to feature engineering and modeling")


INFO:utils.helpers:Plotting style configured: seaborn, notebook


INFO:utils.helpers:Plotting style configured: seaborn, notebook


üìÇ Loading raw data...
Customers: (500, 8)
Products: (1000, 10)
Transactions: (1000, 7)
üîç Initial Data Validation
üîÑ Converting data types...


INFO:utils.helpers:Plotting style configured: seaborn, notebook


üìÇ Loading raw data...
Customers: (500, 8)
Products: (1000, 10)
Transactions: (1000, 7)
üîç Initial Data Validation
üîÑ Converting data types...


KeyError: 'signup_date'