In [None]:
import os
import sys
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)


In [1]:
# Add project root to Python path so we can import our module
import sys
sys.path.append('../')

# Import our custom cleaning class
from src.data.clean_data import RealEstateDataCleaner

# Import other libraries for analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
print("Libraries imported successfully!")

ImportError: cannot import name 'RealEstateDataCleaner' from 'src.data.clean_data' (c:\Users\raoux\Documents\Dev\DS\real-estate-ml-project\notebooks\..\src\data\clean_data.py)

In [None]:
# Create cleaner instance
cleaner = RealEstateDataCleaner()

# IMPORTANT: UPDATE THIS PATH to your actual data file
data_path = "../data/raw/kc_house_data.csv"  # CHANGE THIS!

# Load your data
raw_data = cleaner.load_data(data_path)

if raw_data is not None:
    print("Data loaded successfully!")
    print("\nFirst 5 rows:")
    print(raw_data.head())
    print(f"\nDataset shape: {raw_data.shape}")
else:
    print("Failed to load data. Check your file path!")

In [None]:
# Generate and display comprehensive quality report
cleaner.generate_data_quality_report()
cleaner.print_data_quality_report()

In [None]:
# Create visualization for missing data
if cleaner.cleaning_report['missing_values']:
    missing_data = pd.DataFrame.from_dict(
        cleaner.cleaning_report['missing_values'], 
        orient='index', 
        columns=['Missing_Count']
    )
    missing_data['Missing_Percentage'] = (
        missing_data['Missing_Count'] / cleaner.cleaning_report['total_rows'] * 100
    )
    
    plt.figure(figsize=(12, 6))
    missing_data['Missing_Percentage'].plot(kind='bar', color='red', alpha=0.7)
    plt.title('Missing Data by Column')
    plt.ylabel('Percentage Missing')
    plt.xlabel('Columns')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    print("Missing Data Summary:")
    print(missing_data)
else:
    print("No missing data found!")

In [None]:
# Handle missing values
print("Step 1: Handling missing values...")
cleaned_data = cleaner.handle_missing_values(strategy='default')
print(f"Data shape after cleaning: {cleaned_data.shape}")

# Remove duplicates
print("\nStep 2: Removing duplicates...")
cleaner.remove_duplicates()
print(f"Data shape after removing duplicates: {cleaner.cleaned_data.shape}")

# Check for remaining missing values
remaining_missing = cleaner.cleaned_data.isnull().sum()
print("\nRemaining missing values:")
print(remaining_missing[remaining_missing > 0])

if remaining_missing.sum() == 0:
    print("✅ All missing values handled successfully!")

In [None]:
# Detect outliers
print("Step 3: Detecting outliers...")
outliers_info = cleaner.detect_outliers(method='iqr')

# Display outlier summary
print("\nOUTLIER SUMMARY:")
print("-" * 50)
for col, info in outliers_info.items():
    print(f"{col}: {info['count']} outliers ({info['percentage']:.2f}%)")

In [None]:
# Save the cleaned data
output_path = "../data/processed/cleaned_housing_data.csv"

success = cleaner.save_cleaned_data(output_path)

if success:
    print(f"✅ Cleaned data saved to: {output_path}")
    
    # Verify saved data
    saved_data = pd.read_csv(output_path)
    print(f"Saved data shape: {saved_data.shape}")
    print(f"Missing values in saved data: {saved_data.isnull().sum().sum()}")
else:
    print("❌ Failed to save cleaned data")

In [None]:
# Alternative: Use the complete pipeline method
print("Testing complete pipeline method...")

# Create new cleaner instance
pipeline_cleaner = RealEstateDataCleaner()

# IMPORTANT: UPDATE THESE PATHS
input_file = "../data/raw/kc_house_data.csv"   # CHANGE THIS!
output_file = "../data/processed/pipeline_cleaned_data.csv"

# Run complete pipeline
success = pipeline_cleaner.clean_data_pipeline(
    input_path=input_file,
    output_path=output_file,
    missing_strategy='default',
    remove_outliers=False
)

if success:
    print("✅ Pipeline completed successfully!")
else:
    print("❌ Pipeline failed!")