# Data Preprocessing and Cleaning

## Import required libraries

In [None]:
import pandas as pd
import os
from datetime import datetime

In [None]:


class SimpleCSVAnalyzer:
    """Simple CSV file analyzer - Final Version with fixes"""
    
    def __init__(self):
        self.data_folder = r"C:\Users\sit\Downloads\Capstone2\Data"
        self.csv_files = {}
        self.final_data = None
    
    # Step 1: Load ResaleFlat CSV Files
    def load_all_csv_files(self):
        print("Step 1: Loading ResaleFlat CSV files...")
        print("=" * 50)
        os.chdir(self.data_folder)
        all_files = os.listdir(self.data_folder)

        resale_flat_files = [f for f in all_files if f.endswith('.csv') and f.startswith('ResaleFlat')]
        if not resale_flat_files:
            print("‚ùå No CSV files starting with 'ResaleFlat' found!")
            return

        print(f"Found {len(resale_flat_files)} ResaleFlat CSV files:")
        for i, file in enumerate(resale_flat_files, 1):
            print(f"   {i}. {file}")
        
        for file_name in resale_flat_files:
            try:
                data = pd.read_csv(file_name)
                # ‚úÖ Force headers uppercase immediately
                data.columns = [col.strip().upper() for col in data.columns]
                self.csv_files[file_name] = data
                print(f"‚úÖ Loaded: {file_name} | Rows: {len(data):,} | Cols: {len(data.columns)}")
            except Exception as e:
                print(f"‚ùå Failed to load {file_name}: {e}")
    
    # Step 4: Fix AGE Columns
    def fix_age_columns(self):
        print("\nStep 4: Fixing AGE columns...")
        print("=" * 50)
        current_year = datetime.now().year

        for file_name, data in self.csv_files.items():
            if 'REMAINING_LEASE' in data.columns:
                # Directly compute AGE from numeric REMAINING_LEASE
                # Clean and convert REMAINING_LEASE to numeric
                data['REMAINING_LEASE'] = pd.to_numeric(data['REMAINING_LEASE'], errors='coerce')
                data['AGE'] = (99 - data['REMAINING_LEASE']).fillna(0).astype(int)
                print(f"   ‚úÖ Created AGE from REMAINING_LEASE in {file_name}")

            elif 'LEASE_COMMENCE_DATE' in data.columns:
                # Clean and convert LEASE_COMMENCE_DATE to numeric
                data['LEASE_COMMENCE_DATE'] = pd.to_numeric(data['LEASE_COMMENCE_DATE'], errors='coerce')               
                data['AGE'] = (current_year - data['LEASE_COMMENCE_DATE']).astype(int)
                print(f"   ‚úÖ Created AGE from LEASE_COMMENCE_DATE in {file_name}")

            else:
                print(f"   ‚ö†Ô∏è No lease columns found in {file_name}")
    
    # Step 6: Clean Text Columns
    def clean_text_columns(self):
        print("\nStep 6: Converting text to uppercase...")
        print("=" * 50)
        text_columns = ['TOWN', 'FLAT_TYPE', 'STREET_NAME', 'FLAT_MODEL']
        for file_name, data in self.csv_files.items():
            for col in text_columns:
                if col in data.columns:
                    data[col] = data[col].astype(str).str.upper()
                    data[col] = data[col].replace('NAN', pd.NA)
                    print(f"   ‚úÖ Converted {col} to uppercase in {file_name}")

    # Step 7: Combine All Files
    def combine_all_files(self):
        print("\nStep 7: Combining all files...")
        print("=" * 50)
        dfs = []
        for file_name, data in self.csv_files.items():
            data['SOURCE_FILE'] = file_name
            dfs.append(data)
        self.final_data = pd.concat(dfs, ignore_index=True)
     
       # Convert to int for RESALE_PRICE and FLOOR_AREA_SQM
        self.final_data['RESALE_PRICE'] = self.final_data['RESALE_PRICE'].fillna(0).astype(int)
        self.final_data['FLOOR_AREA_SQM'] = self.final_data['FLOOR_AREA_SQM'].fillna(0).astype(int)
        print(f"‚úÖ Combined successfully! Rows: {len(self.final_data):,}, Cols: {len(self.final_data.columns)}")

    # Step 8: Split MONTH Field
    def split_month_field(self):
        print("\nStep 8: Splitting MONTH field...")
        print("=" * 50)
        if self.final_data is None:
            print("‚ùå No data available")
            return
        if 'MONTH' not in self.final_data.columns:
            print("‚ùå MONTH column not found")
            return

        print("Sample MONTH values:", self.final_data['MONTH'].dropna().head(5).tolist())
        try:
            self.final_data['MONTH'] = pd.to_datetime(self.final_data['MONTH'], errors='coerce')
            self.final_data['YEAR'] = self.final_data['MONTH'].dt.year
            self.final_data['MONTH_NUM'] = self.final_data['MONTH'].dt.month
            print("‚úÖ Successfully split MONTH into YEAR and MONTH_NUM")
            print(self.final_data[['MONTH', 'YEAR', 'MONTH_NUM']].head())
        except Exception as e:
            print(f"‚ùå Error splitting MONTH field: {e}")

    # Step 9: Create Filtered Dataset
    def create_filtered_dataset(self):
        print("\nStep 9: Creating filtered dataset...")
        print("=" * 50)
        if self.final_data is None:
            print("‚ùå No data available")
            return
        exclude_cols = ['STREET_NAME', 'SOURCE_FILE', 'LEASE_COMMENCE_DATE','MONTH','REMAINING_LEASE','REMAINING_LEASE_YEARS','BLOCK']
        keep_cols = [c for c in self.final_data.columns if c not in exclude_cols]
        self.final_data = self.final_data[keep_cols].copy()
        print(f"‚úÖ Filtered dataset created. Cols left: {len(self.final_data.columns)}")

    # Step 10: Show Sample Data
    def show_sample_data(self):
        print("\nStep 10: Sample of filtered data...")
        print("=" * 50)
        if self.final_data is not None:
            print(self.final_data.head(3))
            print("\nColumns:", list(self.final_data.columns))

    # Step 11: Check for Duplicate Data
    def check_duplicate_data(self):
        print("\nStep 11: Checking for duplicate rows...")
        print("=" * 50)
        if self.final_data is None:
            print("‚ùå No data available")
            return

        # Step 1: Detect duplicates
        duplicate_rows = self.final_data[self.final_data.duplicated()]
        num_duplicates = len(duplicate_rows)

        if num_duplicates > 0:
            print(f"üîç Found {num_duplicates:,} duplicate rows.")
            print("Here are a few examples:")
            print(duplicate_rows.head(3))

            # Step 2: Drop duplicates
            self.final_data = self.final_data.drop_duplicates()
            print(f"üßπ Dropped duplicates. Remaining rows: {len(self.final_data):,}")

            # Step 3: Re-check for any remaining duplicates
            remaining_duplicates = self.final_data[self.final_data.duplicated()]
            if len(remaining_duplicates) == 0:
                print("‚úÖ No duplicate rows remain after cleanup.")
            else:
                print(f"‚ö†Ô∏è Still found {len(remaining_duplicates):,} duplicates after drop.")
                print(remaining_duplicates.head(3))
        else:
            print("‚úÖ No duplicate rows found.")


    # Step 12: Export Final Data
    def save_final_data(self, output_filename="final_resale_data.csv"):
        print("\nStep 11: Saving final dataset...")
        print("=" * 50)
        if self.final_data is not None:
            output_path = os.path.join(self.data_folder, output_filename)
            self.final_data.to_csv(output_path, index=False)
            print(f"‚úÖ Saved final dataset to {output_path}")
        else:
            print("‚ùå No data to save")
            

    # Run All Steps
    def run_complete_analysis(self):
        print("üöÄ Starting Complete CSV Analysis")
        print("=" * 70)
        self.load_all_csv_files()
        if not self.csv_files:
            return
        self.fix_age_columns()
        self.clean_text_columns()
        self.combine_all_files()
        self.split_month_field()
        self.create_filtered_dataset()
        self.show_sample_data()
        self.check_duplicate_data()
    #    self.save_final_data()  # Uncomment if you want auto-save
        print("\nüéâ Analysis Complete! ")

def main():
    analyzer = SimpleCSVAnalyzer()
    analyzer.run_complete_analysis()
    return analyzer

if __name__ == "__main__":
    analyzer = main()


In [2]:
# EDA on Singapore HDB Resale Price Dataset
# Objective: Analyze numerical and categorical features vs RESALE_PRICE
# and detect outliers with recommendations.

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
df = pd.read_csv('C:\Users\sit\Downloads\Capstone2\Sample\hdb_data.csv')

# Display the first few rows
print("Sample Data:")
display(df.head())

# Define numerical and categorical columns
num_attribs = ["AGE", "FLOOR_AREA_SQM", "YEAR", "MONTH_NUM"]
cat_attribs = ["TOWN", "FLAT_TYPE", "STOREY_RANGE", "FLAT_MODEL"]

# Basic info and stats
print("\nData Info:")
df.info()

print("\nDescriptive statistics for numerical attributes:")
display(df[num_attribs + ["RESALE_PRICE"]].describe())

print("\nUnique values in categorical attributes:")
for col in cat_attribs:
    print(f"{col}: {df[col].nunique()} unique values")

# ======================
# 1. Scatterplots for Numerical Features vs RESALE_PRICE
# ======================
print("\nScatterplots: Numerical attributes vs RESALE_PRICE")

plt.figure(figsize=(16, 12))
for i, col in enumerate(num_attribs):
    plt.subplot(2, 2, i+1)
    sns.scatterplot(data=df.sample(5000, random_state=42), x=col, y="RESALE_PRICE", alpha=0.3)
    plt.title(f"{col} vs RESALE_PRICE")
plt.tight_layout()
plt.show()

# ======================
# 2. Boxplots for Categorical Features vs RESALE_PRICE
# ======================
print("\nBoxplots: Categorical attributes vs RESALE_PRICE")

plt.figure(figsize=(16, 12))
for i, col in enumerate(cat_attribs):
    plt.subplot(2, 2, i+1)
    sns.boxplot(data=df.sample(5000, random_state=42), x=col, y="RESALE_PRICE")
    plt.xticks(rotation=45)
    plt.title(f"{col} vs RESALE_PRICE")
plt.tight_layout()
plt.show()

# ======================
# 3. Outlier Detection
# ======================

# Use IQR method on RESALE_PRICE
Q1 = df["RESALE_PRICE"].quantile(0.25)
Q3 = df["RESALE_PRICE"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df["RESALE_PRICE"] < lower_bound) | (df["RESALE_PRICE"] > upper_bound)]
print(f"\nNumber of outliers detected in RESALE_PRICE: {outliers.shape[0]}")

# Display summary stats for outliers
print("\nSummary statistics for outliers:")
display(outliers.describe())

# Outlier detection for numerical features vs RESALE_PRICE (using boxplot stats)
for col in num_attribs:
    Q1_col = df[col].quantile(0.25)
    Q3_col = df[col].quantile(0.75)
    IQR_col = Q3_col - Q1_col
    lower_col = Q1_col - 1.5 * IQR_col
    upper_col = Q3_col + 1.5 * IQR_col
    col_outliers = df[(df[col] < lower_col) | (df[col] > upper_col)]
    print(f"\nOutliers in {col}: {col_outliers.shape[0]} rows")

# ======================
# Recommendations for Outliers
# ======================
print("""
Recommendations on Outliers:

1. RESALE_PRICE Outliers:
   - Some extreme resale prices could be data entry errors or very unique properties.
   - Consider:
       * Investigate and verify outlier records if possible.
       * Remove extreme outliers if they skew the model.
       * Alternatively, use robust regression models less sensitive to outliers.
       * Apply transformations (e.g., log) on RESALE_PRICE to reduce skewness.

2. Numerical Attributes Outliers:
   - For AGE and FLOOR_AREA_SQM, extreme values may be valid but rare.
   - Check if outliers correspond to valid flats (e.g., very old age or very large floor area).
   - You may cap values at reasonable percentiles (e.g., 1st and 99th percentiles) if they harm model performance.

3. Categorical Attributes:
   - Rare categories with very few data points might be grouped as 'Others' to reduce noise.

4. General:
   - Use visualizations and domain knowledge to decide on outlier treatment.
   - Consider feature scaling and transformation for numerical columns before modeling.
""")


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (3011902535.py, line 12)