In [1]:
import os
import pandas as pd

# ================= CONFIGURATION =================
INPUT_FILE = "train(1).xlsx"       # Your original Excel
IMAGE_DIR = "data/house_images"    # Where the images are
OUTPUT_FILE = "cleaned_dataset.csv" # The new file we will create
# =================================================

def main():
    print(f"--- Loading Data from {INPUT_FILE} ---")
    df = pd.read_excel(INPUT_FILE, engine='openpyxl')
    original_count = len(df)
    print(f"Original Rows: {original_count}")

    print(f"\n--- Scanning Image Folder ({IMAGE_DIR}) ---")
    # Get a list of all image files that actually exist
    existing_images = set(os.listdir(IMAGE_DIR))
    print(f"Found {len(existing_images)} images.")

    # Function to check if a row's image exists
    def has_image(row):
        # We need to handle 'id' safely (convert to string just in case)
        prop_id = row.get('id')
        filename = f"image_{prop_id}.jpg"
        return filename in existing_images

    # Filter the DataFrame
    print("\n--- Matching Data to Images ---")
    df['has_image'] = df.apply(has_image, axis=1)
    clean_df = df[df['has_image'] == True].copy()
    
    # Drop the helper column
    clean_df.drop(columns=['has_image'], inplace=True)

    # Save
    clean_df.to_csv(OUTPUT_FILE, index=False)
    
    print("-" * 30)
    print(f"Summary:")
    print(f"Original Rows: {original_count}")
    print(f"Valid Images:  {len(existing_images)}")
    print(f"Final Dataset: {len(clean_df)} rows")
    print("-" * 30)
    print(f"✅ Success! Saved clean data to '{OUTPUT_FILE}'")
    print("You are ready to train!")

if __name__ == "__main__":
    main()

--- Loading Data from train(1).xlsx ---
Original Rows: 16209

--- Scanning Image Folder (data/house_images) ---
Found 2027 images.

--- Matching Data to Images ---
------------------------------
Summary:
Original Rows: 16209
Valid Images:  2027
Final Dataset: 2057 rows
------------------------------
✅ Success! Saved clean data to 'cleaned_dataset.csv'
You are ready to train!
