In [1]:
import pandas as pd
import os

# ================= CONFIGURATION =================
RAW_EXCEL = "train(1).xlsx"
IMAGE_FOLDER = "data/house_images"
OUTPUT_CSV = "cleaned_dataset.csv" # Overwriting the old small one
# =================================================

def main():
    print("--- üîÑ UPDATING DATASET LIST ---")
    
    # 1. Load the big Excel file
    if not os.path.exists(RAW_EXCEL):
        print(f"‚ùå Error: {RAW_EXCEL} not found.")
        return

    print(f"Reading {RAW_EXCEL}...")
    df = pd.read_excel(RAW_EXCEL, engine='openpyxl')
    print(f"Found {len(df)} total rows in Excel.")
    
    # 2. Filter: Keep only houses where we have an image
    valid_rows = []
    missing_count = 0
    
    print("Checking image folder...")
    existing_files = set(os.listdir(IMAGE_FOLDER))
    
    for index, row in df.iterrows():
        house_id = row.get('id')
        price = row.get('price')
        
        # We need both an ID and a Price
        if pd.isna(house_id) or pd.isna(price):
            continue
            
        filename = f"image_{house_id}.jpg"
        
        if filename in existing_files:
            valid_rows.append(row)
        else:
            missing_count += 1
            
    # 3. Save the new list
    new_df = pd.DataFrame(valid_rows)
    new_df.to_csv(OUTPUT_CSV, index=False)
    
    print("\n" + "="*30)
    print(f"‚úÖ SUCCESS! New dataset created.")
    print(f"üìÑ Old Dataset Size:  ~2,000")
    print(f"üöÄ New Dataset Size:  {len(new_df)}")
    print(f"üóëÔ∏è Missing Images:    {missing_count}")
    print(f"Saved to: {OUTPUT_CSV}")
    print("="*30)
    print("üëâ NOW run 'python train_ai_v2.py'")

if __name__ == "__main__":
    main()

--- üîÑ UPDATING DATASET LIST ---
Reading train(1).xlsx...
Found 16209 total rows in Excel.
Checking image folder...

‚úÖ SUCCESS! New dataset created.
üìÑ Old Dataset Size:  ~2,000
üöÄ New Dataset Size:  12892
üóëÔ∏è Missing Images:    3317
Saved to: cleaned_dataset.csv
üëâ NOW run 'python train_ai_v2.py'
