In [None]:
import os
import pandas as pd
import requests
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# ================= CONFIGURATION =================
# 1. Your File Name
INPUT_FILE = "train(1).xlsx"   

# 2. Folder to save images
OUTPUT_DIR = "data/house_images" 

# 3. Zoom Level (Close up on the house)
DELTA = 0.0006 
IMAGE_WIDTH = 600
IMAGE_HEIGHT = 600
# ===============================================

def get_session():
    """ Creates a robust connection that retries if internet fails. """
    session = requests.Session()
    retry = Retry(connect=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504, 429])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def get_bbox(lat, lng, delta):
    min_x = lng - delta
    min_y = lat - delta
    max_x = lng + delta
    max_y = lat + delta
    return f"{min_x},{min_y},{max_x},{max_y}"

def fetch_image(session, lat, long, property_id, save_dir):
    file_name = f"image_{property_id}.jpg"
    file_path = os.path.join(save_dir, file_name)

    # --- RESUME LOGIC ---
    if os.path.exists(file_path):
        # If file exists and is not empty (larger than 1KB), skip it
        if os.path.getsize(file_path) > 1024:
            return
        else:
            # If file is empty/corrupted, delete and retry
            try: os.remove(file_path)
            except: pass
    # --------------------

    bbox = get_bbox(lat, long, DELTA)
    base_url = "https://services.arcgisonline.com/arcgis/rest/services/World_Imagery/MapServer/export"
    params = {
        "bbox": bbox,
        "bboxSR": "4326",
        "size": f"{IMAGE_WIDTH},{IMAGE_HEIGHT}",
        "f": "image",
        "format": "jpg"
    }

    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
        response = session.get(base_url, params=params, headers=headers, stream=True, timeout=30)
        
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            print(f"[SUCCESS] Downloaded: {file_name}")
        elif response.status_code == 403:
            print(f"[PAUSE] Server blocked us. Waiting 60 seconds...")
            time.sleep(60)
        else:
            print(f"[ERROR] ID {property_id}: HTTP {response.status_code}")
            
    except Exception as e:
        print(f"[FAIL] ID {property_id}: {e}")

def main():
    # --- STEP 1: LOAD & VERIFY ---
    print(f"--- 1. Loading {INPUT_FILE} ---")
    if not os.path.exists(INPUT_FILE):
        print(f"‚ùå ERROR: Could not find '{INPUT_FILE}'. Is it in this folder?")
        return

    try:
        df = pd.read_excel(INPUT_FILE, engine='openpyxl')
        print(f"‚úÖ Loaded {len(df)} rows.")
    except Exception as e:
        print(f"‚ùå ERROR Reading Excel: {e}")
        return

    # CHECK FOR PRICE
    if 'price' in df.columns:
        print("‚úÖ GREAT NEWS: 'price' column found! This is the correct Training Set.")
    else:
        print("‚ö†Ô∏è WARNING: No 'price' column found. Checking columns...")
        print(df.columns.tolist())
        print("üõë STOPPING: You cannot train without a price column.")
        return

    # --- STEP 2: START DOWNLOAD ---
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
        print(f"Created folder: {OUTPUT_DIR}")

    session = get_session()
    print("\n--- 2. Starting Download (This will take time) ---")
    
    for index, row in df.iterrows():
        try:
            # Handle different capitalization (Lat vs lat)
            lat = row.get('lat', row.get('Lat', row.get('Latitude')))
            long = row.get('long', row.get('Long', row.get('Longitude')))
            prop_id = row.get('id', index)

            if pd.isna(lat) or pd.isna(long):
                continue

            fetch_image(session, lat, long, prop_id, OUTPUT_DIR)
            time.sleep(0.5) # Polite delay

        except KeyboardInterrupt:
            print("\nstopped by user.")
            break
        except Exception as e:
            print(f"Error: {e}")
            continue

if __name__ == "__main__":
    main()

--- 1. Loading train(1).xlsx ---
‚úÖ Loaded 16209 rows.
‚úÖ GREAT NEWS: 'price' column found! This is the correct Training Set.
Created folder: data/house_images

--- 2. Starting Download (This will take time) ---
[SUCCESS] Downloaded: image_9117000170.jpg
[SUCCESS] Downloaded: image_6700390210.jpg
[SUCCESS] Downloaded: image_7212660540.jpg
[SUCCESS] Downloaded: image_8562780200.jpg
[SUCCESS] Downloaded: image_7760400350.jpg
[SUCCESS] Downloaded: image_464001025.jpg
[SUCCESS] Downloaded: image_3432500486.jpg
[SUCCESS] Downloaded: image_1126059095.jpg
[SUCCESS] Downloaded: image_3876500290.jpg
[SUCCESS] Downloaded: image_1865400075.jpg
[SUCCESS] Downloaded: image_2558690150.jpg
[SUCCESS] Downloaded: image_7154200070.jpg
[SUCCESS] Downloaded: image_4139490210.jpg
[SUCCESS] Downloaded: image_9264901040.jpg
[SUCCESS] Downloaded: image_4273000095.jpg
[SUCCESS] Downloaded: image_8691410310.jpg
[SUCCESS] Downloaded: image_6601200020.jpg
[SUCCESS] Downloaded: image_8663260030.jpg
[SUCCESS] Dow