In [None]:
import os
import pandas as pd
import requests
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

# ================= CONFIGURATION =================
INPUT_FILE = "train(1).xlsx"
OUTPUT_DIR = "data/house_images"

# Zoom and Size settings
DELTA = 0.0006
IMAGE_WIDTH = 600
IMAGE_HEIGHT = 600
# ===============================================

def get_session():
    """ Creates a robust connection that retries if internet fails. """
    session = requests.Session()
    retry = Retry(connect=5, backoff_factor=1, status_forcelist=[500, 502, 503, 504, 429])
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def get_bbox(lat, lng, delta):
    min_x = lng - delta
    min_y = lat - delta
    max_x = lng + delta
    max_y = lat + delta
    return f"{min_x},{min_y},{max_x},{max_y}"

def fetch_image(session, lat, long, property_id, save_dir):
    file_name = f"image_{property_id}.jpg"
    file_path = os.path.join(save_dir, file_name)

    # --- RESUME LOGIC ---
    if os.path.exists(file_path):
        if os.path.getsize(file_path) > 1024:
            return "SKIPPED"
        else:
            try: os.remove(file_path)
            except: pass

    bbox = get_bbox(lat, long, DELTA)
    base_url = "https://services.arcgisonline.com/arcgis/rest/services/World_Imagery/MapServer/export"
    params = {
        "bbox": bbox,
        "bboxSR": "4326",
        "size": f"{IMAGE_WIDTH},{IMAGE_HEIGHT}",
        "f": "image",
        "format": "jpg"
    }

    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = session.get(base_url, params=params, headers=headers, stream=True, timeout=10)
        
        if response.status_code == 200:
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            return "DOWNLOADED"
        elif response.status_code == 403:
            print(f"‚ö†Ô∏è Server blocked us. Waiting 60 seconds...")
            time.sleep(60)
            return "ERROR"
        else:
            return "ERROR"
            
    except Exception as e:
        print(f"‚ùå Connection Error: {e}")
        return "ERROR"

def main():
    print(f"--- üöÄ Starting Smart Resume ---")
    
    # 1. Load Data
    if not os.path.exists(INPUT_FILE):
        print(f"‚ùå ERROR: {INPUT_FILE} not found.")
        return

    df = pd.read_excel(INPUT_FILE, engine='openpyxl')
    total_images = len(df)
    print(f"üìÑ Found {total_images} houses in the list.")

    # 2. Check Existing Folder
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)
    
    # Quick scan of what we have
    existing = len(os.listdir(OUTPUT_DIR))
    print(f"üìÇ You already have {existing} images.")
    print(f"‚¨áÔ∏è Need to download about {total_images - existing} more...")
    print("-" * 30)

    session = get_session()
    
    # Counters
    downloaded = 0
    skipped = 0
    errors = 0

    # 3. Start Loop
    for index, row in df.iterrows():
        try:
            lat = row.get('lat')
            long = row.get('long')
            prop_id = row.get('id')

            if pd.isna(lat) or pd.isna(long):
                continue

            status = fetch_image(session, lat, long, prop_id, OUTPUT_DIR)
            
            if status == "SKIPPED":
                skipped += 1
                if skipped % 1000 == 0:
                    print(f"‚è© Skipped {skipped} existing files...")
            elif status == "DOWNLOADED":
                downloaded += 1
                if downloaded % 50 == 0:
                    print(f"‚úÖ Downloaded {downloaded} new images... (Total files: {existing + downloaded})")
                time.sleep(0.2) # Small delay to be polite
            else:
                errors += 1

        except KeyboardInterrupt:
            print("\nüõë Stopped by user.")
            break
        except Exception as e:
            continue

    print("\n" + "="*30)
    print("FINISHED!")
    print(f"New Downloads: {downloaded}")
    print(f"Skipped:       {skipped}")
    print(f"Errors:        {errors}")
    print("="*30)

if __name__ == "__main__":
    main()

--- üöÄ Starting Smart Resume ---
üìÑ Found 16209 houses in the list.
üìÇ You already have 12589 images.
‚¨áÔ∏è Need to download about 3620 more...
------------------------------
‚è© Skipped 1000 existing files...
‚è© Skipped 2000 existing files...
‚è© Skipped 3000 existing files...
‚è© Skipped 4000 existing files...
‚è© Skipped 5000 existing files...
‚è© Skipped 6000 existing files...
‚è© Skipped 7000 existing files...
‚è© Skipped 8000 existing files...
‚è© Skipped 9000 existing files...
‚è© Skipped 10000 existing files...
‚è© Skipped 11000 existing files...
‚è© Skipped 12000 existing files...
