In [1]:
import os
import time
import requests
import pandas as pd
import mercantile

from PIL import Image
from io import BytesIO
from tqdm import tqdm


In [5]:
# defining the paths and parameters for extracting the images

CSV_PATH = r"C:\Users\dhanu\OneDrive\Desktop\cdc project\train_1_tab.csv"        
CSV_PATH_TEST = r"C:\Users\dhanu\OneDrive\Desktop\cdc project\test_2_tab.csv"
TRAIN_OUTPUT_DIR = r"C:\Users\dhanu\OneDrive\Desktop\cdc project\images\train"     
TEST_OUTPUT_DIR = r"C:\Users\dhanu\OneDrive\Desktop\cdc project\images\test"
ZOOM = 18                      
IMG_SIZE = (224, 224)          
SLEEP = 0.1                    

# Esri World Imagery tile server 
TILE_URL = (
    "https://services.arcgisonline.com/ArcGIS/rest/services/"
    "World_Imagery/MapServer/tile/{z}/{y}/{x}"
)

os.makedirs(TRAIN_OUTPUT_DIR, exist_ok=True)



In [3]:
# helper function for downloading the images from the server

def download_image(lat, lon):
    """
    Objective is to download a single tile from the server 

    """

    # Convert lat/lon → XYZ tile
    tile = mercantile.tile(lon, lat, ZOOM)

    # Get tile from the URL with set parameters
    url = TILE_URL.format(z=tile.z, x=tile.x, y=tile.y)

    # Request tile
    response = requests.get(url, timeout=10)
    response.raise_for_status()

    # Load image
    img = Image.open(BytesIO(response.content)).convert("RGB")

    # Resize to CNN size
    img = img.resize(IMG_SIZE)

    return img

In [6]:
df_train = pd.read_csv(CSV_PATH)
df_test = pd.read_csv(CSV_PATH_TEST)

In [43]:
def main():
    
    # convert int dtype to str dtype to save the image with its "id" as filename
    df_train["id"] = df_train["id"].astype(str)

    total = len(df_train)
    print(f"... Download begin for {total} rows ...")

    for idx, row in tqdm(df_train.iterrows(), total=total):
        

        # for each row, collect the id, lat and long values
        house_id = row["id"]
        lat = row["lat"]
        lon = row["long"]

        # mention that this is the path for the image to save at 
        image_path = os.path.join(TRAIN_OUTPUT_DIR, f"{house_id}.png")

        # if already downloaded, ignore
        if os.path.exists(image_path):
            continue

        try:
            # download the image
            img = download_image(lat, lon)
        
        except Exception as e:

            print(f"[ERROR] Download failed for {house_id}: {e}")
            # Mitigate : black image
            img = Image.new("RGB", IMG_SIZE, (0, 0, 0))

        img.save(image_path)
        time.sleep(SLEEP)

    print("All images downloaded successfully!")


if __name__ == "__main__":
    main()

... Download begin for 16209 rows ...


100%|██████████| 16209/16209 [2:02:48<00:00,  2.20it/s]  

All images downloaded successfully!





In [16]:
def main():
    
    # convert int dtype to str dtype to save the image with its "id" as filename
    df_test["id"] = df_test["id"].astype(str)

    total = len(df_test)
    print(f"... Download begin for {total} rows ...")

    for idx, row in tqdm(df_test.iterrows(), total=total):
        

        # for each row, collect the id, lat and long values
        house_id = row["id"]
        lat = row["lat"]
        lon = row["long"]

        # mention that this is the path for the image to save at 
        image_path = os.path.join(TEST_OUTPUT_DIR, f"{house_id}.png")

        # if already downloaded, ignore
        if os.path.exists(image_path):
            continue

        try:
            # download the image
            img = download_image(lat, lon)
        
        except Exception as e:

            print(f"[ERROR] Download failed for {house_id}: {e}")
            # Mitigate : black image
            img = Image.new("RGB", IMG_SIZE, (0, 0, 0))

        img.save(image_path)
        time.sleep(SLEEP)

    print("All images downloaded successfully!")


if __name__ == "__main__":
    main()

... Download begin for 5404 rows ...


100%|██████████| 5404/5404 [37:28<00:00,  2.40it/s]  

All images downloaded successfully!



