## Download PBFs

In [0]:
%run "./download_pbf" $region=Malta $pbfPath=/Volumes/timo/geospatial/osm/malta-latest.osm.pbf

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.15 µs


  warn('Could not open file <%s> for safe execution.' % fname)


Fetching Geofabrik dataset index...


Downloading OSM data for region 'malta' from https://download.geofabrik.de/europe/malta-latest.osm.pbf...
Download complete! File saved at: /Volumes/timo/geospatial/osm/malta-latest.osm.pbf


In [0]:
path_malta = "/Volumes/timo/geospatial/osm/malta-latest.osm.pbf"
filesize_mb = round(dbutils.fs.ls(path_malta)[0].size / 1048 ** 2, 1)
print(F"File size: {filesize_mb} MB")

File size: 5.8 MB


In [0]:
%run "./download_pbf" $region=France $pbfPath=/Volumes/timo/geospatial/osm/france-latest.osm.pbf

Fetching Geofabrik dataset index...


In [0]:
path_france = "/Volumes/timo/geospatial/osm/france-latest.osm.pbf"
filesize_mb = round(dbutils.fs.ls(path_france)[0].size / 1048 ** 2, 1)
print(F"File size: {filesize_mb} MB")

Downloading OSM data for region 'france' from https://download.geofabrik.de/europe/france-latest.osm.pbf...


In [0]:
%run "./download_pbf" $region=north-america $pbfPath=/Volumes/timo/geospatial/osm/north-america-latest.osm.pbf

Fetching Geofabrik dataset index...


Downloading OSM data for region 'north-america' from https://download.geofabrik.de/north-america-latest.osm.pbf...


In [0]:
path_north_america = "/Volumes/timo/geospatial/osm/north-america-latest.osm.pbf"
filesize_mb = round(dbutils.fs.ls(path_north_america)[0].size / 1048 ** 2, 1)
print(F"File size: {filesize_mb} MB")

## Run Benchmark

In [0]:
%run "./load_datasource"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
import time
import pandas as pd

# Create a global list to store logs
execution_logs = []

def measure_execution_time(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time

        # Determine the human-readable format of execution time
        if execution_time < 60:
            readable_time = f"{execution_time:.2f} seconds"
        elif execution_time < 3600:
            readable_time = f"{execution_time / 60:.2f} minutes"
        else:
            readable_time = f"{execution_time / 3600:.2f} hours"

        # Log the execution time
        print(f"Execution time: {readable_time}")

        # Log details in the global execution_logs list
        execution_logs.append({
            "Path": args[0] if len(args) > 0 else kwargs.get("pbfPath", "Unknown"),
            "Table Name": args[1] if len(args) > 1 else kwargs.get("tableName", "Unknown"),
            "Execution Time": readable_time
        })

        return result
    return wrapper
  
@measure_execution_time
def load_and_save_geospatial_data(pbfPath, tableName):
    (
        spark.read.format("pbf")
        .option("path", pbfPath)
        .option("geometryType", "WKT")
        .option("emptyTagFilter", True)
        .option("keyFilter", "building")
        .load()
        .write
        .mode("overwrite")
        .saveAsTable(tableName)
    )

In [0]:
load_and_save_geospatial_data("/Volumes/timo/geospatial/osm/andorra-latest.osm.pbf", 'timo.geospatial.buildings_andorra')

Execution time: 12.07 seconds


In [0]:
load_and_save_geospatial_data(path_malta, 'timo.geospatial.buildings_malta')

Execution time: 5.97 seconds


In [0]:
load_and_save_geospatial_data(path_france, 'timo.geospatial.buildings_france')

In [0]:
load_and_save_geospatial_data(path_north_america, 'timo.geospatial.buildings_north_america')

# Results

In [0]:
# Convert logs to a pandas DataFrame
execution_logs_df = pd.DataFrame(execution_logs)

# Display the DataFrame
display(execution_logs_df)

Path,Table Name,Execution Time
/Volumes/timo/geospatial/osm/andorra-latest.osm.pbf,timo.geospatial.buildings_andorra,12.07 seconds
/Volumes/timo/geospatial/osm/malta-latest.osm.pbf,timo.geospatial.buildings_malta,5.97 seconds


| Path                                                      | Table Name                              | Execution Time |
|-----------------------------------------------------------|-----------------------------------------|----------------|
| /Volumes/timo/geospatial/osm/andorra-latest.osm.pbf       | timo.geospatial.buildings_andorra       | 25.45 seconds  |
| /Volumes/timo/geospatial/osm/malta-latest.osm.pbf         | timo.geospatial.buildings_malta         | 6.95 seconds   |
| /Volumes/timo/geospatial/osm/france-latest.osm.pbf        | timo.geospatial.buildings_france        | 42.39 minutes  |
| /Volumes/timo/geospatial/osm/north-america-latest.osm.pbf | timo.geospatial.buildings_north_america | 1.25 hours     |