In [96]:
import pyarrow.dataset as ds
import pyarrow.compute as pc
import pyarrow as pa
import polars as pl
import pyarrow.fs as fs

In [97]:
s3_path = "overturemaps-us-west-2/release/2025-02-19.0/theme=places/type=place/"

dataset = ds.dataset(s3_path, filesystem=fs.S3FileSystem(anonymous=True, region="us-west-2"))

In [98]:
## setting bounding box
bbox = (-2.1099092231, 52.3556570611, -1.6529557739, 52.5474833593)

# Define bounding box filter
bbox_filter = (
    (pc.field("bbox", "xmin") < bbox[2]) &
    (pc.field("bbox", "xmax") > bbox[0]) &
    (pc.field("bbox", "ymin") < bbox[3]) &
    (pc.field("bbox", "ymax") > bbox[1])
)

In [99]:
batches = dataset.to_batches(filter=bbox_filter)

## Non Empty batch of the partitioned data
non_empty_batches = [b for b in batches if b.num_rows > 0]

In [100]:
# Convert batches to a PyArrow Table
table = pa.Table.from_batches(non_empty_batches)

In [101]:
## Because polar isn't compatible with MAP types of arrow
## We had to convert it to list of structs and work with the data
def convert_map_to_list_of_structs(table: pa.Table) -> pa.Table:
    """Convert all Map types in the table to List[Struct]."""
    new_schema = []
    for field in table.schema:
        def _convert_type(data_type):
            if isinstance(data_type, pa.MapType):
                return pa.list_(
                    pa.struct([
                        pa.field("key", data_type.key_type),
                        pa.field("value", data_type.item_type)
                    ])
                )
            elif isinstance(data_type, pa.StructType):
                return pa.struct([
                    pa.field(f.name, _convert_type(f.type)) for f in data_type
                ])
            elif isinstance(data_type, pa.ListType):
                return pa.list_(_convert_type(data_type.value_type))
            else:
                return data_type
        new_type = _convert_type(field.type)
        new_schema.append(pa.field(field.name, new_type))
    return table.cast(pa.schema(new_schema))

In [102]:
# Convert Map types to List[Struct]
converted_table = convert_map_to_list_of_structs(table)

In [104]:
# Convert to Polars DataFrame
bhx_places_pl = pl.from_arrow(converted_table)

In [58]:
# Convert to Polars DataFrame directly from tables you have this error.
bhx_places_pl = pl.from_arrow(table)

PanicException: Arrow datatype Map(Field { name: "common", dtype: Struct([Field { name: "key", dtype: Utf8, is_nullable: false, metadata: None }, Field { name: "value", dtype: Utf8, is_nullable: true, metadata: None }]), is_nullable: false, metadata: None }, false) not supported by Polars. You probably need to activate that data-type feature.

In [88]:
print(bhx_places_pl.schema)

Schema({'id': String, 'geometry': Binary, 'bbox': Struct({'xmin': Float32, 'xmax': Float32, 'ymin': Float32, 'ymax': Float32}), 'version': Int32, 'sources': List(Struct({'property': String, 'dataset': String, 'record_id': String, 'update_time': String, 'confidence': Float64})), 'names': Struct({'primary': String, 'common': List(Struct({'key': String, 'value': String})), 'rules': List(Struct({'variant': String, 'language': String, 'value': String, 'between': List(Float64), 'side': String}))}), 'categories': Struct({'primary': String, 'alternate': List(String)}), 'confidence': Float64, 'websites': List(String), 'socials': List(String), 'emails': List(String), 'phones': List(String), 'brand': Struct({'wikidata': String, 'names': Struct({'primary': String, 'common': List(Struct({'key': String, 'value': String})), 'rules': List(Struct({'variant': String, 'language': String, 'value': String, 'between': List(Float64), 'side': String}))})}), 'addresses': List(Struct({'freeform': String, 'local

In [86]:
bhx_places_pl.head()

id,geometry,bbox,version,sources,names,categories,confidence,websites,socials,emails,phones,brand,addresses
str,binary,struct[4],i32,list[struct[5]],struct[3],struct[2],f64,list[str],list[str],list[str],list[str],struct[2],list[struct[5]]
"""08f195c7638149610316504a155408…","b""\x00\x00\x00\x00\x01\xc0\x00\xe0\xeb\x90\x06r\xe1@J5I\xfa\x1f\x15o""","{-2.109824,-2.109824,52.416313,52.416321}",0,"[{"""",""meta"",""511570855528771"",""2025-01-06T08:00:00.000Z"",0.983745}]","{""The Four Stones Coffee and Brunch"",null,null}","{""restaurant"",[""italian_restaurant"", ""british_restaurant""]}",0.983745,"[""http://www.thefourstones.co.uk/""]","[""https://www.facebook.com/511570855528771""]",,"[""+441562883260""]",,"[{""Adam's Hill"",null,""DY9 9PS"",null,""GB""}]"
"""08f195c76381496103ec859b3f38fa…","b""\x00\x00\x00\x00\x01\xc0\x00\xe1\x02\xff\x8e\xc0\xf9@J5J\xfdTT\x15""","{-2.109869,-2.109869,52.416344,52.416351}",0,"[{"""",""Microsoft"",""1125899910316703"",""2022-05-28T05:59:12.167Z"",0.77}]","{""The Four Stones Restaurant"",null,null}","{""breakfast_and_brunch_restaurant"",[""british_restaurant"", ""cafe"", … ""restaurant""]}",0.77,"[""http://www.thefourstones.co.uk/""]",,,"[""01562883260""]",,"[{""Adam's Hill, Clent"",""Stourbridge"",""DY9 9PS"","""",""GB""}]"
"""08f195c760c5bc2a03b7a56aa062f0…","b""\x00\x00\x00\x00\x01\xc0\x00\xe00\xdd\x17\x9d\xab@J7\x87\xa1I\x91D""","{-2.109468,-2.109468,52.433826,52.433834}",0,"[{"""",""meta"",""167039030569271"",""2025-01-06T08:00:00.000Z"",0.983745}]","{""Old Halesonians Association"",null,null}","{""sports_club_and_league"",[""active_life""]}",0.983745,"[""http://www.oldhalesrfc.co.uk/""]","[""https://www.facebook.com/167039030569271""]",,"[""+441562883036""]",,"[{""Wassell Gr La"",null,""DY9 9JP"",null,""GB""}]"
"""08f195c76458032b0351bc9d8081f5…","b""\x00\x00\x00\x00\x01\xc0\x00\xe0{\x1aR\x91\x9d@J9\xecg\x9c\xc7L""","{-2.10961,-2.10961,52.452526,52.452534}",0,"[{"""",""meta"",""145611086053605"",""2025-01-06T08:00:00.000Z"",0.897317}]","{""Solarfit Blinds"",null,null}","{""windows_installation"",[""linen"", ""home_improvement_store""]}",0.897317,"[""http://www.solarfitblinds.com/""]","[""https://www.facebook.com/145611086053605""]",,"[""+447815187181""]",,"[{""2 Perrins Lane"",""Stourbridge"",""DY9 8XR"",""ENG"",""GB""}]"
"""08f195c0d925427203b8204a41c614…","b""\x00\x00\x00\x00\x01\xc0\x00\xe0G@0q\xb8@J:V\x92\xb3\xccK""","{-2.109511,-2.109511,52.455761,52.455769}",0,"[{"""",""meta"",""101832158518065"",""2025-01-06T08:00:00.000Z"",0.868235}]","{""Waste in Time Lye"",null,null}","{""garbage_collection_service"",[""real_estate_service"", ""professional_services""]}",0.868235,"[""http://waste-in-time.co.uk/""]","[""https://www.facebook.com/101832158518065""]",,"[""+447935312913""]",,"[{""7 Star St"",null,""DY9 8TU"",null,""GB""}]"


In [93]:
# Define health-related categories
health_categories = [
    "hospital", "doctor", "hospice", "health_and_medical", "surgeon",
    "personal_care_service", "psychiatrist", "rehabilitation_center",
    "walk_in_clinic", "medical_center", "wellness_program", "surgical_center",
    "public_health_clinic", "physical_therapy", "occupational_medicine",
    "occupational_therapy"
]

# Process and clean the dataset
uk_health_data = (
    bhx_places_pl
    # Extract fields from first element of list-of-structs (addresses)
    .with_columns(
        pl.col("addresses").list.first().struct.field("freeform").alias("street"),
        pl.col("addresses").list.first().struct.field("locality").alias("locality"),
        pl.col("addresses").list.first().struct.field("postcode").alias("postcode"),
        pl.col("addresses").list.first().struct.field("region").alias("region"),
        pl.col("addresses").list.first().struct.field("country").alias("country"),
        pl.col("categories").struct.field("primary").alias("category"),
        pl.col("names").struct.field("primary").alias("name"),
        pl.col("sources").list.eval(pl.element().struct.field("dataset")).alias("dataset_source"),
        pl.col("sources").list.eval(pl.element().struct.field("update_time")).alias("update_time"),
    )

    # Filter for UK locations
    .filter(pl.col("country") == "GB")

    # Filter for relevant health categories
    .filter(pl.col("category").is_in(health_categories))

    # Filter for confidence > 0.5
    .filter(pl.col("confidence") > 0.5)

    # Flatten list columns properly
    .explode("socials")
    .explode("websites")
    .explode("phones")
    .explode("emails")

    # Rename and select final columns
    .rename({"id": "uuid", "phones": "phone_number"})
    .select([
        "uuid", "name", "category", "websites", "socials", "emails",
        "phone_number", "street", "locality", "postcode", "region",
        "country", "dataset_source", "update_time", "geometry"
    ])
)


print("✅ Processed UK health data successfully!")


✅ Processed UK health data successfully!


In [94]:
uk_health_data.head()

uuid,name,category,websites,socials,emails,phone_number,street,locality,postcode,region,country,dataset_source,update_time,geometry
str,str,str,str,str,str,str,str,str,str,str,str,list[str],list[str],binary
"""08f195c0d9254c2e03538ef7e1f18c…","""Ruselhall Hospital""","""hospital""",,"""https://www.facebook.com/58611…",,,,"""Stourbridge""",,,"""GB""","[""meta""]","[""2025-01-06T08:00:00.000Z""]","b""\x00\x00\x00\x00\x01\xc0\x00\xe0\xdd\xb8\xad[\x17@J:e\x12\xa9O\xf0"""
"""08f195c0dd75b4b503e549ca694105…","""Brook Dudley at Cranstoun ROUT…","""doctor""","""https://www.brook.org.uk/servi…",,,"""01384881830""","""The Blvd, Brierley Hill""","""Merry Hill Shopping Centre""","""DY5 1QX""","""""","""GB""","[""Microsoft""]","[""2024-11-26T23:58:36.140Z""]","b""\x00\x00\x00\x00\x01\xc0\x00\xe0\x7f(\x09l;@J=3\x15=\xe3T"""
"""08f195c0dcb46c5c03d0096b59103e…","""Keelinge House Surgery""","""hospital""","""http://www.keelingehousesurger…","""https://www.facebook.com/74781…",,"""+44138477194""","""174 Stourbridge Road""","""Dudley""","""DY1 2ER""","""ENG""","""GB""","[""meta""]","[""2025-01-06T08:00:00.000Z""]","b""\x00\x00\x00\x00\x01\xc0\x00\xe0\x80sW\xe6q@J@\x18\xe7W\x92\x8e"""
"""08f195c76915a66c0374fe3e4afc3d…","""Mercian Surgical Supply Co""","""hospital""","""https://www.merciansurgical.co…","""https://www.facebook.com/11922…",,"""+448448791133""","""10, Topaz Business Park, Topaz…",,"""B61 0GD""",,"""GB""","[""meta"", ""Microsoft""]","[""2025-01-06T08:00:00.000Z"", ""2019-04-15T22:22:57.440Z""]","b""\x00\x00\x00\x00\x01\xc0\x00a2!\xbb;\xab@J-\xb6\xfa\xd3pU"""
"""08f195c7691423150373e0e4f53c6e…","""Property Services Central Ltd.""","""personal_care_service""","""""",,,"""01527831335""","""350 Birmingham Road""","""Bromsgrove""","""B61 0HJ""","""""","""GB""","[""Microsoft""]","[""2017-01-21T04:05:59.453Z""]","b""\x00\x00\x00\x00\x01\xc0\x00_\x17D\xc0#q@J-\xd1\xfc\xb7v#"""
