In [2]:
import geopandas as gpd
import pandas as pd
import pyarrow.parquet as pq

In [4]:
# function to load shapefiles and add osm_type column
def load_health_data(file_path, osm_type, encoding="utf-8"):
    """ Load Health facility shapefile and assign an OSM type."""
    try:
        gdf = gpd.read_file(file_path, encoding=encoding)
    except UnicodeDecodeError:
        print(f"⚠️ UTF-8 decoding failed for {file_path}. Trying ISO-8859-1...")
        gdf = gpd.read_file(file_path, encoding="ISO-8859-1")
        
    gdf["osm_type"] = osm_type
    return gdf

## Load USA Health Data

us_health_node_raw = load_health_data("../data-raw/hdx/United States/United States-node.shp", "node")
us_health_way_raw = load_health_data("../data-raw/hdx/United States/United States-node.shp", "way")

# Combine both USA datasets hdx
us_health_combined = pd.concat([us_health_node_raw, us_health_way_raw], ignore_index=True)

⚠️ UTF-8 decoding failed for ../data-raw/hdx/United States/United States-node.shp. Trying ISO-8859-1...
⚠️ UTF-8 decoding failed for ../data-raw/hdx/United States/United States-node.shp. Trying ISO-8859-1...


In [5]:
us_health_combined.head()

Unnamed: 0,osm_id,amenity,healthcare,name,operator,source,speciality,operator_t,contact_nu,operationa,...,addr_house,addr_stree,addr_postc,addr_city,changeset_,changeset_.1,changeset_.2,changeset_.3,geometry,osm_type
0,354398914,doctors,,Van Horne Family Medical Clinic,Virginia Gay Hospital,,,,,,...,205,Main Street,52346,Van Horne,watmildon,watmildon,watmildon,watmildon,POINT (-92.09074 42.00889),node
1,354399084,doctors,,Hopkinton Family Medical Center,Regional Medical Center,,,,,,...,122,1st Street Southwest,52310,Monticello,b-jazz-bot,b-jazz-bot,b-jazz-bot,b-jazz-bot,POINT (-91.24876 42.3437),node
2,354399343,doctors,,Covenant Clinic,Wheaton Franciscan Healthcare,,,,,,...,1094,220th Street,50648,Jesup,watmildon,watmildon,watmildon,watmildon,POINT (-92.06211 42.46773),node
3,1667420596,pharmacy,pharmacy,Rite Aid,,,,,,,...,2170,Frederick Douglass Boulevard,10026,,CjMalone,CjMalone,CjMalone,CjMalone,POINT (-73.95453 40.80515),node
4,354400197,doctors,doctor,"Genesis Health Group, Clinton Internal Medicine",,,internal_medicine,,+1 563 242 7522,,...,221,Main Avenue,52732,Clinton,thetornado76,thetornado76,thetornado76,thetornado76,POINT (-90.18055 41.87299),node


In [6]:
# Rename columns
us_health_data = us_health_combined.rename(columns={
    "operator_t": "operator_type",
    "operationa": "operational",
    "contact_nu": "contact_number",
    "opening_ho": "opening_hours",
    "beds": "num_beds",
    "staff_doct": "staff_docter",
    "staff_nurs": "staff_nursery",
    "health_ame": "health_amenities",
    "water_sour": "water_source",
    "electricit": "electricity",
    "addr_stree": "street",
    "addr_postc": "postcode",
    "addr_city": "city",
    "addr_house": "house_number"
})

# Drop unnecessary columns
cols_to_drop = ["operational", "water_source", "insurance", "staff_docter", "staff_nursery", 
                "health_amenities", "wheelchair", "emergency", "electricity", 
                "is_in_heal", "is_in_heal.1", "changeset_.1", "dispensing", 
                "url", "changeset_", "changeset_.2", "changeset_.3", "contact_number"]

us_health_data = us_health_data.drop(columns=cols_to_drop, errors="ignore")

# Create full address column
us_health_data["iso3c"] = "USA"
us_health_data["downloaded_from"] = "https://data.humdata.org/organization/healthsites"

us_health_data["address"] = us_health_data[["house_number", "street", "postcode", "city", "iso3c"]].apply(
    lambda x: ", ".join(x.dropna().astype(str)), axis=1
)

In [7]:
us_health_data.head()

Unnamed: 0,osm_id,amenity,healthcare,name,operator,source,speciality,operator_type,opening_hours,num_beds,house_number,street,postcode,city,geometry,osm_type,iso3c,downloaded_from,address
0,354398914,doctors,,Van Horne Family Medical Clinic,Virginia Gay Hospital,,,,"Mo-Th 08:00-12:00,13:00-17:00; Fr 08:00-12:00",,205,Main Street,52346,Van Horne,POINT (-92.09074 42.00889),node,USA,https://data.humdata.org/organization/healthsites,"205, Main Street, 52346, Van Horne, USA"
1,354399084,doctors,,Hopkinton Family Medical Center,Regional Medical Center,,,,"Mo,Tu,Th 08:00-12:00,01:00-17:00; Fr 08:00-12:00",,122,1st Street Southwest,52310,Monticello,POINT (-91.24876 42.3437),node,USA,https://data.humdata.org/organization/healthsites,"122, 1st Street Southwest, 52310, Monticello, USA"
2,354399343,doctors,,Covenant Clinic,Wheaton Franciscan Healthcare,,,,Mo-Fr 08:30-17:00,,1094,220th Street,50648,Jesup,POINT (-92.06211 42.46773),node,USA,https://data.humdata.org/organization/healthsites,"1094, 220th Street, 50648, Jesup, USA"
3,1667420596,pharmacy,pharmacy,Rite Aid,,,,,Mo-Fr 08:00-21:00; Sa 09:00-18:00; Su 10:00-17...,,2170,Frederick Douglass Boulevard,10026,,POINT (-73.95453 40.80515),node,USA,https://data.humdata.org/organization/healthsites,"2170, Frederick Douglass Boulevard, 10026, USA"
4,354400197,doctors,doctor,"Genesis Health Group, Clinton Internal Medicine",,,internal_medicine,,Mo-Fr 08:00-17:00,,221,Main Avenue,52732,Clinton,POINT (-90.18055 41.87299),node,USA,https://data.humdata.org/organization/healthsites,"221, Main Avenue, 52732, Clinton, USA"


In [8]:
# Load UK health data
uk_health_node_raw = load_health_data("../data-raw/hdx/United Kingdom/United Kingdom-node.shp", "node")
uk_health_way_raw = load_health_data("../data-raw/hdx/United Kingdom/United Kingdom-way.shp", "way")

# Combine UK datasets
uk_health_combined = pd.concat([uk_health_node_raw, uk_health_way_raw], ignore_index=True)

# Rename columns
uk_health_data = uk_health_combined.rename(columns={
    "operator_t": "operator_type",
    "operationa": "operational",
    "contact_nu": "contact_number",
    "opening_ho": "opening_hours",
    "beds": "num_beds",
    "staff_doct": "staff_docter",
    "staff_nurs": "staff_nursery",
    "health_ame": "health_amenities",
    "water_sour": "water_source",
    "electricit": "electricity",
    "addr_stree": "street",
    "addr_postc": "postcode",
    "addr_city": "city",
    "addr_house": "house_number"
})

# Drop unnecessary columns
uk_health_data = uk_health_data.drop(columns=cols_to_drop, errors="ignore")

# Create full address column
uk_health_data["iso3c"] = "GBR"
uk_health_data["downloaded_from"] = "https://data.humdata.org/organization/healthsites"

uk_health_data["address"] = uk_health_data[["house_number", "street", "postcode", "city", "iso3c"]].apply(
    lambda x: ", ".join(x.dropna().astype(str)), axis=1
)


⚠️ UTF-8 decoding failed for ../data-raw/hdx/United Kingdom/United Kingdom-node.shp. Trying ISO-8859-1...


In [9]:
# Load Nigeria health data (GeoJSON format)
ng_health_raw = gpd.read_file("../data-raw/hdx/nigeria.geojson")

# Rename columns
ng_health_data = ng_health_raw.rename(columns={
    "addr_street": "street",
    "addr_postcode": "postcode",
    "addr_city": "city",
    "addr_housenumber": "house_number",
    "beds": "num_beds"
})

# Drop unnecessary columns
cols_to_drop_ng = ["operational_status", "water_source", "insurance", "staff_doctors", "staff_nurses", 
                   "health_amenity_type", "wheelchair", "emergency", "electricity", 
                   "is_in_health_area", "is_in_health_zone", "changeset_id", "dispensing", 
                   "url", "changeset_id", "changeset_version", "changeset_timestamp", 
                   "uuid", "completeness"]

ng_health_data = ng_health_data.drop(columns=cols_to_drop_ng, errors="ignore")

# Create full address column
ng_health_data["iso3c"] = "NGA"
ng_health_data["downloaded_from"] = "https://data.humdata.org/organization/healthsites"

ng_health_data["address"] = ng_health_data[["house_number", "street", "postcode", "city", "iso3c"]].apply(
    lambda x: ", ".join(x.dropna().astype(str)), axis=1
)

# Select and reorder columns
selected_columns = ["osm_id", "amenity", "healthcare", "name", "operator", "source", "speciality", 
                    "operator_type", "opening_hours", "num_beds", "house_number", "street", 
                    "postcode", "city", "osm_type", "geometry", "iso3c", "downloaded_from", "address"]

ng_health_data = ng_health_data[selected_columns]
ng_health_data["osm_id"] = ng_health_data["osm_id"].astype(str)


In [10]:
# Combine all datasets
health_data = pd.concat([ng_health_data, us_health_data, uk_health_data], ignore_index=True)

In [12]:
# Save the cleaned data
health_data.to_parquet("../data/hdx_health_data_ng_us_uk_beta.parquet", engine="pyarrow")

In [15]:
len(health_data)

150924

In [16]:
len(us_health_data)

117352

In [17]:
len(uk_health_data)

26738

In [18]:
len(ng_health_data)

6834