In [None]:
import pandas as pd
import numpy as np
import wbddh
import geopandas as gpd
import matplotlib.pyplot as plt
import warnings
from rio_tiler.io import COGReader
from rio_tiler.utils import create_cutline
from rasterio.features import bounds as featureBounds

warnings.filterwarnings("ignore")

%matplotlib inline

In [None]:
## If you don't have the package installed, you can install it using pip

#%pip install -r requirements.txt

## Introduction to Data Catalog (DDH) APIs

The Data Catalog (DDH) is the World Bank’s central data hub, hosting more than 16,000 datasets—including indicators, microdata, geospatial assets, and reproducibility packages. DDH underpins many Bank platforms by offering direct, cloud-native access to data that applications can consume seamlessly.

For this session we want to showcase some of the commonly used API endpoints that can help you in identifying what data you need and how to use them in your workflows. 

DDH serves both intenral and external audience, so you'll notice separate api hosts for them. The internal host requires user authentication, and additional documentation will be provided to guide you through the process. 

We've developed a [swagger page](https://ddh-openapi.worldbank.org/docs/index.html) that provides more information on all the DDH endpoints. 

### Authentication

In [None]:
## https://ddh-openapi.worldbank.org -- host for external use

## Setting the API host to the internal one. If you are an external user, please set it to the external one.

wbddh.set_api_host("https://ddh-openapi.worldbank.org")

### Search

You can search entire data catalog by either passing keywords (as you would on the UI), or by applying filters to different fields. DDH has it own metadata schema, so be sure take a look a dataset response. 

In [None]:
## Get a list of all datasets in DDH
## Requires pagination. Default is set to 50 records

ds_all = wbddh.get("datasets").json()

In [None]:
## This view provides basic information about the datasets, including the dataset unique ID, name, description, and related dates.

pd.DataFrame(ds_all['data']).head()

In [None]:
## Search by keyword

params = {"qname" :  "dataset",
          "param" : "gdp"}

ds_gdp = wbddh.get("search", params=params).json()

In [None]:
pd.DataFrame(ds_gdp['data']).head()

In [None]:
## Search by filtering by country

params = {"qname": "dataset",
          "filter" : "geographical_extent/coverage/any(i: i/code eq 'AF')"}

ds_con = wbddh.get("search", params = params).json()

In [None]:
pd.DataFrame(ds_con['data']).head()

In [None]:
## Search for a dataset by its unique identifier

## 0066940 - Space2Stats Monthly & Annual Black Marble Nighttime Lights

ds = wbddh.get(f"datasets/0066940", params= {"show_resources" : True}).json()

In [None]:
ds

In [None]:
## Check resources for the dataset

for resource in ds['resources']:
    print(resource['name'], ':', resource['resource_unique_id'], '\n File Format -', resource['format'])

In [None]:
## Getting data schema for a resource

wbddh.get(f"resources/DR0095688/metadata").json()

In [None]:
## Getting data for a resource
## You can use the filter parameter to filter the data. The syntax for the filter parameter is based on OData filter syntax.

rs_params = {
    'filter' : "ISO_A3='IND'"
     
}
rs = wbddh.get(f"resources/DR0095688/data", params=rs_params).json()

In [None]:
pd.DataFrame(rs['value'])

### Accessing geospatial data

While the endpoint above provides access to tabular data, the following snippets will showcase how to retrieve geospatial data directly from DDH storage. 

P.S. The full suite of geospatial services will be made available later this year. 

#### Vector Data

Some vector data formats such as CSV, Geojson, Geoparquet, Gpkg can be read directly the storage. Here we'll use a dataset with a Geojson resource.  

In [None]:
## Read a Power Plant dataset from Uzbekistan

ds = wbddh.get(f"datasets/0041474", params= {"show_resources" : True}).json()

In [None]:
for resource in ds['resources']:
    print(resource['name'], ':', resource['resource_unique_id'], '\n', resource['url'] )

In [None]:
## Reading in Existing Power Plants in Uzbekistan dataset

uz_pp = gpd.read_file(ds['resources'][0]['url'])

In [None]:
## Visualize the power plants in Uzbekistan
uz_pp.explore().save("uz_pp.html")

In [None]:
## If gdf.explore fails, you can also use leafmap to visualize the data 

# !pip install leafmap 
# import leafmap

# m = leafmap.Map()
# m.add_gdf(uz_pp, layer_name="My Points")
# m

#### Raster Data

Some datasets on DDH have been made Cloud Optimized Geotiff (COG) compatible, so you can query the data for your AOI instead of downloading the entire file on your computer. In this example, we have a Maize mask for Malawi, and we'll subset it to a small region for our analysis. 

In [None]:
## Geojson for a polygon
## Malawi

feat ={
      "type": "Feature",
      "properties": {},
      "geometry": {
        "type": "Polygon",
        "coordinates": [
          [
            [
              33.64022163514227,
              -13.095832877158898
            ],
            [
              33.64022163514227,
              -13.509532545665095
            ],
            [
              34.134782313766124,
              -13.509532545665095
            ],
            [
              34.134782313766124,
              -13.095832877158898
            ],
            [
              33.64022163514227,
              -13.095832877158898
            ]
          ]
        ]
      }
    }

In [None]:
# Get BBOX of the polygon
bbox = featureBounds(feat)

# Use COGReader to open and read the dataset
with COGReader("https://datacatalogfiles.worldbank.org/ddh-published/0037935/1/DR0046011/mwi_maize_mask_cog_2016.tif") as cog:
    # Create WTT Cutline
    cutline = create_cutline(cog.dataset, feat, geometry_crs="epsg:4326")

    # Read part of the data (bbox) and use the cutline to mask the data
    data_, mask_ = cog.part(bbox, vrt_options={'cutline': cutline})

In [None]:
## Viasualize the masked data

plt.imshow(data_[0,:,:]);

## Exercise

- Search for "Burkina Faso Administrative Boundaries" dataset on DDH and retrieve metadata for all its resources. 
- Use the resource IDs to get the file URL for "District boundary" geojson file. 
- Now search for "Burkina Faso - Electricity Transmission Network" dataset and get the fle link for Geojson file. 
- Use these two layers to visually overlay on top of each other

Hint:
- While plotting first geodataframe, assign it to a variable `ax`
- For second dataframe's plot, assign the parameter `ax=ax`. For instance, plot for gdf2 will look like `gdf2.plot(ax=ax)`
- Double check if the two geodataframes have the same CRS. 
    - check if `print(gdf1.crs == gdf2.crs)`
    - If not, you can set gdf2 crs as `gdf2 = gdf2.set_crs(gdf1.crs, allow_override=True)`