# DFI Quick Start Guide - API Basics; A Small Query

This  notebook will guide you through the basics by querying a
[small 25 million record dataset](https://www.microsoft.com/en-us/research/publication/geolife-gps-trajectory-dataset-user-guide/)
in the Data Flow Index from [General System](https://www.generalsystem.com).

OpenAPI specification documentation is available at
<https://api.dataflowindex.io/docs/api>.

Please refer to https://github.com/thegeneralsystem/dfi-client-examples for
the most up-to-date companion documentation.

Additional resources and help are available at <https://support.generalsystem.com>.

## Get ready

In [None]:
# Install Python modules if they are not already present.
!python3 -m pip install requests tabulate pydeck

In [None]:
# Import required modules.
import json
from typing import List

import requests
from tabulate import tabulate

In [None]:
import pandas as pd

# This tutorial uses PyDeck to visualise the data on a map.
# If you want to visualise data, please install PyDeck following the instructions:
#     https://deckgl.readthedocs.io/en/latest/installation.html
# You do not need a Mapbox API key (skip this step).
# You DO need to enable pydeck for Jupyter (follow this step in the guide).
import pydeck as pdk

In [None]:
# First set your API token to access the DFI API.
#
# Access to the DFI demonstration servers requires an API token, which may be
# obtained free of charge by enrolling at <https://eap.generalsystem.com>. Once
# enrolled, your API token may be redeemed from <https://tokens.dataflowindex.io/>.

import os
from getpass import getpass

api_token = getpass("Enter your API token: ")

# Set authorisation headers:
headers = {
    "Authorization": f"Bearer {api_token}",
    "accept": "application/json",
    "content-type": "application/json",
}
base_url = "https://api.dataflowindex.io"
query_timeout = 60

#### In this tutorial we will be querying a small Geolife data set

Original source data: https://www.microsoft.com/en-us/research/publication/geolife-gps-trajectory-dataset-user-guide/

| total records	| 24.9 million |
| ------------- | -------------- |
| distinct uuids | 18.670 |

#### Hardware
- The dataset runs on a single server hosted on AWS
- The server has 2 vCPU, 8 GB ram and 1 x 75 GB NVMe SSD

#### Note: this is a shared DFI instance, and you cannot add or delete data to it.

In [None]:
# Get list of instances associated with your API key.
r = requests.get(f"{base_url}/instances", headers=headers, timeout=query_timeout)
print(r.json())

In [None]:
# Next select the DFI instance you will be accessing.
namespace = "gs_eap_demo"
instance_name = "eap-2"
params = {"instance": f"{namespace}.{instance_name}"}

## Query the data

In [None]:
# Determine how much data resides in this DFI instance.
r = requests.get(f"{base_url}/count", params=params, headers=headers, timeout=query_timeout)
if r.status_code != 200:
    print(f"Status code: {r.status_code}")
    print(f"Response:\n{r.text}")
    r.raise_for_status()
total_histories = r.json()
print(f"Total records: {total_histories}")

r = requests.get(f"{base_url}/entities", params=params, headers=headers, timeout=query_timeout)
if r.status_code != 200:
    print(f"Status code: {r.status_code}")
    print(f"Response:\n{r.text}")
    r.raise_for_status()

total_entities = len(r.json())
print(f"Total entities: {total_entities}")
if total_entities > 0:
    print(f"Average histories per entity: {round(total_histories / total_entities)}")

In [None]:
# We have created a set of interesting polygons that you can use to query the
# datasets. Much of this data is in Beijing, China.
r = requests.get(f"{base_url}/namespaces/{namespace}/polygons", headers=headers, timeout=query_timeout)
if r.status_code != 200:
    print(f"Status code: {r.status_code}")
    print(f"Response:\n{r.text}")
    r.raise_for_status()

data = [[polygon["name"], polygon["count"]] for polygon in r.json()["polygons"]]
print(tabulate(data, ["name", "vertices"], tablefmt="pretty"))

In [None]:
# This is a helper function that allows us to display a polygon on a map.
def show_polygon(query_polygon: str) -> pdk.Deck:
    """Visualise a polygon on a map"""
    r = requests.get(
        f"{base_url}/namespaces/{namespace}/polygons/" + query_polygon, headers=headers, timeout=query_timeout
    )
    r.raise_for_status()

    coordinates = r.json()["vertices"]
    geo_json = {
        "type": "FeatureCollection",
        "features": [
            {
                "type": "Feature",
                "properties": {},
                "geometry": {"coordinates": [coordinates], "type": "Polygon"},
            }
        ],
    }

    geo_json_pdk = pdk.Layer(
        "GeoJsonLayer",
        geo_json,
        opacity=0.2,
        stroked=False,
        filled=True,
        extruded=False,
        wireframe=True,
        get_elevation="0",
        get_fill_color="[255, 255, 0]",
        get_line_color=[255, 255, 255],
        pickable=True,
    )
    view_state = pdk.ViewState(longitude=116.4, latitude=39.9, zoom=10, min_zoom=5, max_zoom=15, pitch=0, bearing=0)
    return pdk.Deck(layers=[geo_json_pdk], initial_view_state=view_state)

In [None]:
# Show some of the polygons on a map.
show_polygon("ch_beijing_110107")

In [None]:
# Count how many records there are inside a polygon.
polygon = "ch_beijing_110107"
time_params = {
    "startTime": "2008-01-01T08:00:00.001Z",
    "endTime": "2011-01-30T20:00:00.001Z",
}
r = requests.get(
    f"{base_url}/polygon/{namespace}.{polygon}/count",
    params=params | time_params,
    headers=headers,
    timeout=query_timeout,
)

print(f"Status code: {r.status_code}")
print(f"There are {r.json()} records in the polygon")

In [None]:
# Count how many unique entities there are inside a polygon.
polygon = "ch_beijing_110107"
time_params = {
    "startTime": "2008-01-01T08:00:00.001Z",
    "endTime": "2011-01-30T20:00:00.001Z",
}
r = requests.get(
    f"{base_url}/polygon/{namespace}.{polygon}/entities",
    params=params | time_params,
    headers=headers,
    timeout=query_timeout,
)

print(f"Status code: {r.status_code}")
r.raise_for_status()
entities = r.json()

print(f"There are {len(entities)} entities in the polygon")
print("Here are the first 3 entity ids:", entities[:3])

In [None]:
def show_history(history: List[List[float]]) -> pdk.Deck:
    """show history on a map"""
    df = pd.DataFrame(history, columns=["Longitude", "Latitude"])

    history_pdk = pdk.Layer(
        "ScatterplotLayer",
        df,
        get_position=["Longitude", "Latitude"],
        auto_highlight=True,
        elevation_scale=500,
        pickable=True,
        elevation_range=[0, 300],
        extruded=True,
        filled=True,
        opacity=0.8,
        radius_scale=6,
        radius_min_pixels=1,
        radius_max_pixels=100,
        line_width_min_pixels=1,
        get_fill_color=[255, 0, 0],
        get_line_color=[255, 0, 0],
        coverage=1,
    )
    view_state = pdk.ViewState(longitude=116.4, latitude=39.9, zoom=10, min_zoom=5, max_zoom=15, pitch=0, bearing=0)
    r = pdk.Deck(layers=[history_pdk], initial_view_state=view_state)
    return r

In [None]:
# List all records of an entity.
entity = "dacc1e4d-aeef-4d19-8c4f-834c9be949a7"
r = requests.get(f"{base_url}/entities/{entity}/history", params=params, headers=headers, timeout=query_timeout)
print(f"Status code: {r.status_code}")
r.raise_for_status()

history = [[item["coordinate"][0], item["coordinate"][1]] for item in r.json()]
show_history(history)

In [None]:
# List all records inside a polygon.
polygon = "ch_beijing_110107"
time_params = {
    "startTime": "2008-01-01T08:00:00.001Z",
    "endTime": "2011-01-30T20:00:00.001Z",
}
r = requests.get(
    f"{base_url}/polygon/{namespace}.{polygon}/history",
    params=params | time_params,
    headers=headers,
    timeout=query_timeout,
)
print(f"Status code: {r.status_code}")
r.raise_for_status()

history = [[item["coordinate"][0], item["coordinate"][1]] for item in r.json()]
show_history(history)

In [None]:
# We can also query by polygon supplying the list of vertices of the polygon.
# Vertices must be listed in counter-clockwise order as mandated in the geoJSON standard.
payload = {"vertices": [[-1.1, +1.1], [-1.1, -1.1], [+1.1, -1.1], [+1.1, +1.1], [-1.1, +1.1]]}
r = requests.post(
    f"{base_url}/polygon/count",
    json=payload,
    headers=headers,
    params=params,
    timeout=query_timeout,
)

print(f"Status code: {r.status_code}")
r.raise_for_status()
print(f"Records found: {r.text}")

# Adding polygons

In [None]:
# Polygons can be defined, named and stored for later use.
# Polygons are used in "points in polygon" queries. As polygons definitions may
# be large and complex, they can be stored and referred to by name in queries.
# A polygon could be, for instance, the boundary of a country and be several MBs in size.
# Here we create a new polygon.
# Vertices must be listed in counter-clockwise order as mandated in the geoJSON standard.
payload = {
    "name": "my-first-polygon",
    "vertices": [[-1.1, +1.1], [-1.1, -1.1], [+1.1, -1.1], [+1.1, +1.1], [-1.1, +1.1]],
}
r = requests.post(f"{base_url}/polygons", json=payload, headers=headers, timeout=query_timeout)
print(f"Status code: {r.status_code}")

In [None]:
# Reading the polygon.
r = requests.get(f"{base_url}/polygons", headers=headers, timeout=query_timeout)
if r.status_code != 200:
    print(f"Status code: {r.status_code}")
    print(f"Response:\n{r.text}")
    r.raise_for_status()

data = [[polygon["name"], polygon["count"]] for polygon in r.json()["polygons"]]
print(tabulate(data, ["name", "vertices"], tablefmt="pretty"))

### Bounding box queries methods
The user supplies a bounding box by listing its top-left, bottom-right vertices. The DFI will find all points (observations) that lie within. We have 3 types of queries:

* `count` - Computes how many points lie within the polygon
* `points` - Returns the details of the points that lie within the polygon
* `entities` - Returns the list of unique sensor ids that lie within the polygon

All queries optionally support time ranges and limit the search to include a list of sensor IDs.

In [None]:
payload = {"minLng": 115.5, "minLat": 39.5, "maxLng": 116.5, "maxLat": 40.5}
r = requests.post(
    base_url + "/bounding-box/count",
    json=payload,
    headers=headers,
    params=params,
    timeout=query_timeout,
)
print(f"Status code: {r.status_code}")
r.raise_for_status()
print("Results: ", r.text)

In [None]:
payload = {"minLng": 115.5, "minLat": 39.5, "maxLng": 115.8, "maxLat": 39.6}
r = requests.post(
    f"{base_url}/bounding-box/history",
    json=payload,
    headers=headers,
    params=params,
    timeout=query_timeout,
)
print(f"Status code: {r.status_code}")
r.raise_for_status()
print(f"Results: {r.text}")

In [None]:
payload = {"minLng": 115.5, "minLat": 39.5, "maxLng": 116.5, "maxLat": 40.5}
r = requests.post(
    f"{base_url}/bounding-box/entities",
    json=payload,
    headers=headers,
    params=params,
    timeout=query_timeout,
)
print(f"Status code: {r.status_code}")
r.raise_for_status()
print(f"Results: {r.text}")