# Batch ingestion and Hybrid Tables

Segments are transferred as tar archives and can be downloaded from the controller.

In [13]:
# all imports
import os
import requests
import shutil
import fileinput
import tarfile
import json
import io
import re
import pandas as pd

In [14]:
# some helpers
def query_sql(query):
    return requests.post('http://pinot-broker.pinot:8099/query/sql', json={
        "sql" : query
    }).json()

def query_sql_dataframe(query):
    result = query_sql(query)
    return pd.DataFrame(columns=result['resultTable']['dataSchema']['columnNames'], data=result['resultTable']['rows'])

In [15]:
requests.get('http://pinot-controller.pinot:9000/schemas/trips').json()

{'schemaName': 'trips',
 'dimensionFieldSpecs': [{'name': 'rider_name',
   'dataType': 'STRING',
   'defaultNullValue': ''},
  {'name': 'driver_name', 'dataType': 'STRING', 'defaultNullValue': ''},
  {'name': 'license_plate', 'dataType': 'STRING', 'defaultNullValue': ''},
  {'name': 'start_location', 'dataType': 'STRING', 'defaultNullValue': ''},
  {'name': 'start_zip_code', 'dataType': 'STRING', 'defaultNullValue': ''},
  {'name': 'end_location', 'dataType': 'STRING', 'defaultNullValue': ''},
  {'name': 'end_zip_code', 'dataType': 'STRING', 'defaultNullValue': ''},
  {'name': 'rider_is_premium', 'dataType': 'INT', 'defaultNullValue': 0}],
 'metricFieldSpecs': [{'name': 'count',
   'dataType': 'LONG',
   'defaultNullValue': 1},
  {'name': 'payment_amount', 'dataType': 'FLOAT'},
  {'name': 'payment_tip_amount', 'dataType': 'FLOAT'},
  {'name': 'trip_wait_time_millis', 'dataType': 'LONG'},
  {'name': 'rider_rating', 'dataType': 'INT'},
  {'name': 'driver_rating', 'dataType': 'INT'}],
 'd

In [16]:
table_config = {
  "tableName": "trips_hybrid",
  "tableType": "OFFLINE",
  "segmentsConfig": {
    "timeColumnName": "trip_start_time_millis",
    "timeType": "MILLISECONDS",
    "retentionTimeUnit": "DAYS",
    "retentionTimeValue": "60",
    "schemaName": "trips",
    "replication": "1"
  },
  "tenants": {},
  "tableIndexConfig": {
    "loadMode": "MMAP",
    "invertedIndexColumns": [
        "rider_name",
        "driver_name",
        "start_location",
        "end_location"
    ]
  },
  "metadata": {
    "customConfigs": {}
  }
}

# create offline table
print(requests.post('http://pinot-controller.pinot:9000/tables', json=table_config).json())

# create realtime table
table_config["tableType"] = "REALTIME"
table_config["segmentsConfig"]["replicasPerPartition"] = "1"
table_config["tableIndexConfig"]["streamConfigs"] = {
  "streamType": "kafka",
  "stream.kafka.consumer.type": "simple",
  "stream.kafka.topic.name": "trips",
  "stream.kafka.decoder.class.name": "org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder",
  "stream.kafka.consumer.factory.class.name": "org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory",
  "stream.kafka.zk.broker.url": "pinot-kafka-zookeeper:2181",
  "stream.kafka.broker.list": "pinot-kafka:9092",
  "realtime.segment.flush.threshold.time": "12h",
  "realtime.segment.flush.threshold.size": "5000",
  "stream.kafka.consumer.prop.auto.offset.reset": "smallest"
}
print(requests.post('http://pinot-controller.pinot:9000/tables', json=table_config).json())

{'code': 409, 'error': 'Table trips_hybrid_OFFLINE already exists'}
{'status': 'Table trips_hybrid_REALTIME succesfully added'}


In [17]:
# list all segments
response = requests.get('http://pinot-controller.pinot:9000/segments/trips_hybrid').json()
realtime_segments = []
for segments in response:
    if "REALTIME" in segments:
        realtime_segments = segments["REALTIME"]
pd.DataFrame(realtime_segments, columns=["REALTIME"])

Unnamed: 0,REALTIME
0,trips_hybrid__0__10__20210409T1914Z
1,trips_hybrid__0__11__20210409T1916Z
2,trips_hybrid__0__12__20210409T1919Z
3,trips_hybrid__0__13__20210409T1920Z
4,trips_hybrid__0__14__20210409T1920Z
5,trips_hybrid__0__15__20210409T1920Z
6,trips_hybrid__0__16__20210409T1920Z
7,trips_hybrid__0__17__20210409T1920Z
8,trips_hybrid__0__18__20210409T1920Z
9,trips_hybrid__0__19__20210409T1920Z


In [7]:
# get download URL for first segment
segment_name = realtime_segments[0]
segment_meta = requests.get(f'http://pinot-controller.pinot:9000/segments/trips_hybrid/{segment_name}/metadata').json()
download_url = ""
if segment_meta["segment.realtime.status"] == "DONE":
    download_url = segment_meta["segment.realtime.download.url"]
print(download_url)

http://pinot-controller-0.pinot-controller-headless.pinot.svc.cluster.local:9000/segments/trips_hybrid/trips_hybrid__0__10__20210409T1914Z


In [8]:
segment_realtime_tar = "/tmp/trips_segment_realtime.tar.gz"

# cleanup old downloads
try:
    os.remove(segment_realtime_tar)
except OSError:
    pass

# download segment to local file
response = requests.get(download_url, stream=True)
print(response)
with open(segment_realtime_tar, 'wb') as out_file:
    shutil.copyfileobj(response.raw, out_file)
del response

<Response [200]>


In [9]:
segment_offline_basedir = "/tmp/trips_segment_realtime"

# cleanup old artifacts
shutil.rmtree(segment_offline_basedir, ignore_errors=True)

# extract downloaded segment tar
with tarfile.open(segment_realtime_tar, 'r:gz') as tar:
    tar.extractall(path=segment_offline_basedir)

# modify metadata.properties of segment
segment_offline_dir = segment_offline_basedir + "/" + segment_name
metadata_file = segment_offline_dir + "/v3/metadata.properties"
metadata_contents = None
with open(metadata_file, 'r') as file:
  metadata_contents = file.read()
metadata_contents = metadata_contents.replace('trips_hybrid_REALTIME', 'trips_hybrid_OFFLINE')
with open(metadata_file, 'w') as file:
  file.write(metadata_contents)
del metadata_contents

# create new offline segment tar based on realtime segment
segment_offline_tar = "/tmp/trips_segment_offline.tar.gz"
with tarfile.open(segment_offline_tar, 'w:gz') as tar:
    tar.add(segment_offline_dir, arcname=segment_name)

### Segment URI Push
Let controller fetch segment tar from some blob store ([docs](https://docs.pinot.apache.org/basics/data-import/batch-ingestion#segment-uri-push)).

In [10]:
# response = requests.post('http://pinot-controller.pinot:9000/v2/segments?table=trips_hybrid', headers={
#     'UPLOAD_TYPE': 'URI',
#     'DOWNLOAD_URI': download_url
# })
# print(response)
# print(response.json())

### Segment Tar Push
Push segment tar to controller ([docs](https://docs.pinot.apache.org/basics/data-import/batch-ingestion#segment-tar-push)).

In [11]:
# POST segment as multipart/form-data for key 'segment'
response = requests.post('http://pinot-controller.pinot:9000/v2/segments?table=trips_hybrid', files={
    'segment': open(segment_offline_tar, 'rb')
})
print(response)
print(response.json())

<Response [200]>
{'status': 'Successfully uploaded segment: trips_hybrid__0__10__20210409T1914Z of table: trips_hybrid_OFFLINE'}


### Show external view for hybrid table

In [12]:
externalview = requests.get('http://pinot-controller.pinot:9000/tables/trips_hybrid/externalview').json()
externalview_data = {}

server_name_regex = re.compile('pinot-server-[0-9]+')

for table_type, externalview_per_type in externalview.items():
    for segment, segment_servers in externalview_per_type.items():
        if not segment in externalview_data:
            externalview_data[segment] = {}
        for server, state in segment_servers.items():
            server_name = server_name_regex.search(server).group()
            try:
                externalview_data[segment][table_type].append(server_name)
            except KeyError:
                externalview_data[segment][table_type] = [server_name]

pd.DataFrame(externalview_data).transpose()

Unnamed: 0,OFFLINE,REALTIME
trips_hybrid__0__0__20210407T1824Z,[pinot-server-0],
trips_hybrid__0__10__20210409T1914Z,[pinot-server-2],[pinot-server-1]
trips_hybrid__0__1__20210407T1824Z,[pinot-server-1],[pinot-server-1]
trips_hybrid__0__11__20210409T1916Z,,[pinot-server-1]
trips_hybrid__0__12__20210409T1919Z,,[pinot-server-1]
trips_hybrid__0__13__20210409T1920Z,,[pinot-server-1]
trips_hybrid__0__14__20210409T1920Z,,[pinot-server-1]
trips_hybrid__0__15__20210409T1920Z,,[pinot-server-1]
trips_hybrid__0__16__20210409T1920Z,,[pinot-server-1]
trips_hybrid__0__17__20210409T1920Z,,[pinot-server-1]


### Example Query: Top 5 drivers

In [12]:
query_sql_dataframe("""
    SELECT driver_name, sum(count) as trips
    FROM trips_hybrid
    GROUP BY driver_name
    HAVING trips > 1
    LIMIT 5
    """)

Unnamed: 0,driver_name,trips
0,Michael Turner,2.0
1,Jackie Nunez,2.0
2,Charles Cruz,2.0
3,Jason Cole,2.0
4,William Collins,2.0
