# Batch ingestion in Pinot

Segments are transferred as tar archives and can be downloaded from the controller.

In [13]:
# all imports
import requests
import json
import io
import re
import pandas as pd

In [19]:
# some helpers
def query_sql(query):
    return requests.post('http://pinot-broker.pinot:8099/query/sql', json={
        "sql" : query
    }).json()

def query_sql_dataframe(query):
    result = query_sql(query)
    return pd.DataFrame(columns=result['resultTable']['dataSchema']['columnNames'], data=result['resultTable']['rows'])

In [15]:
requests.get('http://pinot-controller.pinot:9000/schemas/trips').json()

{'schemaName': 'trips',
 'dimensionFieldSpecs': [{'name': 'rider_name',
   'dataType': 'STRING',
   'defaultNullValue': ''},
  {'name': 'driver_name', 'dataType': 'STRING', 'defaultNullValue': ''},
  {'name': 'license_plate', 'dataType': 'STRING', 'defaultNullValue': ''},
  {'name': 'start_location', 'dataType': 'STRING', 'defaultNullValue': ''},
  {'name': 'start_zip_code', 'dataType': 'STRING', 'defaultNullValue': ''},
  {'name': 'end_location', 'dataType': 'STRING', 'defaultNullValue': ''},
  {'name': 'end_zip_code', 'dataType': 'STRING', 'defaultNullValue': ''},
  {'name': 'rider_is_premium', 'dataType': 'INT', 'defaultNullValue': 0}],
 'metricFieldSpecs': [{'name': 'count',
   'dataType': 'LONG',
   'defaultNullValue': 1},
  {'name': 'payment_amount', 'dataType': 'FLOAT'},
  {'name': 'payment_tip_amount', 'dataType': 'FLOAT'},
  {'name': 'trip_wait_time_millis', 'dataType': 'LONG'},
  {'name': 'rider_rating', 'dataType': 'INT'},
  {'name': 'driver_rating', 'dataType': 'INT'}],
 'd

In [16]:
table_config = {
  "tableName": "trips_batch",
  "tableType": "OFFLINE",
  "segmentsConfig": {
    "timeColumnName": "trip_start_time_millis",
    "timeType": "MILLISECONDS",
    "retentionTimeUnit": "DAYS",
    "retentionTimeValue": "60",
    "schemaName": "trips",
    "replication": "1"
  },
  "tenants": {},
  "tableIndexConfig": {
    "loadMode": "MMAP",
    "invertedIndexColumns": [
        "rider_name",
        "driver_name",
        "start_location",
        "end_location"
    ]
  },
  "metadata": {
    "customConfigs": {}
  }
}

print(requests.post('http://pinot-controller.pinot:9000/tables', json=table_config).json())

{'code': 409, 'error': 'Table trips_batch_OFFLINE already exists'}


In [22]:
response = requests.post('http://pinot-controller.pinot:9000/v2/segments?table=trips_batch', headers={
    'UPLOAD_TYPE': 'URI',
    'DOWNLOAD_URI': 'https://adm-pinot.s3.eu-central-1.amazonaws.com/trips_batch.tar.gz'
})
print(response)
print(response.json())

<Response [200]>
{'status': 'Successfully uploaded segment: trips_batch_1__0__0__20210330T2132Z of table: trips_batch'}


### Example Query: Top 5 drivers

In [23]:
query_sql_dataframe("""
    SELECT driver_name, sum(count) as trips
    FROM trips_batch
    GROUP BY driver_name
    HAVING trips > 1
    LIMIT 5
    """)

Unnamed: 0,driver_name,trips
0,Jonathan Johnson,2.0
1,Lorraine Johnson,2.0
2,James Smith,2.0
3,Mildred Johnson,2.0
4,James Davis,2.0
