## Segmentation in Pinot

Table contents in Pinot are expected to grow infinitely and thus need to be distributed across multiple nodes. The dataset is split into segments, which are comparable to shards/partitions in classical RDBMS. Segmentation is done in a time-based fashion, meaning that rows in a given segment will be timewisely close to each other.
Segments store all columns of a table and organize data in columnar orientation for high encoding efficiency and optional pre-aggregation of metrics. In addition to values, segments store indices and other lookup-related data structures like dictionaries. By default values are stored using dictionary encoding, meaning that values are represented as dictionary IDs that reference a corresponding dictionary entry. This way, values can be stored with the minimum number of bits required, which depends on the cardinality of the column. 

In [None]:
# all imports
import requests
import json
import io
import re
import pandas as pd
from kafka import KafkaConsumer

In [None]:
# some helpers
def server_name_from_instance(instance):
    return re.search('pinot-server-[0-9]+', instance).group()

def query_sql(query):
    print("query: " + query)
    return requests.get('http://pinot-broker.pinot:8099/query/sql', params={
        "sql" : query,
        "trace": "true"
    }).json()

def query_result_to_dataframe(result):
    return pd.DataFrame(columns=result['resultTable']['dataSchema']['columnNames'], data=result['resultTable']['rows'])

def extract_query_statistics_from_result(result):
    query_statistics_fields = ["numServersQueried","numServersResponded","numSegmentsQueried","numSegmentsProcessed","numSegmentsMatched","numConsumingSegmentsQueried","numDocsScanned","numEntriesScannedInFilter","numEntriesScannedPostFilter","numGroupsLimitReached","totalDocs","timeUsedMs"]
    return { key: result[key] for key in query_statistics_fields }

def extract_query_statistics_from_result_dataframe(result):
    return pd.DataFrame({"value": extract_query_statistics_from_result(result)})

# def extract_segment_trace_from_result(result):
#     trace_data = []
#     for server, server_trace_json in result["traceInfo"].items():
#         server_name = server_name_from_instance(server)
#         server_trace = json.loads(server_trace_json)
#         for trace_dict in server_trace:
#             for segment, segment_trace in trace_dict.items():
#                 if re.match('[0-9]+_[0-9]+', segment) != None:
#                     trace_data.append({"segment":segment, "server": server_name})
    
#     print(trace_data)
#     trace_data.sort(key=lambda L: (int(re.search('[0-9]+_([0-9]+)', L["segment"]).group(1)), L))
#     return pd.DataFrame(trace_data)

ordinal_pattern = re.compile(r'__[0-9]+__([0-9]+)__')
def sort_by_ascending_ordinal(segments):
    segments.sort(key=lambda L: (int(ordinal_pattern.search(L).group(1)), L))

def segment_metadata_for_table(table):
    segments = requests.get(f'http://pinot-controller.pinot:9000/segments/{table}').json()
    
    segment_metadata = {}
    for segments_item in segments:
        for table_type, type_segments in segments_item.items():
            for segment in type_segments:
                segment_type_name = f"{segment}_{table_type}"
                segment_metadata[segment_type_name] = requests.get(f'http://pinot-controller.pinot:9000/segments/{table}/{segment}/metadata').json()
    
    return segment_metadata

def segment_metadata_of_nth_segment(segment_metadata, n, table_type="REALTIME"):
    segments_of_type = []
    for segment in segment_metadata.keys():
        if segment.endswith("_" + table_type):
            segments_of_type.append(segment)
    
    sort_by_ascending_ordinal(segments_of_type)
    return segment_metadata[segments_of_type[n]]


def start_time_of_nth_segment(segment_metadata, n, table_type="REALTIME"):
    return segment_metadata_of_nth_segment(segment_metadata, n, table_type)["segment.start.time"]

In [None]:
# consumer = KafkaConsumer(group_id='test', bootstrap_servers=['pinot-kafka.pinot:9092'])
# consumer.topics()

In [None]:
# requests.get('http://pinot-controller.pinot:9000/schemas/trips').json()

In [None]:
table_config = {
  "tableName": "",
  "tableType": "REALTIME",
  "segmentsConfig": {
    "timeColumnName": "trip_start_time_millis",
    "timeType": "MILLISECONDS",
    "retentionTimeUnit": "DAYS",
    "retentionTimeValue": "60",
    "schemaName": "trips",
    "replication": "1",
    "replicasPerPartition": "1"
  },
  "tenants": {},
  "tableIndexConfig": {
    "loadMode": "MMAP",
    "invertedIndexColumns": [
        "rider_name",
        "driver_name",
        "start_location",
        "end_location"
    ],
    "streamConfigs": {
      "streamType": "kafka",
      "stream.kafka.consumer.type": "simple",
      "stream.kafka.topic.name": "trips",
      "stream.kafka.decoder.class.name": "org.apache.pinot.plugin.stream.kafka.KafkaJSONMessageDecoder",
      "stream.kafka.consumer.factory.class.name": "org.apache.pinot.plugin.stream.kafka20.KafkaConsumerFactory",
      "stream.kafka.zk.broker.url": "pinot-kafka-zookeeper:2181",
      "stream.kafka.broker.list": "pinot-kafka:9092",
      "realtime.segment.flush.threshold.time": "12h",
      "realtime.segment.flush.threshold.size": "80000",
      "stream.kafka.consumer.prop.auto.offset.reset": "smallest"
    }
  },
  "metadata": {
    "customConfigs": {}
  }
}

table_config["tableName"] = "trips_segmentation_1"
print(requests.post('http://pinot-controller.pinot:9000/tables', json=table_config).json())

table_config["tableName"] = "trips_segmentation_2"
table_config["segmentsConfig"]["replication"] = "3"
table_config["segmentsConfig"]["replicasPerPartition"] = "3"
table_config["tableIndexConfig"]["streamConfigs"]["realtime.segment.flush.threshold.size"] = "50000"
print(requests.post('http://pinot-controller.pinot:9000/tables', json=table_config).json())

In [None]:
# response = requests.get('http://pinot-controller.pinot:9000/segments/trips_segmentation_1').json()
# segments_1 = response[0]['REALTIME']
# sort_by_ascending_ordinal(segments_1)
# pd.DataFrame(segments_1, columns=['trips_segmentation_1'])

In [None]:
# response = requests.get('http://pinot-controller.pinot:9000/segments/trips_segmentation_2').json()
# segments_2 = response[0]['REALTIME']
# sort_by_ascending_ordinal(segments_2)
# pd.DataFrame(segments_2, columns=['trips_segmentation_2'])

In [None]:
segment_metadata_1 = segment_metadata_for_table("trips_segmentation_1")
pd.DataFrame(segment_metadata_1)

In [None]:
segment_metadata_2 = segment_metadata_for_table("trips_segmentation_2")
pd.DataFrame(segment_metadata_2)

In [None]:
# get data from first 2 segments
query_for_trips_segmentation_1 = f"select driver_name, sum(count) as trips_count from trips_segmentation_1 where trip_start_time_millis < {start_time_of_nth_segment(segment_metadata_1, 1)} group by driver_name order by trips_count desc limit 5"

query_result = query_sql(query_for_trips_segmentation_1)
query_result_to_dataframe(query_result)

In [None]:
extract_query_statistics_from_result_dataframe(query_result)

In [None]:
# get data from first 3 segments
query_for_trips_segmentation_2 = f"select driver_name, sum(count) as trips_count from trips_segmentation_2 where trip_start_time_millis < {start_time_of_nth_segment(segment_metadata_2, 2)} group by driver_name order by trips_count desc limit 5"

query_result = query_sql(query_for_trips_segmentation_2)
query_result_to_dataframe(query_result)

In [None]:
extract_query_statistics_from_result_dataframe(query_result)

## Query Routing / Processing

Brokers are responsible for maintaining routing tables, which contain mappings between segments of a table and servers where they are hosted on. This allows brokers to efficiently scatter received queries across servers.

In [None]:
# some helpers
def routing_table_for_query(query):
    print("query: " + query)
    return requests.get('http://pinot-broker.pinot:8099/debug/routingTable/sql', params={
        "query" : query
    }).json()

def routing_table_for_table(table):
    return requests.get(f'http://pinot-broker.pinot:8099/debug/routingTable/{table}').json()

def external_view_for_table(table):
    return requests.get(f'http://pinot-controller.pinot:9000/tables/{table}/externalview').json()

def routing_table_for_query_dataframe(query):
    rt = routing_table_for_query(query)
    rt_data = {}

    for server, server_segments in rt.items():
        server_name = server_name_from_instance(server)
        for s in server_segments:
            rt_data[s] = server_name

    rt_data_list = []
    for segment, server in rt_data.items():
        rt_data_list.append({"segment": segment, "server": server})

    rt_data_list.sort(key=lambda L: (int(ordinal_pattern.search(L["segment"]).group(1)), L))
    return pd.DataFrame(rt_data_list)

def routing_table_for_table_dataframe(table):
    rt = routing_table_for_table(table)
    rt_data = {}

    for table_name_type, table_rt in rt.items():
        table_type = re.search('REALTIME|OFFLINE', table_name_type).group()
        for server, server_segments in table_rt.items():
            server_name = server_name_from_instance(server)
            for s in server_segments:
                try:
                    rt_data[s][table_type] = server_name
                except KeyError:
                    rt_data[s] = {table_type: server_name}

    rt_data_list = []
    for segment, type_server in rt_data.items():
        segment_data = {"segment": segment}
        for table_type, server in type_server.items():
            segment_data[table_type] = server
        rt_data_list.append(segment_data)

    rt_data_list.sort(key=lambda L: (int(ordinal_pattern.search(L["segment"]).group(1)), L))
    return pd.DataFrame(rt_data_list)

def external_view_for_table_dataframe(table):
    ev = external_view_for_table(table)
    ev_data = {}

    for table_type, ev_per_type in ev.items():
        if ev_per_type == None:
            continue
        
        for segment, segment_servers in ev_per_type.items():
            if not segment in ev_data:
                ev_data[segment] = {}
            for server, state in segment_servers.items():
                server_name = server_name_from_instance(server)
                try:
                    ev_data[segment][table_type].append(server_name)
                except KeyError:
                    ev_data[segment][table_type] = [server_name]

    return pd.DataFrame(ev_data).transpose()

In [None]:
external_view_for_table_dataframe("trips_segmentation_1")

In [None]:
external_view_for_table_dataframe("trips_segmentation_2")

In [None]:
routing_table_for_query_dataframe(query_for_trips_segmentation_1.replace("trips_segmentation_1", "trips_segmentation_1_REALTIME"))

In [None]:
# routing_table_for_table_dataframe("trips_segmentation_1")

In [None]:
routing_table_for_query_dataframe(query_for_trips_segmentation_2.replace("trips_segmentation_2", "trips_segmentation_2_REALTIME"))

In [None]:
# routing_table_for_table_dataframe("trips_segmentation_2")