In [27]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.flight as flight
import numpy as np
import pandas as pd
import time
import threading

# Implement a Flight server in Python

This server has a few goals

* Clients can send ("put") datasets, to be kept in memory by the server
* Clients can request a list of cached datasets ("list-tables")
* Clients can request ("get") a cached table

Note that this server is very simple and does not show some of the more sophisticated "query planning" capabilities of Arrow Flight, nor does it show parallel or multi-part access. My goal is to show you that

* It's easy to write a Flight service in Python
* The performance of Flight is **very, very good**

In [28]:
flight.FlightServerBase

pyarrow._flight.FlightServerBase

In [29]:
class DemoServer(flight.FlightServerBase):
    
    def __init__(self, location):
        self._cache = {}
        super().__init__(location)
    
    def list_actions(self, context):
        return [flight.ActionType('list-tables', 'List stored tables'),
                flight.ActionType('drop-table', 'Drop a stored table')]

    # -----------------------------------------------------------------
    # Implement actions
    
    def do_action(self, context, action):
        handlers = {
            'list-tables': self._list_tables,
            'drop-table': self._drop_table
        }        
        handler = handlers.get(action.type)
        if not handler:
            raise NotImplementedError   
        return handlers[action.type](action)
        
    def _drop_table(self, action):
        del self._cache[action.body]
        
    def _list_tables(self, action):
        return iter([flight.Result(cache_key) 
                     for cache_key in sorted(self._cache.keys())])

    # -----------------------------------------------------------------
    # Implement puts
    
    def do_put(self, context, descriptor, reader, writer):
        self._cache[descriptor.command] = reader.read_all()
        
    # -----------------------------------------------------------------
    # Implement gets

    def do_get(self, context, ticket):
        table = self._cache[ticket.ticket]
        return flight.RecordBatchStream(table)

Some helper utilities, you can ignore this part

In [None]:
import contextlib
import socket
def find_free_port():
    # Find a free port
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    with contextlib.closing(sock) as sock:
        sock.bind(('', 0))
        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
        port = sock.getsockname()[1]
    return port

## Start server in background, connect client

In [30]:
port = 1337
location = flight.Location.for_grpc_tcp("localhost", find_free_port())
location

server = DemoServer(location)

thread = threading.Thread(target=lambda: server.serve(), daemon=True)
thread.start()

client = flight.connect(location)
client.wait_for_available()

### Ask server for supported actions

In [31]:
client.list_actions()

[ActionType(type='list-tables', description='List stored tables'),
 ActionType(type='drop-table', description='Drop a stored table')]

### Implement convenience functions for invoking server's RPC methods

In [32]:
# Call "list-tables" RPC and return results as Python list
def list_tables(client):
    action = flight.Action('list-tables', b'')
    return [x.body.to_pybytes().decode('utf8') for x in client.do_action(action)]    

# Send a pyarrow.Table to the server to be cached
def cache_table_in_server(name, table):
    desc = flight.FlightDescriptor.for_command(name.encode('utf8'))
    put_writer, put_meta_reader = client.do_put(desc, table.schema)
    put_writer.write(table)
    put_writer.close()
    
# Request a pyarrow.Table by name
def get_table(name):
    reader = client.do_get(flight.Ticket(name.encode('utf8')))
    return reader.read_all()

list_tables(client)

[]

In [33]:
table = pa.table([pa.array([1,2,3,4,5])], names=['f0'])
cache_table_in_server('table1', table)

In [34]:
list_tables(client)

['table1']

In [35]:
cache_table_in_server('table2', table)
cache_table_in_server('table3', table)
cache_table_in_server('table4', table)

In [36]:
list_tables(client)

['table1', 'table2', 'table3', 'table4']

In [37]:
get_table('table1')

pyarrow.Table
f0: int64

### Now let's make a much bigger table and test performance

In [None]:
# fec = pd.read_csv('/home/wesm/code/pydata-book/datasets/fec/P00000001-ALL.csv',
#                 low_memory=False)
# table = pa.table(fec)
# pq.write_table(table, 'fec-2012.parquet')

In [38]:
fec_table = pq.read_table('fec-2012.parquet')

In [39]:
fec_table = pa.concat_tables([fec_table] * 10)

In [40]:
# How big is it?
out = pa.BufferOutputStream()
with pa.ipc.RecordBatchStreamWriter(out, fec_table.schema) as writer:
    writer.write(fec_table)
len(out.getvalue())

1821475008

In [41]:
print(f'Table is {1780273284 / (1 << 30)} gigabytes')

Table is 1.658008698374033 gigabytes


In [42]:
%%time
cache_table_in_server('fec_table', fec_table)

CPU times: user 217 ms, sys: 973 ms, total: 1.19 s
Wall time: 875 ms


In [None]:
list_tables(client)

In [43]:
%%time 

fec_table_received = get_table('fec_table')

CPU times: user 234 ms, sys: 1.01 s, total: 1.25 s
Wall time: 913 ms


### ~1.5 gigabytes/sec end-to-end over TCP, not bad