In [1]:
from pyspark.sql import SparkSession

def _spark_context():
    'Creates a local spark context'
    return (
        SparkSession.builder
        .master('local')
        .appName('syllabus')
        .getOrCreate()
    )

SPARK = _spark_context()
SPARK

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/06/30 04:06:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import json

from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from pyspark.sql import types as T

from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import TerminalTrueColorFormatter

from IPython.display import JSON
from IPython.core.display_functions import display

def ppd(d, indent=2):
    'pretty-prints a dict'
    print(highlight(
        code      = json.dumps(d, indent=indent),
        lexer     = JsonLexer(),
        formatter = TerminalTrueColorFormatter(style='material')
    ).strip())

def ppj(j, indent=2):
    'pretty-prints a JSON string'
    ppd(json.loads(j), indent=indent)


def count_nulls(df: DataFrame) -> int:
    return df.select(
        sum([F.count(F.when(F.col(c).isNull(), c)) for c in df.columns])
    ).collect()[0][0]

def count_cells(df: DataFrame) -> int:
    return df.count() * len(df.columns)

class DFLoader:
    @staticmethod
    def from_file(records: list, fpath: str = 'f.ndjson', schema: dict = {}) -> DataFrame:
        with open(fpath, 'w') as ostream:
            for record in records:
                print(json.dumps(record), file=ostream, end='\n')
        if schema:
            df = SPARK.read.json(fpath, schema=T.StructType.fromJson(schema))
        else:
            df = SPARK.read.json(fpath)
        # df.show()
        display(df.toPandas())
        # display(df.toPandas())
        print('cells', count_cells(df), '/', 'nulls', count_nulls(df))
        ppj(df.schema.json())
        # display(JSON(json.loads(df.schema.json()), expanded=True))
        return df

In [8]:
records = [
    { "a": "a", "c": "d" },
    { "a": "b" },
    { "a": "c" },
    { "a": 1, "d": "z" }
]
df = DFLoader.from_file(records)
count_nulls(df)

Unnamed: 0,a,c,d
0,a,d,
1,b,,
2,c,,
3,1,,z


cells 12 / nulls 6
[38;2;137;221;255m{[39m
[38;2;238;255;255m  [39m[38;2;255;83;112m"fields"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255m[[39m
[38;2;238;255;255m    [39m[38;2;137;221;255m{[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"metadata"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255m{},[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"name"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"a"[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"nullable"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255mtrue[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"type"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"string"[39m
[38;2;238;255;255m    [39m[38;2;137;221;255m},[39m
[38;2;238;255;255m    [39m[38;2;137;221;255m{[39m
[38;2;238;255;255m      [39m[38;2;255;83;112

6

In [16]:
records = [
    { "a": 1, "c": "d" },
    { "a": "b" },
    { "a": "c" },
    { "a": "123", "d": "z" }
]
df = DFLoader.from_file(records)

Unnamed: 0,a,c,d
0,1,d,
1,b,,
2,c,,
3,123,,z


cells 12 / nulls 6
[38;2;137;221;255m{[39m
[38;2;238;255;255m  [39m[38;2;255;83;112m"fields"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255m[[39m
[38;2;238;255;255m    [39m[38;2;137;221;255m{[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"metadata"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255m{},[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"name"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"a"[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"nullable"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255mtrue[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"type"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"string"[39m
[38;2;238;255;255m    [39m[38;2;137;221;255m},[39m
[38;2;238;255;255m    [39m[38;2;137;221;255m{[39m
[38;2;238;255;255m      [39m[38;2;255;83;112

In [6]:
records = [
    { "id": 123, "a": 1, "c": "d" },
    { "id": 122, "a": "b" },
    { "id": 111, "a": "c" },
    { "id": 234, "a": 1, "d": "z" }
]
df = DFLoader.from_file(records)

Unnamed: 0,a,c,d,id
0,1,d,,123
1,b,,,122
2,c,,,111
3,1,,z,234


cells 16 / nulls 6
[38;2;137;221;255m{[39m
[38;2;238;255;255m  [39m[38;2;255;83;112m"fields"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255m[[39m
[38;2;238;255;255m    [39m[38;2;137;221;255m{[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"metadata"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255m{},[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"name"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"a"[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"nullable"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255mtrue[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"type"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"string"[39m
[38;2;238;255;255m    [39m[38;2;137;221;255m},[39m
[38;2;238;255;255m    [39m[38;2;137;221;255m{[39m
[38;2;238;255;255m      [39m[38;2;255;83;112

In [19]:
records = [
    { "id": 123, "a": 1, "c": "d" },
    { "id": 122, "a": "b" },
    { "id": 111, "a": "c" },
    { "id": 234, "a": 1, "d": "z" }
]

records = [
    { "id": 123, "key": "a", "value": 1},
    { "id": 123, "key": "c", "value": "d" },

    { "id": 122, "key": "a", "value": "b" },
    { "id": 111, "key": "a", "value": "c" },

    { "id": 234, "key": "a", "value": 1 },
    { "id": 234, "key": "d", "value": "z" },
]
df = DFLoader.from_file(records)

Unnamed: 0,id,key,value
0,123,a,1
1,123,c,d
2,122,a,b
3,111,a,c
4,234,a,1
5,234,d,z


cells 18 / nulls 0
[38;2;137;221;255m{[39m
[38;2;238;255;255m  [39m[38;2;255;83;112m"fields"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255m[[39m
[38;2;238;255;255m    [39m[38;2;137;221;255m{[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"metadata"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255m{},[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"name"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"id"[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"nullable"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255mtrue[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"type"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"long"[39m
[38;2;238;255;255m    [39m[38;2;137;221;255m},[39m
[38;2;238;255;255m    [39m[38;2;137;221;255m{[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m

In [7]:
records = [
    { "id": 123, "a": "1", "c": "d" },
    { "id": 122, "a": "b" },
    { "id": 111, "a": "c" },
    { "id": 234, "a": "z", "d": "z" }
]

json_schema = {
    "type" : "object",
    "properties" : {
        "id" : {"type" : "integer"},
        "a": {"type": "string"},
    },
    "required": ["id"],
    "additionalProperties": False
}

print("--- records")
ppd(records)
print("--- schema")
ppd(json_schema)

--- records
[38;2;137;221;255m[[39m
[38;2;238;255;255m  [39m[38;2;137;221;255m{[39m
[38;2;238;255;255m    [39m[38;2;255;83;112m"id"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;247;140;108m123[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m    [39m[38;2;255;83;112m"a"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"1"[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m    [39m[38;2;255;83;112m"c"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"d"[39m
[38;2;238;255;255m  [39m[38;2;137;221;255m},[39m
[38;2;238;255;255m  [39m[38;2;137;221;255m{[39m
[38;2;238;255;255m    [39m[38;2;255;83;112m"id"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;247;140;108m122[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m    [39m[38;2;255;83;112m"a"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"b"[39m
[38;2;238;255;255m  [39m[38;2;137;221;255m},[39m
[38;2;238;255;

In [8]:
from typing import Dict, List
from jsonschema.exceptions import ValidationError
from jsonschema.validators import Draft3Validator
from jsonschema import validate

def translate(df: list, json_schema: dict) -> dict:
    v = Draft3Validator(json_schema)
    for record in df:
        for error in sorted(v.iter_errors(record), key=str):
            yield record, error

def parse_unexpected_keys(instance: dict, e: ValidationError) -> List[str]:
    '''
    "Additional properties are not allowed ('a' was unexpected)" -> ['a']
    "Additional properties are not allowed ('a', 'c' were unexpected)" -> ['a', 'c']
    '''
    return (
        list(map(
            # Strip the single quote from the outside of each key
            lambda x: x.strip("'"),
            # Strip the message of the beginning/end, 
            e.args[0]
            .lstrip('Additional properties are not allowed (')
            .rstrip(' was unexpected)')
            .rstrip(' were unexpected)')
            # split on comma e.g. "('a', 'c' was...)" -> ["'a'", "'b'"]
            .split(', ')
        ))
    )
            
for record, error in translate(records, json_schema):
    # print(error)
    ppd({
        'instance': record,
        'error': error.message,
        'keys': parse_unexpected_keys(record, error)
    })

[38;2;137;221;255m{[39m
[38;2;238;255;255m  [39m[38;2;255;83;112m"instance"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255m{[39m
[38;2;238;255;255m    [39m[38;2;255;83;112m"id"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;247;140;108m123[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m    [39m[38;2;255;83;112m"a"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"1"[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m    [39m[38;2;255;83;112m"c"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"d"[39m
[38;2;238;255;255m  [39m[38;2;137;221;255m},[39m
[38;2;238;255;255m  [39m[38;2;255;83;112m"error"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"Additional properties are not allowed ('c' was unexpected)"[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m  [39m[38;2;255;83;112m"keys"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255

In [9]:
from dataclasses import dataclass, field

@dataclass
class UnexpectedKeys:
    schema: dict
    unique_keys: set = field(default_factory=set)
    
    def __post_init__(self):
        self.validator = Draft3Validator(self.schema)

    @property
    def keys(self):
        return list(sorted(self.unique_keys))

    @staticmethod
    def parse_unexpected_keys(e: ValidationError) -> List[str]:
        '''
        "Additional properties are not allowed ('a' was unexpected)" -> ['a']
        "Additional properties are not allowed ('a', 'c' were unexpected)" -> ['a', 'c']
        '''
        return (
            list(map(
                # Strip the single quote from the outside of each key
                lambda x: x.strip("'"),
                # Strip the message of the beginning/end, 
                e.args[0]
                .lstrip('Additional properties are not allowed (')
                .rstrip(' was unexpected)')
                .rstrip(' were unexpected)')
                # split on comma e.g. "('a', 'c' was...)" -> ["'a'", "'b'"]
                .split(', ')
            ))
        )

    
    def check(self, instance: dict):
        for error in sorted(self.validator.iter_errors(instance), key=str):
            for key in UnexpectedKeys.parse_unexpected_keys(error):
                self.unique_keys.add(key)
            yield error

In [10]:
k = UnexpectedKeys(json_schema)
for record in records:
    for error in k.check(record):
        print('--- record')
        ppd(record)
        print('--- error', error, sep='\n')
k

--- record
[38;2;137;221;255m{[39m
[38;2;238;255;255m  [39m[38;2;255;83;112m"id"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;247;140;108m123[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m  [39m[38;2;255;83;112m"a"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"1"[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m  [39m[38;2;255;83;112m"c"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"d"[39m
[38;2;137;221;255m}[39m
--- error
Additional properties are not allowed ('c' was unexpected)

Failed validating 'additionalProperties' in schema:
    {'additionalProperties': False,
     'properties': {'a': {'type': 'string'}, 'id': {'type': 'integer'}},
     'required': ['id'],
     'type': 'object'}

On instance:
    {'a': '1', 'c': 'd', 'id': 123}
--- record
[38;2;137;221;255m{[39m
[38;2;238;255;255m  [39m[38;2;255;83;112m"id"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;247;140;108m234[39m

UnexpectedKeys(schema={'type': 'object', 'properties': {'id': {'type': 'integer'}, 'a': {'type': 'string'}}, 'required': ['id'], 'additionalProperties': False}, unique_keys={'d', 'c'})

In [11]:
k.unique_keys

{'c', 'd'}

In [12]:
def split_dict(d: dict, keys: list):
    dd, extras = {}, {}
    for k in d.keys():
        if k in keys:
            extras[k] = d[k]
        else:
            dd[k] = d[k]
    return dd, extras

def translate(df: list, json_schema: dict) -> list:
    uk = UnexpectedKeys(json_schema)
    for record in records:
        for error in uk.check(record):
            d, extras = split_dict(record, UnexpectedKeys.parse_unexpected_keys(error))
            if extras:
                for k, v in extras.items():
                    yield {
                        **d, **{"custom": {"key": k, "value": v}}
                    }
            else:
                yield {**d, **{"custom": {}}}
            break
        else:
            yield {**record, **{"custom": {}}}

In [36]:
print('--- records')
for record in records:
    ppd(record, indent=None)

print('--- translated')
for record in translate(records, json_schema):
    ppd(record, indent=None)

--- records
[38;2;137;221;255m{[39m[38;2;255;83;112m"id"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;247;140;108m123[39m[38;2;137;221;255m,[39m[38;2;238;255;255m [39m[38;2;255;83;112m"a"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"1"[39m[38;2;137;221;255m,[39m[38;2;238;255;255m [39m[38;2;255;83;112m"c"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"d"[39m[38;2;137;221;255m}[39m
[38;2;137;221;255m{[39m[38;2;255;83;112m"id"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;247;140;108m122[39m[38;2;137;221;255m,[39m[38;2;238;255;255m [39m[38;2;255;83;112m"a"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"b"[39m[38;2;137;221;255m}[39m
[38;2;137;221;255m{[39m[38;2;255;83;112m"id"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;247;140;108m111[39m[38;2;137;221;255m,[39m[38;2;238;255;255m [39m[38;2;255;83;112m"a"[39m[38;2;137;221

In [38]:
json_schema = {
    "type" : "object",
    "properties" : {
        "id" : {"type" : "integer"},
        "a": {"type": "string"},
        "custom": {
            "type": "object",
            "properties": {
                "key": {"type": "string"},
                "value": {"type": "string"},
            }
        }
    },
    "required": ["id"],
    "additionalProperties": False
}

for record in translate(records, json_schema):
    ppd(record, indent=None)
    validate(record, json_schema)

[38;2;137;221;255m{[39m[38;2;255;83;112m"id"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;247;140;108m123[39m[38;2;137;221;255m,[39m[38;2;238;255;255m [39m[38;2;255;83;112m"a"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"1"[39m[38;2;137;221;255m,[39m[38;2;238;255;255m [39m[38;2;255;83;112m"custom"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255m{[39m[38;2;255;83;112m"key"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"c"[39m[38;2;137;221;255m,[39m[38;2;238;255;255m [39m[38;2;255;83;112m"value"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"d"[39m[38;2;137;221;255m}}[39m
[38;2;137;221;255m{[39m[38;2;255;83;112m"id"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;247;140;108m122[39m[38;2;137;221;255m,[39m[38;2;238;255;255m [39m[38;2;255;83;112m"a"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141

In [39]:
json_schema = {
    "type" : "object",
    "properties" : {
        "id" : {"type" : "integer"},
        "a": {"type": "string"},
        "custom": {
            "type": "object",
            "properties": {
                "key": {"type": "string"},
                "value": {"type": "string"}
            }
        }
    },
    "required": ["id", "custom"],
    "additionalProperties": False
}
ppd(json_schema)

[38;2;137;221;255m{[39m
[38;2;238;255;255m  [39m[38;2;255;83;112m"type"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"object"[39m[38;2;137;221;255m,[39m
[38;2;238;255;255m  [39m[38;2;255;83;112m"properties"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255m{[39m
[38;2;238;255;255m    [39m[38;2;255;83;112m"id"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255m{[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"type"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"integer"[39m
[38;2;238;255;255m    [39m[38;2;137;221;255m},[39m
[38;2;238;255;255m    [39m[38;2;255;83;112m"a"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;137;221;255m{[39m
[38;2;238;255;255m      [39m[38;2;255;83;112m"type"[39m[38;2;137;221;255m:[39m[38;2;238;255;255m [39m[38;2;195;232;141m"string"[39m
[38;2;238;255;255m    [39m[38;2;137;221;255m},[39m
[38;2;238;255;255

In [None]:
records = [
    { "id": 123, "a": "1", "c": "d", "z": "zz" },
    { "id": 122, "a": "b" },
    { "id": 111, "a": "c" },
    { "id": 234, "a": "z", "d": "z" }
]

print('--- records')
for record in records:
    ppd(record, indent=None)

print('--- translated')
for record in translate(records, json_schema):
    ppd(record, indent=None)

In [None]:
df = DFLoader.from_file(translate(records, json_schema))

In [None]:
display(
    df.select(df.id, df.a, df.custom.key, df.custom.value
).toPandas())

In [None]:
def translate2(df: list, json_schema: dict) -> list:
    uk = UnexpectedKeys(json_schema)
    for record in records:
        for error in uk.check(record):
            d, extras = split_dict(record, UnexpectedKeys.parse_unexpected_keys(error))
            custom = []
            if extras:
                for k, v in extras.items():
                    custom.append({"key": k, "value": v})
                yield {**d, **{"custom": custom}}
            else:
                yield {**d, **{"custom": custom}}
            break
        else:
            yield {**record, **{"custom": []}}

In [None]:
records = [
    { "id": 123, "a": "1", "c": "d", "z": "zz" },
    { "id": 122, "a": "b" },
    { "id": 111, "a": "c" },
    { "id": 234, "a": "z", "d": "z" }
]

print('--- records')
for record in records:
    ppd(record, indent=None)

print('--- translated')
for record in translate2(records, json_schema):
    ppd(record, indent=None)

In [None]:
print('--- translate2')
df = DFLoader.from_file(translate2(records, json_schema))

In [None]:
df.select(df.id, df.a, df.custom.key, df.custom.value).show()

In [None]:
def select_rows_with_custom_key(df: DataFrame, key: str) -> DataFrame:
    return (
        df.where(
            F.array_contains(df.custom.key,  key)
        )
        .select(
            df.id,
            df.a,
            df.custom.key,
            df.custom.value
        )
    )

In [None]:
k = UnexpectedKeys(json_schema)
for record in records:
    list(k.check(record))

for key in k.unique_keys:
    print("--- custom key", key)
    display(select_rows_with_custom_key(df, key).toPandas())

In [None]:
# select_rows_with_custom_key(df, 'c').select(F.filter(df.custom

In [None]:
schema = {
  "fields": [
    {
      "metadata": {},
      "name": "a",
      "nullable": True,
      "type": "string"
    },
    {
      "metadata": {},
      "name": "custom",
      "nullable": True,
      "type": {
        "containsNull": True,
        "elementType": {
          "fields": [
            {
              "metadata": {},
              "name": "key",
              "nullable": True,
              "type": "string"
            },
            {
              "metadata": {},
              "name": "value",
              "nullable": True,
              "type": "string"
            }
          ],
          "type": "struct"
        },
        "type": "array"
      }
    },
    {
      "metadata": {},
      "name": "id",
      "nullable": True,
      "type": "long"
    }
  ],
  "type": "struct"
}
ppd(schema)

In [None]:
df = DFLoader.from_file(translate(records, json_schema), schema=schema)

In [None]:
for key in k.unique_keys:
    print("--- custom key", key)
    display(select_rows_with_custom_key(df, key).toPandas())

In [None]:
c = select_rows_with_custom_key(df, 'c')
c.select('`custom.value`').collect()