In [9]:
from pyspark.sql import SparkSession

def _spark_context():
    'Creates a local spark context'

    return SparkSession.builder \
      .master('local') \
      .appName('syllabus') \
      .getOrCreate()

SPARK = _spark_context()
SPARK

In [12]:
from pyspark.sql import DataFrame

In [36]:
import json

from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import TerminalTrueColorFormatter

def ppj(j, indent=2):
    j = json.dumps(json.loads(j), indent=indent)
    print(highlight(j, JsonLexer(), TerminalTrueColorFormatter()))
   
def ppd(d, indent=2):
    ppj(json.dumps(d, indent=indent))

class DFLoader:
    
    @staticmethod
    def from_file(records: list, fpath: str) -> DataFrame:
        with open(fpath, 'w') as ostream:
            for record in records:
                print(json.dumps(record), file=ostream, end='\n')
        df = SPARK.read.json(fpath)
        df.show()
        print(ppj(df.schema.json()))
        return df
    
    

In [37]:
records = [
    {
        "a": "b",
        "c": "d"
    },
    {
        "a": "b"
    },
    {
        "a": "c"
    },
    {
        "a": "c",
        "d": "z"
    }
]

In [38]:
df = DFLoader.from_file(records, 'f.ndjson')

+---+----+----+
|  a|   c|   d|
+---+----+----+
|  b|   d|null|
|  b|null|null|
|  c|null|null|
|  c|null|   z|
+---+----+----+

{
[38;2;187;187;187m  [39m[38;2;0;128;0;01m"fields"[39;00m:[38;2;187;187;187m [39m[
[38;2;187;187;187m    [39m{
[38;2;187;187;187m      [39m[38;2;0;128;0;01m"metadata"[39;00m:[38;2;187;187;187m [39m{},
[38;2;187;187;187m      [39m[38;2;0;128;0;01m"name"[39;00m:[38;2;187;187;187m [39m[38;2;186;33;33m"a"[39m,
[38;2;187;187;187m      [39m[38;2;0;128;0;01m"nullable"[39;00m:[38;2;187;187;187m [39m[38;2;0;128;0;01mtrue[39;00m,
[38;2;187;187;187m      [39m[38;2;0;128;0;01m"type"[39;00m:[38;2;187;187;187m [39m[38;2;186;33;33m"string"[39m
[38;2;187;187;187m    [39m},
[38;2;187;187;187m    [39m{
[38;2;187;187;187m      [39m[38;2;0;128;0;01m"metadata"[39;00m:[38;2;187;187;187m [39m{},
[38;2;187;187;187m      [39m[38;2;0;128;0;01m"name"[39;00m:[38;2;187;187;187m [39m[38;2;186;33;33m"c"[39m,
[38;2;187;187;187m      

In [26]:
records = [
    {
        "a": 1,
        "c": "d"
    },
    {
        "a": "b"
    },
    {
        "a": "c"
    },
    {
        "a": "c",
        "d": "z"
    }
]

In [27]:
df = DFLoader.from_file(records, 'f.ndjson')

+---+----+----+
|  a|   c|   d|
+---+----+----+
|  1|   d|null|
|  b|null|null|
|  c|null|null|
|  c|null|   z|
+---+----+----+

StructType([StructField('a', StringType(), True), StructField('c', StringType(), True), StructField('d', StringType(), True)])


In [28]:
records = [
    {
        "a": 1,
        "c": "d"
    },
    {
        "a": "b"
    },
    {
        "a": "c"
    },
    {
        "a": 1,
        "d": "z"
    }
]
df = DFLoader.from_file(records, 'f.ndjson')

+---+----+----+
|  a|   c|   d|
+---+----+----+
|  1|   d|null|
|  b|null|null|
|  c|null|null|
|  1|null|   z|
+---+----+----+

StructType([StructField('a', StringType(), True), StructField('c', StringType(), True), StructField('d', StringType(), True)])
