In [1]:
import pprint
from pprint import pformat
import os
import datetime

from operator import and_
from collections import defaultdict

import six
import luigi
import pyspark.sql.functions as sqlfn

import json
import itertools as it

from pyspark.sql.types import MapType, ArrayType, FloatType, StringType, NumericType

if six.PY3:
    from functools import reduce  # make flake8 happy

In [None]:
# pprint.pprint(dict(os.environ), width=1)
def log(obj, msg=""):
    if msg: print(msg)
    print("type: {}\ndata: {}".format(type(obj), pformat(obj, indent=1, width=1)))

log(os.environ, "os.environ")
print()
log(dict(os.environ), "dict(os.environ)")

In [3]:
import os
import sys

from pyspark.sql import SparkSession, SQLContext

# Pack executable Grinder conda environment into zip
TMP_ENV_BASEDIR = "tmpenv"  # Reserved directory to store environment archive
env_dir = os.path.dirname(os.path.dirname(sys.executable))
env_name = os.path.basename(env_dir)
env_archive = "{basedir}/{env}.zip#{basedir}".format(basedir=TMP_ENV_BASEDIR, env=env_name)
os.environ["PYSPARK_PYTHON"] = "{}/{}/bin/python".format(TMP_ENV_BASEDIR, env_name)

# you need this only first time!
# !rm -rf {TMP_ENV_BASEDIR} && mkdir {TMP_ENV_BASEDIR} && cd {TMP_ENV_BASEDIR} && rsync -a {env_dir} . && zip -rq {env_name}.zip {env_name}


In [4]:
# Create Spark session with prj conda environment and JVM extensions
spark = SparkSession.builder\
    .master("yarn-client")\
    .appName("dmprj-spark_sql_functions_test")\
    .config("spark.yarn.queue", "dev.regular")\
    .config("spark.executor.instances", "1")\
    .config("spark.executor.memory", "2G")\
    .config("spark.executor.cores", "2")\
    .config("spark.yarn.executor.memoryOverhead", "1G")\
    .config("spark.dynamicAllocation.enabled", "true")\
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")\
    .config("spark.dynamicAllocation.maxExecutors", "128")\
    .config("spark.network.timeout", "800s")\
    .config("spark.reducer.maxReqsInFlight", "10")\
    .config("spark.shuffle.io.retryWait", "60s")\
    .config("spark.shuffle.io.maxRetries", "10")\
    .config("spark.sql.shuffle.partitions", "2")\
    .config("spark.driver.memory", "2G")\
    .config("spark.driver.maxResultSize", "1G")\
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
    .config("spark.kryoserializer.buffer.max", "1024m")\
    .config("hive.exec.dynamic.partition", "true")\
    .config("hive.exec.dynamic.partition.mode", "nonstrict")\
    .config("hive.exec.max.dynamic.partitions", "100000")\
    .config("hive.exec.max.dynamic.partitions.pernode", "10000")\
    .config("spark.jars", "hdfs:/lib/prj-transformers-assembly-1.4.0.jar")\
    .config("spark.yarn.dist.archives", env_archive)\
    .getOrCreate()

sql_ctx = SQLContext(spark.sparkContext)

In [67]:
# end of env. setup

In [None]:
import os

from pprint import pformat

import luigi
import pyspark.sql.functions as sqlfn

from pyspark.sql.types import MapType, ArrayType, FloatType, DoubleType, StringType, StructType, IntegralType
from pyspark.sql.utils import CapturedException

from luigi.contrib.hdfs import HdfsTarget

from dmprj.apps.utils.common import add_days
from dmprj.apps.utils.common.hive import format_table, select_clause
from dmprj.apps.utils.common.luigix import HiveExternalTask
from dmprj.apps.utils.control.luigix.task import ControlApp
from dmprj.apps.utils.control.client.exception import FailedStatusException, MissingDepsStatusException

from dmprj.apps.utils.common import unfreeze_json_param
from dmprj.apps.utils.common.fs import HdfsClient
from dmprj.apps.utils.common.hive import FindPartitionsEngine
from dmprj.apps.utils.common.spark import prjUDFLibrary
from dmprj.apps.utils.common.luigix import HiveTableSchemaTarget

from dmprj.apps.utils.common.hive import select_clause
from dmprj.apps.utils.common.spark import prjUDFLibrary, insert_into_hive
from dmprj.apps.utils.common.luigix import HiveExternalTask, HiveGenericTarget
from dmprj.apps.utils.control.luigix import ControlApp, ControlDynamicOutputPySparkTask

In [None]:
CustomUDFLibrary(spark).register_all_udf()

In [9]:
def show(df, nlines=10, truncate=False):
    df.printSchema()
    df.show(nlines, truncate)
    return df

In [10]:
# experiments with Spark SQL functions
# https://spark.apache.org/docs/latest/api/sql/index.html

In [36]:
# filter array values
df_0 = show(spark.sql(
    "select filter("
    "array("
    "cast(1 as float), cast(null as float), cast('NaN' as float), cast(4 as float)"
    "), _x -> not isnull(_x) and not isnan(_x)"
    ") as arrcol"
))

root
 |-- arrcol: array (nullable = false)
 |    |-- element: float (containsNull = true)

+----------+
|arrcol    |
+----------+
|[1.0, 4.0]|
+----------+



In [37]:
# create map
df_1 = show(
    spark.sql(
        "select map_from_arrays("
        "array('a', 'b', 'c', '101', '-1', '303'), "
        "array(cast(1 as double), cast(null as double), cast('NaN' as double), 1.01, 2.02, 3.03)"
        ") as mapcol"
    )
)

root
 |-- mapcol: map (nullable = false)
 |    |-- key: string
 |    |-- value: double (valueContainsNull = true)

+----------------------------------------------------------------+
|mapcol                                                          |
+----------------------------------------------------------------+
|[a -> 1.0, b ->, c -> NaN, 101 -> 1.01, -1 -> 2.02, 303 -> 3.03]|
+----------------------------------------------------------------+



In [38]:
# convert map to array
df_2 = show(df_1.selectExpr(
    "user_dmdesc.map_key_values(mapcol) as arrtuples"
))

root
 |-- arrtuples: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- key: string (nullable = true)
 |    |    |-- value: double (nullable = true)

+----------------------------------------------------------------+
|arrtuples                                                       |
+----------------------------------------------------------------+
|[[a, 1.0], [b,], [101, 1.01], [c, NaN], [-1, 2.02], [303, 3.03]]|
+----------------------------------------------------------------+



In [39]:
# drop invalid tuples
df_3 = show(df_2.selectExpr(
    "filter(arrtuples, _x -> "
    "not isnull(_x['value']) and not isnan(_x['value']) and is_uint32(_x['key'])"
    ") as arrtuples"
))

root
 |-- arrtuples: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- key: string (nullable = true)
 |    |    |-- value: double (nullable = true)

+--------------------------+
|arrtuples                 |
+--------------------------+
|[[101, 1.01], [303, 3.03]]|
+--------------------------+



In [42]:
# convert array to map
df_4 = show(df_3.selectExpr(
    "cast("
    "map_from_entries(arrtuples)"
    "as map<string,float>) as mapcol"
))

root
 |-- mapcol: map (nullable = true)
 |    |-- key: string
 |    |-- value: float (valueContainsNull = true)

+--------------------------+
|mapcol                    |
+--------------------------+
|[101 -> 1.01, 303 -> 3.03]|
+--------------------------+



In [7]:
spark.sql("select cast(0 as float)").show()

+----------------+
|CAST(0 AS FLOAT)|
+----------------+
|             0.0|
+----------------+



In [43]:
spark.stop()