In [1]:
import pprint
from pprint import pformat

import os
import datetime

from operator import and_
from collections import defaultdict

import six
import luigi
import pyspark.sql.functions as sqlfn

import json
import itertools as it

from pyspark.sql.types import MapType, ArrayType, FloatType, StringType, NumericType

if six.PY3:
    from functools import reduce  # make flake8 happy

In [None]:
# pprint.pprint(dict(os.environ), width=1)
def log(obj, msg=""):
    if msg: print(msg)
    print("type: {}\ndata: {}".format(type(obj), pformat(obj, indent=1, width=1)))

log(os.environ, "os.environ")
print()
log(dict(os.environ), "dict(os.environ)")

In [4]:
import os
import sys

from pyspark.sql import SparkSession, SQLContext

# Pack executable Grinder conda environment into zip
TMP_ENV_BASEDIR = "tmpenv"  # Reserved directory to store environment archive
env_dir = os.path.dirname(os.path.dirname(sys.executable))
env_name = os.path.basename(env_dir)
env_archive = "{basedir}/{env}.zip#{basedir}".format(basedir=TMP_ENV_BASEDIR, env=env_name)
os.environ["PYSPARK_PYTHON"] = "{}/{}/bin/python".format(TMP_ENV_BASEDIR, env_name)

# you need this only first time!
# !rm -rf {TMP_ENV_BASEDIR} && mkdir {TMP_ENV_BASEDIR} && cd {TMP_ENV_BASEDIR} && rsync -a {env_dir} . && zip -rq {env_name}.zip {env_name}


In [None]:
# Create Spark session with prj conda environment and JVM extensions
# `spark-submit ... --driver-java-options "-Dlog4j.configuration=file:/home/vlk/driver_log4j.properties"`
# spark.driver.extraJavaOptions

spark = (
SparkSession.builder
    .master("yarn-client")
    .appName("TRG-73014-test")
    .config("spark.yarn.queue", "dev.other.regular")
    .config("spark.executor.instances", "2")
    .config("spark.executor.memory", "4G")
    .config("spark.executor.cores", "4")
    .config("spark.executor.memoryOverhead", "2G")
    .config("spark.sql.shuffle.partitions", "1024")
    .config("spark.driver.memory", "4G")
    .config("spark.driver.maxResultSize", "1G")
    .config("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:/home/vlk/driver_log4j.properties")
    .config("spark.speculation", "true")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.maxExecutors", "256")
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")
    .config("spark.network.timeout", "800s")
    .config("spark.reducer.maxReqsInFlight", "10")
    .config("spark.shuffle.io.retryWait", "60s")
    .config("spark.shuffle.io.maxRetries", "10")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.kryoserializer.buffer.max", "1024m")
    .config("hive.exec.dynamic.partition", "true")
    .config("hive.exec.dynamic.partition.mode", "nonstrict")
    .config("hive.exec.max.dynamic.partitions", "1000000")
    .config("hive.exec.max.dynamic.partitions.pernode", "100000")
    .config("spark.yarn.dist.archives", env_archive)
    .getOrCreate()
)
#     .config("spark.jars", "hdfs:/lib/prj-transformers-assembly-dev-1.5.0.jar")
sql_ctx = SQLContext(spark.sparkContext)
(spark, sql_ctx)

In [5]:
# end of env. setup

In [None]:
import os
import numpy as np

from pprint import pformat

import luigi
import pyspark.sql.functions as sqlfn

from pyspark.storagelevel import StorageLevel
from pyspark.sql import DataFrame, SQLContext
from pyspark.sql.types import (
    MapType, ArrayType, FloatType, DoubleType, StringType, StructType, IntegralType, IntegerType
)
from pyspark.sql.utils import CapturedException
from pyspark.ml.wrapper import JavaWrapper

from luigi.contrib.hdfs import HdfsTarget

from dmprj.apps.utils.common import add_days
from dmprj.apps.utils.common.hive import format_table, select_clause
from dmprj.apps.utils.common.luigix import HiveExternalTask
from dmprj.apps.utils.control.luigix.task import ControlApp
from dmprj.apps.utils.control.client.exception import FailedStatusException, MissingDepsStatusException

from dmprj.apps.utils.common import unfreeze_json_param
from dmprj.apps.utils.common.fs import HdfsClient
from dmprj.apps.utils.common.hive import FindPartitionsEngine
from dmprj.apps.utils.common.spark import prjUDFLibrary
from dmprj.apps.utils.common.luigix import HiveTableSchemaTarget

from dmprj.apps.utils.common.hive import select_clause
from dmprj.apps.utils.common.spark import prjUDFLibrary, insert_into_hive
from dmprj.apps.utils.common.luigix import HiveExternalTask, HiveGenericTarget
from dmprj.apps.utils.control.luigix import ControlApp, ControlDynamicOutputPySparkTask

In [None]:
CustomUDFLibrary(spark, "hdfs:/lib/prj-transformers-assembly-dev-1.5.0.jar").register_all_udf()

In [8]:
def show(df, message="dataframe", nlines=20, truncate=False):
    print("\n{}, rows: {}:".format(message, df.count()))
    df.printSchema()
    df.show(nlines, truncate)
    return df

## experiments

In [27]:
# map<i32,float> friends_info, 
# map<i32,i64> os_info, 
# map<i32,i32> interests
df = show(spark.createDataFrame(
        [
            ["a", {1:2.0}, {1:22}, {1:222}],
            ["b", {2:3.0}, {2:33}, {2:333}],
            ["c", None, {3:None}, {}],
            ["d", {}, None, {4:None}],
            ["e", {5:None}, {}, None],
            ["f", None, None, None],
        ],
        schema="uid:string,friends_info:map<int,float>,os_info:map<int,bigint>,interests:map<int,int>",
    ).persist(StorageLevel.MEMORY_ONLY)
)

spark.catalog.dropGlobalTempView("test_features")
df.createGlobalTempView("test_features")  # global_temp


dataframe, rows: 6:
root
 |-- uid: string (nullable = true)
 |-- friends_info: map (nullable = true)
 |    |-- key: integer
 |    |-- value: float (valueContainsNull = true)
 |-- os_info: map (nullable = true)
 |    |-- key: integer
 |    |-- value: long (valueContainsNull = true)
 |-- interests: map (nullable = true)
 |    |-- key: integer
 |    |-- value: integer (valueContainsNull = true)

+---+------------+---------+----------+
|uid|friends_info|os_info  |interests |
+---+------------+---------+----------+
|a  |[1 -> 2.0]  |[1 -> 22]|[1 -> 222]|
|b  |[2 -> 3.0]  |[2 -> 33]|[2 -> 333]|
|c  |null        |[3 ->]   |[]        |
|d  |[]          |null     |[4 ->]    |
|e  |[5 ->]      |[]       |null      |
|f  |null        |null     |null      |
+---+------------+---------+----------+



In [10]:
# experiments with Spark SQL functions
# https://spark.apache.org/docs/latest/api/sql/index.html

In [29]:
# concat (merge) maps
expr = """
map_concat(friends_info, os_info, interests) as merged
"""
show(spark.sql("select uid, {} from global_temp.test_features".format(expr)))


dataframe, rows: 6:
root
 |-- uid: string (nullable = true)
 |-- merged: map (nullable = true)
 |    |-- key: integer
 |    |-- value: float (valueContainsNull = true)

+---+---------------------------------+
|uid|merged                           |
+---+---------------------------------+
|a  |[1 -> 2.0, 1 -> 22.0, 1 -> 222.0]|
|b  |[2 -> 3.0, 2 -> 33.0, 2 -> 333.0]|
|c  |null                             |
|d  |null                             |
|e  |null                             |
|f  |null                             |
+---+---------------------------------+



DataFrame[uid: string, merged: map<int,float>]

In [30]:
# transform maps with key bins
# https://github.com/klout/brickhouse/blob/863a370820f64a7825c337f708116149978c097a/src/main/java/brickhouse/udf/collect/MapKeyValuesUDF.java#L72
show(df.selectExpr("uid", """
map_from_entries(
  transform(
    user_dmdesc.map_key_values(coalesce(friends_info, from_json('{}', 'map<int,float>'))), _tup -> (_tup['key'], _tup['value'])
  )
) as fi
""",
"""
map_from_entries(
  transform(
    user_dmdesc.map_key_values(coalesce(os_info, from_json('{}', 'map<int,long>'))), _tup -> (_tup['key'] + 1000, _tup['value'])
  )
) as oi
""",
"""
map_from_entries(
  transform(
    user_dmdesc.map_key_values(coalesce(interests, from_json('{}', 'map<int,int>'))), _tup -> (_tup['key'] + 2000, _tup['value'])
  )
) as it
"""
))
# map<i32,float> friends_info, 
# map<i32,i64> os_info, 
# map<i32,i32> interests


dataframe, rows: 6:
root
 |-- uid: string (nullable = true)
 |-- fi: map (nullable = true)
 |    |-- key: integer
 |    |-- value: float (valueContainsNull = true)
 |-- oi: map (nullable = true)
 |    |-- key: integer
 |    |-- value: long (valueContainsNull = true)
 |-- it: map (nullable = true)
 |    |-- key: integer
 |    |-- value: integer (valueContainsNull = true)

+---+----------+------------+-------------+
|uid|fi        |oi          |it           |
+---+----------+------------+-------------+
|a  |[1 -> 2.0]|[1001 -> 22]|[2001 -> 222]|
|b  |[2 -> 3.0]|[1002 -> 33]|[2002 -> 333]|
|c  |[]        |[1003 ->]   |[]           |
|d  |[]        |[]          |[2004 ->]    |
|e  |[5 ->]    |[]          |[]           |
|f  |[]        |[]          |[]           |
+---+----------+------------+-------------+



DataFrame[uid: string, fi: map<int,float>, oi: map<int,bigint>, it: map<int,int>]

In [32]:
# transform and merge maps
show(df.selectExpr("uid",
"""
map_concat(
        map_from_entries(transform(
            user_dmdesc.map_key_values(coalesce(friends_info, from_json('{}', 'map<int,float>'))), _tup -> (_tup['key'], _tup['value'])
        )), 
        map_from_entries(transform(
            user_dmdesc.map_key_values(coalesce(os_info, from_json('{}', 'map<int,long>'))), _tup -> (_tup['key'] + 1000, _tup['value'])
          )), 
        map_from_entries(transform(
            user_dmdesc.map_key_values(coalesce(interests, from_json('{}', 'map<int,int>'))), _tup -> (_tup['key'] + 2000, _tup['value'])
          ))
) as merged
"""))


dataframe, rows: 6:
root
 |-- uid: string (nullable = true)
 |-- merged: map (nullable = true)
 |    |-- key: integer
 |    |-- value: float (valueContainsNull = true)

+---+---------------------------------------+
|uid|merged                                 |
+---+---------------------------------------+
|a  |[1 -> 2.0, 1001 -> 22.0, 2001 -> 222.0]|
|b  |[2 -> 3.0, 1002 -> 33.0, 2002 -> 333.0]|
|c  |[1003 ->]                              |
|d  |[2004 ->]                              |
|e  |[5 ->]                                 |
|f  |[]                                     |
+---+---------------------------------------+



DataFrame[uid: string, merged: map<int,float>]

In [36]:
# filter not null array values
df_0 = show(spark.sql(
    "select filter("
    "array("
    "cast(1 as float), cast(null as float), cast('NaN' as float), cast(4 as float)"
    "), _x -> not isnull(_x) and not isnan(_x)"
    ") as arrcol"
))

root
 |-- arrcol: array (nullable = false)
 |    |-- element: float (containsNull = true)

+----------+
|arrcol    |
+----------+
|[1.0, 4.0]|
+----------+



In [37]:
# create map from two arrays
df_1 = show(
    spark.sql(
        "select map_from_arrays("
        "array('a', 'b', 'c', '101', '-1', '303'), "
        "array(cast(1 as double), cast(null as double), cast('NaN' as double), 1.01, 2.02, 3.03)"
        ") as mapcol"
    )
)

root
 |-- mapcol: map (nullable = false)
 |    |-- key: string
 |    |-- value: double (valueContainsNull = true)

+----------------------------------------------------------------+
|mapcol                                                          |
+----------------------------------------------------------------+
|[a -> 1.0, b ->, c -> NaN, 101 -> 1.01, -1 -> 2.02, 303 -> 3.03]|
+----------------------------------------------------------------+



In [38]:
# convert map to array
df_2 = show(df_1.selectExpr(
    "user_dmdesc.map_key_values(mapcol) as arrtuples"
))

root
 |-- arrtuples: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- key: string (nullable = true)
 |    |    |-- value: double (nullable = true)

+----------------------------------------------------------------+
|arrtuples                                                       |
+----------------------------------------------------------------+
|[[a, 1.0], [b,], [101, 1.01], [c, NaN], [-1, 2.02], [303, 3.03]]|
+----------------------------------------------------------------+



In [39]:
# drop invalid tuples
df_3 = show(df_2.selectExpr(
    "filter(arrtuples, _x -> "
    "not isnull(_x['value']) and not isnan(_x['value']) and is_uint32(_x['key'])"
    ") as arrtuples"
))

root
 |-- arrtuples: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- key: string (nullable = true)
 |    |    |-- value: double (nullable = true)

+--------------------------+
|arrtuples                 |
+--------------------------+
|[[101, 1.01], [303, 3.03]]|
+--------------------------+



In [42]:
# convert array of tuples to map
df_4 = show(df_3.selectExpr(
    "cast("
    "map_from_entries(arrtuples)"
    "as map<string,float>) as mapcol"
))

root
 |-- mapcol: map (nullable = true)
 |    |-- key: string
 |    |-- value: float (valueContainsNull = true)

+--------------------------+
|mapcol                    |
+--------------------------+
|[101 -> 1.01, 303 -> 3.03]|
+--------------------------+



In [21]:
# js = '{"1":2}'
js = '{}'
show(spark.sql("select from_json('{}', 'map<int,float>')".format(js)))


dataframe, rows: 1:
root
 |-- entries: map (nullable = true)
 |    |-- key: integer
 |    |-- value: float (valueContainsNull = true)

+-------+
|entries|
+-------+
|[]     |
+-------+



DataFrame[entries: map<int,float>]

In [7]:
spark.sql("select cast(0 as float)").show()

+----------------+
|CAST(0 AS FLOAT)|
+----------------+
|             0.0|
+----------------+



In [14]:
spark.catalog.dropGlobalTempView("test_features")
df.unpersist()

DataFrame[uid: string, friends_info: map<int,float>, os_info: map<int,bigint>, interests: map<int,int>]

In [15]:
spark.stop()