# catalyst udf/udaf tests (spark2.4)

In [1]:
import pprint
from pprint import pformat

import os

In [None]:
# pprint.pprint(dict(os.environ), width=1)
def log(msg, obj=None):
    if obj is None:
        print(msg)
    else:
        print("{} obj type: {}, obj data:\n{}".format(msg, type(obj), pformat(obj, indent=1, width=1)))

log("os.environ:", os.environ)
log("\ndict(os.environ):", dict(os.environ))

In [None]:
import os
import sys

# Pack executable conda environment into zip
TMP_ENV_BASEDIR = "tmpenv"  # Reserved directory to store environment archive
env_dir = os.path.dirname(os.path.dirname(sys.executable))
env_name = os.path.basename(env_dir)
env_archive = "{basedir}/{env}.zip#{basedir}".format(basedir=TMP_ENV_BASEDIR, env=env_name)
os.environ["PYSPARK_PYTHON"] = "{}/{}/bin/python".format(TMP_ENV_BASEDIR, env_name)

log("env_dir:", env_dir)

# you need this only the first time!
# !rm -rf {TMP_ENV_BASEDIR} && mkdir {TMP_ENV_BASEDIR} && cd {TMP_ENV_BASEDIR} && rsync -a {env_dir} . && zip -rq {env_name}.zip {env_name}

# session builder.config("spark.yarn.dist.archives", env_archive)
log("env_archive: ", env_archive)

In [4]:
from pyspark.sql import SparkSession, SQLContext

# Create Spark session with conda environment and JVM extensions
# `spark-submit ... --driver-java-options "-Dlog4j.configuration=file:/home/vlk/driver_log4j.properties"`

jira_ticket = "N-102772"

queue = "root.dev.one.priority"

# "spark.driver.extraJavaOptions", "-Xss10M"
# catalyst SO while building parts. filter expression

sssp = (1 * 4) * 2 * 2 * 4 # 1 GB of data
# sssp = (300 * 4) * 2 * 2 # 300 GB

spark = (
SparkSession.builder
    .master("yarn-client")
    .appName("{}-test-ipynb".format(jira_ticket))
    .config("spark.yarn.queue", queue)
    .config("spark.sql.shuffle.partitions", sssp)
    .config("spark.yarn.dist.archives", env_archive)
    .config("spark.executor.instances", "2")
    .config("spark.executor.cores", "6")
    .config("spark.executor.memory", "6G")
    .config("spark.executor.memoryOverhead", "6G")
    .config("spark.driver.memory", "4G")
    .config("spark.driver.maxResultSize", "2G")
    .config("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:/home/vlk/driver2_log4j.properties")
    .config("spark.speculation", "true")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.maxExecutors", "256")
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")
    .config("spark.network.timeout", "800s")
    .config("spark.reducer.maxReqsInFlight", "10")
    .config("spark.shuffle.io.retryWait", "60s")
    .config("spark.shuffle.io.maxRetries", "10")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.kryoserializer.buffer.max", "1024m")
    .config("spark.hadoop.hive.exec.dynamic.partition", "true")
    .config("spark.hadoop.hive.exec.dynamic.partition.mode", "nonstrict")
    .config("spark.hadoop.hive.exec.max.dynamic.partitions", "1000000")
    .config("spark.hadoop.hive.exec.max.dynamic.partitions.pernode", "100000")
    .config("spark.hadoop.hive.metastore.client.socket.timeout", "60s")
    .config("spark.ui.enabled", "true")
    .config("spark.sql.sources.partitionColumnTypeInference.enabled", "false")
    .getOrCreate()
)
# .config("spark.driver.extraJavaOptions", "-Xss10M -Dlog4j.configuration=file:/home/vlk/driver_log4j.properties")
# .config("spark.jars", "hdfs:/lib/transformers-assembly-SNAPSHOT.jar")

sql_ctx = SQLContext(spark.sparkContext)
(spark, sql_ctx)

(<pyspark.sql.session.SparkSession at 0x7f03a4ca9ed0>,
 <pyspark.sql.context.SQLContext at 0x7f044c5cd8d0>)

In [None]:
print(spark._repr_html_())

In [None]:
# end of env. setup

In [None]:
import datetime

from operator import and_
from collections import defaultdict

import six
import luigi
import pyspark
import pyspark.sql.functions as sqlfn

import json
import itertools as it

if six.PY3:
    from functools import reduce  # make flake8 happy

import os
import numpy as np

from pprint import pformat

from pyspark.storagelevel import StorageLevel

from pyspark.sql import DataFrame, SQLContext
from pyspark.sql.types import (
    MapType, ArrayType, FloatType, DoubleType, StringType, StructType, IntegralType, IntegerType, NumericType
)
from pyspark.sql.utils import CapturedException

from pyspark.ml.wrapper import JavaWrapper

# from luigi.contrib.hdfs import HdfsTarget

from dmgrinder.common.hive import HiveMetastoreClient, HiveThriftSASLContext
from dmgrinder.apps.utils.common import add_days
from dmgrinder.apps.utils.common import unfreeze_json_param
from dmgrinder.apps.utils.common.hive import format_table, select_clause
from dmgrinder.apps.utils.common.hive import FindPartitionsEngine
from dmgrinder.apps.utils.common.external_program import AvoidLuigiFlatTaskRunner
from dmgrinder.apps.utils.common.fs import HdfsClient
from dmgrinder.apps.utils.common.luigix import HiveTableSchemaTarget
from dmgrinder.apps.utils.common.luigix import HiveExternalTask, HiveGenericTarget
from dmgrinder.apps.utils.control.client.logs import ControlLoggingMixin
from dmgrinder.apps.utils.control.luigix import ControlApp, ControlDynamicOutputPySparkTask

from dmgrinder.apps.utils.common.spark import (
    GrinderUDFLibrary,
    union_all,
    join_filter,
    read_orc_table,
    configured_join,
    insert_into_hive,
    stratified_sample,
    column_values_count,
)

In [None]:
UDFLibrary(spark, "hdfs:/lib/transformers-assembly-dev-SNAPSHOT.jar").register_all_udf()

In [29]:
df = spark.createDataFrame(
    data=[(
        str(i), 
        "an{}".format(i), 
        "positive", 
        "ut{}".format(i % 3), 
        i,
        float(i),
        [float(i) / float(x+1) for x in range(7)],
        {str(k): float(i) / float(k+1) for k in range(7)},
    ) for i in range(1, 123001)],
    schema=(
        "uid:string,audience_name:string,category:string,uid_type:string,"
        "action_dt:long,score:float,score_list:array<float>,score_map:map<string,float>"
    ),
)

In [30]:
hdfs_home_dir = "hdfs:/user/vlk/"
hdfs_dir = os.path.join(hdfs_home_dir, jira_ticket, "test_dataset")

In [31]:
df.repartition(12, "uid").write.option("mapreduce.fileoutputcommitter.algorithm.version", "2").parquet(
    hdfs_dir, mode="overwrite"
)

In [44]:
df = spark.read.parquet(hdfs_dir)  # .persist()
df.printSchema()
df.show(10)

root
 |-- uid: string (nullable = true)
 |-- audience_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- uid_type: string (nullable = true)
 |-- action_dt: long (nullable = true)
 |-- score: float (nullable = true)
 |-- score_list: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- score_map: map (nullable = true)
 |    |-- key: string
 |    |-- value: float (valueContainsNull = true)

+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
|uid|audience_name|category|uid_type|action_dt|score|          score_list|           score_map|
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
| 20|         an20|positive|     ut2|       20| 20.0|[20.0, 10.0, 6.66...|[0 -> 20.0, 1 -> ...|
| 27|         an27|positive|     ut0|       27| 27.0|[27.0, 13.5, 9.0,...|[0 -> 27.0, 1 -> ...|
| 36|         an36|positive|     ut0|       36| 36.0|[36.0, 18.0, 12.0...|[0 -> 3

In [None]:
def show(df):
    df.printSchema()
    df.explain()
    df.show(truncate=False)
    return df

### gmin, gmax, gsum, gavg

In [46]:
exprs = [sqlfn.expr(expr) for expr in [
    "gmin(score) as score", 
    "gmax(score_list)", 
    "gmin(score_map)", 
    "gsum(score)",
    "gsum(score_list)",
    "gsum(score_map)",
    "gavg(score)",
    "gavg(score_list)",
    "gavg(score_map)",
]]

_df = df.where("uid_type in ('ut2', 'ut1')").groupBy("uid_type").agg(*exprs).where("score < 10")

In [47]:
show(_df)

root
 |-- uid_type: string (nullable = true)
 |-- score: float (nullable = true)
 |-- gmax(score_list): array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- gmin(score_map): map (nullable = true)
 |    |-- key: string
 |    |-- value: float (valueContainsNull = true)
 |-- gsum(score): float (nullable = true)
 |-- gsum(score_list): array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- gsum(score_map): map (nullable = true)
 |    |-- key: string
 |    |-- value: float (valueContainsNull = true)
 |-- gavg(score): float (nullable = true)
 |-- gavg(score_list): array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- gavg(score_map): map (nullable = true)
 |    |-- key: string
 |    |-- value: float (valueContainsNull = true)

+--------+-----+-----------------------------------------------------------------------+--------------------------------------------------------------------------------------------+------------+----------

DataFrame[uid_type: string, score: float, gmax(score_list): array<float>, gmin(score_map): map<string,float>, gsum(score): float, gsum(score_list): array<float>, gsum(score_map): map<string,float>, gavg(score): float, gavg(score_list): array<float>, gavg(score_map): map<string,float>]

### most_freq(column, index=null, threshold=null, prefer=null)

In [50]:
exprs = [sqlfn.expr(expr) for expr in [
    "most_freq(score) as score",
    "most_freq(uid)",
    "most_freq(action_dt)",
]]

_df = df.where("uid_type in ('ut2', 'ut1')").groupBy("uid_type").agg(*exprs).where("score != 10")

In [51]:
show(_df)

root
 |-- uid_type: string (nullable = true)
 |-- score: float (nullable = true)
 |-- most_freq(uid): string (nullable = true)
 |-- most_freq(action_dt): long (nullable = true)

+--------+--------+--------------+--------------------+
|uid_type|score   |most_freq(uid)|most_freq(action_dt)|
+--------+--------+--------------+--------------------+
|ut2     |107603.0|22472         |44723               |
|ut1     |787.0   |32071         |21226               |
+--------+--------+--------------+--------------------+

== Physical Plan ==
*(2) Filter (isnotnull(score#1189) && NOT (score#1189 = 10.0))
+- ObjectHashAggregate(keys=[uid_type#1023], functions=[generic_most_freq(score#1025, null, null, null, 0, 0), generic_most_freq(uid#1020, null, null, null, 0, 0), generic_most_freq(action_dt#1024L, null, null, null, 0, 0)])
   +- Exchange hashpartitioning(uid_type#1023, 64)
      +- ObjectHashAggregate(keys=[uid_type#1023], functions=[partial_generic_most_freq(score#1025, null, null, null, 0, 0), p

DataFrame[uid_type: string, score: float, most_freq(uid): string, most_freq(action_dt): bigint]

### map_values_ordered(map_column, keys_list)
Сигнатура: `(map<string, float>, array<string>) => array<float>`

In [53]:
"""
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
|uid|audience_name|category|uid_type|action_dt|score|          score_list|           score_map|
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
| 20|         an20|positive|     ut2|       20| 20.0|[20.0, 10.0, 6.66...|[0 -> 20.0, 1 -> ...|
"""
exprs = [sqlfn.expr(expr) for expr in [
    "uid",
    "score",
    "map_values_ordered(score_map, array('1', '2', '3'))",
]]

_df = df.where("uid_type in ('ut2', 'ut1')").select(*exprs).where("score != 10")
show(_df)

root
 |-- uid: string (nullable = true)
 |-- score: float (nullable = true)
 |-- UDF:map_values_ordered(score_map, array(1, 2, 3)): array (nullable = true)
 |    |-- element: float (containsNull = true)

+---+-----+-------------------------------------------------+
|uid|score|UDF:map_values_ordered(score_map, array(1, 2, 3))|
+---+-----+-------------------------------------------------+
|20 |20.0 |[10.0, 6.6666665, 5.0]                           |
|70 |70.0 |[35.0, 23.333334, 17.5]                          |
|121|121.0|[60.5, 40.333332, 30.25]                         |
|131|131.0|[65.5, 43.666668, 32.75]                         |
|140|140.0|[70.0, 46.666668, 35.0]                          |
|152|152.0|[76.0, 50.666668, 38.0]                          |
|158|158.0|[79.0, 52.666668, 39.5]                          |
|200|200.0|[100.0, 66.666664, 50.0]                         |
|202|202.0|[101.0, 67.333336, 50.5]                         |
|218|218.0|[109.0, 72.666664, 54.5]                 

DataFrame[uid: string, score: float, UDF:map_values_ordered(score_map, array(1, 2, 3)): array<float>]

### is_uint32(str_column)
Сигнатура: `string => boolean`

In [54]:
exprs = [sqlfn.expr(expr) for expr in [
    "uid",
    "score",
    "is_uint32(uid)",
]]

_df = df.where("uid_type in ('ut2', 'ut1')").select(*exprs).where("score != 10")
show(_df)

root
 |-- uid: string (nullable = true)
 |-- score: float (nullable = true)
 |-- UDF:is_uint32(uid): boolean (nullable = true)

+---+-----+------------------+
|uid|score|UDF:is_uint32(uid)|
+---+-----+------------------+
|20 |20.0 |true              |
|70 |70.0 |true              |
|121|121.0|true              |
|131|131.0|true              |
|140|140.0|true              |
|152|152.0|true              |
|158|158.0|true              |
|200|200.0|true              |
|202|202.0|true              |
|218|218.0|true              |
|223|223.0|true              |
|269|269.0|true              |
|277|277.0|true              |
|295|295.0|true              |
|314|314.0|true              |
|347|347.0|true              |
|365|365.0|true              |
|370|370.0|true              |
|388|388.0|true              |
|389|389.0|true              |
+---+-----+------------------+
only showing top 20 rows

== Physical Plan ==
*(1) Project [uid#1020, score#1025, UDF:is_uint32(uid#1020) AS UDF:is_uint32(uid)#

DataFrame[uid: string, score: float, UDF:is_uint32(uid): boolean]

### hash_to_uint32(str_column)
Сигнатура: `string => string`

In [55]:
"""
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
|uid|audience_name|category|uid_type|action_dt|score|          score_list|           score_map|
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
| 20|         an20|positive|     ut2|       20| 20.0|[20.0, 10.0, 6.66...|[0 -> 20.0, 1 -> ...|
"""
exprs = [sqlfn.expr(expr) for expr in [
    "uid",
    "score",
    "hash_to_uint32(audience_name)",
]]

_df = df.where("uid_type in ('ut2', 'ut1')").select(*exprs).where("score != 10")
show(_df)

root
 |-- uid: string (nullable = true)
 |-- score: float (nullable = true)
 |-- UDF:hash_to_uint32(audience_name): string (nullable = true)

+---+-----+---------------------------------+
|uid|score|UDF:hash_to_uint32(audience_name)|
+---+-----+---------------------------------+
|20 |20.0 |3074177046                       |
|70 |70.0 |1572117424                       |
|121|121.0|1394285225                       |
|131|131.0|1804965547                       |
|140|140.0|2520080435                       |
|152|152.0|4274212572                       |
|158|158.0|1658192223                       |
|200|200.0|3196484522                       |
|202|202.0|51751845                         |
|218|218.0|2839725946                       |
|223|223.0|365801713                        |
|269|269.0|1146275242                       |
|277|277.0|1657397741                       |
|295|295.0|910553476                        |
|314|314.0|200282540                        |
|347|347.0|3551783777         

DataFrame[uid: string, score: float, UDF:hash_to_uint32(audience_name): string]

### uid64(str_column)
Сигнатура: `string => string`

In [56]:
"""
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
|uid|audience_name|category|uid_type|action_dt|score|          score_list|           score_map|
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
| 20|         an20|positive|     ut2|       20| 20.0|[20.0, 10.0, 6.66...|[0 -> 20.0, 1 -> ...|
"""
exprs = [sqlfn.expr(expr) for expr in [
    "uid",
    "score",
    "uid64(md5(audience_name))",
]]

_df = df.where("uid_type in ('ut2', 'ut1')").select(*exprs).where("score != 10")
show(_df)

root
 |-- uid: string (nullable = true)
 |-- score: float (nullable = true)
 |-- UDF:uid64(md5(cast(audience_name as binary))): string (nullable = true)

+---+-----+---------------------------------------------+
|uid|score|UDF:uid64(md5(cast(audience_name as binary)))|
+---+-----+---------------------------------------------+
|20 |20.0 |10450430168314814339                         |
|70 |70.0 |771192226233195387                           |
|121|121.0|18367159758826262061                         |
|131|131.0|2887395016517913779                          |
|140|140.0|8805933726758575420                          |
|152|152.0|3018354615061546692                          |
|158|158.0|4321784850796573240                          |
|200|200.0|8817945939647107112                          |
|202|202.0|17848873196591168689                         |
|218|218.0|17551919422123190489                         |
|223|223.0|15761808761620872783                         |
|269|269.0|11441267015519271768   

DataFrame[uid: string, score: float, UDF:uid64(md5(cast(audience_name as binary))): string]

### map_join(map_column, items_separator, kv_separator)
Сигнатура: `(map<string, float>, string, string) => string`

In [58]:
"""
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
|uid|audience_name|category|uid_type|action_dt|score|          score_list|           score_map|
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
| 20|         an20|positive|     ut2|       20| 20.0|[20.0, 10.0, 6.66...|[0 -> 20.0, 1 -> ...|
"""
exprs = [sqlfn.expr(expr) for expr in [
    "uid",
    "score",
    "map_join(score_map, ';', ',')",
]]

_df = df.where("uid_type in ('ut2', 'ut1')").select(*exprs).where("score != 10")
show(_df)

root
 |-- uid: string (nullable = true)
 |-- score: float (nullable = true)
 |-- UDF:map_join(score_map, ;, ,): string (nullable = true)

+---+-----+-------------------------------------------------------------------+
|uid|score|UDF:map_join(score_map, ;, ,)                                      |
+---+-----+-------------------------------------------------------------------+
|20 |20.0 |4,4.0;5,3.3333333;6,2.857143;1,10.0;0,20.0;2,6.6666665;3,5.0       |
|70 |70.0 |4,14.0;5,11.666667;6,10.0;1,35.0;0,70.0;2,23.333334;3,17.5         |
|121|121.0|4,24.2;5,20.166666;6,17.285715;1,60.5;0,121.0;2,40.333332;3,30.25  |
|131|131.0|4,26.2;5,21.833334;6,18.714285;1,65.5;0,131.0;2,43.666668;3,32.75  |
|140|140.0|4,28.0;5,23.333334;6,20.0;1,70.0;0,140.0;2,46.666668;3,35.0        |
|152|152.0|4,30.4;5,25.333334;6,21.714285;1,76.0;0,152.0;2,50.666668;3,38.0   |
|158|158.0|4,31.6;5,26.333334;6,22.571428;1,79.0;0,158.0;2,52.666668;3,39.5   |
|200|200.0|4,40.0;5,33.333332;6,28.571428;1,100.0;0,200.0;2,66

DataFrame[uid: string, score: float, UDF:map_join(score_map, ;, ,): string]

### uid2user(uid, uid_type)
Сигнатура: `(string, string) => string`

In [59]:
"""
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
|uid|audience_name|category|uid_type|action_dt|score|          score_list|           score_map|
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
| 20|         an20|positive|     ut2|       20| 20.0|[20.0, 10.0, 6.66...|[0 -> 20.0, 1 -> ...|
"""
exprs = [sqlfn.expr(expr) for expr in [
    "uid",
    "score",
    "uid2user(uid, 'HID')",
]]

_df = df.where("uid_type in ('ut2', 'ut1')").select(*exprs).where("score != 10")
show(_df)

root
 |-- uid: string (nullable = true)
 |-- score: float (nullable = true)
 |-- UDF:uid2user(uid, HID): string (nullable = true)

+---+-----+----------------------+
|uid|score|UDF:uid2user(uid, HID)|
+---+-----+----------------------+
|20 |20.0 |hid:20                |
|70 |70.0 |hid:70                |
|121|121.0|hid:121               |
|131|131.0|hid:131               |
|140|140.0|hid:140               |
|152|152.0|hid:152               |
|158|158.0|hid:158               |
|200|200.0|hid:200               |
|202|202.0|hid:202               |
|218|218.0|hid:218               |
|223|223.0|hid:223               |
|269|269.0|hid:269               |
|277|277.0|hid:277               |
|295|295.0|hid:295               |
|314|314.0|hid:314               |
|347|347.0|hid:347               |
|365|365.0|hid:365               |
|370|370.0|hid:370               |
|388|388.0|hid:388               |
|389|389.0|hid:389               |
+---+-----+----------------------+
only showing top 20 rows

== 

DataFrame[uid: string, score: float, UDF:uid2user(uid, HID): string]

### coomul(vec1, vec2); semidiff(vec1, vec2); semisum(vec1, vec2); matmul(vec1, vec2)
Сигнатура: `(array<numeric>, array<numeric>) => array<numeric>`

In [64]:
"""
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
|uid|audience_name|category|uid_type|action_dt|score|          score_list|           score_map|
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
| 20|         an20|positive|     ut2|       20| 20.0|[20.0, 10.0, 6.66...|[0 -> 20.0, 1 -> ...|
"""
exprs = [sqlfn.expr(expr) for expr in [
    "uid",
    "score",
    "coomul(score_list, score_list)",
    "semidiff(score_list, score_list)",
    "semisum(score_list, score_list)",
    "matmul(concat(score_list, array(1, 2)), concat(array(1, 2), score_list))",
]]

_df = df.where("uid_type in ('ut2', 'ut1')").select(*exprs).where("score != 10")
show(_df)

root
 |-- uid: string (nullable = true)
 |-- score: float (nullable = true)
 |-- genericvectorcoomul(score_list, score_list): array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- genericvectorsemidiff(score_list, score_list): array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- genericvectorsemisum(score_list, score_list): array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- genericvectormatmul(concat(score_list, array(1, 2)), concat(array(1, 2), score_list)): array (nullable = true)
 |    |-- element: float (containsNull = true)

+---+-----+---------------------------------------------------------------------------+---------------------------------------------+-------------------------------------------------------------+---------------------------------------------------------------------------------------------------+
|uid|score|genericvectorcoomul(score_list, score_list)                                |genericvect

DataFrame[uid: string, score: float, genericvectorcoomul(score_list, score_list): array<float>, genericvectorsemidiff(score_list, score_list): array<float>, genericvectorsemisum(score_list, score_list): array<float>, genericvectormatmul(concat(score_list, array(1, 2)), concat(array(1, 2), score_list)): array<float>]

### isinf(x); isfinite(x)
Сигнатура: `(Atomic) => Boolean`

In [77]:
"""
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
|uid|audience_name|category|uid_type|action_dt|score|          score_list|           score_map|
+---+-------------+--------+--------+---------+-----+--------------------+--------------------+
| 20|         an20|positive|     ut2|       20| 20.0|[20.0, 10.0, 6.66...|[0 -> 20.0, 1 -> ...|
"""
df.printSchema()
exprs = [sqlfn.expr(expr) for expr in [
    "uid",
    "score",
    "isinf(uid)",
    "isfinite(uid)",
    "isinf(double(score))",  # bug? if score is float: codegen failure: java.lang.VerifyError: Expecting a stackmap frame at branch target
    "isfinite(double(score))",
]]

_df = df.where("uid_type in ('ut2', 'ut1')").select(*exprs).where("score != 10")
show(_df)

root
 |-- uid: string (nullable = true)
 |-- audience_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- uid_type: string (nullable = true)
 |-- action_dt: long (nullable = true)
 |-- score: float (nullable = true)
 |-- score_list: array (nullable = true)
 |    |-- element: float (containsNull = true)
 |-- score_map: map (nullable = true)
 |    |-- key: string
 |    |-- value: float (valueContainsNull = true)

root
 |-- uid: string (nullable = true)
 |-- score: float (nullable = true)
 |-- genericisinf(CAST(uid AS DOUBLE)): boolean (nullable = false)
 |-- genericisfinite(CAST(uid AS DOUBLE)): boolean (nullable = false)
 |-- genericisinf(CAST(score AS DOUBLE)): boolean (nullable = false)
 |-- genericisfinite(CAST(score AS DOUBLE)): boolean (nullable = false)

+---+-----+---------------------------------+------------------------------------+-----------------------------------+--------------------------------------+
|uid|score|genericisinf(CAST(uid AS DOUBLE))|gen

DataFrame[uid: string, score: float, genericisinf(CAST(uid AS DOUBLE)): boolean, genericisfinite(CAST(uid AS DOUBLE)): boolean, genericisinf(CAST(score AS DOUBLE)): boolean, genericisfinite(CAST(score AS DOUBLE)): boolean]

In [74]:
df.unpersist()

DataFrame[uid: string, audience_name: string, category: string, uid_type: string, action_dt: bigint, score: float, score_list: array<float>, score_map: map<string,float>]

In [78]:
spark.stop()