In [1]:
import pprint
from pprint import pformat

import os
import datetime

from operator import and_
from collections import defaultdict

import six
import luigi
import pyspark
import pyspark.sql.functions as sqlfn

import json
import itertools as it

from pyspark.sql.types import MapType, ArrayType, FloatType, StringType, NumericType

if six.PY3:
    from functools import reduce  # make flake8 happy

In [2]:
# pprint.pprint(dict(os.environ), width=1)
def log(msg, obj=None):
    if obj is None:
        print(msg)
    else:
        print("{} obj type: {}, obj data:\n{}".format(msg, type(obj), pformat(obj, indent=1, width=1)))

log("os.environ:", os.environ)
log("\ndict(os.environ):", dict(os.environ))

## Hive MetaStor, listing

In [None]:
from prj.common.hive import HiveMetastoreClient
hmc = HiveMetastoreClient()

In [None]:
dbs = sorted(hmc.get_all_databases())  # type: list
print("{} databases ...".format(len(dbs)))

all_items = []
for db in dbs[0:3]:
    tables = sorted(hmc.get_all_tables(db))
    for table in tables[0:3]:
        columns = hmc.get_columns(table, db)
        partition_names = hmc.get_partition_names(table, db)

        obj = {"{}.{}".format(db, table): {"columns": columns, "partitioned": partition_names}}
        print(json.dumps(obj, sort_keys=True, separators=(",", ":")))
        all_items.append(obj)

print("collected {} records".format(len(all_items)))

In [None]:
with open("databases_tables_fields.json", "w") as f:
    json.dump(all_items, f, sort_keys=True, separators=(",", ":"))

print("DONE")

In [None]:
all_tables_set = set()
all_columns_set = set()
for db in dbs:
    tables = sorted(hmc.get_all_tables(db))
    # log(tables, "db {} tables".format(db))
    all_tables_set.update(set(tables))
    for table in tables:
        columns = hmc.get_columns(table, db)
        partition_names = hmc.get_partition_names(table, db)
        all_columns_set.update(set(columns + partition_names))

In [None]:
log(sorted(list(all_tables_set)), "all tables")

In [None]:
log(sorted(list(all_columns_set)), "all columns")

In [None]:
log(hmc.get_columns("mobile_app", "md_mobile"))

In [None]:
log(hmc.get_partition_names("mobile_app", "md_mobile"))

## Ser/Des Exception

In [None]:
import dill
import pyspark
from pyspark.sql.utils import AnalysisException, CapturedException
from prj.apps.utils.common.fs import HdfsClient


In [None]:
hdfs = HdfsClient()
hdfs_file_path = "hdfs:/user/vlk/TRG-80367/temp/obj.dill"
exception = AnalysisException(desc="test description", stackTrace="foo bar")


In [None]:
with hdfs.open(hdfs_file_path, "w") as f:
    dill.dump(exception, f)


In [None]:
signal = None
with hdfs.open(hdfs_file_path, "r") as f:
    signal = dill.load(f)
# TypeError: __init__() takes exactly 3 arguments (1 given)
# local FS the same

log(signal)

In [None]:
import dill.detect
with dill.detect.trace(True):
    dill.dumps(exception)
#     signal = dill.load(f)


## Spark playground

In [None]:
import os
import sys

# Pack executable prj conda environment into zip
TMP_ENV_BASEDIR = "tmpenv"  # Reserved directory to store environment archive
env_dir = os.path.dirname(os.path.dirname(sys.executable))  # /data/dmprj-anaconda/conda-dmprj-1.34.11
env_name = os.path.basename(env_dir)
env_archive = "{basedir}/{env}.zip#{basedir}".format(basedir=TMP_ENV_BASEDIR, env=env_name)
os.environ["PYSPARK_PYTHON"] = "{}/{}/bin/python".format(TMP_ENV_BASEDIR, env_name)

log("env_dir:", env_dir)

# you need this only first time!
# !rm -rf {TMP_ENV_BASEDIR} && mkdir {TMP_ENV_BASEDIR} && cd {TMP_ENV_BASEDIR} && rsync -a {env_dir} . && zip -rq {env_name}.zip {env_name}

# session builder.config("spark.yarn.dist.archives", env_archive)
log("env_archive: ", env_archive)

In [None]:
from pyspark.sql import SparkSession, SQLContext

# Create Spark session with prj conda environment and JVM extensions
# `spark-submit ... --driver-java-options "-Dlog4j.configuration=file:/home/vlk/driver_log4j.properties"`
# spark.driver.extraJavaOptions

queue = "root.priority"

# "spark.driver.extraJavaOptions", "-Xss10M"
# catalyst SO while building parts. filter expression

# 1 GB of data
sssp = (1 * 4) * 2 * 2 * 4

spark = (
SparkSession.builder
    .master("yarn-client")
    .appName("TRG-94433-test-ipynb")
    .config("spark.yarn.queue", queue)
    .config("spark.sql.shuffle.partitions", sssp)
    .config("spark.executor.instances", "2")
    .config("spark.executor.cores", "6")
    .config("spark.executor.memory", "24G")
    .config("spark.executor.memoryOverhead", "8G")
    .config("spark.driver.memory", "4G")
    .config("spark.driver.maxResultSize", "1G")
    .config("spark.driver.extraJavaOptions", "-Dlog4j.configuration=file:/home/vlk/driver2_log4j.properties")
    .config("spark.speculation", "true")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.maxExecutors", "256")
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")
    .config("spark.network.timeout", "800s")
    .config("spark.reducer.maxReqsInFlight", "10")
    .config("spark.shuffle.io.retryWait", "60s")
    .config("spark.shuffle.io.maxRetries", "10")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.kryoserializer.buffer.max", "1024m")
    .config("spark.hadoop.hive.exec.dynamic.partition", "true")
    .config("spark.hadoop.hive.exec.dynamic.partition.mode", "nonstrict")
    .config("spark.hadoop.hive.exec.max.dynamic.partitions", "1000000")
    .config("spark.hadoop.hive.exec.max.dynamic.partitions.pernode", "100000")
    .config("spark.hadoop.hive.metastore.client.socket.timeout", "60s")
    .config("spark.ui.enabled", "true")
    .config("spark.sql.sources.partitionColumnTypeInference.enabled", "false")
    .config("spark.yarn.dist.archives", env_archive)
    .getOrCreate()
)
# .config("spark.driver.extraJavaOptions", "-Xss10M -Dlog4j.configuration=file:/home/vlk/driver_log4j.properties")
# .config("spark.jars", "hdfs:/lib/dm/prj-transformers-assembly-dev-1.5.1.jar")

sql_ctx = SQLContext(spark.sparkContext)
(spark, sql_ctx)

In [None]:
# end of env. setup

In [None]:
import os
import numpy as np

from pprint import pformat

import luigi
import pyspark.sql.functions as sqlfn

from pyspark.storagelevel import StorageLevel

from pyspark.sql import DataFrame, SQLContext
from pyspark.sql.types import (
    MapType, ArrayType, FloatType, DoubleType, StringType, StructType, IntegralType, IntegerType
)
from pyspark.sql.utils import CapturedException

from pyspark.ml.wrapper import JavaWrapper

# from luigi.contrib.hdfs import HdfsTarget

from dmprj.common.hive import HiveMetastoreClient, HiveThriftSASLContext

from dmprj.apps.utils.common import add_days
from dmprj.apps.utils.common import unfreeze_json_param
from dmprj.apps.utils.common.hive import format_table, select_clause
from dmprj.apps.utils.common.hive import FindPartitionsEngine
from dmprj.apps.utils.common.hive import select_clause
from dmprj.apps.utils.common.external_program import AvoidLuigiFlatTaskRunner
from dmprj.apps.utils.common.fs import HdfsClient

from dmprj.apps.utils.common.spark import CustomUDFLibrary, insert_into_hive

from dmprj.apps.utils.common.luigix import HiveTableSchemaTarget
from dmprj.apps.utils.common.luigix import HiveExternalTask, HiveGenericTarget

from dmprj.apps.utils.control.client.logs import ControlLoggingMixin
from dmprj.apps.utils.control.luigix import ControlApp, ControlDynamicOutputPySparkTask


In [None]:
# CustomUDFLibrary(spark, "hdfs:/lib/dm/prj-transformers-assembly-dev-1.5.1.jar").register_all_udf()

## pattern_feature

In [6]:
banner_pattern_df = spark.sql("select * from ods_targetdb_data.banner_pattern where dt = '2023-01-09'")

In [7]:
banner_pattern_df.show()

+---+-------------------+-------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+---------+-------------+------------+----------+
| id|        create_dttm|        update_dttm|status|                name|         description|              format|           interface|            settings|editor_id|mt_mapping_id|feature_type|        dt|
+---+-------------------+-------------------+------+--------------------+--------------------+--------------------+--------------------+--------------------+---------+-------------+------------+----------+
|  1|2018-11-09 19:43:53|2020-01-09 16:37:51|active|             default|Ссылка для всех п...|[[role -> primary...|                null|                null|  2071885|         null|        null|2023-01-09|
|  2|2018-12-08 11:37:02|2022-07-28 11:52:19|active|multiformat_squar...|Мультиформат, ква...|[[role -> primary...|{"projectionFacto...|                null|  1002998|         

In [10]:
format_configs_df = banner_pattern_df.selectExpr("`format` as format_configs").where("format_configs is not NULL")

In [12]:
format_configs_df.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
expr = """
aggregate(
  `format`,
  CAST(MAP() AS MAP<string,int>),
  (acc, x) ->
    IF(x['role'] is not NULL AND x['field']='url' AND x['required']='true',
      user_dmdesc.combine(acc, MAP(x['role'], 1)),
      acc
    )
) AS url_required_roles
"""

In [18]:
(
    banner_pattern_df.selectExpr(expr)
    .orderBy(sqlfn.size(sqlfn.col("url_required_roles")).desc())
    .show(truncate=False)
)

+-------------------------------------------------------------------------+
|url_required_roles                                                       |
+-------------------------------------------------------------------------+
|[header_click -> 1, url_slide_3 -> 1, url_slide_2 -> 1, url_slide_1 -> 1]|
|[url_slide_2 -> 1, deeplink_slide_2 -> 1]                                |
|[url_slide_5 -> 1, deeplink_slide_5 -> 1]                                |
|[deeplink_slide_6 -> 1, url_slide_6 -> 1]                                |
|[url_slide_3 -> 1, deeplink_slide_3 -> 1]                                |
|[vk -> 1, ok -> 1]                                                       |
|[deeplink_url -> 1, primary -> 1]                                        |
|[deeplink_url -> 1, primary -> 1]                                        |
|[deeplink_url -> 1, primary -> 1]                                        |
|[deeplink_url -> 1, primary -> 1]                                        |
|[link_1 -> 

In [20]:
expr_p = """
aggregate(
  `format`,
  CAST(MAP() AS MAP<string,int>),
  (acc, x) ->
    IF(x['role'] is not NULL AND x['field']='{field}' AND x['required']='{required}',
      user_dmdesc.combine(acc, MAP(x['role'], 1)),
      acc
    )
) AS {name}
"""

In [22]:
(
    banner_pattern_df.selectExpr(
        expr_p.format(field="url", required="true", name="url_required_roles"),
        expr_p.format(field="url", required="false", name="url_optional_roles"),
        expr_p.format(field="textblock", required="true", name="textblock_required_roles"),
        expr_p.format(field="textblock", required="false", name="textblock_optional_roles"),
        expr_p.format(field="content", required="true", name="content_required_roles"),
        expr_p.format(field="content", required="false", name="content_optional_roles")
    )
    .orderBy(sqlfn.size(sqlfn.col("content_optional_roles")).desc())
    .show(truncate=False)
)

+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------

In [24]:
# ",".join(["'{}'".format(x) for x in range(32)][::-1])
",".join(["'{}'".format(x) for x in range(32)])

"'0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31'"

In [None]:
ftypes_df = (
    banner_pattern_df
        .where("feature_type is not null")  # .orderBy(sqlfn.col("feature_type").desc())
        .selectExpr(
            """
    map_from_entries(
        filter(
            zip_with(
                array('0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31'),
                reverse(
                    array_remove(
                        split(bin(feature_type), ''),
                        ''
                    )
                ),
                (_i, _b) -> (_i, cast(_b as int) as _b)
            ),
            rec -> rec['_b'] != 0

        )
    )
            """
        )        
)

ftypes_df.show(500, truncate=False)
# ftypes_df.printSchema()

t = """
filter(array(0, null, 2, 3, null), x -> x IS NOT NULL)

            zip_with(
                array('0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','23','24','25','26','27','28','29','30','31'),
                reverse(split(bin(feature_type), '')),
                (i, b) -> (i, b)
            )
"""

In [23]:
spark.stop()