In [1]:
import pprint
from pprint import pformat
import os
import datetime

from operator import and_
from collections import defaultdict

import six
import luigi
import pyspark.sql.functions as sqlfn

import json
import itertools as it

from pyspark.sql.types import MapType, ArrayType, FloatType, StringType, NumericType

if six.PY3:
    from functools import reduce  # make flake8 happy

In [None]:
# pprint.pprint(dict(os.environ), width=1)
def log(obj, msg=""):
    if msg: print(msg)
    print("type: {}\ndata: {}".format(type(obj), pformat(obj, indent=1, width=1)))

log(os.environ, "os.environ")
print()
log(dict(os.environ), "dict(os.environ)")

In [3]:
import os
import sys

from pyspark.sql import SparkSession, SQLContext

# Pack executable prj conda environment into zip
TMP_ENV_BASEDIR = "tmpenv"  # Reserved directory to store environment archive
env_dir = os.path.dirname(os.path.dirname(sys.executable))
env_name = os.path.basename(env_dir)
env_archive = "{basedir}/{env}.zip#{basedir}".format(basedir=TMP_ENV_BASEDIR, env=env_name)
os.environ["PYSPARK_PYTHON"] = "{}/{}/bin/python".format(TMP_ENV_BASEDIR, env_name)

# you need this only first time!
# !rm -rf {TMP_ENV_BASEDIR} && mkdir {TMP_ENV_BASEDIR} && cd {TMP_ENV_BASEDIR} && rsync -a {env_dir} . && zip -rq {env_name}.zip {env_name}


In [4]:
# Create Spark session with prj conda environment and JVM extensions
spark = SparkSession.builder\
    .master("yarn-client")\
    .appName("dmprj-uf_export_test")\
    .config("spark.yarn.queue", "dev.regular")\
    .config("spark.executor.instances", "10")\
    .config("spark.executor.memory", "8G")\
    .config("spark.executor.cores", "6")\
    .config("spark.yarn.executor.memoryOverhead", "2G")\
    .config("spark.dynamicAllocation.enabled", "true")\
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")\
    .config("spark.dynamicAllocation.maxExecutors", "128")\
    .config("spark.network.timeout", "800s")\
    .config("spark.reducer.maxReqsInFlight", "10")\
    .config("spark.shuffle.io.retryWait", "60s")\
    .config("spark.shuffle.io.maxRetries", "10")\
    .config("spark.sql.shuffle.partitions", "1024")\
    .config("spark.driver.memory", "4G")\
    .config("spark.driver.maxResultSize", "2G")\
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
    .config("spark.kryoserializer.buffer.max", "1024m")\
    .config("hive.exec.dynamic.partition", "true")\
    .config("hive.exec.dynamic.partition.mode", "nonstrict")\
    .config("hive.exec.max.dynamic.partitions", "100000")\
    .config("hive.exec.max.dynamic.partitions.pernode", "10000")\
    .config("spark.jars", "hdfs:/lib/dm/prj-transformers-assembly-1.4.0.jar")\
    .config("spark.yarn.dist.archives", env_archive)\
    .getOrCreate()

sql_ctx = SQLContext(spark.sparkContext)

In [67]:
# end of env. setup

In [None]:
import os

from pprint import pformat

import luigi
import pyspark.sql.functions as sqlfn

from pyspark.sql.types import MapType, ArrayType, FloatType, DoubleType, StringType, StructType, IntegralType
from pyspark.sql.utils import CapturedException

from luigi.contrib.hdfs import HdfsTarget

from dmprj.apps.utils.common import add_days
from dmprj.apps.utils.common.hive import format_table, select_clause
from dmprj.apps.utils.common.luigix import HiveExternalTask
from dmprj.apps.utils.control.luigix.task import ControlApp
from dmprj.apps.utils.control.client.exception import FailedStatusException, MissingDepsStatusException

from dmprj.apps.utils.common import unfreeze_json_param
from dmprj.apps.utils.common.fs import HdfsClient
from dmprj.apps.utils.common.hive import FindPartitionsEngine
from dmprj.apps.utils.common.luigix import HiveTableSchemaTarget

from dmprj.apps.utils.common.hive import select_clause
from dmprj.apps.utils.common.spark import CustomUDFLibrary, insert_into_hive
from dmprj.apps.utils.common.luigix import HiveExternalTask, HiveGenericTarget
from dmgrinder.apps.utils.control.luigix import ControlApp, ControlDynamicOutputPySparkTask

In [None]:
CustomUDFLibrary(spark).register_all_udf()

In [75]:
# experiments with UF export app

In [70]:
app_config = """
{
    "feature_name": "rng_installs_predict",
    "target_dt": "2021-10-12",
    "target_hdfs_basedir": "hdfs:/export/target/universal_features/",
    "min_target_rows": 1000000,
    "source_db": "ds_auditories",
    "source_table": "clal_adhoc_audience",
    "source_partition_conf": {
      "audience_name": [
        "rng_installs__all_trg_72510",
        "rng_installs__soc_med_trg_72510"
      ],
      "category": "positive",
      "uid_type": [
        "GAID"
      ]
    },
    "max_dt_diff": 0,
    "export_columns": {
      "uid": "uid",
      "uid_type": "uid_type",
      "bu_link_id": {
        "expr": "CASE WHEN audience_name = 'rng_installs__all_trg_72510' THEN 1 ELSE 0 END",
        "link_type": "AggBannerTopic"
      },
      "feature": "map_values_ordered(user_dmdesc.collect(audience_name, cast((CASE WHEN audience_name = 'rng_installs__all_trg_72510' AND scores_raw[0] < 0.25 THEN 1 WHEN audience_name = 'rng_installs__all_trg_72510' AND scores_raw[0] < 0.63 THEN 8 WHEN audience_name = 'rng_installs__soc_med_trg_72510' AND scores_raw[0] < 0.24 THEN 1 WHEN audience_name = 'rng_installs__soc_med_trg_72510' AND scores_raw[0] < 0.31 THEN 2 WHEN audience_name = 'rng_installs__soc_med_trg_72510' AND scores_raw[0] < 1.01 THEN 10 ELSE 0 END) as float)), array('rng_installs__all_trg_72510', 'rng_installs__soc_med_trg_72510'))"
    },
    "shuffle_partitions": 1000,
    "force": false,
    "status_urls": [
      "http://dmcontrol.host/api/compose/jobrun/1182405/"
    ],
    "output_urls": [
      "http://dmcontrol.host/api/compose/job/1142695/result/"
    ],
    "input_urls": [
      null
    ],
    "log_url": "http://dmcontrol.host/api/compose/task/937183/log/",
    "ctid": "prod_937183"
  }  
"""

In [10]:
app_config_orig = """
{
    "feature_name": "rng_installs_predict",
    "target_dt": "2021-10-12",
    "target_hdfs_basedir": "hdfs:/export/target/universal_features/",
    "min_target_rows": 1000000,
    "source_db": "ds_auditories",
    "source_table": "clal_adhoc_audience",
    "source_partition_conf": {
      "audience_name": [
        "rng_installs__all_trg_72510",
        "rng_installs__auto_trg_72510",
        "rng_installs__finance_trg_72510",
        "rng_installs__estate_trg_72510",
        "rng_installs__soc_med_trg_72510",
        "rng_installs__travel_trg_72510",
        "rng_installs__sport_trg_72510",
        "rng_installs__leisure_trg_72510",
        "rng_installs__dating_trg_72510",
        "rng_installs__games_trg_72510",
        "rng_installs__other_trg_72510",
        "rng_installs__education_trg_72510",
        "rng_installs__work_trg_72510",
        "rng_installs__fmcg_trg_72510",
        "rng_installs__potency_trg_72510",
        "rng_installs__medicine_trg_72510",
        "rng_installs__allclothes_trg_72510",
        "rng_installs__jewelry_trg_72510",
        "rng_installs__telecom_trg_72510",
        "rng_installs__photovideo_trg_72510",
        "rng_installs__forbsn_trg_72510",
        "rng_installs__animals_trg_72510",
        "rng_installs__technics_trg_72510",
        "rng_installs__kids_e_comm_trg_72510",
        "rng_installs__casino_trg_72510"
      ],
      "category": "positive",
      "uid_type": [
        "GAID"
      ]
    },
    "max_dt_diff": 0,
    "export_columns": {
      "uid": "uid",
      "uid_type": "uid_type",
      "bu_link_id": {
        "expr": "CASE WHEN audience_name = 'rng_installs__auto_trg_72510' THEN 1 WHEN audience_name = 'rng_installs__finance_trg_72510' THEN 2 WHEN audience_name = 'rng_installs__estate_trg_72510' THEN 3 WHEN audience_name = 'rng_installs__soc_med_trg_72510' THEN 4 WHEN audience_name = 'rng_installs__travel_trg_72510' THEN 5 WHEN audience_name = 'rng_installs__sport_trg_72510' THEN 6 WHEN audience_name = 'rng_installs__leisure_trg_72510' THEN 7 WHEN audience_name = 'rng_installs__dating_trg_72510' THEN 8 WHEN audience_name = 'rng_installs__games_trg_72510' THEN 9 WHEN audience_name = 'rng_installs__other_trg_72510' THEN 10 WHEN audience_name = 'rng_installs__education_trg_72510' THEN 11 WHEN audience_name = 'rng_installs__work_trg_72510' THEN 12 WHEN audience_name = 'rng_installs__fmcg_trg_72510' THEN 13 WHEN audience_name = 'rng_installs__potency_trg_72510' THEN 14 WHEN audience_name = 'rng_installs__medicine_trg_72510' THEN 15 WHEN audience_name = 'rng_installs__allclothes_trg_72510' THEN 16 WHEN audience_name = 'rng_installs__jewelry_trg_72510' THEN 17 WHEN audience_name = 'rng_installs__telecom_trg_72510' THEN 18 WHEN audience_name = 'rng_installs__photovideo_trg_72510' THEN 19 WHEN audience_name = 'rng_installs__forbsn_trg_72510' THEN 20 WHEN audience_name = 'rng_installs__animals_trg_72510' THEN 21 WHEN audience_name = 'rng_installs__technics_trg_72510' THEN 22 WHEN audience_name = 'rng_installs__kids_e_comm_trg_72510' THEN 23 WHEN audience_name = 'rng_installs__casino_trg_72510' THEN 24 ELSE 0 END",
        "link_type": "AggBannerTopic"
      },
      "feature": "map_values_ordered(user_dmdesc.collect(audience_name, cast((CASE WHEN audience_name = 'rng_installs__all_trg_72510' AND scores_raw[0] < 0.25 THEN 1 WHEN audience_name = 'rng_installs__all_trg_72510' AND scores_raw[0] < 0.32 THEN 2 WHEN audience_name = 'rng_installs__all_trg_72510' AND scores_raw[0] < 0.37 THEN 3 WHEN audience_name = 'rng_installs__all_trg_72510' AND scores_raw[0] < 0.42 THEN 4 WHEN audience_name = 'rng_installs__all_trg_72510' AND scores_raw[0] < 0.47 THEN 5 WHEN audience_name = 'rng_installs__all_trg_72510' AND scores_raw[0] < 0.51 THEN 6 WHEN audience_name = 'rng_installs__all_trg_72510' AND scores_raw[0] < 0.57 THEN 7 WHEN audience_name = 'rng_installs__all_trg_72510' AND scores_raw[0] < 0.63 THEN 8 WHEN audience_name = 'rng_installs__all_trg_72510' AND scores_raw[0] < 0.72 THEN 9 WHEN audience_name = 'rng_installs__all_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__soc_med_trg_72510' AND scores_raw[0] < 0.24 THEN 1 WHEN audience_name = 'rng_installs__soc_med_trg_72510' AND scores_raw[0] < 0.31 THEN 2 WHEN audience_name = 'rng_installs__soc_med_trg_72510' AND scores_raw[0] < 0.37 THEN 3 WHEN audience_name = 'rng_installs__soc_med_trg_72510' AND scores_raw[0] < 0.42 THEN 4 WHEN audience_name = 'rng_installs__soc_med_trg_72510' AND scores_raw[0] < 0.47 THEN 5 WHEN audience_name = 'rng_installs__soc_med_trg_72510' AND scores_raw[0] < 0.51 THEN 6 WHEN audience_name = 'rng_installs__soc_med_trg_72510' AND scores_raw[0] < 0.57 THEN 7 WHEN audience_name = 'rng_installs__soc_med_trg_72510' AND scores_raw[0] < 0.63 THEN 8 WHEN audience_name = 'rng_installs__soc_med_trg_72510' AND scores_raw[0] < 0.71 THEN 9 WHEN audience_name = 'rng_installs__soc_med_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__kids_e_comm_trg_72510' AND scores_raw[0] < 0.31 THEN 1 WHEN audience_name = 'rng_installs__kids_e_comm_trg_72510' AND scores_raw[0] < 0.42 THEN 2 WHEN audience_name = 'rng_installs__kids_e_comm_trg_72510' AND scores_raw[0] < 0.51 THEN 3 WHEN audience_name = 'rng_installs__kids_e_comm_trg_72510' AND scores_raw[0] < 0.59 THEN 4 WHEN audience_name = 'rng_installs__kids_e_comm_trg_72510' AND scores_raw[0] < 0.67 THEN 5 WHEN audience_name = 'rng_installs__kids_e_comm_trg_72510' AND scores_raw[0] < 0.75 THEN 6 WHEN audience_name = 'rng_installs__kids_e_comm_trg_72510' AND scores_raw[0] < 0.83 THEN 7 WHEN audience_name = 'rng_installs__kids_e_comm_trg_72510' AND scores_raw[0] < 0.89 THEN 8 WHEN audience_name = 'rng_installs__kids_e_comm_trg_72510' AND scores_raw[0] < 0.94 THEN 9 WHEN audience_name = 'rng_installs__kids_e_comm_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__other_trg_72510' AND scores_raw[0] < 0.24 THEN 1 WHEN audience_name = 'rng_installs__other_trg_72510' AND scores_raw[0] < 0.36 THEN 2 WHEN audience_name = 'rng_installs__other_trg_72510' AND scores_raw[0] < 0.48 THEN 3 WHEN audience_name = 'rng_installs__other_trg_72510' AND scores_raw[0] < 0.59 THEN 4 WHEN audience_name = 'rng_installs__other_trg_72510' AND scores_raw[0] < 0.7 THEN 5 WHEN audience_name = 'rng_installs__other_trg_72510' AND scores_raw[0] < 0.8 THEN 6 WHEN audience_name = 'rng_installs__other_trg_72510' AND scores_raw[0] < 0.88 THEN 7 WHEN audience_name = 'rng_installs__other_trg_72510' AND scores_raw[0] < 0.94 THEN 8 WHEN audience_name = 'rng_installs__other_trg_72510' AND scores_raw[0] < 0.98 THEN 9 WHEN audience_name = 'rng_installs__other_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__leisure_trg_72510' AND scores_raw[0] < 0.25 THEN 1 WHEN audience_name = 'rng_installs__leisure_trg_72510' AND scores_raw[0] < 0.39 THEN 2 WHEN audience_name = 'rng_installs__leisure_trg_72510' AND scores_raw[0] < 0.51 THEN 3 WHEN audience_name = 'rng_installs__leisure_trg_72510' AND scores_raw[0] < 0.62 THEN 4 WHEN audience_name = 'rng_installs__leisure_trg_72510' AND scores_raw[0] < 0.71 THEN 5 WHEN audience_name = 'rng_installs__leisure_trg_72510' AND scores_raw[0] < 0.78 THEN 6 WHEN audience_name = 'rng_installs__leisure_trg_72510' AND scores_raw[0] < 0.84 THEN 7 WHEN audience_name = 'rng_installs__leisure_trg_72510' AND scores_raw[0] < 0.9 THEN 8 WHEN audience_name = 'rng_installs__leisure_trg_72510' AND scores_raw[0] < 0.95 THEN 9 WHEN audience_name = 'rng_installs__leisure_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__medicine_trg_72510' AND scores_raw[0] < 0.2 THEN 1 WHEN audience_name = 'rng_installs__medicine_trg_72510' AND scores_raw[0] < 0.3 THEN 2 WHEN audience_name = 'rng_installs__medicine_trg_72510' AND scores_raw[0] < 0.4 THEN 3 WHEN audience_name = 'rng_installs__medicine_trg_72510' AND scores_raw[0] < 0.49 THEN 4 WHEN audience_name = 'rng_installs__medicine_trg_72510' AND scores_raw[0] < 0.59 THEN 5 WHEN audience_name = 'rng_installs__medicine_trg_72510' AND scores_raw[0] < 0.66 THEN 6 WHEN audience_name = 'rng_installs__medicine_trg_72510' AND scores_raw[0] < 0.73 THEN 7 WHEN audience_name = 'rng_installs__medicine_trg_72510' AND scores_raw[0] < 0.8 THEN 8 WHEN audience_name = 'rng_installs__medicine_trg_72510' AND scores_raw[0] < 0.88 THEN 9 WHEN audience_name = 'rng_installs__medicine_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__fmcg_trg_72510' AND scores_raw[0] < 0.22 THEN 1 WHEN audience_name = 'rng_installs__fmcg_trg_72510' AND scores_raw[0] < 0.33 THEN 2 WHEN audience_name = 'rng_installs__fmcg_trg_72510' AND scores_raw[0] < 0.43 THEN 3 WHEN audience_name = 'rng_installs__fmcg_trg_72510' AND scores_raw[0] < 0.5 THEN 4 WHEN audience_name = 'rng_installs__fmcg_trg_72510' AND scores_raw[0] < 0.57 THEN 5 WHEN audience_name = 'rng_installs__fmcg_trg_72510' AND scores_raw[0] < 0.65 THEN 6 WHEN audience_name = 'rng_installs__fmcg_trg_72510' AND scores_raw[0] < 0.71 THEN 7 WHEN audience_name = 'rng_installs__fmcg_trg_72510' AND scores_raw[0] < 0.79 THEN 8 WHEN audience_name = 'rng_installs__fmcg_trg_72510' AND scores_raw[0] < 0.85 THEN 9 WHEN audience_name = 'rng_installs__fmcg_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__games_trg_72510' AND scores_raw[0] < 0.43 THEN 1 WHEN audience_name = 'rng_installs__games_trg_72510' AND scores_raw[0] < 0.56 THEN 2 WHEN audience_name = 'rng_installs__games_trg_72510' AND scores_raw[0] < 0.65 THEN 3 WHEN audience_name = 'rng_installs__games_trg_72510' AND scores_raw[0] < 0.73 THEN 4 WHEN audience_name = 'rng_installs__games_trg_72510' AND scores_raw[0] < 0.79 THEN 5 WHEN audience_name = 'rng_installs__games_trg_72510' AND scores_raw[0] < 0.84 THEN 6 WHEN audience_name = 'rng_installs__games_trg_72510' AND scores_raw[0] < 0.89 THEN 7 WHEN audience_name = 'rng_installs__games_trg_72510' AND scores_raw[0] < 0.93 THEN 8 WHEN audience_name = 'rng_installs__games_trg_72510' AND scores_raw[0] < 0.96 THEN 9 WHEN audience_name = 'rng_installs__games_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__auto_trg_72510' AND scores_raw[0] < 0.22 THEN 1 WHEN audience_name = 'rng_installs__auto_trg_72510' AND scores_raw[0] < 0.32 THEN 2 WHEN audience_name = 'rng_installs__auto_trg_72510' AND scores_raw[0] < 0.42 THEN 3 WHEN audience_name = 'rng_installs__auto_trg_72510' AND scores_raw[0] < 0.51 THEN 4 WHEN audience_name = 'rng_installs__auto_trg_72510' AND scores_raw[0] < 0.6 THEN 5 WHEN audience_name = 'rng_installs__auto_trg_72510' AND scores_raw[0] < 0.68 THEN 6 WHEN audience_name = 'rng_installs__auto_trg_72510' AND scores_raw[0] < 0.77 THEN 7 WHEN audience_name = 'rng_installs__auto_trg_72510' AND scores_raw[0] < 0.85 THEN 8 WHEN audience_name = 'rng_installs__auto_trg_72510' AND scores_raw[0] < 0.92 THEN 9 WHEN audience_name = 'rng_installs__auto_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__dating_trg_72510' AND scores_raw[0] < 0.38 THEN 1 WHEN audience_name = 'rng_installs__dating_trg_72510' AND scores_raw[0] < 0.56 THEN 2 WHEN audience_name = 'rng_installs__dating_trg_72510' AND scores_raw[0] < 0.69 THEN 3 WHEN audience_name = 'rng_installs__dating_trg_72510' AND scores_raw[0] < 0.78 THEN 4 WHEN audience_name = 'rng_installs__dating_trg_72510' AND scores_raw[0] < 0.85 THEN 5 WHEN audience_name = 'rng_installs__dating_trg_72510' AND scores_raw[0] < 0.9 THEN 6 WHEN audience_name = 'rng_installs__dating_trg_72510' AND scores_raw[0] < 0.94 THEN 7 WHEN audience_name = 'rng_installs__dating_trg_72510' AND scores_raw[0] < 0.97 THEN 8 WHEN audience_name = 'rng_installs__dating_trg_72510' AND scores_raw[0] < 0.98 THEN 9 WHEN audience_name = 'rng_installs__dating_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__sport_trg_72510' AND scores_raw[0] < 0.2 THEN 1 WHEN audience_name = 'rng_installs__sport_trg_72510' AND scores_raw[0] < 0.38 THEN 2 WHEN audience_name = 'rng_installs__sport_trg_72510' AND scores_raw[0] < 0.55 THEN 3 WHEN audience_name = 'rng_installs__sport_trg_72510' AND scores_raw[0] < 0.67 THEN 4 WHEN audience_name = 'rng_installs__sport_trg_72510' AND scores_raw[0] < 0.78 THEN 5 WHEN audience_name = 'rng_installs__sport_trg_72510' AND scores_raw[0] < 0.85 THEN 6 WHEN audience_name = 'rng_installs__sport_trg_72510' AND scores_raw[0] < 0.91 THEN 7 WHEN audience_name = 'rng_installs__sport_trg_72510' AND scores_raw[0] < 0.95 THEN 8 WHEN audience_name = 'rng_installs__sport_trg_72510' AND scores_raw[0] < 0.98 THEN 9 WHEN audience_name = 'rng_installs__sport_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__finance_trg_72510' AND scores_raw[0] < 0.29 THEN 1 WHEN audience_name = 'rng_installs__finance_trg_72510' AND scores_raw[0] < 0.43 THEN 2 WHEN audience_name = 'rng_installs__finance_trg_72510' AND scores_raw[0] < 0.54 THEN 3 WHEN audience_name = 'rng_installs__finance_trg_72510' AND scores_raw[0] < 0.63 THEN 4 WHEN audience_name = 'rng_installs__finance_trg_72510' AND scores_raw[0] < 0.7 THEN 5 WHEN audience_name = 'rng_installs__finance_trg_72510' AND scores_raw[0] < 0.77 THEN 6 WHEN audience_name = 'rng_installs__finance_trg_72510' AND scores_raw[0] < 0.84 THEN 7 WHEN audience_name = 'rng_installs__finance_trg_72510' AND scores_raw[0] < 0.89 THEN 8 WHEN audience_name = 'rng_installs__finance_trg_72510' AND scores_raw[0] < 0.94 THEN 9 WHEN audience_name = 'rng_installs__finance_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__forbsn_trg_72510' AND scores_raw[0] < 0.04 THEN 1 WHEN audience_name = 'rng_installs__forbsn_trg_72510' AND scores_raw[0] < 0.1 THEN 2 WHEN audience_name = 'rng_installs__forbsn_trg_72510' AND scores_raw[0] < 0.18 THEN 3 WHEN audience_name = 'rng_installs__forbsn_trg_72510' AND scores_raw[0] < 0.28 THEN 4 WHEN audience_name = 'rng_installs__forbsn_trg_72510' AND scores_raw[0] < 0.43 THEN 5 WHEN audience_name = 'rng_installs__forbsn_trg_72510' AND scores_raw[0] < 0.59 THEN 6 WHEN audience_name = 'rng_installs__forbsn_trg_72510' AND scores_raw[0] < 0.69 THEN 7 WHEN audience_name = 'rng_installs__forbsn_trg_72510' AND scores_raw[0] < 0.82 THEN 8 WHEN audience_name = 'rng_installs__forbsn_trg_72510' AND scores_raw[0] < 0.92 THEN 9 WHEN audience_name = 'rng_installs__forbsn_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__travel_trg_72510' AND scores_raw[0] < 0.19 THEN 1 WHEN audience_name = 'rng_installs__travel_trg_72510' AND scores_raw[0] < 0.32 THEN 2 WHEN audience_name = 'rng_installs__travel_trg_72510' AND scores_raw[0] < 0.46 THEN 3 WHEN audience_name = 'rng_installs__travel_trg_72510' AND scores_raw[0] < 0.58 THEN 4 WHEN audience_name = 'rng_installs__travel_trg_72510' AND scores_raw[0] < 0.67 THEN 5 WHEN audience_name = 'rng_installs__travel_trg_72510' AND scores_raw[0] < 0.76 THEN 6 WHEN audience_name = 'rng_installs__travel_trg_72510' AND scores_raw[0] < 0.83 THEN 7 WHEN audience_name = 'rng_installs__travel_trg_72510' AND scores_raw[0] < 0.89 THEN 8 WHEN audience_name = 'rng_installs__travel_trg_72510' AND scores_raw[0] < 0.94 THEN 9 WHEN audience_name = 'rng_installs__travel_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__allclothes_trg_72510' AND scores_raw[0] < 0.24 THEN 1 WHEN audience_name = 'rng_installs__allclothes_trg_72510' AND scores_raw[0] < 0.3 THEN 2 WHEN audience_name = 'rng_installs__allclothes_trg_72510' AND scores_raw[0] < 0.36 THEN 3 WHEN audience_name = 'rng_installs__allclothes_trg_72510' AND scores_raw[0] < 0.41 THEN 4 WHEN audience_name = 'rng_installs__allclothes_trg_72510' AND scores_raw[0] < 0.45 THEN 5 WHEN audience_name = 'rng_installs__allclothes_trg_72510' AND scores_raw[0] < 0.5 THEN 6 WHEN audience_name = 'rng_installs__allclothes_trg_72510' AND scores_raw[0] < 0.56 THEN 7 WHEN audience_name = 'rng_installs__allclothes_trg_72510' AND scores_raw[0] < 0.62 THEN 8 WHEN audience_name = 'rng_installs__allclothes_trg_72510' AND scores_raw[0] < 0.71 THEN 9 WHEN audience_name = 'rng_installs__allclothes_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__technics_trg_72510' AND scores_raw[0] < 0.11 THEN 1 WHEN audience_name = 'rng_installs__technics_trg_72510' AND scores_raw[0] < 0.17 THEN 2 WHEN audience_name = 'rng_installs__technics_trg_72510' AND scores_raw[0] < 0.25 THEN 3 WHEN audience_name = 'rng_installs__technics_trg_72510' AND scores_raw[0] < 0.35 THEN 4 WHEN audience_name = 'rng_installs__technics_trg_72510' AND scores_raw[0] < 0.46 THEN 5 WHEN audience_name = 'rng_installs__technics_trg_72510' AND scores_raw[0] < 0.57 THEN 6 WHEN audience_name = 'rng_installs__technics_trg_72510' AND scores_raw[0] < 0.7 THEN 7 WHEN audience_name = 'rng_installs__technics_trg_72510' AND scores_raw[0] < 0.78 THEN 8 WHEN audience_name = 'rng_installs__technics_trg_72510' AND scores_raw[0] < 0.88 THEN 9 WHEN audience_name = 'rng_installs__technics_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__estate_trg_72510' AND scores_raw[0] < 0.13 THEN 1 WHEN audience_name = 'rng_installs__estate_trg_72510' AND scores_raw[0] < 0.24 THEN 2 WHEN audience_name = 'rng_installs__estate_trg_72510' AND scores_raw[0] < 0.34 THEN 3 WHEN audience_name = 'rng_installs__estate_trg_72510' AND scores_raw[0] < 0.44 THEN 4 WHEN audience_name = 'rng_installs__estate_trg_72510' AND scores_raw[0] < 0.53 THEN 5 WHEN audience_name = 'rng_installs__estate_trg_72510' AND scores_raw[0] < 0.63 THEN 6 WHEN audience_name = 'rng_installs__estate_trg_72510' AND scores_raw[0] < 0.7 THEN 7 WHEN audience_name = 'rng_installs__estate_trg_72510' AND scores_raw[0] < 0.78 THEN 8 WHEN audience_name = 'rng_installs__estate_trg_72510' AND scores_raw[0] < 0.87 THEN 9 WHEN audience_name = 'rng_installs__estate_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__potency_trg_72510' AND scores_raw[0] < 0.25 THEN 1 WHEN audience_name = 'rng_installs__potency_trg_72510' AND scores_raw[0] < 0.32 THEN 2 WHEN audience_name = 'rng_installs__potency_trg_72510' AND scores_raw[0] < 0.37 THEN 3 WHEN audience_name = 'rng_installs__potency_trg_72510' AND scores_raw[0] < 0.42 THEN 4 WHEN audience_name = 'rng_installs__potency_trg_72510' AND scores_raw[0] < 0.47 THEN 5 WHEN audience_name = 'rng_installs__potency_trg_72510' AND scores_raw[0] < 0.51 THEN 6 WHEN audience_name = 'rng_installs__potency_trg_72510' AND scores_raw[0] < 0.57 THEN 7 WHEN audience_name = 'rng_installs__potency_trg_72510' AND scores_raw[0] < 0.63 THEN 8 WHEN audience_name = 'rng_installs__potency_trg_72510' AND scores_raw[0] < 0.72 THEN 9 WHEN audience_name = 'rng_installs__potency_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__work_trg_72510' AND scores_raw[0] < 0.16 THEN 1 WHEN audience_name = 'rng_installs__work_trg_72510' AND scores_raw[0] < 0.3 THEN 2 WHEN audience_name = 'rng_installs__work_trg_72510' AND scores_raw[0] < 0.44 THEN 3 WHEN audience_name = 'rng_installs__work_trg_72510' AND scores_raw[0] < 0.55 THEN 4 WHEN audience_name = 'rng_installs__work_trg_72510' AND scores_raw[0] < 0.66 THEN 5 WHEN audience_name = 'rng_installs__work_trg_72510' AND scores_raw[0] < 0.73 THEN 6 WHEN audience_name = 'rng_installs__work_trg_72510' AND scores_raw[0] < 0.8 THEN 7 WHEN audience_name = 'rng_installs__work_trg_72510' AND scores_raw[0] < 0.87 THEN 8 WHEN audience_name = 'rng_installs__work_trg_72510' AND scores_raw[0] < 0.93 THEN 9 WHEN audience_name = 'rng_installs__work_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__photovideo_trg_72510' AND scores_raw[0] < 0.24 THEN 1 WHEN audience_name = 'rng_installs__photovideo_trg_72510' AND scores_raw[0] < 0.31 THEN 2 WHEN audience_name = 'rng_installs__photovideo_trg_72510' AND scores_raw[0] < 0.36 THEN 3 WHEN audience_name = 'rng_installs__photovideo_trg_72510' AND scores_raw[0] < 0.41 THEN 4 WHEN audience_name = 'rng_installs__photovideo_trg_72510' AND scores_raw[0] < 0.46 THEN 5 WHEN audience_name = 'rng_installs__photovideo_trg_72510' AND scores_raw[0] < 0.51 THEN 6 WHEN audience_name = 'rng_installs__photovideo_trg_72510' AND scores_raw[0] < 0.56 THEN 7 WHEN audience_name = 'rng_installs__photovideo_trg_72510' AND scores_raw[0] < 0.62 THEN 8 WHEN audience_name = 'rng_installs__photovideo_trg_72510' AND scores_raw[0] < 0.71 THEN 9 WHEN audience_name = 'rng_installs__photovideo_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__telecom_trg_72510' AND scores_raw[0] < 0.24 THEN 1 WHEN audience_name = 'rng_installs__telecom_trg_72510' AND scores_raw[0] < 0.31 THEN 2 WHEN audience_name = 'rng_installs__telecom_trg_72510' AND scores_raw[0] < 0.36 THEN 3 WHEN audience_name = 'rng_installs__telecom_trg_72510' AND scores_raw[0] < 0.41 THEN 4 WHEN audience_name = 'rng_installs__telecom_trg_72510' AND scores_raw[0] < 0.45 THEN 5 WHEN audience_name = 'rng_installs__telecom_trg_72510' AND scores_raw[0] < 0.5 THEN 6 WHEN audience_name = 'rng_installs__telecom_trg_72510' AND scores_raw[0] < 0.56 THEN 7 WHEN audience_name = 'rng_installs__telecom_trg_72510' AND scores_raw[0] < 0.62 THEN 8 WHEN audience_name = 'rng_installs__telecom_trg_72510' AND scores_raw[0] < 0.71 THEN 9 WHEN audience_name = 'rng_installs__telecom_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__jewelry_trg_72510' AND scores_raw[0] < 0.24 THEN 1 WHEN audience_name = 'rng_installs__jewelry_trg_72510' AND scores_raw[0] < 0.31 THEN 2 WHEN audience_name = 'rng_installs__jewelry_trg_72510' AND scores_raw[0] < 0.36 THEN 3 WHEN audience_name = 'rng_installs__jewelry_trg_72510' AND scores_raw[0] < 0.41 THEN 4 WHEN audience_name = 'rng_installs__jewelry_trg_72510' AND scores_raw[0] < 0.45 THEN 5 WHEN audience_name = 'rng_installs__jewelry_trg_72510' AND scores_raw[0] < 0.5 THEN 6 WHEN audience_name = 'rng_installs__jewelry_trg_72510' AND scores_raw[0] < 0.56 THEN 7 WHEN audience_name = 'rng_installs__jewelry_trg_72510' AND scores_raw[0] < 0.63 THEN 8 WHEN audience_name = 'rng_installs__jewelry_trg_72510' AND scores_raw[0] < 0.71 THEN 9 WHEN audience_name = 'rng_installs__jewelry_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__education_trg_72510' AND scores_raw[0] < 0.05 THEN 1 WHEN audience_name = 'rng_installs__education_trg_72510' AND scores_raw[0] < 0.1 THEN 2 WHEN audience_name = 'rng_installs__education_trg_72510' AND scores_raw[0] < 0.14 THEN 3 WHEN audience_name = 'rng_installs__education_trg_72510' AND scores_raw[0] < 0.21 THEN 4 WHEN audience_name = 'rng_installs__education_trg_72510' AND scores_raw[0] < 0.5 THEN 5 WHEN audience_name = 'rng_installs__education_trg_72510' AND scores_raw[0] < 0.66 THEN 6 WHEN audience_name = 'rng_installs__education_trg_72510' AND scores_raw[0] < 0.77 THEN 7 WHEN audience_name = 'rng_installs__education_trg_72510' AND scores_raw[0] < 0.9 THEN 8 WHEN audience_name = 'rng_installs__education_trg_72510' AND scores_raw[0] < 0.91 THEN 9 WHEN audience_name = 'rng_installs__education_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__animals_trg_72510' AND scores_raw[0] < 0.24 THEN 1 WHEN audience_name = 'rng_installs__animals_trg_72510' AND scores_raw[0] < 0.31 THEN 2 WHEN audience_name = 'rng_installs__animals_trg_72510' AND scores_raw[0] < 0.36 THEN 3 WHEN audience_name = 'rng_installs__animals_trg_72510' AND scores_raw[0] < 0.41 THEN 4 WHEN audience_name = 'rng_installs__animals_trg_72510' AND scores_raw[0] < 0.45 THEN 5 WHEN audience_name = 'rng_installs__animals_trg_72510' AND scores_raw[0] < 0.5 THEN 6 WHEN audience_name = 'rng_installs__animals_trg_72510' AND scores_raw[0] < 0.56 THEN 7 WHEN audience_name = 'rng_installs__animals_trg_72510' AND scores_raw[0] < 0.62 THEN 8 WHEN audience_name = 'rng_installs__animals_trg_72510' AND scores_raw[0] < 0.71 THEN 9 WHEN audience_name = 'rng_installs__animals_trg_72510' AND scores_raw[0] < 1.01 THEN 10 WHEN audience_name = 'rng_installs__casino_trg_72510' AND scores_raw[0] < 0.24 THEN 1 WHEN audience_name = 'rng_installs__casino_trg_72510' AND scores_raw[0] < 0.31 THEN 2 WHEN audience_name = 'rng_installs__casino_trg_72510' AND scores_raw[0] < 0.36 THEN 3 WHEN audience_name = 'rng_installs__casino_trg_72510' AND scores_raw[0] < 0.41 THEN 4 WHEN audience_name = 'rng_installs__casino_trg_72510' AND scores_raw[0] < 0.45 THEN 5 WHEN audience_name = 'rng_installs__casino_trg_72510' AND scores_raw[0] < 0.5 THEN 6 WHEN audience_name = 'rng_installs__casino_trg_72510' AND scores_raw[0] < 0.55 THEN 7 WHEN audience_name = 'rng_installs__casino_trg_72510' AND scores_raw[0] < 0.62 THEN 8 WHEN audience_name = 'rng_installs__casino_trg_72510' AND scores_raw[0] < 0.7 THEN 9 WHEN audience_name = 'rng_installs__casino_trg_72510' AND scores_raw[0] < 1.01 THEN 10 ELSE 0 END) as float)), array('rng_installs__all_trg_72510', 'rng_installs__auto_trg_72510', 'rng_installs__finance_trg_72510', 'rng_installs__estate_trg_72510', 'rng_installs__soc_med_trg_72510', 'rng_installs__travel_trg_72510', 'rng_installs__sport_trg_72510', 'rng_installs__leisure_trg_72510', 'rng_installs__dating_trg_72510', 'rng_installs__games_trg_72510', 'rng_installs__other_trg_72510', 'rng_installs__education_trg_72510', 'rng_installs__work_trg_72510', 'rng_installs__fmcg_trg_72510', 'rng_installs__potency_trg_72510', 'rng_installs__medicine_trg_72510', 'rng_installs__allclothes_trg_72510', 'rng_installs__jewelry_trg_72510', 'rng_installs__telecom_trg_72510', 'rng_installs__photovideo_trg_72510', 'rng_installs__forbsn_trg_72510', 'rng_installs__animals_trg_72510', 'rng_installs__technics_trg_72510', 'rng_installs__kids_e_comm_trg_72510', 'rng_installs__casino_trg_72510'))"
    },
    "shuffle_partitions": 1000,
    "force": false,
    "status_urls": [
      "http://dmcontrol.host/api/compose/jobrun/1182405/"
    ],
    "output_urls": [
      "http://dmcontrol.host/api/compose/job/1142695/result/"
    ],
    "input_urls": [
      null
    ],
    "log_url": "http://dmcontrol.host/api/compose/task/937183/log/",
    "ctid": "prod_937183"
  }
"""

In [71]:
def prepare_config(config):
    partitions_finder = FindPartitionsEngine(raise_on_invalid_table=False)
    db = config["source_db"]
    table = config["source_table"]
    partition_conf = config.pop("source_partition_conf")
    max_dt = config["target_dt"]
    min_dt = add_days(max_dt, -config.pop("max_dt_diff"))

    source_partitions = partitions_finder.find(db, table, partition_conf, min_dt, max_dt)

    if not source_partitions:
        raise MissingDepsStatusException("Not found source partitions in Hive table {}.{}".format(db, table))

    config["source_partitions"] = source_partitions
    return config


In [72]:
app_config_dict = json.loads(app_config)
# log(app_config_dict["export_columns"]["feature"])
task_config = prepare_config(app_config_dict)

In [30]:
# log((len(task_config["source_partitions"]), task_config["source_partitions"],))

In [90]:
source_db = task_config["source_db"]
source_table = task_config["source_table"]
source_partitions = task_config["source_partitions"]

df = sql_ctx.sql(
    select_clause(database=source_db, table=source_table, partition_dicts=source_partitions)
).persist()

In [None]:
df.printSchema()
df.show(truncate=False)

In [76]:
export_columns = task_config["export_columns"]
bu_link_id = export_columns.get("bu_link_id")

In [77]:
key_columns = [
            sqlfn.expr("uid2user({}, {})".format(export_columns["uid"], export_columns["uid_type"])).alias(
                "user"
            )
        ] + ([sqlfn.expr(bu_link_id["expr"]).cast("string").alias("bu_link_id")] if bu_link_id else [])
# log(key_columns)

In [78]:
for key in key_columns:
    df = df.where(key.isNotNull())

In [79]:
# df.show()

In [80]:
df = df.groupBy(*key_columns).agg(sqlfn.expr(export_columns["feature"]).alias("feature"))

In [81]:
df = df.persist()

In [None]:
df.printSchema()
df.show(truncate=False)

In [None]:
feature_type = df.schema["feature"].dataType
df = df.dropna()
df.show(truncate=False)

In [85]:
arr_filter = "size({col}) > 0 and not exists({col}, _x -> isnull(_x) or isnan(_x))"

In [86]:
# array<float>
df = df.where(arr_filter.format(col="feature"))
df.show()

+----+----------+-------+
|user|bu_link_id|feature|
+----+----------+-------+
+----+----------+-------+



In [89]:
max_collection_size = 100
df = df.where("size(feature) <= {}".format(max_collection_size))
df.show()

+----+----------+-------+
|user|bu_link_id|feature|
+----+----------+-------+
+----+----------+-------+



In [None]:
# check exported CSV

In [7]:
csv_path = "/export/target/universal_features/rng_installs_predict/2021-10-13/"

In [8]:
csv_df = (
    spark.read
    .option("header", True)
    .option("delimiter", ";")
    .csv("hdfs:" + csv_path + "*.csv")
)

In [9]:
csv_df.printSchema()

root
 |-- user: string (nullable = true)
 |-- link_type:AggBannerTopic: string (nullable = true)
 |-- num:rng_installs_predict: string (nullable = true)



In [10]:
csv_df.show(truncate=False)

+-----------------------------------------+------------------------+------------------------+
|user                                     |link_type:AggBannerTopic|num:rng_installs_predict|
+-----------------------------------------+------------------------+------------------------+
|gaid:d75da36c-7e58-4380-b105-69268dd10024|18                      |4.0                     |
|gaid:cef4142f-79a7-42b9-b577-944577ed4b63|4                       |7.0                     |
|gaid:19ca186c-8680-4a6a-93ff-b37032ae4686|7                       |6.0                     |
|gaid:bc053943-498f-49da-b3b3-c72d9dc826ba|18                      |2.0                     |
|gaid:44e461ad-76da-482a-b6f2-e7c069820dac|12                      |1.0                     |
|gaid:ffc7d51a-46f5-4e50-a89d-19c948bebc9c|8                       |1.0                     |
|gaid:1f20eaf2-2124-482c-98c2-787789523ef5|21                      |4.0                     |
|gaid:1e555bec-bc28-4037-9fec-8ef76ed12dea|9                

In [18]:
user_feature = (
    csv_df
    .select(
        sqlfn.col("user"), 
        sqlfn.col("link_type:AggBannerTopic").alias("banner"), 
        sqlfn.col("num:rng_installs_predict").alias("score")
    )
    .groupby("user")
    .agg(sqlfn.expr(
        "user_dmdesc.collect(banner, cast(score as float)) as feature"
    ))
).persist()

In [19]:
user_feature.printSchema()

root
 |-- user: string (nullable = true)
 |-- feature: map (nullable = true)
 |    |-- key: string
 |    |-- value: float (valueContainsNull = true)



In [20]:
user_feature.show(truncate=False)

+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user                                     |feature                                                                                                                                                                                                                                                                        |
+-----------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|gaid:000006e8-a85b-413f-adda-a9487783f178|[22 -> 1.

In [21]:
user_feature_size = user_feature.selectExpr("user", "feature", "size(feature) as size")

In [23]:
user_feature_size.where("size = 25").count()

136449244

In [24]:
user_feature_size.where("size < 25").count()

0

In [25]:
user_feature_size.where("size > 25").count()

0

In [54]:
spark.sql("select cast(0 as float)").show()

+----------------+
|CAST(0 AS FLOAT)|
+----------------+
|             0.0|
+----------------+



In [92]:
spark.stop()