In [1]:
from pprint import pformat
from timeit import default_timer as time
import os

# notes
- dm3 RAM limit = 16 GB
- cluster nodes RAM limit = 32 GB
- HdfsClient который через process.Popen(hadoop ...) жрет 300 мегабайт оперативки
```sh
# ps -xo uid,rss,pid,cmd #my processes only
ps -eo uid,rss,pid,cmd | grep FsShell #all processes
978 308844 21882 /etc/alternatives/jre_1.8.0/bin/java -Dproc_fs -Djava.net.preferIPv4Stack=true -Dyarn.log.dir=/usr/lib/hadoop/logs -Dyarn.log.file=hadoop.log -Dyarn.home.dir=/usr/lib/hadoop-yarn -Dyarn.root.logger=INFO,console -Djava.library.path=/usr/lib/hadoop/lib/native -Dhadoop.log.dir=/usr/lib/hadoop/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/usr/lib/hadoop -Dhadoop.id.str=jenkins-dm -Dhadoop.root.logger=INFO,console -Dhadoop.policy.file=hadoop-policy.xml -Dhadoop.security.logger=INFO,NullAppender org.apache.hadoop.fs.FsShell -stat /data/dm/Admining/logs/graph_filter/events/2021-05-26/_SUCCESS
```
- WebHdfsClient не взлетает, ему не хватает пакета hdfs, который по простому не ставится (pypi недоступен на dm3).

Поэтому используем схему с запуском команд hdfs параллельно на экзекуторах спарка, затолкав параметры команд в датафрейм или RDD

In [2]:
def log(obj, msg=""):
    if msg: print(msg)
    print("{}, value: `{}`".format(type(obj), pformat(obj)))

In [None]:
log(os.environ, "os.environ:")
log(dict(os.environ), "\ndict(od.environ):")

# ThreadPool mover

In [4]:
from prj.apps.utils.common.fs import HdfsClient
from luigi.contrib.hdfs import HdfsClient as LuigiHdfsClient, WebHdfsClient, SnakebiteHdfsClient

In [None]:
# !pip install hdfs
# !pip install -i http://nexus.k8s.trg.cloud.devmail.ru/repository/pypi-proxy/simple --trusted-host nexus.k8s.trg.cloud.devmail.ru hdfs
# !pip install -i http://pkg.trgqa.devmail.ru:8081/repository/pypi-proxy/simple/ --trusted-host pkg.trgqa.devmail.ru hdfs


In [5]:
hdfs = HdfsClient()
# hdfs = WebHdfsClient()
# hdfs = SnakebiteHdfsClient()

In [46]:
from multiprocessing.pool import ThreadPool

class MultithreadedHdfsOps(object):

    def __init__(self, hdfs_client, max_threads=128):
        self.hdfs = hdfs_client
        self.pool = ThreadPool(max_threads)

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.pool.close()
        self.pool.join()

    def move(self, source, dest, callback=None):
        return self.pool.apply_async(self._move, args=(source, dest, ), callback=callback)
    
    def touchz(self, path):
        return self.pool.apply_async(self._touchz, args=(path, ))

    def _move(self, src, trg):
        return self.hdfs.move(src, trg)

    def _touchz(self, path):
        return self.hdfs.touchz(path)

In [7]:
home_dir = "hdfs:/user/vlk/"
experiment_root_dir = os.path.join(home_dir, "files_mover", "")

In [8]:
start = time()

hdfs.mkdir(
    path=os.path.join(experiment_root_dir, "files_src"), 
    parents=True, 
    raise_if_exists=False, 
    remove_if_exists=True  # only for prj version
)

print("mkdir in {} seconds".format(time() - start))  # mkdir in 6.10173201561 seconds

mkdir in 6.42792797089 seconds


In [47]:
start = time()
pool_size=48
num_files=48
files = [
    os.path.join(experiment_root_dir, "files_src", "file_n{}.txt".format("%05d" % i))
    for i in range(1, num_files+1)
]
print("ready to touchz {} files, prepared in {} seconds".format(len(files), time() - start))

ready to touchz 48 files, prepared in 0.000410079956055 seconds


In [48]:
start = time()
results = []  # multiprocessing.pool.ApplyResult object
with MultithreadedHdfsOps(hdfs, max_threads=pool_size) as mth_hdfs:
    for fn in files:
        results.append(mth_hdfs.touchz(fn))
print("# processed {} files in {} seconds, f/s: {}, pool size: {}, files: {}".format(
    len(files), 
    time() - start, 
    len(files) / float(time() - start),
    pool_size,
    num_files
))
errors = [r.get() for r in results if not r.successful()]
print("errors: {}".format(pformat(errors)))
# processed 40 files in 10.1468760967 seconds, f/s: 3.942, pool size: 40, files: 40

# processed 48 files in 13.1580049992 seconds, f/s: 3.64796804215, pool size: 48, files: 48
errors: []


# Spark RDD mover

In [5]:
import os
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from timeit import default_timer as time

In [6]:
from prj.apps.utils.common.fs import HdfsClient
# from luigi.contrib.hdfs.hadoopcli_clients import HdfsClient

In [18]:
import os
import sys
from pyspark.sql import SparkSession
from prj.sparkml.scoring import InverseVariabilityTransformer
from prj.sparkml.postprocessing import ScoreEqualizeTransformer
TMP_ENV_BASEDIR = "tmpenv"  # Reserved directory to store environment archive
env_dir = os.path.dirname(os.path.dirname(sys.executable))
env_name = os.path.basename(env_dir)
env_archive = "{basedir}/{env}.zip#{basedir}".format(basedir=TMP_ENV_BASEDIR, env=env_name)
os.environ["PYSPARK_PYTHON"] = "{}/{}/bin/python".format(TMP_ENV_BASEDIR, env_name)
# Pack executable prj conda environment into zip
# !rm -rf {TMP_ENV_BASEDIR} && mkdir {TMP_ENV_BASEDIR} && cd {TMP_ENV_BASEDIR} && rsync -a {env_dir} . && zip -rq {env_name}.zip {env_name}


In [80]:
cores = 48
executors = cores / 6
num_files = cores
home_dir = "hdfs:/user/vlk/"
experiment_root_dir = os.path.join(home_dir, "files_mover", "")

In [None]:
!export JAVA_HOME=/etc/alternatives/jre_1.8.0 && export PATH=${JAVA_HOME}/bin:$PATH
!java -version

notes = """
spark = SparkSession.builder\
    .master("yarn-client")\
    .appName("DM-AWESOME")\
    .config("spark.yarn.queue", "default")\
    .config("spark.executor.instances", "2")\
    .config("spark.executor.memory", "2G")\
    .config("spark.executor.cores", "2")\
    .config("spark.yarn.executor.memoryOverhead", "2G")\
    .config("spark.dynamicAllocation.enabled", "true")\
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")\
    .config("spark.dynamicAllocation.maxExecutors", "200")\
    .config("spark.network.timeout", "800s")\
    .config("spark.reducer.maxReqsInFlight", "10")\
    .config("spark.shuffle.io.retryWait", "60s")\
    .config("spark.shuffle.io.maxRetries", "10")\
    .config("spark.sql.shuffle.partitions", "2000")\
    .config("spark.driver.memory", "2G")\
    .config("spark.driver.maxResultSize", "2G")\
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
    .config("spark.kryoserializer.buffer.max", "1024m")\
    .config("hive.exec.dynamic.partition", "true")\
    .config("hive.exec.dynamic.partition.mode", "nonstrict")\
    .config("hive.exec.max.dynamic.partitions", "10000")\
    .config("hive.exec.max.dynamic.partitions.pernode", "10000")\
    .config("spark.jars", "hdfs:/lib/dm/prj-transformers-assembly-dev-0.6.0.jar")\
    .config("spark.yarn.dist.archives", env_archive)\
    .getOrCreate()
"""

In [8]:
# spark = SparkSession.builder.master("yarn-client").appName("DM-AWESOME").getOrCreate()

In [9]:
# spark.stop()

In [22]:
conf = {
    "spark.app.name": "DM-7687-export_hdfs_test",
    "spark.yarn.queue": "dev.priority",
    "spark.master": "yarn-client",
    "spark.submit.deployMode": "cluster",
    "spark.memory.offHeap.enabled": "false",
    "spark.memory.offHeap.size": "1G",
    "spark.default.parallelism": cores,
    "spark.sql.shuffle.partitions": cores,
    "spark.driver.memory": "1g",
    "spark.driver.memoryOverhead": "1g",
    "spark.executor.cores": cores/executors,
    "spark.executor.instances": executors,
    "spark.executor.memory": "4g",
    "spark.dynamicAllocation.enabled": "true",
    "spark.dynamicAllocation.maxExecutors": executors*2,
    "spark.dynamicAllocation.minExecutors": executors/2,
    "spark.dynamicAllocation.cachedExecutorIdleTimeout": "30s",
    "spark.speculation": "true",
    "hive.exec.dynamic.partition": "true",
    "hive.exec.dynamic.partition.mode": "nonstrict",
    "spark.yarn.maxAppAttempts": "1",
    "spark.jars": "hdfs:/lib/dm/prj-transformers-assembly-dev-1.1.0.jar",
    "spark.yarn.dist.archives": env_archive,
    "spark.ui.enabled": "true",
    #spark.eventLog.dir=hdfs://hacluster/user/spark/applicationHistory
    #spark.driver.extraClassPath=/etc/hive/conf
    #spark.executor.extraClassPath=/etc/hive/conf
    "spark.eventLog.enabled": "true",
    #spark.executorEnv.PYTHONPATH=/usr/lib/spark/python/lib/py4j-current-src.zip:/usr/lib/spark/python/:<CPS>{{PWD}}/pyspark.zip<CPS>{{PWD}}/py4j-0.10.7-src.zip
    "spark.hadoop.mapred.output.compress": "false",
    "spark.hadoop.mapred.output.compression.codec": "org.apache.hadoop.io.compress.GzipCodec",
    #spark.history.fs.logDirectory=hdfs://hacluster/user/spark/applicationHistory
    #spark.jars.ivySettings=/usr/local/etc/ivysettings.xml
    #spark.jars.repositories=http://artifactory.hp.rbdev.mail.ru/artifactory/list/maven-mirror
    "spark.logConf": "true",
    "spark.network.timeout": "240s",
    #spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS=rbhp-control1.rbdev.mail.ru,rbhp-control2.rbdev.mail.ru
    #spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES=http://rm.hadoop.rbdev.mail.ru/proxy/application_1626104146526_35341,http://rm.hadoop.rbdev.mail.ru/proxy/application_1626104146526_35341
    #spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.RM_HA_URLS=rm.hadoop.rbdev.mail.ru,rm.hadoop.rbdev.mail.ru
    #spark.port.maxRetries=64
    "spark.rdd.compress": "true",
    #spark.security.credentials.hbase.enabled=false
    "spark.serializer": "org.apache.spark.serializer.KryoSerializer",
    #spark.serializer.objectStreamReset=100
    "spark.shuffle.service.enabled": "true",
    "spark.sql.broadcastTimeout": "2400",
    "spark.sql.catalogImplementation": "hive",
    "spark.sql.hive.metastore.jars": "/usr/lib/hive/lib/*:/usr/lib/hadoop/client/*",
    "spark.sql.hive.metastore.version": "2.1.1",
    "spark.sql.orc.compression.codec": "zlib",
    #spark.sql.warehouse.dir=hdfs://hacluster/user/hive/warehouse
    #spark.ui.filters=org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter
    #spark.ui.port=0
    "spark.yarn.am.waitTime": "600s",
    #spark.yarn.app.container.log.dir=/data/disk2/yarn/logs/application_1626104146526_35341/container_e973_1626104146526_35341_01_000001
    #spark.yarn.app.id=application_1626104146526_35341
    #spark.yarn.historyServer.address=http://rbhp-control5.rbdev.mail.ru:18080
    #spark.yarn.isPython=true
}

In [23]:
start = time()
spark = SparkSession.builder.config(conf=SparkConf().setAll(conf.items())).getOrCreate()
print("got spark session in {} seconds".format(time() - start))

got spark session in 143.72496891 seconds


In [79]:
def actual_cores(spark):
    return (
        len([executor.host() for executor in spark._jsc.sc().statusTracker().getExecutorInfos()]) * 
        int(spark.sparkContext.getConf().get("spark.executor.cores", "4"))
    )

In [98]:
num_files = 1000  # cores
files = [
    (
        os.path.join(experiment_root_dir, "files_src", "file_n{}.txt".format("%05d" % i)),
        os.path.join(experiment_root_dir, "files_trg", "file_n{}.txt".format("%05d" % i))
    )
    for i in range(1, num_files+1)
]
# log(files)

In [97]:
hdfs = HdfsClient()
hdfs.remove(
    os.path.join(experiment_root_dir, "files_trg", ""), 
    recursive=True, 
    skip_trash=True, 
    force=True
)

In [99]:
start = time()
results = []  # multiprocessing.pool.ApplyResult object
with MultithreadedHdfsOps(hdfs, max_threads=cores) as mth_hdfs:
    for fn, _ in files:
        results.append(mth_hdfs.touchz(fn))
print("# processed {} files in {} seconds, f/s: {}, pool size: {}".format(
    len(files), 
    time() - start, 
    len(files) / float(time() - start),
    cores
))
errors = [r.get() for r in results if not r.successful()]
print("errors: {}".format(pformat(errors)))

# processed 1000 files in 422.594970942 seconds, f/s: 2.36633197073, pool size: 48
errors: []


In [100]:
from luigi.contrib.hdfs.error import HDFSCliError
def move_files(split, iter):
    hdfs = HdfsClient()
    for src, trg in iter:
        retcode = 0
        try:
            out = hdfs.move(src, trg)
        except HDFSCliError as e:
            out = e.message
            retcode = e.returncode
        yield (
            src,
            trg,
            retcode,
            out,
        )

In [101]:
start = time()
rdd = spark.sparkContext.parallelize(files, num_files)
print("Cores before job: {}".format(actual_cores(spark)))
results = rdd.mapPartitionsWithIndex(move_files).collect()
print("# processed {} files in {} seconds, f/s: {}, cores at the end: {}".format(
    len(files), 
    time() - start, 
    len(files) / float(time() - start),
    actual_cores(spark)
))

Cores before job: 30
# processed 1000 files in 258.257082939 seconds, f/s: 3.87211062714, cores at the end: 102


In [102]:
success = []
fail = []
for src, trg, retcode, out in results:
    if retcode == 0:
        success.append(src)
    else:
        fail.append(src)
        log("Failed to move file: `{}`, retcode: {},\nmessage: `{}`".format(src, retcode, out))

In [103]:
if len(success) != len(files):
    log("Operation failed, files:\n{}".format(pformat(fail)))

In [59]:
# log(results)
errs = """
('hdfs:/user/vlk/files_mover/files_src/file_n00001.txt',
  'hdfs:/user/vlk/files_mover/files_trg/file_n00001.txt',
  0,
  None
),

('hdfs:/user/vlk/files_mover/files_src/file_n00001.txt',
  'hdfs:/user/vlk/files_mover/files_trg/file_n00001.txt',
  1,
  "Command ['hadoop', 'fs', '-mv', 'hdfs:/user/vlk/files_mover/files_src/file_n00001.txt', 'hdfs:/user/vlk/files_mover/files_trg/file_n00001.txt'] failed [exit code 1]\n---stdout---\n\n---stderr---\nmv: `hdfs:///user/vlk/files_mover/files_trg/file_n00001.txt': File exists\n------------"
)
  
"""

In [None]:
print("results: {} {}\n{}".format(type(results), type(results[0]), pformat(results)))

In [None]:
sc = spark._jsc.sc()
executors = [executor.host() for executor in sc.statusTracker().getExecutorInfos()]
log(executors)

In [75]:
executor_cores = spark.sparkContext.getConf().get("spark.executor.cores", "4")
log(executor_cores)

<type 'unicode'>, value: `u'6'`


actual cores: 30


In [105]:
spark.stop()

In [33]:

import datetime

from operator import and_
from collections import defaultdict

import six
import luigi
import pyspark.sql.functions as sqlfn

from pyspark.sql.types import MapType, ArrayType, FloatType, StringType, NumericType
from luigi.contrib.hdfs import HdfsTarget

from prj.apps.utils.common import DT_FORMAT
from prj.apps.utils.common.fs import HdfsClient
from prj.apps.utils.common.hive import select_clause, find_partitions

if six.PY3:
    from functools import reduce  # make flake8 happy

DEFAULT_MAX_DT_DIFF = 60

In [62]:
import json
import itertools as it

import six

from luigi.contrib.hive import run_hive_cmd

from dmcore.utils.common import to_str
from prj.common.hive import HiveMetastoreClient


In [85]:

def expand_partitions(**partition_conf):
    """Convert a partition configuration to a list of partition dicts via Cartesian product.

    :param partition_conf: key is a partition field name, value is one or several partition values.
        All values are converted to strings to meet Hive metastore response format.

    :return: a list of dicts, partition descriptions.
    :rtype: list[dict[str,str]].

    :Example:

    >>> expand_partitions(dt="2020-01-01", uid_type=["VKID", "OKID", "EMAIL"])
    [
        {"dt": "2020-01-01", uid_type: "VKID"},
        {"dt": "2020-01-01", uid_type: "OKID"},
        {"dt": "2020-01-01", uid_type: "EMAIL"}
    ]
    >>> expand_partitions()  # One empty partition spec
    [{}]
    >>> expand_partitions(foo="foo", bar=[])  # Empty Cartesian product
    []
    """
    return [
        dict(zip(partition_conf.keys(), map(to_str, values)))
        for values in it.product(*(v if isinstance(v, (list, tuple, set)) else [v] for v in partition_conf.values()))
    ]


In [86]:

def find_partitions(database, table, partition_conf, min_dt=None, max_dt=None, match_mode="all"):
    """Get a list of Hive table partitions with a given configuration.

    .. note:: A given Hive table is assumed to have "dt" partitioning column.

    :param str database: a database name.
    :param str table: a table name.
    :param partition_conf: a dictionary with partitions configuration, where key is a partition column name, and value
        is either a partition column value, or a list of values. A special partitioning column is "dt", for which a key
        is optional.
        If "dt" is missing, then the last date value that matches ``partition_conf`` is selected.
        If "dt" has a single value, then only partitions that match the rest ``partition_conf`` for this date are
        selected.
        If "dt" has multiple values, then only partitions that match the rest ``partition_conf`` for at least one of
        the dates in a list are selected.
        There are supported several matching modes, see ``match_mode`` parameter.
    :type partition_conf: dict[str, typing.Union[str, list[str]]].

    :param str min_dt: (optional) a date, for example, "2018-10-01". It is used only when ``dt`` is not set.
        In this case the last date value >= ``min_dt`` that matches ``partition_conf`` is selected.
    :param str max_dt: (optional) a date, for example, "2018-10-10". It is used only when ``dt`` is not set.
        In this case the last date value <= ``max_dt`` that matches ``partition_conf`` is selected.

    :param str match_mode: specifies, how a date is considered to match ``partition_conf`` (here we consider this
        configuration without "dt" key). Possible values are:

        - "all" (default), a date matches ``partition_conf`` when and only when all partitions from ``partition_conf``
            Cartesian product exist for that date.
        - "any", a date matches ``partition_conf``, if there exists at least one partition from ``partition_conf``
            Cartesian product for that date.

    :return: a list of dicts, partition descriptions.
    :rtype: list[dict[str,str]].

    :Example:

    >>> find_partitions("ds_auditories", "xlal_sample", {"audience_name": "12345", "uid_type": ["VKID", "OKID"]})
    [
        {"audience_name": "12345", "category": "positive", "dt": "2020-04-20", "uid_type": "VKID"},
        {"audience_name": "12345", "category": "positive", "dt": "2020-04-20", "uid_type": "OKID"},
        {"audience_name": "12345", "category": "negative", "dt": "2020-04-20", "uid_type": "VKID"},
        {"audience_name": "12345", "category": "negative", "dt": "2020-04-20", "uid_type": "OKID"}
    ]
    >>> find_partitions("cdm_scoring", "specified_socdem", {"uid_type": "IDFA", "dt": ["1999-01-01", "2020-11-17"]})
    [{"dt": "2020-11-17", "uid_type": "IDFA"}]
    >>> find_partitions("ds_auditories", "xlal_sample", {"audience_name": "12345", "uid_type": []})
    []
    """
    if match_mode not in {"all", "any"}:
        raise ValueError("Unsupported match_mode='{}'".format(match_mode))

    hive_client = HiveMetastoreClient()

    if "dt" not in hive_client.get_partition_names(database=database, table=table):
        raise TypeError("A table {}.{} has no 'dt' partitioning column".format(database, table))

    dt = partition_conf.get("dt")
    existing_partitions = hive_client.get_partitions(database=database, table=table)
    log(existing_partitions, "table partitions:")
    expanded_conf_partitions = expand_partitions(**{k: v for k, v in six.iteritems(partition_conf) if k != "dt"})
    log(expanded_conf_partitions, "expanded partitions:")

    if len(expanded_conf_partitions) == 0:
        log(None, "expanded partitions list is empty:")
        return []

    if dt is None:
        dts_sets = (
            set(ep["dt"] for ep in existing_partitions if set(p.items()).issubset(ep.items()))
            for p in expanded_conf_partitions
        )
        dts_sets = list(dts_sets)
        log(dts_sets, "list of dt sets:")
        
        if match_mode == "all":
            dts = set.intersection(*dts_sets)
        else:
            dts = set.union(*dts_sets)

        if min_dt or max_dt:
            dts = [dt for dt in dts if (min_dt is None or min_dt <= dt) and (max_dt is None or dt <= max_dt)]

        if not dts:
            log(None, "dt list not found:")
            return []
        else:
            dts = [max(dts)]

    else:
        dts = [dt] if isinstance(dt, six.string_types) else dt

    log(dts, "list of dt:")
    result_partitions = []

    for dt in dts:
        dt_partitions = []

        for p in expanded_conf_partitions:
            p = dict(dt=dt, **p)
            matched_existing_partitions = [ep for ep in existing_partitions if set(p.items()).issubset(ep.items())]
            log(p, "needed partition:")
            log(matched_existing_partitions, "matched partitions:")

            if (match_mode == "all") and not matched_existing_partitions:
                dt_partitions = []
                break

            dt_partitions.extend(matched_existing_partitions)

        result_partitions.extend(dt_partitions)

    return result_partitions


In [97]:
class ExportUniversalFeatureApp(object):

    config_example = {
        "target_dt": "2021-06-04",
        "max_dt_diff": 30,  # optional
        "source": {
            "db": "ds_scoring",
            "table": "dm_universal_feature",
            "partitions": [
                {"feature_name": "user_app_cats_installed", "uid_type": "GAID"},
                {"feature_name": "user_app_cats_installed", "uid_type": "IDFA"},
                {"feature_name": "dm8792_showed_urls", "uid_type": "VKID"},
            ],
        },
    }

    def prepare_config(self, config):
        # replace declared partitions list with list of existing partitions
        def make_partition_conf(plist):
            p_conf = defaultdict(set)
            for p in plist:
                for k, v in six.iteritems(p):
                    p_conf[k].add(v)
            return p_conf
    
        def dt_minus(dt, days):
            date = datetime.datetime.strptime(dt, DT_FORMAT).date()
            delta = datetime.timedelta(days=int(days))
            return (date - delta).isoformat()
    
        target_dt = config["target_dt"]
        min_dt = dt_minus(target_dt, config.get("max_dt_diff", DEFAULT_MAX_DT_DIFF))
        log(min_dt, "min dt:")
        source_conf = config["source"]
        partition_conf = make_partition_conf(source_conf.get("partitions", []))
        log(partition_conf, "partition conf dict:")
        source_conf["partitions"] = find_partitions(
            source_conf["db"], source_conf["table"], partition_conf, min_dt=min_dt, max_dt=target_dt, match_mode="any"
        )
    
        return config


In [None]:
app = ExportUniversalFeatureApp()
cfg = app.prepare_config(app.config_example)
log(cfg, "actual cfg:")

In [5]:
import os
import sys

from pyspark.sql import SparkSession

from prj.sparkml.scoring import InverseVariabilityTransformer
from prj.sparkml.postprocessing import ScoreEqualizeTransformer

TMP_ENV_BASEDIR = "tmpenv"  # Reserved directory to store environment archive
env_dir = os.path.dirname(os.path.dirname(sys.executable))
env_name = os.path.basename(env_dir)
env_archive = "{basedir}/{env}.zip#{basedir}".format(basedir=TMP_ENV_BASEDIR, env=env_name)
os.environ["PYSPARK_PYTHON"] = "{}/{}/bin/python".format(TMP_ENV_BASEDIR, env_name)

In [None]:
# Pack executable prj conda environment into zip
!rm -rf {TMP_ENV_BASEDIR} && mkdir {TMP_ENV_BASEDIR} && cd {TMP_ENV_BASEDIR} && rsync -a {env_dir} . && zip -rq {env_name}.zip {env_name}


In [None]:
!ls -la {TMP_ENV_BASEDIR}

In [7]:
# Create Spark session with prj conda environment and JVM extensions
spark = SparkSession.builder\
    .master("yarn-client")\
    .appName("DM-AWESOME")\
    .config("spark.yarn.queue", "default")\
    .config("spark.executor.instances", "2")\
    .config("spark.executor.memory", "2G")\
    .config("spark.executor.cores", "2")\
    .config("spark.yarn.executor.memoryOverhead", "2G")\
    .config("spark.dynamicAllocation.enabled", "true")\
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")\
    .config("spark.dynamicAllocation.maxExecutors", "200")\
    .config("spark.network.timeout", "800s")\
    .config("spark.reducer.maxReqsInFlight", "10")\
    .config("spark.shuffle.io.retryWait", "60s")\
    .config("spark.shuffle.io.maxRetries", "10")\
    .config("spark.sql.shuffle.partitions", "2000")\
    .config("spark.driver.memory", "2G")\
    .config("spark.driver.maxResultSize", "2G")\
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")\
    .config("spark.kryoserializer.buffer.max", "1024m")\
    .config("hive.exec.dynamic.partition", "true")\
    .config("hive.exec.dynamic.partition.mode", "nonstrict")\
    .config("hive.exec.max.dynamic.partitions", "10000")\
    .config("hive.exec.max.dynamic.partitions.pernode", "10000")\
    .config("spark.jars", "hdfs:/lib/dm/prj-transformers-assembly-dev-0.6.0.jar")\
    .config("spark.yarn.dist.archives", env_archive)\
    .getOrCreate()

In [8]:
spark.stop()

In [4]:
spark.sql("CREATE TEMPORARY FUNCTION gmin as 'prj.hive.udaf.features.GenericMinUDAF'")
spark.sql("CREATE TEMPORARY FUNCTION gmax as 'prj.hive.udaf.features.GenericMaxUDAF'")
spark.sql("CREATE TEMPORARY FUNCTION gavg as 'prj.hive.udaf.features.GenericAvgUDAF'")
spark.sql("CREATE TEMPORARY FUNCTION gsum as 'prj.hive.udaf.features.GenericSumUDAF'")
spark.sql("CREATE TEMPORARY FUNCTION most_freq AS 'prj.hive.udaf.features.GenericMostFreqUDAF'")

DataFrame[]

In [7]:
from pyspark.sql import functions as sqlfn
text = """
            |a  |OKID    |3_days            |ANTIFRAUD_SHOW|[43.0, 50.0, 19.0] |1.2100051013643267
            |c  |VID     |3_days            |ANTIFRAUD_SHOW|[49.0, 30.0, 10.0] |0.9048109505117552
            |b  |VKID    |3_days            |ANTIFRAUD_SHOW|[40.0, 20.0, 15.0] |2.8761312813466950
            |d  |VID     |3_days            |FOO_BAR       |[23.0,,]           |0.0476731294622796     
        """
df = (
    spark.createDataFrame([
            tuple([item.strip() for item in row.split("|") if item.strip()]) for row in text.strip().split("\n")
    ]).toDF("uid", "uid_type", "aggregation_period", "audience_name", "counts", "expected")
).withColumn(
            "shows_counts",
            sqlfn.expr(r"split(regexp_replace(counts, '(\\[|\\])', ''), ',')")
).selectExpr(
            "uid", "uid_type", "aggregation_period", "audience_name",
            "cast(shows_counts as array<double>) as shows_counts",
            "cast(expected as double) as expected"
)
df.printSchema()
df.show()

root
 |-- uid: string (nullable = true)
 |-- uid_type: string (nullable = true)
 |-- aggregation_period: string (nullable = true)
 |-- audience_name: string (nullable = true)
 |-- shows_counts: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- expected: double (nullable = true)

+---+--------+------------------+--------------+------------------+------------------+
|uid|uid_type|aggregation_period| audience_name|      shows_counts|          expected|
+---+--------+------------------+--------------+------------------+------------------+
|  a|    OKID|            3_days|ANTIFRAUD_SHOW|[43.0, 50.0, 19.0]|1.2100051013643267|
|  c|     VID|            3_days|ANTIFRAUD_SHOW|[49.0, 30.0, 10.0]|0.9048109505117552|
|  b|    VKID|            3_days|ANTIFRAUD_SHOW|[40.0, 20.0, 15.0]| 2.876131281346695|
|  d|     VID|            3_days|       FOO_BAR|          [23.0,,]|0.0476731294622796|
+---+--------+------------------+--------------+------------------+-----------------

In [8]:
from prj.sparkml.scoring import InverseVariabilityTransformer
model = InverseVariabilityTransformer(
    inputCol="shows_counts",
    groupColumns="audience_name, uid_type, aggregation_period",
    weightValue=0.01
).fit(df)
result = model.setOutputCol("score").transform(df)
result.printSchema()
result.show()

root
 |-- uid: string (nullable = true)
 |-- uid_type: string (nullable = true)
 |-- aggregation_period: string (nullable = true)
 |-- audience_name: string (nullable = true)
 |-- shows_counts: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- expected: double (nullable = true)
 |-- score: double (nullable = true)

+---+--------+------------------+--------------+------------------+------------------+-------------------+
|uid|uid_type|aggregation_period| audience_name|      shows_counts|          expected|              score|
+---+--------+------------------+--------------+------------------+------------------+-------------------+
|  a|    OKID|            3_days|ANTIFRAUD_SHOW|[43.0, 50.0, 19.0]|1.2100051013643267| 1.2100051013643267|
|  c|     VID|            3_days|ANTIFRAUD_SHOW|[49.0, 30.0, 10.0]|0.9048109505117552| 0.9048109505117552|
|  b|    VKID|            3_days|ANTIFRAUD_SHOW|[40.0, 20.0, 15.0]| 2.876131281346695|  2.876131281346695|
|  d|     VID|

In [9]:
df = spark.createDataFrame([
                ("a", 0.3,  0.1,  0.0),
                ("b", 0.7,  3.14, 0.27968233),
                ("c", 13.0, 26.0, 0.74796144),
                ("d", 17.0, 28.0, 1.0),
                ("e", 27.0, 15.0, 0.4982242)
            ]).toDF("uid", "score_raw_train", "score_raw", "expected")
df.printSchema()
df.show()

root
 |-- uid: string (nullable = true)
 |-- score_raw_train: double (nullable = true)
 |-- score_raw: double (nullable = true)
 |-- expected: double (nullable = true)

+---+---------------+---------+----------+
|uid|score_raw_train|score_raw|  expected|
+---+---------------+---------+----------+
|  a|            0.3|      0.1|       0.0|
|  b|            0.7|     3.14|0.27968233|
|  c|           13.0|     26.0|0.74796144|
|  d|           17.0|     28.0|       1.0|
|  e|           27.0|     15.0| 0.4982242|
+---+---------------+---------+----------+



In [12]:
from prj.sparkml.postprocessing import ScoreEqualizeTransformer
model = ScoreEqualizeTransformer(
  inputCol="score_raw_train",
  groupColumns="",
  sampleSize=100000,
  numBins=10000,
  noiseValue=1e-4,
  epsValue=1e-3,  
  randomValue=0.5
).fit(df)
result = model.setInputCol("score_raw").setOutputCol("score").transform(df)
result.printSchema()
result.show()

root
 |-- uid: string (nullable = true)
 |-- score_raw_train: double (nullable = true)
 |-- score_raw: double (nullable = true)
 |-- expected: double (nullable = true)
 |-- score: double (nullable = true)

+---+---------------+---------+----------+------------------+
|uid|score_raw_train|score_raw|  expected|             score|
+---+---------------+---------+----------+------------------+
|  a|            0.3|      0.1|       0.0|               0.0|
|  b|            0.7|     3.14|0.27968233|0.2796823308046389|
|  c|           13.0|     26.0|0.74796144| 0.747961435953159|
|  d|           17.0|     28.0|       1.0|               1.0|
|  e|           27.0|     15.0| 0.4982242| 0.498224197874749|
+---+---------------+---------+----------+------------------+

