# TRG-73014 UniversalFeatures export compare
Сравнение результатов оригинального экспорта и нового (Spark3, Python3), rbhp-dm3 conda3

In [1]:
import os
import sys
from pprint import pformat

In [None]:
# Pay attention to log(sys.version_info), you have to `conda-pack` the same version of python

def log(obj, msg=""):
    if msg: print(msg)
    print("type: {}\ndata: {}".format(type(obj), pformat(obj, indent=1, width=1)))

log(os.environ, "os.environ")
print()
log(dict(os.environ), "dict(os.environ)")
print()
log(sys.version_info)

In [3]:
_env_example = """
export SPARK_HOME=/usr/lib/spark3
export PYTHONPATH=/usr/lib/spark3/python/lib/py4j-current-src.zip:/usr/lib/spark3/python
export PYSPARK_PYTHON=/usr/bin/python3
export PYSPARK_DRIVER_PYTHON=/usr/bin/python3
export PATH=/etc/alternatives/jre_1.8.0/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/data/anaconda3/bin:/data/anaconda2/bin
"""

for path in [
    "/data/anaconda2/bin",
    "/usr/lib/spark/python", 
    "/usr/lib/spark/python/lib/py4j-current-src.zip"
]:
    try:
        sys.path.remove(path)
    except:
        pass

sys.path.append("/usr/lib/spark3/python/lib/py4j-current-src.zip")
sys.path.append("/usr/lib/spark3/python")
os.environ["PATH"] = f"/usr/lib/spark3/:/usr/lib/spark3/python/lib/py4j-current-src.zip:/data/anaconda3/bin:{os.environ.get('PATH')}"

os.environ["HTTPS_PROXY"] = "http://rbhp-proxy.i:3128"
os.environ["SPARK_HOME"] = "/usr/lib/spark3"
del os.environ["PYSPARK_PYTHON"]
del os.environ["PYTHONPATH"]

In [4]:
# prepare env on host

# https://databricks.com/blog/2020/12/22/how-to-manage-python-dependencies-in-pyspark.html
# https://conda.github.io/conda-pack/spark.html
# !export HTTPS_PROXY=http://rbhp-proxy.i:3128
_script="""
 export HTTPS_PROXY=http://rbhp-proxy.i:3128
 which conda
 conda -V
 conda create -y -n pyspark_conda_env python=3.7
 conda activate pyspark_conda_env
 conda install conda-pack
 conda pack -f -o pyspark_conda_env.tar.gz
"""

In [5]:
# https://conda.github.io/conda-pack/spark.html
# !export PYSPARK_PYTHON=./environment/bin/python
os.environ["PYSPARK_PYTHON"] = "./environment/bin/python"
# $ PYSPARK_PYTHON=./environment/bin/python \
# spark-submit \
# --master yarn \
# --deploy-mode cluster \
# --conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=./environment/bin/python
# --archives pyspark_conda_env.tar.gz#environment \

# !export PYSPARK_DRIVER_PYTHON=/home/vlk/.conda/envs/pyspark_conda_env/bin/python
os.environ["PYSPARK_DRIVER_PYTHON"] = "/home/vlk/.conda/envs/pyspark_conda_env/bin/python"
# $ PYSPARK_DRIVER_PYTHON=`which python` \
# PYSPARK_PYTHON=./environment/bin/python \
# spark-submit \
# --master yarn \
# --deploy-mode client \
# --conf spark.yarn.appMasterEnv.PYSPARK_PYTHON=./environment/bin/python \
# --archives pyspark_conda_env.tar.gz#environment \

# .config("spark.yarn.dist.archives", "pyspark_conda_env.tar.gz#environment")
# .config("spark.yarn.appMasterEnv.PYSPARK_PYTHON", "./environment/bin/python")

In [6]:
# only for prj conda env
# Pack executable prj conda environment into zip

def _create_env_archive():
    import os
    import sys

    TMP_ENV_BASEDIR = "tmpenv"  # Reserved directory to store environment archive
    env_dir = os.path.dirname(os.path.dirname(sys.executable))
    env_name = os.path.basename(env_dir)
    os.environ["PYSPARK_PYTHON"] = "{}/{}/bin/python".format(TMP_ENV_BASEDIR, env_name)
    # you need this only once time!
    # !rm -rf {TMP_ENV_BASEDIR} && mkdir {TMP_ENV_BASEDIR} && cd {TMP_ENV_BASEDIR} && rsync -a {env_dir} . && zip -rq {env_name}.zip {env_name}
    return "{basedir}/{env}.zip#{basedir}".format(basedir=TMP_ENV_BASEDIR, env=env_name)
    
# env_archive = _create_env_archive()
# log(env_archive)

# .config("spark.yarn.dist.archives", env_archive)

In [7]:
import math
from pyspark.sql import SparkSession, SQLContext

In [None]:
# Create Spark session with [prj] conda environment and JVM extensions

# `spark-submit ... --driver-java-options "-Dlog4j.configuration=file:/home/vlk/driver_log4j.properties"`
# "spark.driver.extraJavaOptions", "-Xss10M"
# catalyst SO while building parts. filter expression
ejo = "-Dlog4j.configuration=file:/home/vlk/driver2_log4j.properties"

queue = "root.default"

# 1000 GB => 4000 part. => 8000 real part.
# 4 TB of data
# sssp = 4 * 4 * 1024
# 50 GB
# sssp = 50.0 * 4 * 2
# 10 GB
sssp = 10.0 * 4 * 2

spark = (
SparkSession.builder
    .master("yarn")
    .appName("TRG-73014-test-ipynb")
    .config("spark.yarn.dist.archives", "pyspark_conda_env.tar.gz#environment")
    .config("spark.yarn.appMasterEnv.PYSPARK_PYTHON", "./environment/bin/python")
    .config("spark.yarn.queue", queue)
    .config("spark.sql.shuffle.partitions", int(math.ceil(sssp)))
    .config("spark.driver.extraJavaOptions", ejo)
    .config("spark.executor.instances", "2")
    .config("spark.executor.memory", "8G")
    .config("spark.executor.cores", "6")
    .config("spark.executor.memoryOverhead", "2G")
    .config("spark.driver.memory", "4G")
    .config("spark.driver.maxResultSize", "1G")
    .config("spark.speculation", "true")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "2")
    .config("spark.dynamicAllocation.maxExecutors", "512")
    .config("spark.dynamicAllocation.executorIdleTimeout", "300s")
    .config("spark.network.timeout", "800s")
    .config("spark.reducer.maxReqsInFlight", "10")
    .config("spark.shuffle.io.retryWait", "60s")
    .config("spark.shuffle.io.maxRetries", "10")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config("spark.kryoserializer.buffer.max", "1024m")
    .config("spark.hadoop.hive.exec.dynamic.partition", "true")
    .config("spark.hadoop.hive.exec.dynamic.partition.mode", "nonstrict")
    .config("spark.hadoop.hive.exec.max.dynamic.partitions", "1000000")
    .config("spark.hadoop.hive.exec.max.dynamic.partitions.pernode", "100000")
    .config("spark.hadoop.hive.metastore.client.socket.timeout", "300s")
    .config("spark.ui.enabled", "true")
    .config("spark.sql.sources.partitionColumnTypeInference.enabled", "false")    
    .getOrCreate()
)
#     .config("spark.jars", "hdfs:/lib/dm/prj-transformers-assembly-dev-1.5.2.jar")

sql_ctx = SQLContext(spark.sparkContext)
(spark, sql_ctx)

In [6]:
# end of env. setup

In [9]:
import os
import numpy as np

import pprint
from pprint import pformat

import sys
import datetime

from operator import and_
from collections import defaultdict

import six
import pyspark.sql.functions as sqlfn

import json
import itertools as it

if six.PY3:
    from functools import reduce  # make flake8 happy

from pyspark.storagelevel import StorageLevel
from pyspark.sql import DataFrame, SQLContext
from pyspark.sql.types import (
    MapType, ArrayType, FloatType, DoubleType, StringType, StructType, IntegralType, IntegerType, NumericType
)
from pyspark.sql.utils import CapturedException
from pyspark.ml.wrapper import JavaWrapper

# import luigi
# from luigi.contrib.hdfs import HdfsTarget

In [6]:
# CustomUDFLibrary(spark, "hdfs:/lib/dm/prj-transformers-assembly-dev-1.5.0.jar").register_all_udf()

In [10]:
def show(df, message="dataframe", nlines=20, truncate=False, heavy=True):
    if heavy:
        print("\n{}:".format(message))
        df.printSchema()
        return df

    print("\n{}, rows: {}:".format(message, df.count()))
    df.printSchema()
    df.show(nlines, truncate)
    return df

## UniversalFeatures compare emulation

In [8]:
# gdfs head -n 3 /export/target/universal_features/dm_extended_features_v2/2021-12-31/part-00000-85c2b8eb-591a-4d44-9ff7-a6c198a7191e-c000.csv
old_file_text = """
user;num:ext_feature2
vk:601804359;0:0.133850812912,1:4723.9375,2:0.869369387627,3:0.10000000149,4:0.0500000007451,5:0.0500000007451,6:0.0500000007451,10:3.40000009537,11:5.40000009537,12:2.59999990463,13:15385.1669922,14:8117.16650391,15:203079.578125,16:242712.25,17:320989.59375,18:1333384.875,19:13013.25,20:21976.9160156,21:13660.1669922,22:1720.16662598,23:26525.25,24:0.0434782616794,25:0.0434782616794,26:0.0434782616794,27:0.0869565233588,28:0.0434782616794,29:0.0869565233588,30:0.0434782616794,31:1583.72814941,32:44.6000022888,33:114.233337402,34:393.619049072,35:2361.46655273,36:0.0625,37:0.883668005466,38:23392.9667969,1330:3,1335:3,1343:3,1347:3,1097:3,1243:3,1548:3,1533:3
vk:72293765;0:0.374163866043,1:405.89855957,2:0.920303463936,3:0.0774647891521,4:0.0211267601699,5:0.0140845067799,6:0.0845070406795,7:0.866715550423,8:0.709127545357,9:0.510675251484,10:5.32258081436,11:1.13709676266,12:3.30645155907,13:15810.6582031,14:8899.97558594,15:197297.53125,16:223353.984375,17:317720.75,18:1335009.125,19:12406.796875,20:21937.2792969,21:9697.89746094,22:1478.60449219,23:23335.1816406,24:0.0206896550953,25:0.00689655169845,26:0.00689655169845,27:0.110344827175,28:0.00689655169845,29:0.0206896550953,30:0.00689655169845,31:1944.44030762,32:45.0097999573,33:135.024597168,34:742.001220703,35:2077.11474609,36:0.253623187542,37:0.919775247574,38:21882.328125,1547:452,2120:3,1097:2421,1122:25,2127:1,2130:3,2131:3,2132:3,2138:208,2145:498,2146:8,2147:3,2066:5,2070:9,2187:2,2188:18,2157:1,2212:7,2213:5,2290:5,2218:1,2224:1,2231:6,2232:35,2233:53,2237:2,2241:2,2245:78,1057:12,2248:7,2259:1,2261:8,1243:73,1247:1651,1062:132,2278:1,2283:1,2284:10,2285:1,2287:1,2288:1,1266:144,2291:5,2295:5,2296:3,2305:16,2307:1,2310:2,2316:19,2318:2,2319:14,2321:2,1302:6,2333:60,2334:12,2335:33,1319:734,2345:24,2346:2,2348:2,2350:1,1330:2593,1333:15,2358:6,1335:2593,1337:15,1338:491,2363:2,1340:243,1343:1874,1346:33,1347:1723,1348:452,2359:2,2384:1,2413:24,2415:2,2417:119,2418:2,2419:20,2426:233,2428:2,2429:69,2430:80,2431:1,1426:33,1548:1769,1453:73,2379:2,2012:1,2016:1,2021:3,2022:35,2023:2,2024:1,2029:1,2036:3,2217:1,1533:1271
ok:571813119726;1547:20,1161:10,1345:20,1335:20,1343:20,1051:10,1330:20.01,1061:10.0,1253:20.0,1348:20.0
"""
# gdfs head -n 3 /data/dm/prj/dev/apps/export/universal_features/dm_extended_features_v2/2022-01-09/part-00000-1929771c-58d1-4a7a-938b-775fc92beaae-c000.csv
new_file_text = """
user;num:ext_feature2
ok:571813119726;1547:20.00001,1161:10.0,1345:20.0,1335:20.0,1343:20.0,1051:10.0,1330:20.0,1061:10.0,1253:20.0,1348:20.0
ok:590558313987;12:0.0,1547:48.0,8:0.40449977,4:0.2,1221:10.0,11:1.0,9:0.40449977,1323:14.0,26:0.125,37:0.24138813,24:0.125,1265:23.0,5:0.2,10:0.0,1345:48.0,1335:48.0,6:0.2,1320:1.0,36:0.0,1343:38.0,1:2792.0,25:0.125,1062:32.0,1051:16.0,0:0.6542008,27:0.125,2:0.24138813,1330:48.0,30:0.125,7:0.40449977,29:0.125,1338:10.0,3:0.2,28:0.125,1348:48.0
"""

In [9]:
def drop_empty_lines(text):
    lines = text.split("\n")
    lines = [x.strip() for x in lines if len(x.strip()) > 0]
    return "\n".join(lines)

In [10]:
def write_text(text, file_path):
    with open(file_path, 'w') as f:
        f.write(text)

In [11]:
def save_to_hdfs(text, hdfs_dir, file_name):
    from backports import tempfile
    from prj.apps.utils.common.fs import HdfsClient
    hdfs = HdfsClient()
    success_filename = "_SUCCESS"

    with tempfile.TemporaryDirectory() as tmp_local_dir:
        write_text(drop_empty_lines(text), os.path.join(tmp_local_dir, file_name))
        hdfs.mkdir(hdfs_dir, remove_if_exists=True)
        hdfs.put(tmp_local_dir, hdfs_dir, content_only=True)
    hdfs.touchz(os.path.join(hdfs_dir, success_filename))

In [12]:
save_to_hdfs(old_file_text, "hdfs:/user/vlk/test/TRG-73014/dm_extended_features_v2/old/", "features.csv")
save_to_hdfs(new_file_text, "hdfs:/user/vlk/test/TRG-73014/dm_extended_features_v2/new/", "features.csv")

## End of emulation, pay attention!

In [None]:
# path="hdfs:/user/vlk/test/TRG-73014/dm_extended_features_v2/old/"
# path="hdfs:/export/target/universal_features/dm_extended_features_v2/2021-12-31/"

# path = "hdfs:/export/target/universal_features/dm_user_af_shows/2022-02-08/*.csv"
# user;num:mean_shows_per_day;num:visit_probability;num:total_shows_cnt;num:mean;num:std;num:hours

path = "hdfs:/export/target/universal_features/dm_user_app_cats_pc_cri_wscore/2022-02-10/*.csv"
# user;num:dm_user_app_cats_pc_cri_wscore
# gaid:43d2023e-60f0-4459-ac6b-d4fe45e03d9f;1739705835:0.165162265301,268232120:0.708328306675,2508174941:0.742438793182,3987996473:0.184657230973

old_lines_df = show(
    df=spark.read.csv(
        path=path,
        header=True,
        sep=";"
    ).persist(StorageLevel.MEMORY_ONLY),
    message="prod",
    heavy=False
)

In [None]:
# path="hdfs:/user/vlk/test/TRG-73014/dm_extended_features_v2/new/"
# path="hdfs:/data/dm/prj/dev/apps/export/universal_features/dm_extended_features_v2/2022-01-09/"
# path = "hdfs:/data/dm/prj/dev/apps/export/universal_features/dm_user_af_shows/2022-02-08/*.csv"

path = "hdfs:/data/dm/prj/dev/apps/export/universal_features/dm_user_app_cats_pc_cri_wscore/2022-02-10/*.csv"

new_lines_df = show(
    df=spark.read.csv(
        path=path,
        header=True,
        sep=";"
    ).persist(StorageLevel.MEMORY_ONLY),
    message="dev",
    heavy=False
)

In [30]:
# debug only!

# gaid:00079fe7-7e6a-4103-9c2f-cdabe3ab4023
# 24.9355,1.0,773,7.5997,0.8192,0.0194,0.0323,0.0323,0.0944,0.0815,0.0388,0.0595,0.0595,0.0957,0.0375,0.0375,0.0543,0.0298,0.0789,0.0246,0.0142,0.0479,0.0285,0.0181,0.0129,0.0712,0.0142,0.0168,0.0
# std = 0.8192
def make_fake_diff(df):
    return (df.selectExpr(
        "user",
        "`num:mean_shows_per_day`",
        "`num:visit_probability`",
        "`num:total_shows_cnt`",
        "`num:mean`",
        "`num:hours`"
        )
        .where("user = 'gaid:00079fe7-7e6a-4103-9c2f-cdabe3ab4023'")
        .withColumn("num:std", sqlfn.lit("0.8191"))
    )

# new_lines_df = make_fake_diff(new_lines_df)

In [None]:
def _dm_user_app_cats_pc_cri_wscore_join():
    # num:dm_user_app_cats_pc_cri_wscore
    return show(
        df=old_lines_df.alias("a")
        .join(new_lines_df.alias("b"), ["user"])
        .selectExpr(
            "user",
            "a.`num:dm_user_app_cats_pc_cri_wscore` as a_f",
            "b.`num:dm_user_app_cats_pc_cri_wscore` as b_f"
        ).persist(StorageLevel.MEMORY_ONLY),
        message="joined",
        heavy=False
    )

joined = _dm_user_app_cats_pc_cri_wscore_join()

In [31]:
def _dm_user_af_shows_join():
    # set of features, dm_user_af_shows
    features = [
        "num:mean_shows_per_day",
        "num:visit_probability",
        "num:total_shows_cnt",
        "num:mean",
        "num:std",
        "num:hours"
    ]

    return show(
        df=old_lines_df.alias("a")
        .join(new_lines_df.alias("b"), ["user"])
        .selectExpr(
            "user",
            "concat_ws(',', {}) as a_f".format(", ".join(["a.`{}`".format(x) for x in features])),
            "concat_ws(',', {}) as b_f".format(", ".join(["b.`{}`".format(x) for x in features]))
        ).persist(StorageLevel.MEMORY_ONLY),
        message="joined",
        heavy=False
    )

# joined = _dm_user_af_shows_join()


joined, rows: 1:
root
 |-- user: string (nullable = true)
 |-- a_f: string (nullable = false)
 |-- b_f: string (nullable = false)

+-----------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|user                                     |a_f                                                                                                                                                                                               |b_f                                                                                                                                                                                               

In [18]:
# actually for normalization you should for each key compute (min,max) and then map found ranges to (0,1)
def _format_map(item):
    k, v = item.split(":")    
    v = str(int(float(v) * 100000))  # 5 fraction digits for float32
    return "{}:{}".format(k, v[:6])  # only first 6 digits to compare

def _normalize_maps(text):
    items = text.split(",")
    formatted_items = [_format_map(x) for x in items]
    return sorted(formatted_items, key=lambda x: int(x.split(":")[0]))

def _diff_maps(a, b):
    a_items = _normalize_maps(a)
    b_items = _normalize_maps(b)
    if a_items == b_items:
        return None, None

    if len(a_items) != len(b_items):
        return -1, 100500.0  # key = -1 if len is different

    for i, (ai, bi) in enumerate(zip(a_items, b_items)):
        ak, av = ai.split(":")
        bk, bv = bi.split(":")
        if ak != bk:
            return int(ak), float(bk)  # diff = key(b) if keys is different

        diff = abs(int(av) - int(bv))
        if diff > 1:
            return int(ak), float(diff) / 100000.0  # key(a), abs_diff(val(a), val(b))

    return None, None  # if a == b

In [32]:
# actually for normalization you should for each key compute (min,max) and then map found ranges to (0,1)
def _format_num(item):
    # works ok only for numbers shape like 1.23456
    # 5 fraction digits for float32 
    # only first 6 digits to compare
    return str(int(float(item) * 100000))[:6]

def _normalize_arrays(text):
    return [_format_num(x) for x in text.split(",")]

def _diff_arrays(a, b):
    a_items = _normalize_arrays(a) # array of strings
    b_items = _normalize_arrays(b)
    if a_items == b_items:
        return None, None

    if len(a_items) != len(b_items):
        return -1, 100500.0  # key = -1 if len is different

    for i, (ai, bi) in enumerate(zip(a_items, b_items)):
        diff = abs(int(ai) - int(bi))
        if diff > 1:
            return i, float(diff) / 100000.0  # idx, abs_diff(a[i], b[i])

    return None, None  # if a == b

In [19]:
def transform(rows):
    for row in rows:
        key, diff = _diff_maps(row["a_f"], row["b_f"])
        # key, diff = _diff_arrays(row["a_f"], row["b_f"])
        yield row["user"], key, diff, row["a_f"], row["b_f"]

diff = spark.createDataFrame(
    joined.rdd.mapPartitions(transform, preservesPartitioning=True),
    schema="user:string,key:int,diff:float,a_f:string,b_f:string"
)

In [20]:
dir_path = "hdfs:/user/vlk/test/TRG-73014/dm_user_app_cats_pc_cri_wscore/old_new_diff/"

diff.where(
    "diff is not null"
).write.option(
    "mapreduce.fileoutputcommitter.algorithm.version", "2"
).parquet(
    dir_path, 
    mode="overwrite"
)

diff = spark.read.parquet(dir_path).persist(StorageLevel.MEMORY_ONLY)
show(diff, "diff", heavy=False)


diff, rows: 0:
root
 |-- user: string (nullable = true)
 |-- key: integer (nullable = true)
 |-- diff: float (nullable = true)
 |-- a_f: string (nullable = true)
 |-- b_f: string (nullable = true)

+----+---+----+---+---+
|user|key|diff|a_f|b_f|
+----+---+----+---+---+
+----+---+----+---+---+



DataFrame[user: string, key: int, diff: float, a_f: string, b_f: string]

In [15]:
# diff.unpersist()
spark.stop()