From 4f861bb612b5a6c4f4a091ed9f64073054c2921d Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Fri, 9 Feb 2024 19:52:50 -0500 Subject: [PATCH 01/18] delete autotuner and predictions tests --- tests/api/test_predictions.py | 68 -- tests/asyncapi/test_asyncpredictions.py | 50 - tests/data/application_1678162862227_0001 | 464 --------- tests/data/emr-cluster-report.json | 354 ------- tests/data/predictions_response.json | 465 --------- tests/test_awsdatabricks.py | 1091 --------------------- tests/test_awsemr.py | 197 ---- 7 files changed, 2689 deletions(-) delete mode 100644 tests/api/test_predictions.py delete mode 100644 tests/asyncapi/test_asyncpredictions.py delete mode 100644 tests/data/application_1678162862227_0001 delete mode 100644 tests/data/emr-cluster-report.json delete mode 100644 tests/data/predictions_response.json delete mode 100644 tests/test_awsdatabricks.py delete mode 100644 tests/test_awsemr.py diff --git a/tests/api/test_predictions.py b/tests/api/test_predictions.py deleted file mode 100644 index 15bb900..0000000 --- a/tests/api/test_predictions.py +++ /dev/null @@ -1,68 +0,0 @@ -import json - -import respx -from httpx import Response - -from sync.api import predictions -from sync.config import CONFIG -from sync.models import Platform - -# auth route will only be called in the first test -mock_router = respx.mock(base_url=CONFIG.api_url, assert_all_called=False) -mock_router.post("/v1/auth/token").mock( - return_value=Response( - 200, - json={ - "result": {"access_token": "notarealtoken", "expires_at_utc": "2022-09-01T20:54:48Z"} - }, - ) -) - - -@mock_router -def test_create_prediction(): - prediction_id = "2c33df4a-c491-4602-9a8a-1353ddec4376" - mock_router.post("/v1/autotuner/predictions").mock( - return_value=Response(202, json={"result": {"prediction_id": prediction_id}}) - ) - - response = predictions.create_prediction( - Platform.AWS_EMR, {}, "https://hello.s3.awsamazon.com/world" - ) - - assert response.result == prediction_id - - -@mock_router -def test_get_prediction_status(): - prediction_id = "2c33df4a-c491-4602-9a8a-1353ddec4376" - mock_router.get(f"/v1/autotuner/predictions/{prediction_id}/status").mock( - side_effect=lambda r: Response(200, json={"result": {"status": "SUCCESS"}}) - if prediction_id in r.url.path - else Response(404) - ) - - response = predictions.get_status(prediction_id) - - assert response.result == "SUCCESS" - - -@mock_router -def test_get_prediction(): - prediction_id = "e26c36fa-3b50-4d42-a412-19db210591a4" - with open("tests/data/predictions_response.json") as predictions_fobj: - prediction = [ - p - for p in json.loads(predictions_fobj.read())["result"] - if p["prediction_id"] == prediction_id - ][0] - - mock_router.get(f"/v1/autotuner/predictions/{prediction_id}").mock( - side_effect=lambda r: Response(200, json={"result": prediction}) - if prediction_id in r.url.path - else Response(404) - ) - - response = predictions.get_prediction(prediction_id) - - assert response.result["prediction_id"] == prediction_id diff --git a/tests/asyncapi/test_asyncpredictions.py b/tests/asyncapi/test_asyncpredictions.py deleted file mode 100644 index 2625583..0000000 --- a/tests/asyncapi/test_asyncpredictions.py +++ /dev/null @@ -1,50 +0,0 @@ -import json - -import pytest -import respx -from httpx import Response - -from sync.asyncapi import predictions -from sync.config import CONFIG -from sync.models import Platform - -# auth route will only be called in the first test -mock_router = respx.mock(base_url=CONFIG.api_url, assert_all_called=False) -mock_router.post("/v1/auth/token").mock( - return_value=Response( - 200, - json={ - "result": {"access_token": "notarealtoken", "expires_at_utc": "2022-09-01T20:54:48Z"} - }, - ) -) - - -@pytest.mark.asyncio -@mock_router -async def test_generate_prediction(): - prediction_id = "e26c36fa-3b50-4d42-a412-19db210591a4" - mock_router.post("/v1/autotuner/predictions").mock( - return_value=Response(202, json={"result": {"prediction_id": prediction_id}}) - ) - mock_router.get(f"/v1/autotuner/predictions/{prediction_id}/status").mock( - return_value=Response(200, json={"result": {"status": "SUCCESS"}}) - ) - - with open("tests/data/predictions_response.json") as predictions_fobj: - prediction = [ - p - for p in json.loads(predictions_fobj.read())["result"] - if p["prediction_id"] == prediction_id - ][0] - mock_router.get(f"/v1/autotuner/predictions/{prediction_id}").mock( - side_effect=lambda r: Response(200, json={"result": prediction}) - if prediction_id in r.url.path - else Response(404) - ) - - response = await predictions.generate_prediction( - Platform.AWS_EMR, {}, "https://hello.s3.awsamazon.com/world" - ) - - assert response.result["prediction_id"] == prediction_id diff --git a/tests/data/application_1678162862227_0001 b/tests/data/application_1678162862227_0001 deleted file mode 100644 index 461e65b..0000000 --- a/tests/data/application_1678162862227_0001 +++ /dev/null @@ -1,464 +0,0 @@ -{"Event":"SparkListenerLogStart","Spark Version":"3.0.1-amzn-0"} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"driver","Host":"ip-172-31-102-115.ec2.internal","Port":36183},"Maximum Memory":1078827417,"Timestamp":1678162962526,"Maximum Onheap Memory":1078827417,"Maximum Offheap Memory":0} -{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre","Java Version":"1.8.0_362 (Amazon.com Inc.)","Scala Version":"version 2.12.10"},"Spark Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.scheduler.mode":"FIFO","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"},"Hadoop Properties":{"hadoop.service.shutdown.timeout":"30s","yarn.resourcemanager.amlauncher.thread-count":"50","yarn.sharedcache.enabled":"false","fs.s3a.connection.maximum":"15","yarn.nodemanager.numa-awareness.numactl.cmd":"/usr/bin/numactl","fs.s3a.impl":"org.apache.hadoop.fs.s3a.S3AFileSystem","yarn.app.mapreduce.am.scheduler.heartbeat.interval-ms":"1000","yarn.timeline-service.timeline-client.number-of-async-entities-to-merge":"10","hadoop.security.kms.client.timeout":"60","hadoop.http.authentication.kerberos.principal":"HTTP/_HOST@LOCALHOST","mapreduce.jobhistory.loadedjob.tasks.max":"-1","mapreduce.framework.name":"yarn","yarn.sharedcache.uploader.server.thread-count":"50","yarn.nodemanager.log-aggregation.roll-monitoring-interval-seconds.min":"3600","yarn.nodemanager.linux-container-executor.nonsecure-mode.user-pattern":"^[_.A-Za-z0-9][-@_.A-Za-z0-9]{0,255}?[$]?$","tfile.fs.output.buffer.size":"262144","yarn.app.mapreduce.am.job.task.listener.thread-count":"60","yarn.nodemanager.node-attributes.resync-interval-ms":"120000","hadoop.security.groups.cache.background.reload.threads":"3","yarn.resourcemanager.webapp.cross-origin.enabled":"true","fs.AbstractFileSystem.ftp.impl":"org.apache.hadoop.fs.ftp.FtpFs","hadoop.registry.secure":"false","hadoop.shell.safely.delete.limit.num.files":"100","dfs.bytes-per-checksum":"512","fs.s3.buffer.dir":"/mnt/s3,/mnt1/s3","mapreduce.job.acl-view-job":" ","fs.s3a.s3guard.ddb.background.sleep":"25ms","fs.s3a.retry.limit":"${fs.s3a.attempts.maximum}","mapreduce.jobhistory.loadedjobs.cache.size":"5","fs.s3a.s3guard.ddb.table.create":"false","yarn.log-aggregation.enable-local-cleanup":"false","dfs.namenode.handler.count":"64","yarn.nodemanager.amrmproxy.enabled":"false","yarn.timeline-service.entity-group-fs-store.with-user-dir":"false","mapreduce.input.fileinputformat.split.minsize":"0","yarn.resourcemanager.container.liveness-monitor.interval-ms":"600000","dfs.namenode.replication.max-streams":"20","yarn.resourcemanager.client.thread-count":"64","io.seqfile.compress.blocksize":"1000000","mapreduce.tasktracker.http.threads":"60","fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","yarn.sharedcache.checksum.algo.impl":"org.apache.hadoop.yarn.sharedcache.ChecksumSHA256Impl","yarn.nodemanager.amrmproxy.interceptor-class.pipeline":"org.apache.hadoop.yarn.server.nodemanager.amrmproxy.DefaultRequestInterceptor","dfs.datanode.data.dir":"/mnt/hdfs,/mnt1/hdfs","dfs.replication":"1","yarn.timeline-service.entity-group-fs-store.leveldb-cache-read-cache-size":"10485760","mapreduce.reduce.shuffle.fetch.retry.interval-ms":"1000","mapreduce.task.profile.maps":"0-2","yarn.scheduler.include-port-in-node-name":"false","yarn.nodemanager.admin-env":"MALLOC_ARENA_MAX=$MALLOC_ARENA_MAX","yarn.resourcemanager.node-removal-untracked.timeout-ms":"60000","mapreduce.am.max-attempts":"2","hadoop.security.kms.client.failover.sleep.base.millis":"100","mapreduce.jobhistory.webapp.https.address":"0.0.0.0:19890","yarn.node-labels.fs-store.impl.class":"org.apache.hadoop.yarn.nodelabels.FileSystemNodeLabelsStore","yarn.nodemanager.collector-service.address":"${yarn.nodemanager.hostname}:8048","fs.trash.checkpoint.interval":"0","mapreduce.job.map.output.collector.class":"org.apache.hadoop.mapred.MapTask$MapOutputBuffer","yarn.resourcemanager.node-ip-cache.expiry-interval-secs":"-1","hadoop.http.authentication.signature.secret.file":"*********(redacted)","hadoop.jetty.logs.serve.aliases":"true","yarn.resourcemanager.placement-constraints.handler":"disabled","yarn.timeline-service.handler-thread-count":"10","yarn.resourcemanager.max-completed-applications":"1000","dfs.hosts.exclude":"/emr/instance-controller/lib/dfs.hosts.exclude","yarn.resourcemanager.placement-constraints.algorithm.class":"org.apache.hadoop.yarn.server.resourcemanager.scheduler.constraint.algorithm.DefaultPlacementAlgorithm","yarn.sharedcache.webapp.address":"0.0.0.0:8788","yarn.resourcemanager.delegation.token.renew-interval":"*********(redacted)","yarn.sharedcache.nm.uploader.replication.factor":"10","hadoop.security.groups.negative-cache.secs":"30","yarn.app.mapreduce.task.container.log.backups":"0","mapreduce.reduce.skip.proc-count.auto-incr":"true","hadoop.security.group.mapping.ldap.posix.attr.gid.name":"gidNumber","ipc.client.fallback-to-simple-auth-allowed":"false","yarn.nodemanager.resource.memory.enforced":"true","yarn.client.failover-proxy-provider":"org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider","yarn.timeline-service.http-authentication.simple.anonymous.allowed":"true","ha.health-monitor.check-interval.ms":"1000","yarn.acl.reservation-enable":"false","yarn.resourcemanager.store.class":"org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore","yarn.app.mapreduce.am.hard-kill-timeout-ms":"10000","fs.s3a.etag.checksum.enabled":"false","yarn.nodemanager.container-metrics.enable":"false","yarn.timeline-service.client.fd-clean-interval-secs":"60","yarn.resourcemanager.nodemanagers.heartbeat-interval-ms":"250","hadoop.common.configuration.version":"3.0.0","fs.s3a.s3guard.ddb.table.capacity.read":"500","yarn.nodemanager.remote-app-log-dir-suffix":"logs","yarn.nodemanager.windows-container.cpu-limit.enabled":"false","yarn.nodemanager.runtime.linux.docker.privileged-containers.allowed":"false","file.blocksize":"67108864","hadoop.registry.zk.retry.ceiling.ms":"60000","mapreduce.reduce.env":"HADOOP_MAPRED_HOME=/usr/lib/hadoop-mapreduce","yarn.scheduler.configuration.leveldb-store.path":"${hadoop.tmp.dir}/yarn/system/confstore","yarn.sharedcache.store.in-memory.initial-delay-mins":"10","mapreduce.jobhistory.principal":"jhs/_HOST@REALM.TLD","mapreduce.map.skip.proc-count.auto-incr":"true","fs.s3a.committer.name":"file","mapreduce.task.profile.reduces":"0-2","hadoop.zk.num-retries":"1000","yarn.webapp.xfs-filter.enabled":"true","seq.io.sort.mb":"100","yarn.scheduler.configuration.max.version":"100","yarn.timeline-service.webapp.https.address":"${yarn.timeline-service.hostname}:8190","yarn.resourcemanager.scheduler.address":"ip-172-31-102-115.ec2.internal:8030","yarn.node-labels.enabled":"false","yarn.resourcemanager.webapp.ui-actions.enabled":"true","mapreduce.task.timeout":"600000","yarn.sharedcache.client-server.thread-count":"50","hadoop.security.groups.shell.command.timeout":"0s","hadoop.security.crypto.cipher.suite":"AES/CTR/NoPadding","yarn.nodemanager.elastic-memory-control.oom-handler":"org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.DefaultOOMHandler","yarn.resourcemanager.connect.max-wait.ms":"900000","fs.defaultFS":"hdfs://ip-172-31-102-115.ec2.internal:8020","yarn.minicluster.use-rpc":"false","yarn.app.mapreduce.am.env":"HADOOP_MAPRED_HOME=/usr/lib/hadoop-mapreduce","fs.s3.impl":"com.amazon.ws.emr.hadoop.fs.EmrFileSystem","fs.har.impl.disable.cache":"true","yarn.webapp.ui2.enable":"false","io.compression.codec.bzip2.library":"system-native","fs.s3a.change.detection.source":"etag","yarn.nodemanager.distributed-scheduling.enabled":"false","mapreduce.shuffle.connection-keep-alive.timeout":"5","yarn.resourcemanager.webapp.https.address":"${yarn.resourcemanager.hostname}:8090","mapreduce.jobhistory.address":"ip-172-31-102-115.ec2.internal:10020","yarn.resourcemanager.nm-tokens.master-key-rolling-interval-secs":"*********(redacted)","yarn.is.minicluster":"false","yarn.nodemanager.address":"${yarn.nodemanager.hostname}:8041","hadoop.proxyuser.livy.groups":"*","fs.abfss.impl":"org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem","fs.AbstractFileSystem.s3a.impl":"org.apache.hadoop.fs.s3a.S3A","mapreduce.task.combine.progress.records":"10000","yarn.resourcemanager.epoch.range":"0","yarn.resourcemanager.am.max-attempts":"2","yarn.nodemanager.linux-container-executor.cgroups.hierarchy":"/hadoop-yarn","fs.AbstractFileSystem.wasbs.impl":"org.apache.hadoop.fs.azure.Wasbs","yarn.timeline-service.entity-group-fs-store.cache-store-class":"org.apache.hadoop.yarn.server.timeline.MemoryTimelineStore","yarn.nodemanager.runtime.linux.docker.default-rw-mounts":"/mnt/yarn:/mnt/yarn,/mnt1/yarn:/mnt1/yarn,/mnt/s3:/mnt/s3,/mnt1/s3:/mnt1/s3","fs.ftp.transfer.mode":"BLOCK_TRANSFER_MODE","ipc.server.log.slow.rpc":"false","yarn.resourcemanager.node-labels.provider.fetch-interval-ms":"1800000","yarn.router.webapp.https.address":"0.0.0.0:8091","yarn.nodemanager.webapp.cross-origin.enabled":"false","fs.wasb.impl":"org.apache.hadoop.fs.azure.NativeAzureFileSystem","yarn.resourcemanager.auto-update.containers":"false","yarn.app.mapreduce.am.job.committer.cancel-timeout":"60000","yarn.scheduler.configuration.zk-store.parent-path":"/confstore","yarn.nodemanager.default-container-executor.log-dirs.permissions":"750","yarn.app.attempt.diagnostics.limit.kc":"64","fs.s3a.change.detection.mode":"server","hadoop.proxyuser.presto.hosts":"*","ftp.bytes-per-checksum":"512","yarn.nodemanager.resource.memory-mb":"12288","io.compression.codecs":"org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec,com.hadoop.compression.lzo.LzoCodec,com.hadoop.compression.lzo.LzopCodec","fs.AbstractFileSystem.abfs.impl":"org.apache.hadoop.fs.azurebfs.Abfs","yarn.timeline-service.writer.flush-interval-seconds":"60","fs.s3a.fast.upload.active.blocks":"4","yarn.resourcemanager.submission-preprocessor.enabled":"false","hadoop.security.credential.clear-text-fallback":"true","yarn.nodemanager.collector-service.thread-count":"5","fs.azure.secure.mode":"false","mapreduce.jobhistory.joblist.cache.size":"20000","fs.ftp.host":"0.0.0.0","yarn.timeline-service.writer.async.queue.capacity":"100","yarn.resourcemanager.fs.state-store.num-retries":"0","yarn.resourcemanager.nodemanager-connect-retries":"10","yarn.nodemanager.log-aggregation.num-log-files-per-app":"30","hadoop.security.kms.client.encrypted.key.cache.low-watermark":"0.3f","fs.s3a.committer.magic.enabled":"false","yarn.timeline-service.client.max-retries":"30","dfs.ha.fencing.ssh.connect-timeout":"30000","yarn.log-aggregation-enable":"true","yarn.system-metrics-publisher.enabled":"true","mapreduce.reduce.markreset.buffer.percent":"0.0","fs.AbstractFileSystem.viewfs.impl":"org.apache.hadoop.fs.viewfs.ViewFs","mapreduce.task.io.sort.factor":"48","yarn.nodemanager.amrmproxy.client.thread-count":"25","ha.failover-controller.new-active.rpc-timeout.ms":"60000","yarn.nodemanager.container-localizer.java.opts":"-Xmx256m","mapreduce.jobhistory.datestring.cache.size":"200000","mapreduce.job.acl-modify-job":" ","dfs.namenode.https-address":"ip-172-31-102-115.ec2.internal:9871","yarn.nodemanager.windows-container.memory-limit.enabled":"false","yarn.timeline-service.webapp.address":"${yarn.timeline-service.hostname}:8188","yarn.app.mapreduce.am.job.committer.commit-window":"10000","yarn.nodemanager.container-manager.thread-count":"64","yarn.minicluster.fixed.ports":"false","hadoop.tags.system":"YARN,HDFS,NAMENODE,DATANODE,REQUIRED,SECURITY,KERBEROS,PERFORMANCE,CLIENT\n ,SERVER,DEBUG,DEPRECATED,COMMON,OPTIONAL","yarn.cluster.max-application-priority":"0","yarn.timeline-service.ttl-enable":"true","mapreduce.jobhistory.recovery.store.fs.uri":"${hadoop.tmp.dir}/mapred/history/recoverystore","hadoop.caller.context.signature.max.size":"40","hadoop.proxyuser.hive.groups":"*","yarn.client.load.resource-types.from-server":"false","ha.zookeeper.session-timeout.ms":"10000","mapreduce.map.java.opts":"-Xmx1229m","tfile.io.chunk.size":"1048576","fs.s3a.s3guard.ddb.table.capacity.write":"100","yarn.dispatcher.print-events-info.threshold":"5000","mapreduce.job.speculative.slowtaskthreshold":"1.0","io.serializations":"org.apache.hadoop.io.serializer.WritableSerialization, org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization, org.apache.hadoop.io.serializer.avro.AvroReflectSerialization","hadoop.security.kms.client.failover.sleep.max.millis":"2000","hadoop.security.group.mapping.ldap.directory.search.timeout":"10000","yarn.scheduler.configuration.store.max-logs":"1000","yarn.nodemanager.node-attributes.provider.fetch-interval-ms":"600000","fs.swift.impl":"org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem","yarn.nodemanager.local-cache.max-files-per-directory":"8192","hadoop.http.cross-origin.enabled":"false","dfs.namenode.rpc-address":"ip-172-31-102-115.ec2.internal:8020","hadoop.zk.acl":"world:anyone:rwcda","mapreduce.map.sort.spill.percent":"0.80","yarn.timeline-service.entity-group-fs-store.scan-interval-seconds":"60","dfs.datanode.fsdataset.volume.choosing.policy":"org.apache.hadoop.hdfs.server.datanode.fsdataset.AvailableSpaceVolumeChoosingPolicy","yarn.node-attribute.fs-store.impl.class":"org.apache.hadoop.yarn.server.resourcemanager.nodelabels.FileSystemNodeAttributeStore","fs.s3a.retry.interval":"500ms","yarn.timeline-service.client.best-effort":"false","yarn.resourcemanager.webapp.delegation-token-auth-filter.enabled":"*********(redacted)","hadoop.security.group.mapping.ldap.posix.attr.uid.name":"uidNumber","fs.AbstractFileSystem.swebhdfs.impl":"org.apache.hadoop.fs.SWebHdfs","yarn.nodemanager.elastic-memory-control.timeout-sec":"5","mapreduce.ifile.readahead":"true","yarn.timeline-service.leveldb-timeline-store.ttl-interval-ms":"300000","yarn.timeline-service.reader.webapp.address":"${yarn.timeline-service.webapp.address}","yarn.resourcemanager.placement-constraints.algorithm.pool-size":"1","yarn.timeline-service.hbase.coprocessor.jar.hdfs.location":"/hbase/coprocessor/hadoop-yarn-server-timelineservice.jar","hadoop.security.kms.client.encrypted.key.cache.num.refill.threads":"2","yarn.resourcemanager.scheduler.class":"org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler","yarn.app.mapreduce.am.command-opts":"-Xmx2458m","mapreduce.cluster.local.dir":"/mnt/mapred,/mnt1/mapred","hadoop.proxyuser.hue.hosts":"*","io.mapfile.bloom.error.rate":"0.005","fs.client.resolve.topology.enabled":"false","hadoop.proxyuser.hue.groups":"*","yarn.nodemanager.runtime.linux.allowed-runtimes":"default,docker","yarn.sharedcache.store.class":"org.apache.hadoop.yarn.server.sharedcachemanager.store.InMemorySCMStore","ha.failover-controller.graceful-fence.rpc-timeout.ms":"5000","ftp.replication":"3","hadoop.security.uid.cache.secs":"14400","mapreduce.job.maxtaskfailures.per.tracker":"3","fs.s3a.metadatastore.impl":"org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore","io.skip.checksum.errors":"false","yarn.app.mapreduce.client-am.ipc.max-retries-on-timeouts":"3","yarn.timeline-service.webapp.xfs-filter.xframe-options":"SAMEORIGIN","fs.s3a.connection.timeout":"200000","yarn.nodemanager.linux-container-executor.group":"yarn","mapreduce.job.max.split.locations":"15","yarn.resourcemanager.nm-container-queuing.max-queue-length":"15","hadoop.registry.zk.session.timeout.ms":"60000","yarn.federation.cache-ttl.secs":"300","mapreduce.jvm.system-properties-to-log":"os.name,os.version,java.home,java.runtime.version,java.vendor,java.version,java.vm.name,java.class.path,java.io.tmpdir,user.dir,user.name","yarn.resourcemanager.opportunistic-container-allocation.nodes-used":"10","yarn.timeline-service.entity-group-fs-store.active-dir":"/tmp/entity-file-history/active","mapreduce.shuffle.transfer.buffer.size":"131072","yarn.timeline-service.client.retry-interval-ms":"1000","yarn.timeline-service.flowname.max-size":"0","yarn.http.policy":"HTTP_ONLY","fs.s3a.socket.send.buffer":"8192","fs.AbstractFileSystem.abfss.impl":"org.apache.hadoop.fs.azurebfs.Abfss","yarn.sharedcache.uploader.server.address":"0.0.0.0:8046","yarn.resourcemanager.delegation-token.max-conf-size-bytes":"*********(redacted)","hadoop.http.authentication.token.validity":"*********(redacted)","mapreduce.shuffle.max.connections":"0","yarn.minicluster.yarn.nodemanager.resource.memory-mb":"4096","mapreduce.job.emit-timeline-data":"false","yarn.nodemanager.resource.system-reserved-memory-mb":"-1","hadoop.kerberos.min.seconds.before.relogin":"60","mapreduce.jobhistory.move.thread-count":"3","yarn.resourcemanager.admin.client.thread-count":"1","yarn.dispatcher.drain-events.timeout":"300000","fs.s3a.buffer.dir":"${hadoop.tmp.dir}/s3a","hadoop.ssl.enabled.protocols":"TLSv1,SSLv2Hello,TLSv1.1,TLSv1.2","mapreduce.jobhistory.admin.address":"0.0.0.0:10033","yarn.log-aggregation-status.time-out.ms":"600000","fs.s3a.assumed.role.sts.endpoint.region":"us-west-1","mapreduce.shuffle.port":"13562","yarn.resourcemanager.max-log-aggregation-diagnostics-in-memory":"10","yarn.nodemanager.health-checker.interval-ms":"600000","yarn.router.clientrm.interceptor-class.pipeline":"org.apache.hadoop.yarn.server.router.clientrm.DefaultClientRequestInterceptor","yarn.resourcemanager.zk-appid-node.split-index":"0","ftp.blocksize":"67108864","yarn.nodemanager.runtime.linux.sandbox-mode.local-dirs.permissions":"read","yarn.router.rmadmin.interceptor-class.pipeline":"org.apache.hadoop.yarn.server.router.rmadmin.DefaultRMAdminRequestInterceptor","yarn.nodemanager.log-container-debug-info.enabled":"true","yarn.client.max-cached-nodemanagers-proxies":"0","yarn.nodemanager.linux-container-executor.cgroups.delete-delay-ms":"20","yarn.nodemanager.delete.debug-delay-sec":"0","yarn.nodemanager.pmem-check-enabled":"true","yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage":"90.0","mapreduce.app-submission.cross-platform":"false","yarn.resourcemanager.work-preserving-recovery.scheduling-wait-ms":"10000","yarn.nodemanager.container-retry-minimum-interval-ms":"1000","hadoop.security.groups.cache.secs":"300","yarn.scheduler.increment-allocation-mb":"32","yarn.federation.enabled":"false","fs.azure.local.sas.key.mode":"false","ipc.maximum.data.length":"67108864","mapreduce.shuffle.max.threads":"0","yarn.router.pipeline.cache-max-size":"25","yarn.resourcemanager.nm-container-queuing.load-comparator":"QUEUE_LENGTH","hadoop.security.authorization":"false","yarn.app.mapreduce.am.jhs.backup.enabled":"true","mapreduce.job.complete.cancel.delegation.tokens":"*********(redacted)","fs.s3a.paging.maximum":"5000","nfs.exports.allowed.hosts":"* rw","yarn.nodemanager.amrmproxy.ha.enable":"false","mapreduce.jobhistory.http.policy":"HTTP_ONLY","yarn.sharedcache.store.in-memory.check-period-mins":"720","mapreduce.reduce.java.opts":"-Xmx2458m","hadoop.security.group.mapping.ldap.ssl":"false","yarn.client.application-client-protocol.poll-interval-ms":"200","yarn.scheduler.configuration.leveldb-store.compaction-interval-secs":"86400","yarn.timeline-service.writer.class":"org.apache.hadoop.yarn.server.timelineservice.storage.HBaseTimelineWriterImpl","ha.zookeeper.parent-znode":"/hadoop-ha","yarn.resourcemanager.submission-preprocessor.file-refresh-interval-ms":"60000","dfs.namenode.safemode.extension":"5000","yarn.nodemanager.log-aggregation.policy.class":"org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation.AllContainerLogAggregationPolicy","mapreduce.reduce.shuffle.merge.percent":"0.66","hadoop.security.group.mapping.ldap.search.filter.group":"(objectClass=group)","yarn.resourcemanager.placement-constraints.scheduler.pool-size":"1","yarn.nodemanager.resourcemanager.minimum.version":"NONE","mapreduce.job.speculative.speculative-cap-running-tasks":"0.1","yarn.admin.acl":"*","dfs.namenode.replication.max-streams-hard-limit":"40","yarn.nodemanager.recovery.supervised":"true","yarn.sharedcache.admin.thread-count":"1","yarn.resourcemanager.ha.automatic-failover.enabled":"true","mapreduce.reduce.skip.maxgroups":"0","mapreduce.reduce.shuffle.connect.timeout":"180000","yarn.resourcemanager.address":"ip-172-31-102-115.ec2.internal:8032","ipc.client.ping":"true","mapreduce.task.local-fs.write-limit.bytes":"-1","mapred.output.committer.class":"org.apache.hadoop.mapred.DirectFileOutputCommitter","fs.adl.oauth2.access.token.provider.type":"*********(redacted)","mapreduce.shuffle.ssl.file.buffer.size":"65536","yarn.resourcemanager.ha.automatic-failover.embedded":"true","yarn.nodemanager.resource-plugins.gpu.docker-plugin":"nvidia-docker-v1","hadoop.ssl.enabled":"false","fs.s3a.multipart.purge":"false","yarn.scheduler.configuration.store.class":"file","yarn.resourcemanager.nm-container-queuing.queue-limit-stdev":"1.0f","mapreduce.job.end-notification.max.attempts":"5","mapreduce.output.fileoutputformat.compress.codec":"org.apache.hadoop.io.compress.DefaultCodec","yarn.nodemanager.container-monitor.procfs-tree.smaps-based-rss.enabled":"false","ipc.client.bind.wildcard.addr":"false","yarn.resourcemanager.webapp.rest-csrf.enabled":"false","ha.health-monitor.connect-retry-interval.ms":"1000","yarn.nodemanager.keytab":"/etc/krb5.keytab","hadoop.security.key.provider.path":"kms://http@ip-172-31-102-115.ec2.internal:9600/kms","mapreduce.jobhistory.keytab":"/etc/security/keytab/jhs.service.keytab","fs.s3a.threads.max":"10","mapreduce.reduce.shuffle.input.buffer.percent":"0.70","hadoop.security.token.service.use_ip":"*********(redacted)","yarn.nodemanager.runtime.linux.docker.allowed-container-networks":"emr-docker-bridge,host,bridge","yarn.nodemanager.node-labels.resync-interval-ms":"120000","hadoop.tmp.dir":"/mnt/var/lib/hadoop/tmp","mapreduce.job.maps":"36","mapreduce.jobhistory.webapp.rest-csrf.custom-header":"X-XSRF-Header","mapreduce.job.end-notification.max.retry.interval":"5000","yarn.log-aggregation.retain-check-interval-seconds":"-1","yarn.resourcemanager.resource-tracker.client.thread-count":"64","yarn.nodemanager.containers-launcher.class":"org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncher","yarn.rm.system-metrics-publisher.emit-container-events":"false","yarn.timeline-service.leveldb-timeline-store.start-time-read-cache-size":"10000","yarn.resourcemanager.ha.automatic-failover.zk-base-path":"/yarn-leader-election","io.seqfile.local.dir":"${hadoop.tmp.dir}/io/local","fs.s3a.s3guard.ddb.throttle.retry.interval":"100ms","fs.AbstractFileSystem.wasb.impl":"org.apache.hadoop.fs.azure.Wasb","mapreduce.client.submit.file.replication":"10","mapreduce.jobhistory.minicluster.fixed.ports":"false","fs.s3a.multipart.threshold":"2147483647","yarn.resourcemanager.webapp.xfs-filter.xframe-options":"SAMEORIGIN","mapreduce.jobhistory.done-dir":"${yarn.app.mapreduce.am.staging-dir}/history/done","dfs.namenode.name.dir":"/mnt/namenode,/mnt1/namenode","ipc.client.idlethreshold":"4000","yarn.nodemanager.linux-container-executor.cgroups.strict-resource-usage":"false","yarn.nodemanager.runtime.linux.docker.default-ro-mounts":"/etc/passwd:/etc/passwd,/usr/lib:/docker/usr/lib,/usr/share:/docker/usr/share","mapreduce.reduce.input.buffer.percent":"0.0","yarn.nodemanager.runtime.linux.docker.userremapping-gid-threshold":"1","yarn.nodemanager.webapp.rest-csrf.enabled":"false","fs.ftp.host.port":"21","ipc.ping.interval":"60000","yarn.resourcemanager.history-writer.multi-threaded-dispatcher.pool-size":"10","yarn.resourcemanager.admin.address":"${yarn.resourcemanager.hostname}:8033","file.client-write-packet-size":"65536","ipc.client.kill.max":"10","mapreduce.reduce.speculative":"true","hadoop.security.key.default.bitlength":"256","mapreduce.job.reducer.unconditional-preempt.delay.sec":"300","yarn.nodemanager.disk-health-checker.interval-ms":"120000","yarn.nodemanager.log.deletion-threads-count":"4","yarn.webapp.filter-entity-list-by-user":"false","yarn.web-proxy.address":"ip-172-31-102-115.ec2.internal:20888","ipc.client.connection.maxidletime":"10000","mapreduce.task.io.sort.mb":"200","yarn.nodemanager.localizer.client.thread-count":"20","io.erasurecode.codec.rs.rawcoders":"rs_native,rs_java","io.erasurecode.codec.rs-legacy.rawcoders":"rs-legacy_java","yarn.sharedcache.admin.address":"0.0.0.0:8047","yarn.resourcemanager.placement-constraints.algorithm.iterator":"SERIAL","yarn.nodemanager.localizer.cache.cleanup.interval-ms":"600000","hadoop.security.crypto.codec.classes.aes.ctr.nopadding":"org.apache.hadoop.crypto.OpensslAesCtrCryptoCodec, org.apache.hadoop.crypto.JceAesCtrCryptoCodec","mapreduce.job.cache.limit.max-resources-mb":"0","fs.s3a.connection.ssl.enabled":"true","yarn.nodemanager.process-kill-wait.ms":"5000","mapreduce.job.hdfs-servers":"${fs.defaultFS}","hadoop.workaround.non.threadsafe.getpwuid":"true","fs.df.interval":"60000","yarn.dispatcher.exit-on-error":"true","fs.s3a.multiobjectdelete.enable":"true","yarn.sharedcache.cleaner.resource-sleep-ms":"0","yarn.nodemanager.disk-health-checker.min-healthy-disks":"0.25","hadoop.shell.missing.defaultFs.warning":"false","io.file.buffer.size":"65536","dfs.permissions.superusergroup":"hadoop","hadoop.security.group.mapping.ldap.search.attr.member":"member","hadoop.security.random.device.file.path":"/dev/urandom","hadoop.security.sensitive-config-keys":"*********(redacted)","fs.s3a.s3guard.ddb.max.retries":"9","hadoop.rpc.socket.factory.class.default":"org.apache.hadoop.net.StandardSocketFactory","yarn.intermediate-data-encryption.enable":"false","yarn.resourcemanager.connect.retry-interval.ms":"30000","yarn.nodemanager.container.stderr.pattern":"{*stderr*,*STDERR*}","fs.s3bfs.impl":"org.apache.hadoop.fs.s3.S3FileSystem","yarn.scheduler.minimum-allocation-mb":"32","yarn.app.mapreduce.am.staging-dir":"/tmp/hadoop-yarn/staging","mapreduce.reduce.shuffle.read.timeout":"180000","hadoop.http.cross-origin.max-age":"1800","io.erasurecode.codec.xor.rawcoders":"xor_native,xor_java","fs.s3a.connection.establish.timeout":"5000","mapreduce.job.running.map.limit":"0","yarn.minicluster.control-resource-monitoring":"false","hadoop.ssl.require.client.cert":"false","hadoop.kerberos.kinit.command":"kinit","yarn.federation.state-store.class":"org.apache.hadoop.yarn.server.federation.store.impl.MemoryFederationStateStore","mapreduce.reduce.log.level":"INFO","hadoop.security.dns.log-slow-lookups.threshold.ms":"1000","mapreduce.job.ubertask.enable":"false","adl.http.timeout":"-1","yarn.resourcemanager.placement-constraints.retry-attempts":"3","hadoop.caller.context.enabled":"false","hadoop.security.group.mapping.ldap.num.attempts":"3","yarn.nodemanager.vmem-pmem-ratio":"5","hadoop.rpc.protection":"authentication","ha.health-monitor.rpc-timeout.ms":"45000","yarn.nodemanager.remote-app-log-dir":"/var/log/hadoop-yarn/apps","hadoop.zk.timeout-ms":"10000","fs.s3a.s3guard.cli.prune.age":"86400000","yarn.nodemanager.resource.pcores-vcores-multiplier":"1.0","yarn.nodemanager.runtime.linux.sandbox-mode":"disabled","yarn.app.mapreduce.am.containerlauncher.threadpool-initial-size":"10","fs.s3a.committer.threads":"8","hadoop.zk.retry-interval-ms":"1000","hadoop.security.crypto.buffer.size":"8192","yarn.nodemanager.node-labels.provider.fetch-interval-ms":"600000","mapreduce.jobhistory.recovery.store.leveldb.path":"${hadoop.tmp.dir}/mapred/history/recoverystore","yarn.client.failover-retries-on-socket-timeouts":"0","yarn.nodemanager.resource.memory.enabled":"false","fs.azure.authorization.caching.enable":"true","hadoop.security.instrumentation.requires.admin":"false","yarn.nodemanager.delete.thread-count":"4","mapreduce.job.finish-when-all-reducers-done":"true","hadoop.registry.jaas.context":"Client","yarn.timeline-service.leveldb-timeline-store.path":"${hadoop.tmp.dir}/yarn/timeline","io.map.index.interval":"128","yarn.resourcemanager.nm-container-queuing.max-queue-wait-time-ms":"100","fs.abfs.impl":"org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem","mapreduce.job.counters.max":"120","mapreduce.jobhistory.webapp.rest-csrf.enabled":"false","yarn.timeline-service.store-class":"org.apache.hadoop.yarn.server.timeline.EntityGroupFSTimelineStore","mapreduce.jobhistory.move.interval-ms":"180000","fs.s3a.change.detection.version.required":"true","yarn.nodemanager.localizer.fetch.thread-count":"20","yarn.resourcemanager.scheduler.client.thread-count":"64","hadoop.ssl.hostname.verifier":"DEFAULT","yarn.timeline-service.leveldb-state-store.path":"${hadoop.tmp.dir}/yarn/timeline","mapreduce.job.classloader":"false","mapreduce.task.profile.map.params":"${mapreduce.task.profile.params}","ipc.client.connect.timeout":"20000","hadoop.security.auth_to_local.mechanism":"hadoop","yarn.timeline-service.app-collector.linger-period.ms":"60000","yarn.nm.liveness-monitor.expiry-interval-ms":"600000","yarn.resourcemanager.reservation-system.planfollower.time-step":"1000","yarn.nodemanager.runtime.linux.docker.enable-userremapping.allowed":"true","hadoop.proxyuser.hadoop.groups":"*","yarn.webapp.api-service.enable":"true","yarn.nodemanager.recovery.enabled":"true","mapreduce.job.end-notification.retry.interval":"1000","fs.du.interval":"600000","fs.ftp.impl":"org.apache.hadoop.fs.ftp.FTPFileSystem","yarn.nodemanager.container.stderr.tail.bytes":"4096","hadoop.security.group.mapping.ldap.read.timeout.ms":"60000","mapreduce.map.env":"HADOOP_MAPRED_HOME=/usr/lib/hadoop-mapreduce","hadoop.security.groups.cache.warn.after.ms":"5000","file.bytes-per-checksum":"512","mapreduce.outputcommitter.factory.scheme.s3a":"org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory","hadoop.security.groups.cache.background.reload":"false","yarn.nodemanager.container-monitor.enabled":"true","yarn.nodemanager.elastic-memory-control.enabled":"false","net.topology.script.number.args":"100","mapreduce.task.merge.progress.records":"10000","yarn.nodemanager.localizer.address":"${yarn.nodemanager.hostname}:8040","yarn.timeline-service.keytab":"/etc/krb5.keytab","mapreduce.reduce.shuffle.fetch.retry.timeout-ms":"30000","yarn.resourcemanager.rm.container-allocation.expiry-interval-ms":"600000","mapreduce.fileoutputcommitter.algorithm.version":"2","yarn.resourcemanager.work-preserving-recovery.enabled":"true","mapreduce.map.skip.maxrecords":"0","yarn.sharedcache.root-dir":"/sharedcache","fs.s3a.retry.throttle.limit":"${fs.s3a.attempts.maximum}","hadoop.http.authentication.type":"simple","mapreduce.job.jvm.numtasks":"20","mapreduce.job.cache.limit.max-resources":"0","mapreduce.task.userlog.limit.kb":"0","yarn.resourcemanager.scheduler.monitor.enable":"false","ipc.client.connect.max.retries":"10","hadoop.registry.zk.retry.times":"5","dfs.namenode.http-address":"ip-172-31-102-115.ec2.internal:9870","yarn.nodemanager.resource-monitor.interval-ms":"3000","yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices":"auto","mapreduce.job.sharedcache.mode":"disabled","yarn.app.mapreduce.am.jhs.backup-dir":"file:///var/log/hadoop-mapreduce/history","yarn.nodemanager.webapp.rest-csrf.custom-header":"X-XSRF-Header","mapreduce.shuffle.listen.queue.size":"128","yarn.scheduler.configuration.mutation.acl-policy.class":"org.apache.hadoop.yarn.server.resourcemanager.scheduler.DefaultConfigurationMutationACLPolicy","mapreduce.map.cpu.vcores":"1","yarn.log-aggregation.file-formats":"TFile","yarn.timeline-service.client.fd-retain-secs":"300","hadoop.user.group.static.mapping.overrides":"dr.who=;","fs.azure.sas.expiry.period":"90d","mapreduce.jobhistory.recovery.store.class":"org.apache.hadoop.mapreduce.v2.hs.HistoryServerFileSystemStateStoreService","yarn.resourcemanager.fail-fast":"${yarn.fail-fast}","yarn.resourcemanager.proxy-user-privileges.enabled":"false","yarn.router.webapp.interceptor-class.pipeline":"org.apache.hadoop.yarn.server.router.webapp.DefaultRequestInterceptorREST","yarn.nodemanager.resource.memory.cgroups.soft-limit-percentage":"90.0","mapreduce.job.reducer.preempt.delay.sec":"0","hadoop.util.hash.type":"murmur","yarn.nodemanager.disk-validator":"basic","yarn.app.mapreduce.client.job.max-retries":"3","mapreduce.reduce.shuffle.retry-delay.max.ms":"60000","hadoop.security.group.mapping.ldap.connection.timeout.ms":"60000","mapreduce.task.profile.params":"-agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s","yarn.app.mapreduce.shuffle.log.backups":"0","yarn.nodemanager.container-diagnostics-maximum-size":"10000","hadoop.registry.zk.retry.interval.ms":"1000","yarn.nodemanager.linux-container-executor.cgroups.delete-timeout-ms":"1000","fs.AbstractFileSystem.file.impl":"org.apache.hadoop.fs.local.LocalFs","yarn.nodemanager.log-aggregation.roll-monitoring-interval-seconds":"-1","mapreduce.jobhistory.cleaner.interval-ms":"86400000","hadoop.registry.zk.quorum":"ip-172-31-102-115.ec2.internal:2181","mapreduce.output.fileoutputformat.compress":"false","yarn.resourcemanager.am-rm-tokens.master-key-rolling-interval-secs":"*********(redacted)","fs.s3a.assumed.role.session.duration":"30m","hadoop.security.group.mapping.ldap.conversion.rule":"none","hadoop.proxyuser.livy.hosts":"*","hadoop.ssl.server.conf":"ssl-server.xml","fs.s3a.retry.throttle.interval":"1000ms","seq.io.sort.factor":"100","yarn.sharedcache.cleaner.initial-delay-mins":"10","mapreduce.client.completion.pollinterval":"5000","hadoop.ssl.keystores.factory.class":"org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory","yarn.app.mapreduce.am.resource.cpu-vcores":"1","yarn.timeline-service.enabled":"false","yarn.nodemanager.runtime.linux.docker.capabilities":"CHOWN,DAC_OVERRIDE,FSETID,FOWNER,MKNOD,NET_RAW,SETGID,SETUID, SETFCAP,SETPCAP,NET_BIND_SERVICE,SYS_CHROOT,KILL,AUDIT_WRITE","yarn.acl.enable":"false","yarn.timeline-service.entity-group-fs-store.done-dir":"/tmp/entity-file-history/done/","mapreduce.tasktracker.map.tasks.maximum":"1","hadoop.security.group.mapping.ldap.num.attempts.before.failover":"3","mapreduce.task.profile":"false","dfs.webhdfs.enabled":"true","yarn.resourcemanager.fs.state-store.uri":"${hadoop.tmp.dir}/yarn/system/rmstore","mapreduce.jobhistory.always-scan-user-dir":"false","yarn.nodemanager.opportunistic-containers-use-pause-for-preemption":"false","yarn.nodemanager.linux-container-executor.nonsecure-mode.local-user":"hadoop","mapred.output.direct.EmrFileSystem":"true","yarn.timeline-service.reader.class":"org.apache.hadoop.yarn.server.timelineservice.storage.HBaseTimelineReaderImpl","yarn.resourcemanager.configuration.provider-class":"org.apache.hadoop.yarn.LocalConfigurationProvider","yarn.nodemanager.runtime.linux.docker.userremapping-uid-threshold":"1","yarn.resourcemanager.configuration.file-system-based-store":"/yarn/conf","mapreduce.job.cache.limit.max-single-resource-mb":"0","yarn.nodemanager.runtime.linux.docker.stop.grace-period":"10","yarn.resourcemanager.resource-profiles.source-file":"resource-profiles.json","yarn.nodemanager.resource.percentage-physical-cpu-limit":"100","mapreduce.jobhistory.client.thread-count":"10","tfile.fs.input.buffer.size":"262144","mapreduce.client.progressmonitor.pollinterval":"1000","yarn.nodemanager.log-dirs":"/var/log/hadoop-yarn/containers","hadoop.security.auth_to_local":"\n RULE:[1:$1@$0](.*@)s/@.*///L\n RULE:[2:$1@$0](.*@)s/@.*///L\n DEFAULT\n ","fs.automatic.close":"true","yarn.nodemanager.hostname":"0.0.0.0","yarn.nodemanager.resource.memory.cgroups.swappiness":"0","fs.s3n.impl":"com.amazon.ws.emr.hadoop.fs.EmrFileSystem","ftp.stream-buffer-size":"4096","yarn.fail-fast":"false","yarn.timeline-service.app-aggregation-interval-secs":"15","hadoop.security.group.mapping.ldap.search.filter.user":"(&(objectClass=user)(sAMAccountName={0}))","yarn.nodemanager.container-localizer.log.level":"INFO","yarn.timeline-service.address":"${yarn.timeline-service.hostname}:10200","dfs.namenode.replication.work.multiplier.per.iteration":"10","mapreduce.job.ubertask.maxmaps":"9","fs.s3a.threads.keepalivetime":"60","dfs.datanode.available-space-volume-choosing-policy.balanced-space-preference-fraction":"1.0","mapreduce.jobhistory.webapp.rest-csrf.methods-to-ignore":"GET,OPTIONS,HEAD","mapreduce.task.files.preserve.failedtasks":"false","yarn.app.mapreduce.client.job.retry-interval":"2000","ha.failover-controller.graceful-fence.connection.retries":"1","yarn.resourcemanager.delegation.token.max-lifetime":"*********(redacted)","yarn.timeline-service.client.drain-entities.timeout.ms":"2000","yarn.nodemanager.resource-plugins.fpga.vendor-plugin.class":"org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.IntelFpgaOpenclPlugin","yarn.timeline-service.entity-group-fs-store.summary-store":"org.apache.hadoop.yarn.server.timeline.RollingLevelDBTimelineStore","mapreduce.reduce.cpu.vcores":"1","hadoop.proxyuser.oozie.groups":"*","mapreduce.job.encrypted-intermediate-data.buffer.kb":"128","fs.client.resolve.remote.symlinks":"true","yarn.nodemanager.webapp.https.address":"0.0.0.0:8044","hadoop.http.cross-origin.allowed-origins":"*","mapreduce.job.encrypted-intermediate-data":"false","yarn.timeline-service.entity-group-fs-store.retain-seconds":"604800","yarn.resourcemanager.metrics.runtime.buckets":"60,300,1440","yarn.timeline-service.generic-application-history.max-applications":"10000","mapreduce.tasktracker.reduce.tasks.maximum":"1","yarn.nodemanager.local-dirs":"/mnt/yarn,/mnt1/yarn","mapreduce.shuffle.connection-keep-alive.enable":"false","yarn.node-labels.configuration-type":"distributed","fs.s3a.path.style.access":"false","yarn.nodemanager.aux-services.mapreduce_shuffle.class":"org.apache.hadoop.mapred.ShuffleHandler","yarn.sharedcache.store.in-memory.staleness-period-mins":"10080","fs.adl.impl":"org.apache.hadoop.fs.adl.AdlFileSystem","yarn.resourcemanager.nodemanager.minimum.version":"NONE","mapreduce.jobhistory.webapp.xfs-filter.xframe-options":"SAMEORIGIN","yarn.app.mapreduce.am.staging-dir.erasurecoding.enabled":"false","net.topology.impl":"org.apache.hadoop.net.NetworkTopology","io.map.index.skip":"0","fs.AbstractFileSystem.s3.impl":"org.apache.hadoop.fs.s3.EMRFSDelegate","yarn.timeline-service.reader.webapp.https.address":"${yarn.timeline-service.webapp.https.address}","fs.ftp.data.connection.mode":"ACTIVE_LOCAL_DATA_CONNECTION_MODE","mapreduce.job.userlog.retain.hours":"48","mapreduce.job.local-fs.single-disk-limit.check.kill-limit-exceed":"true","yarn.scheduler.maximum-allocation-vcores":"128","hadoop.http.cross-origin.allowed-headers":"X-Requested-With,Content-Type,Accept,Origin","yarn.nodemanager.log-aggregation.compression-type":"none","yarn.timeline-service.version":"1.5","yarn.ipc.rpc.class":"org.apache.hadoop.yarn.ipc.HadoopYarnProtoRPC","mapreduce.reduce.maxattempts":"4","hadoop.security.dns.log-slow-lookups.enabled":"false","mapreduce.job.committer.setup.cleanup.needed":"true","mapreduce.job.running.reduce.limit":"0","ipc.maximum.response.length":"134217728","yarn.resourcemanager.webapp.rest-csrf.methods-to-ignore":"GET,OPTIONS,HEAD","mapreduce.job.token.tracking.ids.enabled":"*********(redacted)","hadoop.caller.context.max.size":"128","yarn.nodemanager.runtime.linux.docker.host-pid-namespace.allowed":"false","yarn.nodemanager.runtime.linux.docker.delayed-removal.allowed":"false","hadoop.registry.system.acls":"sasl:yarn@, sasl:mapred@, sasl:hdfs@","yarn.nodemanager.recovery.dir":"${hadoop.tmp.dir}/yarn-nm-recovery","fs.s3a.fast.upload.buffer":"disk","mapreduce.jobhistory.intermediate-done-dir":"${yarn.app.mapreduce.am.staging-dir}/history/done_intermediate","yarn.app.mapreduce.shuffle.log.separate":"true","yarn.log-aggregation.debug.filesize":"104857600","fs.s3a.max.total.tasks":"5","fs.s3a.readahead.range":"64K","hadoop.http.authentication.simple.anonymous.allowed":"true","fs.s3a.attempts.maximum":"20","hadoop.registry.zk.connection.timeout.ms":"15000","yarn.resourcemanager.delegation-token-renewer.thread-count":"*********(redacted)","yarn.nodemanager.health-checker.script.timeout-ms":"1200000","yarn.timeline-service.leveldb-timeline-store.start-time-write-cache-size":"10000","yarn.nodemanager.emit-container-events":"true","yarn.log.server.url":"http://ip-172-31-102-115.ec2.internal:19888/jobhistory/logs","yarn.resourcemanager.resource-profiles.enabled":"false","yarn.timeline-service.hbase-schema.prefix":"prod.","fs.azure.authorization":"false","mapreduce.map.log.level":"INFO","yarn.resourcemanager.decommissioning-nodes-watcher.poll-interval-secs":"20","hadoop.job.history.user.location":"none","mapreduce.output.fileoutputformat.compress.type":"BLOCK","yarn.resourcemanager.leveldb-state-store.path":"${hadoop.tmp.dir}/yarn/system/rmstore","yarn.timeline-service.webapp.rest-csrf.custom-header":"X-XSRF-Header","mapreduce.ifile.readahead.bytes":"4194304","yarn.sharedcache.app-checker.class":"org.apache.hadoop.yarn.server.sharedcachemanager.RemoteAppChecker","yarn.nodemanager.linux-container-executor.nonsecure-mode.limit-users":"true","yarn.nodemanager.resource.detect-hardware-capabilities":"false","mapreduce.cluster.acls.enabled":"false","mapreduce.job.speculative.retry-after-no-speculate":"1000","hadoop.security.group.mapping.ldap.search.group.hierarchy.levels":"0","yarn.resourcemanager.fs.state-store.retry-interval-ms":"1000","hadoop.proxyuser.hadoop.hosts":"*","yarn.resourcemanager.nodes.exclude-path":"/emr/instance-controller/lib/yarn.nodes.exclude.xml","file.stream-buffer-size":"4096","yarn.resourcemanager.application-timeouts.monitor.interval-ms":"3000","mapreduce.map.output.compress.codec":"org.apache.hadoop.io.compress.SnappyCodec","mapreduce.map.speculative":"true","mapreduce.job.speculative.retry-after-speculate":"15000","yarn.nodemanager.linux-container-executor.cgroups.mount":"false","yarn.app.mapreduce.am.container.log.backups":"0","yarn.app.mapreduce.am.log.level":"INFO","mapreduce.job.reduce.slowstart.completedmaps":"0.05","yarn.timeline-service.http-authentication.type":"simple","hadoop.security.group.mapping.ldap.search.attr.group.name":"cn","yarn.nodemanager.resource-plugins.fpga.allowed-fpga-devices":"auto","yarn.timeline-service.client.internal-timers-ttl-secs":"420","hadoop.http.logs.enabled":"true","fs.s3a.block.size":"32M","yarn.sharedcache.client-server.address":"0.0.0.0:8045","yarn.nodemanager.logaggregation.threadpool-size-max":"100","yarn.resourcemanager.hostname":"172.31.102.115","yarn.resourcemanager.delegation.key.update-interval":"86400000","mapreduce.reduce.shuffle.fetch.retry.enabled":"${yarn.nodemanager.recovery.enabled}","mapreduce.map.memory.mb":"1536","mapreduce.task.skip.start.attempts":"2","fs.AbstractFileSystem.hdfs.impl":"org.apache.hadoop.fs.Hdfs","yarn.nodemanager.disk-health-checker.enable":"true","ipc.client.tcpnodelay":"true","ipc.client.rpc-timeout.ms":"0","yarn.nodemanager.webapp.rest-csrf.methods-to-ignore":"GET,OPTIONS,HEAD","ipc.client.low-latency":"false","mapreduce.input.lineinputformat.linespermap":"1","yarn.router.interceptor.user.threadpool-size":"5","ipc.client.connect.max.retries.on.timeouts":"5","yarn.timeline-service.leveldb-timeline-store.read-cache-size":"104857600","fs.AbstractFileSystem.har.impl":"org.apache.hadoop.fs.HarFs","mapreduce.job.split.metainfo.maxsize":"10000000","yarn.am.liveness-monitor.expiry-interval-ms":"600000","yarn.resourcemanager.container-tokens.master-key-rolling-interval-secs":"*********(redacted)","yarn.timeline-service.entity-group-fs-store.app-cache-size":"10","fs.s3a.socket.recv.buffer":"8192","yarn.application.classpath":"\n $HADOOP_CONF_DIR,\n $HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,\n $HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,\n $HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,\n $HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*,\n /usr/lib/hadoop-lzo/lib/*,\n /usr/share/aws/emr/emrfs/conf,\n /usr/share/aws/emr/emrfs/lib/*,\n /usr/share/aws/emr/emrfs/auxlib/*,\n /usr/share/aws/emr/lib/*,\n /usr/share/aws/emr/ddb/lib/emr-ddb-hadoop.jar,\n /usr/share/aws/emr/goodies/lib/emr-hadoop-goodies.jar,\n /usr/lib/spark/yarn/lib/datanucleus-api-jdo.jar,\n /usr/lib/spark/yarn/lib/datanucleus-core.jar,\n /usr/lib/spark/yarn/lib/datanucleus-rdbms.jar,\n /usr/share/aws/emr/cloudwatch-sink/lib/*,\n /usr/share/aws/aws-java-sdk/*\n ","yarn.resourcemanager.resource-tracker.address":"ip-172-31-102-115.ec2.internal:8025","yarn.nodemanager.node-labels.provider.fetch-timeout-ms":"1200000","mapreduce.job.heap.memory-mb.ratio":"0.8","yarn.resourcemanager.leveldb-state-store.compaction-interval-secs":"3600","yarn.resourcemanager.webapp.rest-csrf.custom-header":"X-XSRF-Header","yarn.scheduler.configuration.fs.path":"file://${hadoop.tmp.dir}/yarn/system/schedconf","dfs.datanode.max.transfer.threads":"4096","mapreduce.client.output.filter":"FAILED","hadoop.http.filter.initializers":"org.apache.hadoop.security.HttpCrossOriginFilterInitializer,org.apache.hadoop.yarn.server.security.http.RMAuthenticationFilterInitializer,org.apache.hadoop.http.lib.StaticUserWebFilter","mapreduce.reduce.memory.mb":"3072","mapreduce.admin.user.env":"LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:/usr/lib/hadoop-lzo/lib/native","yarn.timeline-service.hostname":"ip-172-31-102-115.ec2.internal","file.replication":"1","yarn.nodemanager.container-metrics.unregister-delay-ms":"10000","yarn.nodemanager.container-metrics.period-ms":"-1","mapreduce.fileoutputcommitter.task.cleanup.enabled":"false","hadoop.proxyuser.oozie.hosts":"*","yarn.nodemanager.log.retain-seconds":"10800","yarn.timeline-service.entity-group-fs-store.cleaner-interval-seconds":"3600","yarn.resourcemanager.keytab":"/etc/krb5.keytab","hadoop.security.group.mapping.providers.combined":"true","mapreduce.reduce.merge.inmem.threshold":"1000","yarn.timeline-service.recovery.enabled":"false","fs.azure.saskey.usecontainersaskeyforallaccess":"true","yarn.sharedcache.nm.uploader.thread-count":"20","mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","yarn.resourcemanager.nodemanager-graceful-decommission-timeout-secs":"3600","mapreduce.shuffle.ssl.enabled":"false","yarn.timeline-service.hbase.coprocessor.app-final-value-retention-milliseconds":"259200000","fs.s3a.committer.staging.abort.pending.uploads":"true","yarn.nodemanager.opportunistic-containers-max-queue-length":"0","yarn.resourcemanager.state-store.max-completed-applications":"${yarn.resourcemanager.max-completed-applications}","mapreduce.job.speculative.minimum-allowed-tasks":"10","yarn.nodemanager.node-labels.provider":"config","yarn.log-aggregation.retain-seconds":"172800","yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb":"0","mapreduce.jobhistory.max-age-ms":"604800000","hadoop.http.cross-origin.allowed-methods":"GET,POST,HEAD","yarn.resourcemanager.opportunistic-container-allocation.enabled":"false","mapreduce.jobhistory.webapp.address":"ip-172-31-102-115.ec2.internal:19888","hadoop.system.tags":"YARN,HDFS,NAMENODE,DATANODE,REQUIRED,SECURITY,KERBEROS,PERFORMANCE,CLIENT\n ,SERVER,DEBUG,DEPRECATED,COMMON,OPTIONAL","yarn.log-aggregation.file-controller.TFile.class":"org.apache.hadoop.yarn.logaggregation.filecontroller.tfile.LogAggregationTFileController","yarn.client.nodemanager-connect.max-wait-ms":"180000","yarn.resourcemanager.webapp.address":"${yarn.resourcemanager.hostname}:8088","mapreduce.jobhistory.recovery.enable":"false","mapreduce.reduce.shuffle.parallelcopies":"20","fs.AbstractFileSystem.webhdfs.impl":"org.apache.hadoop.fs.WebHdfs","fs.trash.interval":"0","yarn.nodemanager.node-labels.provider.configured-node-partition":"CORE","yarn.app.mapreduce.client.max-retries":"3","hadoop.security.authentication":"simple","mapreduce.task.profile.reduce.params":"${mapreduce.task.profile.params}","dfs.datanode.du.reserved":"536870912","yarn.app.mapreduce.am.resource.mb":"3072","mapreduce.input.fileinputformat.list-status.num-threads":"1","io.compression.codec.lzo.class":"com.hadoop.compression.lzo.LzoCodec","yarn.nodemanager.container-executor.class":"org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor","io.mapfile.bloom.size":"1048576","yarn.timeline-service.ttl-ms":"604800000","yarn.resourcemanager.nm-container-queuing.min-queue-length":"5","yarn.nodemanager.resource.cpu-vcores":"8","mapreduce.job.reduces":"17","fs.s3a.multipart.size":"100M","mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","yarn.scheduler.minimum-allocation-vcores":"1","mapreduce.job.speculative.speculative-cap-total-tasks":"0.01","hadoop.ssl.client.conf":"ssl-client.xml","mapreduce.job.queuename":"default","mapreduce.job.encrypted-intermediate-data-key-size-bits":"128","fs.s3a.metadatastore.authoritative":"false","yarn.nodemanager.webapp.xfs-filter.xframe-options":"SAMEORIGIN","ha.health-monitor.sleep-after-disconnect.ms":"1000","yarn.app.mapreduce.shuffle.log.limit.kb":"0","hadoop.security.group.mapping":"org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback","yarn.client.application-client-protocol.poll-timeout-ms":"-1","mapreduce.jobhistory.jhist.format":"binary","yarn.resourcemanager.ha.enabled":"false","hadoop.http.staticuser.user":"dr.who","mapreduce.task.exit.timeout.check-interval-ms":"20000","mapreduce.jobhistory.intermediate-user-done-dir.permissions":"770","mapreduce.task.exit.timeout":"60000","yarn.nodemanager.linux-container-executor.resources-handler.class":"org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler","mapreduce.reduce.shuffle.memory.limit.percent":"0.25","yarn.resourcemanager.reservation-system.enable":"false","mapreduce.map.output.compress":"true","ha.zookeeper.acl":"world:anyone:rwcda","hadoop.proxyuser.presto.groups":"*","ipc.server.max.connections":"0","yarn.nodemanager.runtime.linux.docker.default-container-network":"host","yarn.router.webapp.address":"0.0.0.0:8089","yarn.scheduler.maximum-allocation-mb":"54272","yarn.resourcemanager.scheduler.monitor.policies":"org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy","yarn.sharedcache.cleaner.period-mins":"1440","yarn.nodemanager.resource-plugins.gpu.docker-plugin.nvidia-docker-v1.endpoint":"http://localhost:3476/v1.0/docker/cli","yarn.app.mapreduce.am.container.log.limit.kb":"0","ipc.client.connect.retry.interval":"1000","yarn.timeline-service.http-cross-origin.enabled":"true","fs.wasbs.impl":"org.apache.hadoop.fs.azure.NativeAzureFileSystem$Secure","hadoop.proxyuser.httpfs.groups":"*","yarn.federation.subcluster-resolver.class":"org.apache.hadoop.yarn.server.federation.resolver.DefaultSubClusterResolverImpl","yarn.resourcemanager.zk-state-store.parent-path":"/rmstore","mapreduce.jobhistory.cleaner.enable":"true","yarn.timeline-service.client.fd-flush-interval-secs":"10","hadoop.security.kms.client.encrypted.key.cache.expiry":"43200000","hadoop.proxyuser.httpfs.hosts":"*","yarn.client.nodemanager-client-async.thread-pool-max-size":"500","mapreduce.map.maxattempts":"4","yarn.resourcemanager.nm-container-queuing.sorting-nodes-interval-ms":"1000","fs.s3a.committer.staging.tmp.path":"tmp/staging","yarn.nodemanager.sleep-delay-before-sigkill.ms":"250","yarn.resourcemanager.nm-container-queuing.min-queue-wait-time-ms":"10","mapreduce.job.end-notification.retry.attempts":"0","yarn.nodemanager.resource.count-logical-processors-as-cores":"false","mapred.output.direct.NativeS3FileSystem":"true","hadoop.registry.zk.root":"/registry","adl.feature.ownerandgroup.enableupn":"false","yarn.resourcemanager.zk-max-znode-size.bytes":"1048576","mapreduce.job.reduce.shuffle.consumer.plugin.class":"org.apache.hadoop.mapreduce.task.reduce.Shuffle","yarn.resourcemanager.delayed.delegation-token.removal-interval-ms":"*********(redacted)","yarn.nodemanager.localizer.cache.target-size-mb":"10240","fs.s3a.committer.staging.conflict-mode":"fail","mapreduce.client.libjars.wildcard":"true","fs.s3a.committer.staging.unique-filenames":"true","yarn.nodemanager.node-attributes.provider.fetch-timeout-ms":"1200000","fs.s3a.list.version":"2","ftp.client-write-packet-size":"65536","fs.AbstractFileSystem.adl.impl":"org.apache.hadoop.fs.adl.Adl","hadoop.proxyuser.hive.hosts":"*","yarn.node-labels.fs-store.root-dir":"file:///mnt/var/lib/hadoop-yarn/nodelabels","hadoop.security.key.default.cipher":"AES/CTR/NoPadding","yarn.client.failover-retries":"0","fs.s3a.multipart.purge.age":"86400","mapreduce.job.local-fs.single-disk-limit.check.interval-ms":"5000","net.topology.node.switch.mapping.impl":"org.apache.hadoop.net.ScriptBasedMapping","yarn.nodemanager.amrmproxy.address":"0.0.0.0:8049","ipc.server.listen.queue.size":"128","map.sort.class":"org.apache.hadoop.util.QuickSort","fs.viewfs.rename.strategy":"SAME_MOUNTPOINT","hadoop.security.kms.client.authentication.retry-count":"1","fs.permissions.umask-mode":"022","fs.s3a.assumed.role.credentials.provider":"org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider","yarn.nodemanager.vmem-check-enabled":"true","yarn.nodemanager.numa-awareness.enabled":"false","yarn.nodemanager.recovery.compaction-interval-secs":"3600","yarn.app.mapreduce.client-am.ipc.max-retries":"3","yarn.federation.registry.base-dir":"yarnfederation/","mapreduce.job.max.map":"-1","mapreduce.job.local-fs.single-disk-limit.bytes":"-1","mapreduce.job.ubertask.maxreduces":"1","hadoop.security.kms.client.encrypted.key.cache.size":"500","hadoop.security.java.secure.random.algorithm":"SHA1PRNG","ha.failover-controller.cli-check.rpc-timeout.ms":"20000","mapreduce.jobhistory.jobname.limit":"50","mapreduce.application.classpath":"\n $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,\n $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,\n /usr/lib/hadoop-lzo/lib/*,\n /usr/share/aws/emr/emrfs/conf,\n /usr/share/aws/emr/emrfs/lib/*,\n /usr/share/aws/emr/emrfs/auxlib/*,\n /usr/share/aws/emr/lib/*,\n /usr/share/aws/emr/ddb/lib/emr-ddb-hadoop.jar,\n /usr/share/aws/emr/goodies/lib/emr-hadoop-goodies.jar,\n /usr/share/aws/emr/cloudwatch-sink/lib/*,\n /usr/share/aws/aws-java-sdk/*\n ","yarn.client.nodemanager-connect.retry-interval-ms":"10000","yarn.timeline-service.state-store-class":"org.apache.hadoop.yarn.server.timeline.recovery.LeveldbTimelineStateStore","yarn.nodemanager.env-whitelist":"JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME,PATH","yarn.sharedcache.nested-level":"3","yarn.timeline-service.webapp.rest-csrf.methods-to-ignore":"GET,OPTIONS,HEAD","fs.azure.user.agent.prefix":"unknown","yarn.resourcemanager.zk-delegation-token-node.split-index":"*********(redacted)","yarn.nodemanager.numa-awareness.read-topology":"false","yarn.nodemanager.webapp.address":"${yarn.nodemanager.hostname}:8042","rpc.metrics.quantile.enable":"false","yarn.registry.class":"org.apache.hadoop.registry.client.impl.FSRegistryOperationsService","mapreduce.jobhistory.admin.acl":"*","yarn.resourcemanager.system-metrics-publisher.dispatcher.pool-size":"10","yarn.scheduler.queue-placement-rules":"user-group","hadoop.http.authentication.kerberos.keytab":"${user.home}/hadoop.keytab","yarn.resourcemanager.recovery.enabled":"false","yarn.timeline-service.webapp.rest-csrf.enabled":"false","dfs.datanode.available-space-volume-choosing-policy.balanced-space-threshold":"10737418240"},"System Properties":{"java.io.tmpdir":"/tmp","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.maintenance.version":"4","java.specification.version":"1.8","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Amazon.com Inc.","java.vm.specification.version":"1.8","user.home":"/home/hadoop","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","sun.arch.data.model":"64","sun.boot.library.path":"/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/amd64","user.dir":"/mnt/var/lib/hadoop/steps/s-1EF238MZKOWWR","java.library.path":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib","sun.cpu.isalist":"","os.arch":"amd64","java.vm.version":"25.362-b08","jetty.git.hash":"84700530e645e812b336747464d6fbbf370c9a20","java.endorsed.dirs":"/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/endorsed","java.runtime.version":"1.8.0_362-b08","java.vm.info":"mixed mode","java.ext.dirs":"/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/ext:/usr/java/packages/lib/ext","java.runtime.name":"OpenJDK Runtime Environment","EMR_RELEASE_LABEL":"emr-6.2.0","file.separator":"/","java.class.version":"52.0","EMR_CLUSTER_ID":"j-14QV64S2PV1Y2","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/resources.jar:/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/rt.jar:/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/sunrsasign.jar:/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/jsse.jar:/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/jce.jar:/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/charsets.jar:/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/jfr.jar:/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/classes","file.encoding":"UTF-8","user.timezone":"UTC","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"4.14.200-155.322.amzn2.x86_64","sun.os.patch.level":"unknown","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","user.language":"en","java.vendor.url":"https://aws.amazon.com/corretto/","java.awt.printerjob":"sun.print.PSPrinterJob","java.awt.graphicsenv":"sun.awt.X11GraphicsEnvironment","awt.toolkit":"sun.awt.X11.XToolkit","os.name":"Linux","java.vm.vendor":"Amazon.com Inc.","java.vendor.url.bug":"https://github.com/corretto/corretto-8/issues/","user.name":"hadoop","java.vm.name":"OpenJDK 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit /home/hadoop/index_data_etl_1GB.py","java.home":"/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre","java.version":"1.8.0_362","EMR_STEP_ID":"s-1EF238MZKOWWR","sun.io.unicode.encoding":"UnicodeLittle"},"Classpath Entries":{"/usr/share/aws/aws-java-sdk/aws-java-sdk-macie-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/cats-kernel_2.12-2.0.0-M4.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mediaconvert-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jersey-container-servlet-core-2.30.jar":"System Classpath","/usr/lib/spark/jars/jackson-core-asl-1.9.13.jar":"System Classpath","/usr/lib/spark/jars/jackson-databind-2.10.0.jar":"System Classpath","/usr/lib/spark/jars/spark-kvstore_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-elastictranscoder-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-appsync-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/json4s-ast_2.12-3.6.6.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-groundstation-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/scala-xml_2.12-1.2.0.jar":"System Classpath","/usr/lib/spark/conf/":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-efs-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/dnsjava-2.1.7.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/emrfs-hadoop-assembly-2.44.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mediaconnect-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-macie2-1.11.880.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/jcl-over-slf4j-1.7.21.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codestar-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-organizations-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-directory-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/aopalliance-repackaged-2.6.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-health-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ssooidc-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-tags_2.12-3.0.1-amzn-0-tests.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-forecastquery-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-frauddetector-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-common-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-machinelearning-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/osgi-resource-locator-1.0.3.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-backup-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/libfb303-0.9.3.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-servicequotas-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudsearch-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/guice-4.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-workspaces-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-honeycode-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-comprehend-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/datanucleus-rdbms-4.1.19.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-budgets-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jakarta.inject-2.6.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-route53-1.11.880.jar":"System Classpath","/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iotsitewise-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/machinist_2.12-0.6.8.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iot-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ebs-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-daemon-1.0.13.jar":"System Classpath","/usr/lib/spark/jars/jsr305-3.0.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-dlm-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ses-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-serde-2.3.7-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/HikariCP-2.5.1.jar":"System Classpath","/usr/lib/spark/jars/jdo-api-3.0.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-signer-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-network-common_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_system-osx-x86_64-1.1-natives.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ioteventsdata-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-applicationinsights-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-imagebuilder-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iam-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/JTransforms-3.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-elasticache-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ecs-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/lz4-java-1.7.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-quicksight-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/kerb-server-1.0.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudformation-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-math3-3.4.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-snowball-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-docdb-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-resourcegroupstaggingapi-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jackson-datatype-jsr310-2.10.3.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-dms-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/okhttp-3.12.6.jar":"System Classpath","/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar":"System Classpath","/usr/lib/spark/jars/parquet-encoding-1.10.1-spark-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-sagemaker-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/stax2-api-3.1.4.jar":"System Classpath","/usr/lib/spark/jars/libthrift-0.12.0.jar":"System Classpath","/usr/lib/spark/jars/kerby-config-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/httpclient-4.5.9.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codeguruprofiler-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jpam-1.1.jar":"System Classpath","/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ssoadmin-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/kryo-shaded-4.0.2.jar":"System Classpath","/usr/lib/spark/jars/json-smart-2.3.jar":"System Classpath","/usr/lib/spark/jars/commons-cli-1.2.jar":"System Classpath","/usr/lib/spark/jars/scala-compiler-2.12.10.jar":"System Classpath","/usr/lib/spark/jars/commons-beanutils-1.9.4.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-migrationhubconfig-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codebuild-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/curator-client-2.13.0.jar":"System Classpath","/usr/lib/spark/jars/slf4j-api-1.7.30.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-stepfunctions-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-autoscaling-1.11.880.jar":"System Classpath","/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cognitoidp-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/curator-framework-2.13.0.jar":"System Classpath","/usr/share/aws/emr/security/conf":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-guardduty-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-configuration2-2.1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-autoscalingplans-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/istack-commons-runtime-3.0.8.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-route53resolver-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/parquet-common-1.10.1-spark-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/hk2-locator-2.6.1.jar":"System Classpath","/usr/lib/spark/jars/commons-lang-2.6.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codecommit-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cognitosync-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/log4j-1.2.17.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-transfer-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-comprehendmedical-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-connectparticipant-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/curator-recipes-2.13.0.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/javax.inject-1.jar":"System Classpath","/usr/lib/spark/jars/jersey-client-2.30.jar":"System Classpath","/usr/lib/spark/jars/spark-mllib_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/breeze-macros_2.12-1.0.jar":"System Classpath","/usr/lib/spark/jars/arrow-vector-0.15.1.jar":"System Classpath","/usr/lib/spark/jars/json4s-scalap_2.12-3.6.6.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-braket-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-events-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/kerb-core-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/kerby-asn1-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/py4j-0.10.9.jar":"System Classpath","/usr/lib/spark/jars/paranamer-2.8.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-appconfig-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-llap-common-2.3.7.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-networkmanager-1.11.880.jar":"System Classpath","/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar":"System Classpath","/usr/lib/spark/jars/jakarta.annotation-api-1.3.5.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/ion-java-1.0.2.jar":"System Classpath","/usr/lib/spark/jars/janino-3.0.16.jar":"System Classpath","/usr/lib/spark/jars/hadoop-auth-3.2.1-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/threeten-extra-1.5.0.jar":"System Classpath","/usr/lib/spark/jars/commons-collections-3.2.2.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/fluent-hc-4.5.6.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-secretsmanager-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-logging-1.1.3.jar":"System Classpath","/usr/lib/spark/jars/jackson-jaxrs-json-provider-2.10.0.jar":"System Classpath","/usr/lib/spark/jars/objenesis-2.5.1.jar":"System Classpath","/usr/lib/spark/jars/chill-java-0.9.5.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-importexport-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-greengrass-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/arrow-format-0.15.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-chime-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-sqs-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/nimbus-jose-jwt-4.41.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-neptune-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/metrics-jvm-4.1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-schemas-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/orc-core-1.5.10.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-marketplacecommerceanalytics-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-exec-2.3.7-amzn-2-core.jar":"System Classpath","/usr/lib/spark/jars/re2j-1.1.jar":"System Classpath","/usr/lib/spark/jars/hive-common-2.3.7-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-pinpointsmsvoice-1.11.880.jar":"System Classpath","/usr/lib/hadoop-lzo/lib/hadoop-lzo-0.4.19.jar":"System Classpath","/usr/lib/spark/jars/commons-codec-1.10.jar":"System Classpath","/usr/lib/spark/jars/compress-lzf-1.0.3.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-pi-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/bcprov-jdk15on-1.60.jar":"System Classpath","/usr/lib/spark/jars/jersey-common-2.30.jar":"System Classpath","/usr/lib/spark/jars/javax.jdo-3.2.0-m3.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-opsworkscm-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-config-1.11.880.jar":"System Classpath","/usr/share/aws/emr/security/lib/*":"System Classpath","/usr/lib/spark/jars/kerb-util-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_ref-linux-i686-1.1-natives.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-costandusagereport-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_system-linux-armhf-1.1-natives.jar":"System Classpath","/usr/lib/spark/jars/jta-1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudtrail-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-net-3.1.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_system-linux-i686-1.1-natives.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-opsworks-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/slf4j-log4j12-1.7.30.jar":"System Classpath","/usr/lib/spark/jars/jackson-module-scala_2.12-2.10.0.jar":"System Classpath","/usr/lib/spark/jars/json-1.8.jar":"System Classpath","/usr/lib/spark/jars/macro-compat_2.12-1.1.1.jar":"System Classpath","/usr/lib/spark/jars/hadoop-mapreduce-client-common-3.2.1-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/stream-2.9.6.jar":"System Classpath","/usr/lib/spark/jars/jackson-annotations-2.10.0.jar":"System Classpath","/usr/lib/spark/jars/aircompressor-0.10.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-glue-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-kinesisvideosignalingchannels-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-launcher_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/metrics-json-4.1.1.jar":"System Classpath","/usr/lib/spark/jars/httpcore-4.4.11.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-forecast-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-resourcegroups-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-appstream-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/parquet-format-2.4.0.jar":"System Classpath","/usr/lib/spark/jars/ehcache-3.3.1.jar":"System Classpath","/usr/lib/spark/jars/native_ref-java-1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-synthetics-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-catalyst_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudwatchmetrics-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-pricing-1.11.880.jar":"System Classpath","/etc/hadoop/conf/":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-sns-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-savingsplans-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-redshiftdataapi-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-mllib-local_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codegurureviewer-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/native_system-java-1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-logs-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-mapreduce-client-jobclient-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-alexaforbusiness-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/remotetea-oncrpc-1.1.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-inspector-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-pinpoint-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iotevents-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/woodstox-core-5.0.3.jar":"System Classpath","/usr/lib/spark/jars/commons-io-2.4.jar":"System Classpath","/usr/lib/spark/jars/htrace-core4-4.1.0-incubating.jar":"System Classpath","/usr/lib/spark/jars/jline-2.14.6.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-rekognition-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-qldbsession-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-dynamodb-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-wafv2-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-lex-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/accessors-smart-1.2.jar":"System Classpath","/usr/lib/spark/jars/jakarta.activation-api-1.2.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-elasticinference-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-shims-2.3.7-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-augmentedairuntime-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-network-shuffle_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/commons-lang3-3.9.jar":"System Classpath","/usr/lib/spark/jars/activation-1.1.1.jar":"System Classpath","/usr/lib/spark/jars/hadoop-yarn-server-web-proxy-3.2.1-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/jackson-core-2.10.0.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_ref-win-x86_64-1.1-natives.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iotanalytics-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_ref-linux-armhf-1.1-natives.jar":"System Classpath","/usr/lib/spark/jars/flatbuffers-java-1.9.0.jar":"System Classpath","/usr/lib/spark/jars/shapeless_2.12-2.3.3.jar":"System Classpath","/usr/lib/spark/jars/jetty-rewrite-9.3.27.v20190418.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-timestreamquery-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-simpleworkflow-1.11.880.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/slf4j-api-1.7.21.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codegen-maven-plugin-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-sso-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-shims-0.23-2.3.7-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-apigatewaymanagementapi-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/metrics-core-4.1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iot1clickprojects-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-datasync-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-athena-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/leveldbjni-all-1.8.jar":"System Classpath","/usr/lib/spark/jars/spire-platform_2.12-0.17.0-M1.jar":"System Classpath","/usr/lib/spark/jars/gmetric4j-1.0.10.jar":"System Classpath","/usr/lib/spark/jars/javax.inject-1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-batch-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iotjobsdataplane-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jersey-server-2.30.jar":"System Classpath","/usr/lib/spark/jars/kerby-xdr-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/javax.servlet-api-3.1.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-s3outposts-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-elasticloadbalancingv2-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-sql_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/antlr4-runtime-4.7.1.jar":"System Classpath","/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mediapackagevod-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/arrow-memory-0.15.1.jar":"System Classpath","/usr/lib/spark/jars/gson-2.2.4.jar":"System Classpath","/usr/lib/spark/jars/hadoop-yarn-registry-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-lexmodelbuilding-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-simpledb-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-core-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spire-macros_2.12-0.17.0-M1.jar":"System Classpath","/usr/lib/spark/jars/okio-1.15.0.jar":"System Classpath","/usr/lib/spark/jars/hadoop-annotations-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-devicefarm-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_system-win-x86_64-1.1-natives.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-kinesisanalyticsv2-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mobile-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-textract-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/transaction-api-1.1.jar":"System Classpath","/usr/lib/spark/jars/kerby-util-1.0.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-elasticbeanstalk-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-servermigration-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/avro-mapred-1.8.2-hadoop2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mechanicalturkrequester-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/protobuf-java-2.5.0.jar":"System Classpath","/usr/lib/spark/jars/emr-spark-goodies.jar":"System Classpath","/usr/lib/spark/jars/spark-ganglia-lgpl_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/kerb-client-1.0.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-clouddirectory-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jaxb-api-2.2.11.jar":"System Classpath","/usr/share/aws/aws-java-sdk/jmespath-java-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-kafka-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_ref-linux-x86_64-1.1-natives.jar":"System Classpath","/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-appflow-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jersey-media-jaxb-2.30.jar":"System Classpath","/usr/lib/spark/jars/spark-graphx_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-personalize-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloud9-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-unsafe_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/commons-compress-1.8.1.jar":"System Classpath","/usr/lib/spark/jars/kerby-pkix-1.0.1.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/lombok-1.18.4.jar":"System Classpath","/usr/lib/spark/jars/datanucleus-core-4.1.17.jar":"System Classpath","/usr/lib/spark/jars/hive-metastore-2.3.7-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/netty-all-4.1.47.Final.jar":"System Classpath","/usr/lib/spark/jars/spark-tags_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-personalizeevents-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/minlog-1.3.0.jar":"System Classpath","/usr/lib/spark/jars/hadoop-yarn-api-3.2.1-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/jackson-mapper-asl-1.9.13.jar":"System Classpath","/usr/lib/spark/jars/zjsonpatch-0.3.0.jar":"System Classpath","/usr/lib/spark/jars/jackson-dataformat-yaml-2.10.0.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_system-linux-x86_64-1.1-natives.jar":"System Classpath","/usr/lib/spark/jars/orc-shims-1.5.10.jar":"System Classpath","/usr/lib/spark/jars/okhttp-2.7.5.jar":"System Classpath","/usr/lib/spark/jars/jniloader-1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-acmpca-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-servicecatalog-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-yarn-common-3.2.1-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/spark-hive_2.12-3.0.1-amzn-0.jar":"System Classpath","/docker/usr/share/aws/emr/emrfs/conf":"System Classpath","/usr/lib/spark/jars/spark-repl_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/orc-mapreduce-1.5.10.jar":"System Classpath","/usr/lib/spark/jars/avro-1.8.2.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/secret-agent-interface-1.3.0.jar":"System Classpath","/usr/lib/spark/jars/oro-2.0.8.jar":"System Classpath","/usr/lib/spark/jars/automaton-1.11-8.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mq-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-sketch_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/token-provider-1.0.1.jar":"System Classpath","/docker/usr/share/aws/emr/security/conf":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-appmesh-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ec2-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/logging-interceptor-3.12.6.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-fsx-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iot1clickdevices-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/xz-1.5.jar":"System Classpath","/usr/lib/spark/jars/json4s-core_2.12-3.6.6.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-securityhub-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iotsecuretunneling-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codedeploy-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-shims-scheduler-2.3.7-amzn-2.jar":"System Classpath","/docker/usr/share/aws/emr/emrfs/auxlib/*":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-glacier-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-pool-1.5.4.jar":"System Classpath","/usr/lib/spark/jars/kerb-common-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/bonecp-0.8.0.RELEASE.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codestarnotifications-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hk2-utils-2.6.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-eks-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudhsmv2-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-yarn-server-common-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ram-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-marketplaceentitlement-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_ref-osx-x86_64-1.1-natives.jar":"System Classpath","/usr/lib/spark/jars/spark-hive-thriftserver_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-marketplacemeteringservice-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-workmail-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/scala-collection-compat_2.12-2.1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-test-utils-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-s3-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-dbcp-1.4.jar":"System Classpath","/usr/lib/spark/jars/jersey-hk2-2.30.jar":"System Classpath","/usr/lib/spark/jars/metrics-jmx-4.1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-marketplacecatalog-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-servicediscovery-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-yarn-client-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mediatailor-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/derby-10.12.1.1.jar":"System Classpath","/usr/lib/spark/jars/json4s-jackson_2.12-3.6.6.jar":"System Classpath","/usr/lib/spark/jars/scala-library-2.12.10.jar":"System Classpath","/usr/lib/spark/jars/hive-jdbc-2.3.7-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-redshift-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-crypto-1.1.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-lakeformation-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/metrics-graphite-4.1.1.jar":"System Classpath","/usr/lib/spark/jars/JLargeArrays-1.5.jar":"System Classpath","/usr/lib/spark/jars/zookeeper-3.4.14.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codestarconnections-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudfront-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/antlr-runtime-3.5.2.jar":"System Classpath","/usr/lib/spark/jars/generex-1.0.2.jar":"System Classpath","/usr/lib/spark/jars/parquet-jackson-1.10.1-spark-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/jackson-module-paranamer-2.10.0.jar":"System Classpath","/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/jmespath-java-1.11.852.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-xray-1.11.880.jar":"System Classpath","/usr/share/aws/emr/emrfs/auxlib/*":"System Classpath","/usr/lib/spark/jars/commons-text-1.6.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/mockito-core-1.10.19.jar":"System Classpath","/docker/usr/lib/hadoop/hadoop-aws.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-elasticsearch-1.11.880.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/objenesis-2.1.jar":"System Classpath","/docker/usr/share/aws/emr/emrfs/lib/*":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-api-gateway-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/parquet-column-1.10.1-spark-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/RoaringBitmap-0.7.45.jar":"System Classpath","/usr/lib/spark/jars/jackson-module-jaxb-annotations-2.10.0.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/bcprov-ext-jdk15on-1.66.jar":"System Classpath","/docker/usr/share/aws/aws-java-sdk/*":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-storagegateway-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-sts-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/avro-ipc-1.8.2.jar":"System Classpath","/usr/lib/spark/jars/jackson-jaxrs-base-2.10.0.jar":"System Classpath","/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-apigatewayv2-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/breeze_2.12-1.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-directconnect-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spire-util_2.12-0.17.0-M1.jar":"System Classpath","/usr/lib/spark/jars/jcl-over-slf4j-1.7.30.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-globalaccelerator-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/datanucleus-api-jdo-4.2.4.jar":"System Classpath","/usr/lib/spark/jars/snappy-java-1.1.7.5.jar":"System Classpath","/usr/lib/spark/jars/dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-outposts-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/guice-servlet-4.0.jar":"System Classpath","/usr/lib/spark/jars/spark-yarn_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-support-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jaxb-runtime-2.3.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-eventbridge-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/bcpkix-jdk15on-1.60.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-qldb-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cognitoidentity-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudwatch-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-translate-1.11.880.jar":"System Classpath","/usr/lib/hadoop-lzo/lib/hadoop-lzo.jar":"System Classpath","/usr/lib/spark/jars/joda-time-2.10.5.jar":"System Classpath","/usr/lib/spark/jars/arpack_combined_all-0.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-detective-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/aopalliance-1.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-kendra-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jsp-api-2.1.jar":"System Classpath","/usr/lib/spark/jars/javassist-3.25.0-GA.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-identitystore-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-workdocs-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-kinesis-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-timestreamwrite-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-kinesisvideo-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-dataexchange-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-dax-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spire_2.12-0.17.0-M1.jar":"System Classpath","/usr/lib/spark/jars/core-1.1.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-worklink-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_system-win-i686-1.1-natives.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-robomaker-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/kerb-admin-1.0.1.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/aws-glue-sdk-1.12.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-lambda-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iotthingsgraph-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-opensdk-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-models-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jakarta.validation-api-2.0.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-sesv2-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-cli-2.3.7-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-acm-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-lightsail-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-accessanalyzer-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-datapipeline-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-mapreduce-client-core-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mediastoredata-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ecr-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codepipeline-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-computeoptimizer-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/scala-reflect-2.12.10.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-personalizeruntime-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/geronimo-jcache_1.0_spec-1.0-alpha-1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-shield-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-beeline-2.3.7-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/velocity-1.5.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-fms-1.11.880.jar":"System Classpath","/docker/usr/share/aws/emr/security/lib/*":"System Classpath","/usr/lib/spark/jars/chill_2.12-0.9.5.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-transcribe-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-shims-common-2.3.7-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/super-csv-2.2.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-applicationautoscaling-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jodd-core-3.5.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-licensemanager-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-waf-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/univocity-parsers-2.9.0.jar":"System Classpath","/usr/lib/spark/jars/zstd-jni-1.4.4-3.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-pinpointemail-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mediapackage-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-hdfs-client-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-gamelift-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/ivy-2.4.0.jar":"System Classpath","/usr/lib/spark/jars/hive-vector-code-gen-2.3.7-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codeartifact-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/kerb-simplekdc-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/scala-parser-combinators_2.12-1.1.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-kms-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hk2-api-2.6.1.jar":"System Classpath","/usr/lib/hadoop/hadoop-aws.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-costexplorer-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-storage-api-2.7.1.jar":"System Classpath","/usr/lib/spark/jars/snakeyaml-1.24.jar":"System Classpath","/usr/lib/spark/jars/jakarta.ws.rs-api-2.1.6.jar":"System Classpath","/usr/lib/spark/jars/jul-to-slf4j-1.7.30.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-polly-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-connect-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-rds-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/ST4-4.0.4.jar":"System Classpath","/usr/lib/spark/jars/opencsv-2.3.jar":"System Classpath","/usr/lib/spark/jars/stax-api-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/parquet-hadoop-1.10.1-spark-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-amplify-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/javolution-5.5.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-managedblockchain-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-emr-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-migrationhub-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-s3control-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/xbean-asm7-shaded-4.15.jar":"System Classpath","/usr/lib/spark/jars/algebra_2.12-2.0.0-M2.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/annotations-16.0.2.jar":"System Classpath","/usr/lib/spark/jars/spark-streaming_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/kerb-crypto-1.0.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ssm-1.11.880.jar":"System Classpath","/usr/share/aws/emr/emrfs/conf/":"System Classpath","/usr/lib/spark/jars/jersey-container-servlet-2.30.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-serverlessapplicationrepository-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/kerb-identity-1.0.1.jar":"System Classpath","/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-sagemakerruntime-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/guava-14.0.1.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/aopalliance-1.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudhsm-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-core_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ec2instanceconnect-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-code-generator-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-client-3.2.1-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/shims-0.7.45.jar":"System Classpath","/usr/lib/spark/jars/commons-compiler-3.0.16.jar":"System Classpath","/usr/lib/spark/jars/jcip-annotations-1.0-1.jar":"System Classpath","/usr/lib/spark/jars/pyrolite-4.30.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-medialive-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-rdsdata-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-elasticloadbalancing-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ivs-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-discovery-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jakarta.xml.bind-api-2.3.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mediastore-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-workmailmessageflow-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_ref-win-i686-1.1-natives.jar":"System Classpath","/docker/usr/lib/hadoop-lzo/lib/*":"System Classpath"}} -{"Event":"SparkListenerApplicationStart","App Name":"index_data_etl_1GB","App ID":"application_1678162862227_0001","Timestamp":1678162946352,"User":"hadoop"} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162968878,"Executor ID":"7","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000009/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000009/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000009"},"Resources":{}}} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162968886,"Executor ID":"1","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000002/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000002/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000002"},"Resources":{}}} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162968889,"Executor ID":"8","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000010/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000010/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000010"},"Resources":{}}} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162968899,"Executor ID":"3","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000005/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000005/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000005"},"Resources":{}}} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162968934,"Executor ID":"2","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000003/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000003/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000003"},"Resources":{}}} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162968941,"Executor ID":"5","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000007/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000007/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000007"},"Resources":{}}} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162968952,"Executor ID":"4","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000006/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000006/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000006"},"Resources":{}}} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162969012,"Executor ID":"6","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000008/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000008/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000008"},"Resources":{}}} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Port":42615},"Maximum Memory":2415289958,"Timestamp":1678162969091,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Port":41251},"Maximum Memory":2415289958,"Timestamp":1678162969097,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Port":46355},"Maximum Memory":2415289958,"Timestamp":1678162969125,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Port":36581},"Maximum Memory":2415289958,"Timestamp":1678162969143,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Port":41805},"Maximum Memory":2415289958,"Timestamp":1678162969185,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Port":40771},"Maximum Memory":2415289958,"Timestamp":1678162969222,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Port":37423},"Maximum Memory":2415289958,"Timestamp":1678162969237,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Port":34425},"Maximum Memory":2415289958,"Timestamp":1678162969295,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerJobStart","Job ID":0,"Submission Time":1678162971172,"Stage Infos":[{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"parquet at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":1,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"1\",\"name\":\"mapPartitions\"}","Callsite":"parquet at NativeMethodAccessorImpl.java:0","Parent IDs":[0],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":0,"Name":"ParallelCollectionRDD","Scope":"{\"id\":\"0\",\"name\":\"parallelize\"}","Callsite":"parquet at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:755)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[0],"Properties":{"spark.rdd.scope":"{\"id\":\"2\",\"name\":\"collect\"}","spark.rdd.scope.noOverride":"true"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"parquet at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":1,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"1\",\"name\":\"mapPartitions\"}","Callsite":"parquet at NativeMethodAccessorImpl.java:0","Parent IDs":[0],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":0,"Name":"ParallelCollectionRDD","Scope":"{\"id\":\"0\",\"name\":\"parallelize\"}","Callsite":"parquet at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:755)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162971221,"Accumulables":[]},"Properties":{"spark.rdd.scope":"{\"id\":\"2\",\"name\":\"collect\"}","spark.rdd.scope.noOverride":"true"}} -{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1678162971381,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1678162971381,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162973008,"Failed":false,"Killed":false,"Accumulables":[{"ID":7,"Name":"internal.metrics.resultSerializationTime","Update":6,"Value":6,"Internal":true,"Count Failed Values":true},{"ID":6,"Name":"internal.metrics.jvmGCTime","Update":105,"Value":105,"Internal":true,"Count Failed Values":true},{"ID":5,"Name":"internal.metrics.resultSize","Update":6448,"Value":6448,"Internal":true,"Count Failed Values":true},{"ID":4,"Name":"internal.metrics.executorCpuTime","Update":123884829,"Value":123884829,"Internal":true,"Count Failed Values":true},{"ID":3,"Name":"internal.metrics.executorRunTime","Update":1010,"Value":1010,"Internal":true,"Count Failed Values":true},{"ID":2,"Name":"internal.metrics.executorDeserializeCpuTime","Update":397259197,"Value":397259197,"Internal":true,"Count Failed Values":true},{"ID":1,"Name":"internal.metrics.executorDeserializeTime","Update":523,"Value":523,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":523,"Executor Deserialize CPU Time":397259197,"Executor Run Time":1010,"Executor CPU Time":123884829,"Peak Execution Memory":0,"Result Size":6448,"JVM GC Time":105,"Result Serialization Time":6,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"parquet at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":1,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"1\",\"name\":\"mapPartitions\"}","Callsite":"parquet at NativeMethodAccessorImpl.java:0","Parent IDs":[0],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":0,"Name":"ParallelCollectionRDD","Scope":"{\"id\":\"0\",\"name\":\"parallelize\"}","Callsite":"parquet at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:755)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162971221,"Completion Time":1678162973025,"Accumulables":[{"ID":2,"Name":"internal.metrics.executorDeserializeCpuTime","Value":397259197,"Internal":true,"Count Failed Values":true},{"ID":5,"Name":"internal.metrics.resultSize","Value":6448,"Internal":true,"Count Failed Values":true},{"ID":4,"Name":"internal.metrics.executorCpuTime","Value":123884829,"Internal":true,"Count Failed Values":true},{"ID":7,"Name":"internal.metrics.resultSerializationTime","Value":6,"Internal":true,"Count Failed Values":true},{"ID":1,"Name":"internal.metrics.executorDeserializeTime","Value":523,"Internal":true,"Count Failed Values":true},{"ID":3,"Name":"internal.metrics.executorRunTime","Value":1010,"Internal":true,"Count Failed Values":true},{"ID":6,"Name":"internal.metrics.jvmGCTime","Value":105,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":0,"Completion Time":1678162973033,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":0,"description":"createOrReplaceTempView at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.Dataset.createOrReplaceTempView(Dataset.scala:3312)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nCreateViewCommand `alexa_top_1m`, false, true, LocalTempView\n +- Relation[rank#0,site#1] csv\n\n== Analyzed Logical Plan ==\n\nCreateViewCommand `alexa_top_1m`, false, true, LocalTempView\n +- Relation[rank#0,site#1] csv\n\n== Optimized Logical Plan ==\nCreateViewCommand `alexa_top_1m`, false, true, LocalTempView\n +- Relation[rank#0,site#1] csv\n\n== Physical Plan ==\nExecute CreateViewCommand\n +- CreateViewCommand `alexa_top_1m`, false, true, LocalTempView\n +- Relation[rank#0,site#1] csv\n","sparkPlanInfo":{"nodeName":"Execute CreateViewCommand","simpleString":"Execute CreateViewCommand","children":[],"metadata":{},"metrics":[]},"time":1678162975023} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":0,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":0,"timePerRule":{"PruneFileSourcePartitions":65031,"ReassignLambdaVariableID":74591,"PushPredicateThroughNonJoin":31582,"Analyzer$HandleNullInputsForUDF":19631,"Analyzer$ResolveSubqueryColumnAliases":5119,"ResolveTimeZone":13477,"Analyzer$ResolveNamespace":5947,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":9689,"RewriteCorrelatedScalarSubquery":59190,"RemoveLiteralFromGroupExpressions":34657,"PushProjectionThroughUnion":59799,"EliminateSubqueryAliases":43229,"ResolveCatalogs":9023,"PushLeftSemiLeftAntiThroughJoin":59095,"FlattenScalarSubqueriesWithAggregates":73889,"LikeSimplification":128631,"CollapseRepartition":71716,"ResolveHints$ResolveCoalesceHints":5149,"Analyzer$ExtractGenerator":56142,"RewriteIntersectAll":33920,"ResolveHints$ResolveJoinStrategyHints":6880,"TypeCoercion$MapZipWithCoercion":15994,"NullPropagation":75242,"PullupCorrelatedPredicates":39261,"UpdateOuterReferences":9590,"ExtractPythonUDFs":90498,"Analyzer$WindowsSubstitution":8868,"CombineUnions":89241,"ExtractGroupingPythonUDFFromAggregate":40453,"ReorderAssociativeOperator":158229,"CleanupDynamicPruningFilters":75059,"ResolveHints$RemoveAllHints":7474,"SimplifyBinaryComparison":67857,"ResolveTableValuedFunctions":8978,"EliminateSerialization":57335,"TypeCoercion$BooleanEquality":13069,"ReplaceIntersectWithSemiJoin":29699,"ConstantPropagation":122845,"CostBasedJoinReorder":21772,"Analyzer$ResolveReferences":79250,"CTESubstitution":389476,"RemoveRedundantAliases":62454,"TypeCoercion$ImplicitTypeCasts":18742,"RewriteExceptAll":36159,"UpdateAttributeNullability":68886,"PropagateEmptyRelation":79072,"SimplifyCasts":126285,"EliminateMapObjects":67859,"CombineLimits":59089,"DetectAmbiguousSelfJoin":34215,"ReplaceExpressions":71910,"ResolveInlineTables":5552,"OptimizeIn":76727,"CollapseWindow":68907,"TypeCoercion$IfCoercion":17121,"ResolveSessionCatalog":14402,"PartitionPruning":58093,"BooleanSimplification":77570,"TypeCoercion$PromoteStrings":16082,"Analyzer$ResolveAliases":6025,"DecimalAggregates":41811,"PruneFilters":85111,"Analyzer$ResolveMissingReferences":5237,"TransposeWindow":75740,"Analyzer$ResolveRelations":12484,"EliminateUnions":26302,"RewritePredicateSubquery":34612,"ObjectSerializerPruning":30512,"LimitPushDown":58347,"SimplifyCaseConversionExpressions":69891,"Analyzer$ResolveNaturalAndUsingJoin":5548,"EliminateView":54668,"CombineTypedFilters":29742,"OptimizeLimitZero":41606,"CheckCartesianProducts":33966,"ExtractPythonUDFFromAggregate":39520,"Analyzer$ExtractWindowExpressions":11498,"ReplaceExceptWithAntiJoin":31764,"ResolveLambdaVariables":11667,"FallBackFileSourceV2":5248,"Analyzer$ResolveTables":8621,"SubstituteUnresolvedOrdinals":6411,"TypeCoercion$CaseWhenCoercion":17399,"DecimalPrecision":25499,"EliminateSorts":36162,"PushDownLeftSemiAntiJoin":59408,"ExtractPythonUDFFromJoinCondition":43077,"TypeCoercion$StackCoercion":16290,"Analyzer$ResolveAggAliasInGroupBy":5493,"TypeCoercion$StringLiteralCoercion":16011,"FoldablePropagation":114280,"V2ScanRelationPushDown":63452,"EliminateDistinct":9832,"InferFiltersFromConstraints":58309,"Analyzer$PullOutNondeterministic":11905,"Analyzer$ResolveFunctions":13399,"ReplaceNullWithFalseInPredicate":65222,"ResolveHigherOrderFunctions":14448,"Analyzer$ResolvePivot":6080,"CollapseProject":108089,"Analyzer$ResolveNewInstance":11369,"ColumnPruning":287750,"Analyzer$ResolveWindowOrder":15958,"TypeCoercion$ConcatCoercion":14857,"PushDownPredicates":176247,"TimeWindowing":11171,"Optimizer$OptimizeSubqueries":199735,"RewriteNonCorrelatedExists":86837,"TypeCoercion$Division":16636,"ComputeCurrentTime":111116,"ResolveCreateNamedStruct":16593,"TypeCoercion$EltCoercion":15296,"ConvertToLocalRelation":68286,"RemoveRepetitionFromGroupExpressions":31985,"ReplaceDistinctWithAggregate":30058,"PreprocessTableCreation":11763,"ResolveSQLOnFile":5162,"Analyzer$ResolveSubquery":5673,"CombineConcats":13144,"Analyzer$ResolveGroupingAnalytics":12499,"Analyzer$ResolveBinaryArithmetic":15679,"RemoveDispensableExpressions":127535,"Analyzer$ResolveAlterTableChanges":8775,"ResolveEncodersInScalaAgg":14326,"TypeCoercion$IntegralDivision":15617,"Analyzer$ResolveWindowFrame":11699,"Analyzer$ResolveDeserializer":50044,"RewriteDistinctAggregates":44577,"RemoveNoopOperators":125186,"Analyzer$ResolveAggregateFunctions":5410,"NormalizeFloatingNumbers":33502,"ReorderJoin":64222,"Analyzer$ResolveUpCast":8679,"Analyzer$ResolveGenerate":6741,"TypeCoercion$WidenSetOperationTypes":6000,"EliminateOuterJoin":67393,"SimplifyExtractValueOps":70804,"OptimizeMetadataOnlyQuery":14153,"EliminateResolvedHint":89547,"Analyzer$ResolveInsertInto":5349,"ReplaceExceptWithFilter":54416,"CleanupAliases":18110,"GetCurrentDatabase":177455,"SchemaPruning":351703,"Analyzer$ResolveOutputRelation":5540,"BloomFilterJoinRule":48922,"Analyzer$ResolveRandomSeed":5887,"TypeCoercion$WindowFrameCoercion":16730,"ConstantFolding":67504,"TypeCoercion$DateTimeOperations":14465,"TypeCoercion$InConversion":19141,"FindDataSourceTable":7874,"SimplifyConditionals":70378,"DataSourceAnalysis":6744,"TypeCoercion$FunctionArgumentConversion":15823,"Analyzer$GlobalAggregates":5394,"Analyzer$LookupFunctions":90831,"CombineFilters":88454,"ReplaceDeduplicateWithAggregate":33529,"PreprocessTableInsertion":5546},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":2,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":2,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":2,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":2,"CollapseRepartition":2,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":2,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":3,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":2,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":2,"ResolveTableValuedFunctions":1,"EliminateSerialization":2,"TypeCoercion$BooleanEquality":1,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":2,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":2,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":2,"EliminateMapObjects":1,"CombineLimits":2,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":2,"CollapseWindow":2,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":2,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":3,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":2,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":2,"SimplifyCaseConversionExpressions":2,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":2,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":2,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":2,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":3,"Analyzer$ResolveNewInstance":1,"ColumnPruning":4,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":4,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":2,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":2,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":4,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":2,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":2,"SimplifyExtractValueOps":2,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":2,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":2,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":3,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{},"timeEffectiveRunsPerRule":{},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":0,"time":1678162975117} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":1,"description":"createOrReplaceTempView at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.Dataset.createOrReplaceTempView(Dataset.scala:3312)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nCreateViewCommand `ccindex`, false, true, LocalTempView\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nCreateViewCommand `ccindex`, false, true, LocalTempView\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nCreateViewCommand `ccindex`, false, true, LocalTempView\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Physical Plan ==\nExecute CreateViewCommand\n +- CreateViewCommand `ccindex`, false, true, LocalTempView\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n","sparkPlanInfo":{"nodeName":"Execute CreateViewCommand","simpleString":"Execute CreateViewCommand","children":[],"metadata":{},"metrics":[]},"time":1678162975192} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":1,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":1,"timePerRule":{"PruneFileSourcePartitions":71400,"ReassignLambdaVariableID":74086,"PushPredicateThroughNonJoin":31467,"Analyzer$HandleNullInputsForUDF":22196,"Analyzer$ResolveSubqueryColumnAliases":5965,"ResolveTimeZone":12842,"Analyzer$ResolveNamespace":8155,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":11584,"RewriteCorrelatedScalarSubquery":56173,"RemoveLiteralFromGroupExpressions":29963,"PushProjectionThroughUnion":57958,"EliminateSubqueryAliases":45359,"ResolveCatalogs":9207,"PushLeftSemiLeftAntiThroughJoin":61969,"FlattenScalarSubqueriesWithAggregates":48938,"LikeSimplification":124594,"CollapseRepartition":71007,"ResolveHints$ResolveCoalesceHints":5501,"Analyzer$ExtractGenerator":40418,"RewriteIntersectAll":26828,"ResolveHints$ResolveJoinStrategyHints":7353,"TypeCoercion$MapZipWithCoercion":14020,"NullPropagation":73646,"PullupCorrelatedPredicates":39717,"UpdateOuterReferences":11232,"ExtractPythonUDFs":75995,"Analyzer$WindowsSubstitution":9154,"CombineUnions":88239,"ExtractGroupingPythonUDFFromAggregate":37191,"ReorderAssociativeOperator":139635,"CleanupDynamicPruningFilters":72665,"ResolveHints$RemoveAllHints":14804,"SimplifyBinaryComparison":71547,"ResolveTableValuedFunctions":7542,"EliminateSerialization":54822,"TypeCoercion$BooleanEquality":10925,"ReplaceIntersectWithSemiJoin":29423,"ConstantPropagation":58549,"CostBasedJoinReorder":18171,"Analyzer$ResolveReferences":66766,"CTESubstitution":232204,"RemoveRedundantAliases":58406,"TypeCoercion$ImplicitTypeCasts":14614,"RewriteExceptAll":32468,"UpdateAttributeNullability":88326,"PropagateEmptyRelation":78263,"SimplifyCasts":141663,"EliminateMapObjects":71028,"CombineLimits":60820,"DetectAmbiguousSelfJoin":22750,"ReplaceExpressions":70683,"ResolveInlineTables":5349,"OptimizeIn":81946,"CollapseWindow":70449,"TypeCoercion$IfCoercion":12885,"ResolveSessionCatalog":10621,"PartitionPruning":69955,"BooleanSimplification":76390,"TypeCoercion$PromoteStrings":13763,"Analyzer$ResolveAliases":5773,"DecimalAggregates":37589,"PruneFilters":87158,"Analyzer$ResolveMissingReferences":5411,"TransposeWindow":76661,"Analyzer$ResolveRelations":12350,"EliminateUnions":12390,"RewritePredicateSubquery":32092,"ObjectSerializerPruning":29148,"LimitPushDown":58792,"SimplifyCaseConversionExpressions":66246,"Analyzer$ResolveNaturalAndUsingJoin":5896,"EliminateView":38153,"CombineTypedFilters":31469,"OptimizeLimitZero":42393,"CheckCartesianProducts":32436,"ExtractPythonUDFFromAggregate":36890,"Analyzer$ExtractWindowExpressions":12035,"ReplaceExceptWithAntiJoin":30105,"ResolveLambdaVariables":9909,"FallBackFileSourceV2":3658,"Analyzer$ResolveTables":8810,"SubstituteUnresolvedOrdinals":6224,"TypeCoercion$CaseWhenCoercion":13331,"DecimalPrecision":23639,"EliminateSorts":38901,"PushDownLeftSemiAntiJoin":65334,"ExtractPythonUDFFromJoinCondition":37561,"TypeCoercion$StackCoercion":12301,"Analyzer$ResolveAggAliasInGroupBy":5399,"TypeCoercion$StringLiteralCoercion":7782,"FoldablePropagation":109453,"V2ScanRelationPushDown":52250,"EliminateDistinct":10525,"InferFiltersFromConstraints":50509,"Analyzer$PullOutNondeterministic":10982,"Analyzer$ResolveFunctions":11245,"ReplaceNullWithFalseInPredicate":69271,"ResolveHigherOrderFunctions":13090,"Analyzer$ResolvePivot":5998,"CollapseProject":104742,"Analyzer$ResolveNewInstance":10224,"ColumnPruning":272433,"Analyzer$ResolveWindowOrder":15719,"TypeCoercion$ConcatCoercion":13842,"PushDownPredicates":210176,"TimeWindowing":12054,"Optimizer$OptimizeSubqueries":205284,"RewriteNonCorrelatedExists":67806,"TypeCoercion$Division":88255,"ComputeCurrentTime":116027,"ResolveCreateNamedStruct":12962,"TypeCoercion$EltCoercion":12812,"ConvertToLocalRelation":60269,"RemoveRepetitionFromGroupExpressions":28848,"ReplaceDistinctWithAggregate":28582,"PreprocessTableCreation":6172,"ResolveSQLOnFile":3849,"Analyzer$ResolveSubquery":5838,"CombineConcats":13589,"Analyzer$ResolveGroupingAnalytics":14805,"Analyzer$ResolveBinaryArithmetic":10992,"RemoveDispensableExpressions":144866,"Analyzer$ResolveAlterTableChanges":7892,"ResolveEncodersInScalaAgg":8512,"TypeCoercion$IntegralDivision":17610,"Analyzer$ResolveWindowFrame":10534,"Analyzer$ResolveDeserializer":46628,"RewriteDistinctAggregates":45360,"RemoveNoopOperators":114961,"Analyzer$ResolveAggregateFunctions":5722,"NormalizeFloatingNumbers":33213,"ReorderJoin":63610,"Analyzer$ResolveUpCast":8480,"Analyzer$ResolveGenerate":6100,"TypeCoercion$WidenSetOperationTypes":6144,"EliminateOuterJoin":56424,"SimplifyExtractValueOps":63752,"OptimizeMetadataOnlyQuery":17254,"EliminateResolvedHint":3505095,"Analyzer$ResolveInsertInto":5618,"ReplaceExceptWithFilter":53754,"CleanupAliases":16136,"GetCurrentDatabase":173342,"SchemaPruning":89692,"Analyzer$ResolveOutputRelation":5294,"BloomFilterJoinRule":50934,"Analyzer$ResolveRandomSeed":5902,"TypeCoercion$WindowFrameCoercion":11960,"ConstantFolding":67181,"TypeCoercion$DateTimeOperations":10934,"TypeCoercion$InConversion":18371,"FindDataSourceTable":4928,"SimplifyConditionals":72778,"DataSourceAnalysis":4525,"TypeCoercion$FunctionArgumentConversion":15045,"Analyzer$GlobalAggregates":5439,"Analyzer$LookupFunctions":48926,"CombineFilters":86396,"ReplaceDeduplicateWithAggregate":31154,"PreprocessTableInsertion":3962},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":2,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":2,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":2,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":2,"CollapseRepartition":2,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":2,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":3,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":2,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":2,"ResolveTableValuedFunctions":1,"EliminateSerialization":2,"TypeCoercion$BooleanEquality":1,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":2,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":2,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":2,"EliminateMapObjects":1,"CombineLimits":2,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":2,"CollapseWindow":2,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":2,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":3,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":2,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":2,"SimplifyCaseConversionExpressions":2,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":2,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":2,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":2,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":3,"Analyzer$ResolveNewInstance":1,"ColumnPruning":4,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":4,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":2,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":2,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":4,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":2,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":2,"SimplifyExtractValueOps":2,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":2,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":2,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":3,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{},"timeEffectiveRunsPerRule":{},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":1,"time":1678162975198} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":2,"description":"createOrReplaceTempView at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.Dataset.createOrReplaceTempView(Dataset.scala:3312)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nCreateViewCommand `site_summary`, false, true, LocalTempView\n +- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nCreateViewCommand `site_summary`, false, true, LocalTempView\n +- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nCreateViewCommand `site_summary`, false, true, LocalTempView\n +- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Physical Plan ==\nExecute CreateViewCommand\n +- CreateViewCommand `site_summary`, false, true, LocalTempView\n +- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n","sparkPlanInfo":{"nodeName":"Execute CreateViewCommand","simpleString":"Execute CreateViewCommand","children":[],"metadata":{},"metrics":[]},"time":1678162976364} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":2,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":2,"timePerRule":{"PruneFileSourcePartitions":66381,"ReassignLambdaVariableID":70148,"PushPredicateThroughNonJoin":27184,"Analyzer$HandleNullInputsForUDF":10733,"Analyzer$ResolveSubqueryColumnAliases":4403,"ResolveTimeZone":5218,"Analyzer$ResolveNamespace":5187,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":8587,"RewriteCorrelatedScalarSubquery":60794,"RemoveLiteralFromGroupExpressions":31869,"PushProjectionThroughUnion":67645,"EliminateSubqueryAliases":42747,"ResolveCatalogs":7759,"PushLeftSemiLeftAntiThroughJoin":58897,"FlattenScalarSubqueriesWithAggregates":46105,"LikeSimplification":132448,"CollapseRepartition":74103,"ResolveHints$ResolveCoalesceHints":5328,"Analyzer$ExtractGenerator":20339,"RewriteIntersectAll":30932,"ResolveHints$ResolveJoinStrategyHints":6268,"TypeCoercion$MapZipWithCoercion":5879,"NullPropagation":72248,"PullupCorrelatedPredicates":39287,"UpdateOuterReferences":6580,"ExtractPythonUDFs":66273,"Analyzer$WindowsSubstitution":6094,"CombineUnions":91820,"ExtractGroupingPythonUDFFromAggregate":37912,"ReorderAssociativeOperator":117356,"CleanupDynamicPruningFilters":69722,"ResolveHints$RemoveAllHints":6153,"SimplifyBinaryComparison":70524,"ResolveTableValuedFunctions":6603,"EliminateSerialization":60535,"TypeCoercion$BooleanEquality":5757,"ReplaceIntersectWithSemiJoin":30875,"ConstantPropagation":61768,"CostBasedJoinReorder":16825,"Analyzer$ResolveReferences":37755,"CTESubstitution":207391,"RemoveRedundantAliases":54150,"TypeCoercion$ImplicitTypeCasts":6265,"RewriteExceptAll":32275,"UpdateAttributeNullability":54708,"PropagateEmptyRelation":79096,"SimplifyCasts":133314,"EliminateMapObjects":66297,"CombineLimits":60819,"DetectAmbiguousSelfJoin":29601,"ReplaceExpressions":88966,"ResolveInlineTables":4838,"OptimizeIn":73721,"CollapseWindow":73641,"TypeCoercion$IfCoercion":6135,"ResolveSessionCatalog":10877,"PartitionPruning":60103,"BooleanSimplification":73623,"TypeCoercion$PromoteStrings":5960,"Analyzer$ResolveAliases":5026,"DecimalAggregates":40021,"PruneFilters":88564,"Analyzer$ResolveMissingReferences":4572,"TransposeWindow":73821,"Analyzer$ResolveRelations":9740,"EliminateUnions":9352,"RewritePredicateSubquery":30648,"ObjectSerializerPruning":28182,"LimitPushDown":61656,"SimplifyCaseConversionExpressions":68506,"Analyzer$ResolveNaturalAndUsingJoin":4721,"EliminateView":36605,"CombineTypedFilters":34011,"OptimizeLimitZero":40821,"CheckCartesianProducts":30621,"ExtractPythonUDFFromAggregate":35692,"Analyzer$ExtractWindowExpressions":10664,"ReplaceExceptWithAntiJoin":29443,"ResolveLambdaVariables":8194,"FallBackFileSourceV2":3581,"Analyzer$ResolveTables":7088,"SubstituteUnresolvedOrdinals":5454,"TypeCoercion$CaseWhenCoercion":6200,"DecimalPrecision":19637,"EliminateSorts":36370,"PushDownLeftSemiAntiJoin":62483,"ExtractPythonUDFFromJoinCondition":37659,"TypeCoercion$StackCoercion":5918,"Analyzer$ResolveAggAliasInGroupBy":4955,"TypeCoercion$StringLiteralCoercion":5693,"FoldablePropagation":98340,"V2ScanRelationPushDown":54051,"EliminateDistinct":9954,"InferFiltersFromConstraints":93261,"Analyzer$PullOutNondeterministic":6558,"Analyzer$ResolveFunctions":9935,"ReplaceNullWithFalseInPredicate":68230,"ResolveHigherOrderFunctions":7034,"Analyzer$ResolvePivot":5392,"CollapseProject":107268,"Analyzer$ResolveNewInstance":10805,"ColumnPruning":264366,"Analyzer$ResolveWindowOrder":7117,"TypeCoercion$ConcatCoercion":8094,"PushDownPredicates":177057,"TimeWindowing":9074,"Optimizer$OptimizeSubqueries":195440,"RewriteNonCorrelatedExists":70426,"TypeCoercion$Division":5419,"ComputeCurrentTime":108663,"ResolveCreateNamedStruct":7419,"TypeCoercion$EltCoercion":8233,"ConvertToLocalRelation":61653,"RemoveRepetitionFromGroupExpressions":29686,"ReplaceDistinctWithAggregate":29344,"PreprocessTableCreation":7448,"ResolveSQLOnFile":3999,"Analyzer$ResolveSubquery":4681,"CombineConcats":13294,"Analyzer$ResolveGroupingAnalytics":11159,"Analyzer$ResolveBinaryArithmetic":7987,"RemoveDispensableExpressions":128186,"Analyzer$ResolveAlterTableChanges":6687,"ResolveEncodersInScalaAgg":8044,"TypeCoercion$IntegralDivision":5987,"Analyzer$ResolveWindowFrame":5246,"Analyzer$ResolveDeserializer":275008,"RewriteDistinctAggregates":40617,"RemoveNoopOperators":115093,"Analyzer$ResolveAggregateFunctions":5186,"NormalizeFloatingNumbers":28962,"ReorderJoin":64090,"Analyzer$ResolveUpCast":7051,"Analyzer$ResolveGenerate":5203,"TypeCoercion$WidenSetOperationTypes":4882,"EliminateOuterJoin":61775,"SimplifyExtractValueOps":133858,"OptimizeMetadataOnlyQuery":15136,"EliminateResolvedHint":89577,"Analyzer$ResolveInsertInto":4860,"ReplaceExceptWithFilter":50973,"CleanupAliases":42713,"GetCurrentDatabase":199404,"SchemaPruning":87363,"Analyzer$ResolveOutputRelation":5027,"BloomFilterJoinRule":45192,"Analyzer$ResolveRandomSeed":4711,"TypeCoercion$WindowFrameCoercion":6365,"ConstantFolding":70169,"TypeCoercion$DateTimeOperations":5835,"TypeCoercion$InConversion":6996,"FindDataSourceTable":5154,"SimplifyConditionals":70263,"DataSourceAnalysis":4883,"TypeCoercion$FunctionArgumentConversion":5819,"Analyzer$GlobalAggregates":4443,"Analyzer$LookupFunctions":21525,"CombineFilters":84970,"ReplaceDeduplicateWithAggregate":32340,"PreprocessTableInsertion":4597},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":2,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":2,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":2,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":2,"CollapseRepartition":2,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":2,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":3,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":2,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":2,"ResolveTableValuedFunctions":1,"EliminateSerialization":2,"TypeCoercion$BooleanEquality":1,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":2,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":2,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":2,"EliminateMapObjects":1,"CombineLimits":2,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":2,"CollapseWindow":2,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":2,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":3,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":2,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":2,"SimplifyCaseConversionExpressions":2,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":2,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":2,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":2,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":3,"Analyzer$ResolveNewInstance":1,"ColumnPruning":4,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":4,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":2,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":2,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":4,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":2,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":2,"SimplifyExtractValueOps":2,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":2,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":2,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":3,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{},"timeEffectiveRunsPerRule":{},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":2,"time":1678162976405} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":3,"description":"createOrReplaceTempView at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.Dataset.createOrReplaceTempView(Dataset.scala:3312)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nCreateViewCommand `language_count_tmp`, false, true, LocalTempView\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nCreateViewCommand `language_count_tmp`, false, true, LocalTempView\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nCreateViewCommand `language_count_tmp`, false, true, LocalTempView\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Physical Plan ==\nExecute CreateViewCommand\n +- CreateViewCommand `language_count_tmp`, false, true, LocalTempView\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n","sparkPlanInfo":{"nodeName":"Execute CreateViewCommand","simpleString":"Execute CreateViewCommand","children":[],"metadata":{},"metrics":[]},"time":1678162976740} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":3,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":3,"timePerRule":{"PruneFileSourcePartitions":65698,"ReassignLambdaVariableID":72929,"PushPredicateThroughNonJoin":27568,"Analyzer$HandleNullInputsForUDF":11536,"Analyzer$ResolveSubqueryColumnAliases":4719,"ResolveTimeZone":6693,"Analyzer$ResolveNamespace":6222,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":9443,"RewriteCorrelatedScalarSubquery":64538,"RemoveLiteralFromGroupExpressions":30597,"PushProjectionThroughUnion":58963,"EliminateSubqueryAliases":45538,"ResolveCatalogs":8264,"PushLeftSemiLeftAntiThroughJoin":52788,"FlattenScalarSubqueriesWithAggregates":50271,"LikeSimplification":139306,"CollapseRepartition":72379,"ResolveHints$ResolveCoalesceHints":6157,"Analyzer$ExtractGenerator":21895,"RewriteIntersectAll":31148,"ResolveHints$ResolveJoinStrategyHints":7363,"TypeCoercion$MapZipWithCoercion":6160,"NullPropagation":78835,"PullupCorrelatedPredicates":41093,"UpdateOuterReferences":6617,"ExtractPythonUDFs":58387,"Analyzer$WindowsSubstitution":7236,"CombineUnions":93165,"ExtractGroupingPythonUDFFromAggregate":32353,"ReorderAssociativeOperator":145006,"CleanupDynamicPruningFilters":79817,"ResolveHints$RemoveAllHints":4600,"SimplifyBinaryComparison":72121,"ResolveTableValuedFunctions":7525,"EliminateSerialization":54484,"TypeCoercion$BooleanEquality":9530,"ReplaceIntersectWithSemiJoin":28664,"ConstantPropagation":60357,"CostBasedJoinReorder":20227,"Analyzer$ResolveReferences":61919,"CTESubstitution":193253,"RemoveRedundantAliases":64210,"TypeCoercion$ImplicitTypeCasts":5236,"RewriteExceptAll":33860,"UpdateAttributeNullability":48302,"PropagateEmptyRelation":82086,"SimplifyCasts":136203,"EliminateMapObjects":70884,"CombineLimits":58072,"DetectAmbiguousSelfJoin":27514,"ReplaceExpressions":75795,"ResolveInlineTables":5329,"OptimizeIn":73976,"CollapseWindow":65179,"TypeCoercion$IfCoercion":5652,"ResolveSessionCatalog":6313,"PartitionPruning":67272,"BooleanSimplification":73811,"TypeCoercion$PromoteStrings":8307,"Analyzer$ResolveAliases":6290,"DecimalAggregates":38128,"PruneFilters":85436,"Analyzer$ResolveMissingReferences":5596,"TransposeWindow":78446,"Analyzer$ResolveRelations":13022,"EliminateUnions":6843,"RewritePredicateSubquery":30493,"ObjectSerializerPruning":31619,"LimitPushDown":55578,"SimplifyCaseConversionExpressions":66025,"Analyzer$ResolveNaturalAndUsingJoin":5701,"EliminateView":42753,"CombineTypedFilters":28066,"OptimizeLimitZero":41950,"CheckCartesianProducts":35178,"ExtractPythonUDFFromAggregate":35801,"Analyzer$ExtractWindowExpressions":11652,"ReplaceExceptWithAntiJoin":30669,"ResolveLambdaVariables":10018,"FallBackFileSourceV2":2887,"Analyzer$ResolveTables":8417,"SubstituteUnresolvedOrdinals":6298,"TypeCoercion$CaseWhenCoercion":5879,"DecimalPrecision":20119,"EliminateSorts":32814,"PushDownLeftSemiAntiJoin":54294,"ExtractPythonUDFFromJoinCondition":46473,"TypeCoercion$StackCoercion":5589,"Analyzer$ResolveAggAliasInGroupBy":5343,"TypeCoercion$StringLiteralCoercion":5125,"FoldablePropagation":109287,"V2ScanRelationPushDown":51124,"EliminateDistinct":10658,"InferFiltersFromConstraints":54387,"Analyzer$PullOutNondeterministic":5500,"Analyzer$ResolveFunctions":10377,"ReplaceNullWithFalseInPredicate":75563,"ResolveHigherOrderFunctions":8499,"Analyzer$ResolvePivot":5738,"CollapseProject":104480,"Analyzer$ResolveNewInstance":9065,"ColumnPruning":274474,"Analyzer$ResolveWindowOrder":8374,"TypeCoercion$ConcatCoercion":7072,"PushDownPredicates":177907,"TimeWindowing":10415,"Optimizer$OptimizeSubqueries":217826,"RewriteNonCorrelatedExists":75551,"TypeCoercion$Division":5955,"ComputeCurrentTime":110775,"ResolveCreateNamedStruct":7962,"TypeCoercion$EltCoercion":6648,"ConvertToLocalRelation":60200,"RemoveRepetitionFromGroupExpressions":30127,"ReplaceDistinctWithAggregate":29815,"PreprocessTableCreation":5793,"ResolveSQLOnFile":3199,"Analyzer$ResolveSubquery":5693,"CombineConcats":14125,"Analyzer$ResolveGroupingAnalytics":11038,"Analyzer$ResolveBinaryArithmetic":7237,"RemoveDispensableExpressions":132171,"Analyzer$ResolveAlterTableChanges":5382,"ResolveEncodersInScalaAgg":8176,"TypeCoercion$IntegralDivision":5093,"Analyzer$ResolveWindowFrame":5950,"Analyzer$ResolveDeserializer":29706,"RewriteDistinctAggregates":40688,"RemoveNoopOperators":112524,"Analyzer$ResolveAggregateFunctions":6194,"NormalizeFloatingNumbers":30746,"ReorderJoin":62428,"Analyzer$ResolveUpCast":7072,"Analyzer$ResolveGenerate":6937,"TypeCoercion$WidenSetOperationTypes":6212,"EliminateOuterJoin":58649,"SimplifyExtractValueOps":75463,"OptimizeMetadataOnlyQuery":18795,"EliminateResolvedHint":94161,"Analyzer$ResolveInsertInto":6027,"ReplaceExceptWithFilter":53028,"CleanupAliases":10402,"GetCurrentDatabase":179739,"SchemaPruning":87947,"Analyzer$ResolveOutputRelation":5170,"BloomFilterJoinRule":45692,"Analyzer$ResolveRandomSeed":5312,"TypeCoercion$WindowFrameCoercion":5673,"ConstantFolding":70574,"TypeCoercion$DateTimeOperations":4960,"TypeCoercion$InConversion":10076,"FindDataSourceTable":4298,"SimplifyConditionals":68775,"DataSourceAnalysis":3529,"TypeCoercion$FunctionArgumentConversion":5731,"Analyzer$GlobalAggregates":4168,"Analyzer$LookupFunctions":25019,"CombineFilters":84098,"ReplaceDeduplicateWithAggregate":30851,"PreprocessTableInsertion":3551},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":2,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":2,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":2,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":2,"CollapseRepartition":2,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":2,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":3,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":2,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":2,"ResolveTableValuedFunctions":1,"EliminateSerialization":2,"TypeCoercion$BooleanEquality":1,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":2,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":2,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":2,"EliminateMapObjects":1,"CombineLimits":2,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":2,"CollapseWindow":2,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":2,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":3,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":2,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":2,"SimplifyCaseConversionExpressions":2,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":2,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":2,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":2,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":3,"Analyzer$ResolveNewInstance":1,"ColumnPruning":4,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":4,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":2,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":2,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":4,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":2,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":2,"SimplifyExtractValueOps":2,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":2,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":2,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":3,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{},"timeEffectiveRunsPerRule":{},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":3,"time":1678162976746} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":4,"description":"createOrReplaceTempView at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.Dataset.createOrReplaceTempView(Dataset.scala:3312)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nCreateViewCommand `language_charset_loglikelihood`, false, true, LocalTempView\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nCreateViewCommand `language_charset_loglikelihood`, false, true, LocalTempView\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nCreateViewCommand `language_charset_loglikelihood`, false, true, LocalTempView\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Physical Plan ==\nExecute CreateViewCommand\n +- CreateViewCommand `language_charset_loglikelihood`, false, true, LocalTempView\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n","sparkPlanInfo":{"nodeName":"Execute CreateViewCommand","simpleString":"Execute CreateViewCommand","children":[],"metadata":{},"metrics":[]},"time":1678162977269} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":4,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":4,"timePerRule":{"PruneFileSourcePartitions":52779,"ReassignLambdaVariableID":77158,"PushPredicateThroughNonJoin":27490,"Analyzer$HandleNullInputsForUDF":11453,"Analyzer$ResolveSubqueryColumnAliases":3717,"ResolveTimeZone":5751,"Analyzer$ResolveNamespace":4009,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":7515,"RewriteCorrelatedScalarSubquery":57042,"RemoveLiteralFromGroupExpressions":27107,"PushProjectionThroughUnion":57237,"EliminateSubqueryAliases":40188,"ResolveCatalogs":5636,"PushLeftSemiLeftAntiThroughJoin":53699,"FlattenScalarSubqueriesWithAggregates":49177,"LikeSimplification":128831,"CollapseRepartition":68445,"ResolveHints$ResolveCoalesceHints":5088,"Analyzer$ExtractGenerator":18674,"RewriteIntersectAll":26895,"ResolveHints$ResolveJoinStrategyHints":5206,"TypeCoercion$MapZipWithCoercion":10749,"NullPropagation":68432,"PullupCorrelatedPredicates":32968,"UpdateOuterReferences":9446,"ExtractPythonUDFs":90523,"Analyzer$WindowsSubstitution":7244,"CombineUnions":80883,"ExtractGroupingPythonUDFFromAggregate":34582,"ReorderAssociativeOperator":123666,"CleanupDynamicPruningFilters":71701,"ResolveHints$RemoveAllHints":5703,"SimplifyBinaryComparison":71739,"ResolveTableValuedFunctions":5955,"EliminateSerialization":54844,"TypeCoercion$BooleanEquality":5911,"ReplaceIntersectWithSemiJoin":35390,"ConstantPropagation":55503,"CostBasedJoinReorder":20713,"Analyzer$ResolveReferences":34660,"CTESubstitution":198488,"RemoveRedundantAliases":68191,"TypeCoercion$ImplicitTypeCasts":6121,"RewriteExceptAll":28552,"UpdateAttributeNullability":310797,"PropagateEmptyRelation":74681,"SimplifyCasts":151208,"EliminateMapObjects":64973,"CombineLimits":57570,"DetectAmbiguousSelfJoin":30309,"ReplaceExpressions":65610,"ResolveInlineTables":4574,"OptimizeIn":77708,"CollapseWindow":65602,"TypeCoercion$IfCoercion":5046,"ResolveSessionCatalog":7602,"PartitionPruning":69556,"BooleanSimplification":72893,"TypeCoercion$PromoteStrings":6246,"Analyzer$ResolveAliases":4086,"DecimalAggregates":37860,"PruneFilters":82018,"Analyzer$ResolveMissingReferences":3812,"TransposeWindow":71711,"Analyzer$ResolveRelations":6804,"EliminateUnions":7164,"RewritePredicateSubquery":32105,"ObjectSerializerPruning":27349,"LimitPushDown":54587,"SimplifyCaseConversionExpressions":62967,"Analyzer$ResolveNaturalAndUsingJoin":3940,"EliminateView":35223,"CombineTypedFilters":27166,"OptimizeLimitZero":35928,"CheckCartesianProducts":34683,"ExtractPythonUDFFromAggregate":38954,"Analyzer$ExtractWindowExpressions":10337,"ReplaceExceptWithAntiJoin":26779,"ResolveLambdaVariables":7231,"FallBackFileSourceV2":3034,"Analyzer$ResolveTables":5367,"SubstituteUnresolvedOrdinals":4926,"TypeCoercion$CaseWhenCoercion":5366,"DecimalPrecision":8998,"EliminateSorts":33679,"PushDownLeftSemiAntiJoin":56788,"ExtractPythonUDFFromJoinCondition":40538,"TypeCoercion$StackCoercion":5406,"Analyzer$ResolveAggAliasInGroupBy":3571,"TypeCoercion$StringLiteralCoercion":4899,"FoldablePropagation":931302,"V2ScanRelationPushDown":48737,"EliminateDistinct":10167,"InferFiltersFromConstraints":5579378,"Analyzer$PullOutNondeterministic":5700,"Analyzer$ResolveFunctions":8826,"ReplaceNullWithFalseInPredicate":64887,"ResolveHigherOrderFunctions":7325,"Analyzer$ResolvePivot":4892,"CollapseProject":101596,"Analyzer$ResolveNewInstance":7800,"ColumnPruning":296080,"Analyzer$ResolveWindowOrder":7558,"TypeCoercion$ConcatCoercion":7330,"PushDownPredicates":141892,"TimeWindowing":9305,"Optimizer$OptimizeSubqueries":196707,"RewriteNonCorrelatedExists":61209,"TypeCoercion$Division":5465,"ComputeCurrentTime":105508,"ResolveCreateNamedStruct":7342,"TypeCoercion$EltCoercion":7100,"ConvertToLocalRelation":60075,"RemoveRepetitionFromGroupExpressions":26826,"ReplaceDistinctWithAggregate":26252,"PreprocessTableCreation":6298,"ResolveSQLOnFile":3825,"Analyzer$ResolveSubquery":4146,"CombineConcats":15370,"Analyzer$ResolveGroupingAnalytics":8799,"Analyzer$ResolveBinaryArithmetic":6498,"RemoveDispensableExpressions":128736,"Analyzer$ResolveAlterTableChanges":6112,"ResolveEncodersInScalaAgg":8292,"TypeCoercion$IntegralDivision":5388,"Analyzer$ResolveWindowFrame":5516,"Analyzer$ResolveDeserializer":26368,"RewriteDistinctAggregates":40294,"RemoveNoopOperators":111773,"Analyzer$ResolveAggregateFunctions":4547,"NormalizeFloatingNumbers":29874,"ReorderJoin":58615,"Analyzer$ResolveUpCast":6122,"Analyzer$ResolveGenerate":4358,"TypeCoercion$WidenSetOperationTypes":4077,"EliminateOuterJoin":54802,"SimplifyExtractValueOps":64890,"OptimizeMetadataOnlyQuery":17618,"EliminateResolvedHint":91363,"Analyzer$ResolveInsertInto":3172,"ReplaceExceptWithFilter":47303,"CleanupAliases":13154,"GetCurrentDatabase":155827,"SchemaPruning":98186,"Analyzer$ResolveOutputRelation":3776,"BloomFilterJoinRule":48361,"Analyzer$ResolveRandomSeed":4369,"TypeCoercion$WindowFrameCoercion":5685,"ConstantFolding":63447,"TypeCoercion$DateTimeOperations":5330,"TypeCoercion$InConversion":6676,"FindDataSourceTable":4768,"SimplifyConditionals":66769,"DataSourceAnalysis":4680,"TypeCoercion$FunctionArgumentConversion":5557,"Analyzer$GlobalAggregates":3684,"Analyzer$LookupFunctions":21801,"CombineFilters":76190,"ReplaceDeduplicateWithAggregate":29437,"PreprocessTableInsertion":3461},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":2,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":2,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":2,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":2,"CollapseRepartition":2,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":2,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":3,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":2,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":2,"ResolveTableValuedFunctions":1,"EliminateSerialization":2,"TypeCoercion$BooleanEquality":1,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":2,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":2,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":2,"EliminateMapObjects":1,"CombineLimits":2,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":2,"CollapseWindow":2,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":2,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":3,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":2,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":2,"SimplifyCaseConversionExpressions":2,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":2,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":2,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":2,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":3,"Analyzer$ResolveNewInstance":1,"ColumnPruning":4,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":4,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":2,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":2,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":4,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":2,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":2,"SimplifyExtractValueOps":2,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":2,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":2,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":3,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{},"timeEffectiveRunsPerRule":{},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":4,"time":1678162977276} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":5,"description":"save at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Sort [Language#279 ASC NULLS FIRST], false, 0\n +- Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Sort [Charset#311 ASC NULLS FIRST], false, 0\n +- Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Exchange SinglePartition, true, [id=#93]\n +- HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)], output=[n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L])\n +- Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#90]\n +- HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)], output=[Charset#311, Language#279, count#1053L])\n +- Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0","children":[{"nodeName":"Project","simpleString":"Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]","children":[{"nodeName":"Filter","simpleString":"Filter (n_pages#545L >= 100)","children":[{"nodeName":"Window","simpleString":"Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]","children":[{"nodeName":"Filter","simpleString":"Filter NOT Contains(Language#279, ,)","children":[{"nodeName":"Window","simpleString":"Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]","children":[{"nodeName":"Sort","simpleString":"Sort [Charset#311 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]","children":[{"nodeName":"Window","simpleString":"Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]","children":[{"nodeName":"Exchange","simpleString":"Exchange SinglePartition, true, [id=#93]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)])","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#90]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)])","children":[{"nodeName":"Project","simpleString":"Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]","children":[{"nodeName":"Filter","simpleString":"Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":109,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":108,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":105,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":106,"metricType":"timing"},{"name":"peak memory","accumulatorId":104,"metricType":"size"},{"name":"number of output rows","accumulatorId":103,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":107,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":37,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":38,"metricType":"nsTiming"},{"name":"records read","accumulatorId":35,"metricType":"sum"},{"name":"local bytes read","accumulatorId":33,"metricType":"size"},{"name":"fetch wait time","accumulatorId":34,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":31,"metricType":"size"},{"name":"local blocks read","accumulatorId":30,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":29,"metricType":"sum"},{"name":"data size","accumulatorId":28,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":32,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":36,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":100,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":101,"metricType":"timing"},{"name":"peak memory","accumulatorId":99,"metricType":"size"},{"name":"number of output rows","accumulatorId":98,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":102,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":48,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":49,"metricType":"nsTiming"},{"name":"records read","accumulatorId":46,"metricType":"sum"},{"name":"local bytes read","accumulatorId":44,"metricType":"size"},{"name":"fetch wait time","accumulatorId":45,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":42,"metricType":"size"},{"name":"local blocks read","accumulatorId":41,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":40,"metricType":"sum"},{"name":"data size","accumulatorId":39,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":43,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":47,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":95,"metricType":"timing"},{"name":"peak memory","accumulatorId":96,"metricType":"size"},{"name":"spill size","accumulatorId":97,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":94,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":91,"metricType":"timing"},{"name":"peak memory","accumulatorId":92,"metricType":"size"},{"name":"spill size","accumulatorId":93,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":90,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":87,"metricType":"timing"},{"name":"peak memory","accumulatorId":88,"metricType":"size"},{"name":"spill size","accumulatorId":89,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":83,"metricType":"sum"},{"name":"written output","accumulatorId":84,"metricType":"size"},{"name":"number of output rows","accumulatorId":85,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":86,"metricType":"sum"}]},"time":1678162978214} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":5,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":5,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Sort [Language#279 ASC NULLS FIRST], false, 0\n +- Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Sort [Charset#311 ASC NULLS FIRST], false, 0\n +- Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Exchange SinglePartition, true, [id=#129]\n +- HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)], output=[n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L])\n +- ShuffleQueryStage 0\n +- Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#126]\n +- *(1) HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)], output=[Charset#311, Language#279, count#1053L])\n +- *(1) Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- *(1) Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0","children":[{"nodeName":"Project","simpleString":"Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]","children":[{"nodeName":"Filter","simpleString":"Filter (n_pages#545L >= 100)","children":[{"nodeName":"Window","simpleString":"Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]","children":[{"nodeName":"Filter","simpleString":"Filter NOT Contains(Language#279, ,)","children":[{"nodeName":"Window","simpleString":"Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]","children":[{"nodeName":"Sort","simpleString":"Sort [Charset#311 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]","children":[{"nodeName":"Window","simpleString":"Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]","children":[{"nodeName":"Exchange","simpleString":"Exchange SinglePartition, true, [id=#129]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)])","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#126]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)])","children":[{"nodeName":"Project","simpleString":"Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]","children":[{"nodeName":"Filter","simpleString":"Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":109,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":166,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":163,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":164,"metricType":"timing"},{"name":"peak memory","accumulatorId":162,"metricType":"size"},{"name":"number of output rows","accumulatorId":161,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":165,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":160,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":131,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":132,"metricType":"nsTiming"},{"name":"records read","accumulatorId":129,"metricType":"sum"},{"name":"local bytes read","accumulatorId":127,"metricType":"size"},{"name":"fetch wait time","accumulatorId":128,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":125,"metricType":"size"},{"name":"local blocks read","accumulatorId":124,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":123,"metricType":"sum"},{"name":"data size","accumulatorId":122,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":126,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":130,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":157,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":158,"metricType":"timing"},{"name":"peak memory","accumulatorId":156,"metricType":"size"},{"name":"number of output rows","accumulatorId":155,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":159,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":142,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":143,"metricType":"nsTiming"},{"name":"records read","accumulatorId":140,"metricType":"sum"},{"name":"local bytes read","accumulatorId":138,"metricType":"size"},{"name":"fetch wait time","accumulatorId":139,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":136,"metricType":"size"},{"name":"local blocks read","accumulatorId":135,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":134,"metricType":"sum"},{"name":"data size","accumulatorId":133,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":137,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":141,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":152,"metricType":"timing"},{"name":"peak memory","accumulatorId":153,"metricType":"size"},{"name":"spill size","accumulatorId":154,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":151,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":148,"metricType":"timing"},{"name":"peak memory","accumulatorId":149,"metricType":"size"},{"name":"spill size","accumulatorId":150,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":147,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":144,"metricType":"timing"},{"name":"peak memory","accumulatorId":145,"metricType":"size"},{"name":"spill size","accumulatorId":146,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":83,"metricType":"sum"},{"name":"written output","accumulatorId":84,"metricType":"size"},{"name":"number of output rows","accumulatorId":85,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":86,"metricType":"sum"}]}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates","executionId":5,"accumUpdates":[[115,5],[116,4],[117,1090354548]]} -{"Event":"SparkListenerJobStart","Job ID":1,"Submission Time":1678162979920,"Stage Infos":[{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":10,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"7\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[8],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[1],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"7\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"5","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":10,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"7\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[8],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162979931,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"7\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"5","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":1,"Index":0,"Attempt":0,"Launch Time":1678162979998,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":2,"Index":1,"Attempt":0,"Launch Time":1678162980008,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":3,"Index":2,"Attempt":0,"Launch Time":1678162980011,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":4,"Index":3,"Attempt":0,"Launch Time":1678162980012,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":5,"Index":4,"Attempt":0,"Launch Time":1678162980013,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":6,"Index":5,"Attempt":0,"Launch Time":1678162980014,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":7,"Index":6,"Attempt":0,"Launch Time":1678162980015,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":8,"Index":7,"Attempt":0,"Launch Time":1678162980017,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":9,"Index":8,"Attempt":0,"Launch Time":1678162980019,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":10,"Index":9,"Attempt":0,"Launch Time":1678162980020,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":11,"Index":10,"Attempt":0,"Launch Time":1678162980022,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":12,"Index":11,"Attempt":0,"Launch Time":1678162980023,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":13,"Index":12,"Attempt":0,"Launch Time":1678162980025,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":14,"Index":13,"Attempt":0,"Launch Time":1678162980026,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":15,"Index":14,"Attempt":0,"Launch Time":1678162980030,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":16,"Index":15,"Attempt":0,"Launch Time":1678162980031,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":17,"Index":16,"Attempt":0,"Launch Time":1678162980032,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":18,"Index":17,"Attempt":0,"Launch Time":1678162980035,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":19,"Index":18,"Attempt":0,"Launch Time":1678162980036,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":20,"Index":19,"Attempt":0,"Launch Time":1678162980037,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":21,"Index":20,"Attempt":0,"Launch Time":1678162980039,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":22,"Index":21,"Attempt":0,"Launch Time":1678162980041,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":23,"Index":22,"Attempt":0,"Launch Time":1678162980042,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":24,"Index":23,"Attempt":0,"Launch Time":1678162980043,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":25,"Index":24,"Attempt":0,"Launch Time":1678162980045,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":26,"Index":25,"Attempt":0,"Launch Time":1678162980046,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":27,"Index":26,"Attempt":0,"Launch Time":1678162980048,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":28,"Index":27,"Attempt":0,"Launch Time":1678162980049,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":29,"Index":28,"Attempt":0,"Launch Time":1678162980050,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":30,"Index":29,"Attempt":0,"Launch Time":1678162980052,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":31,"Index":30,"Attempt":0,"Launch Time":1678162980054,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":32,"Index":31,"Attempt":0,"Launch Time":1678162980055,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":33,"Index":32,"Attempt":0,"Launch Time":1678162982742,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":13,"Index":12,"Attempt":0,"Launch Time":1678162980025,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162982748,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"249","Value":"249","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"2","Value":"2","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"70","Value":"70","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"50","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14145,"Value":14145,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":294912,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":128,"Value":128,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":3809,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":766099159,"Value":766099159,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2181,"Value":2181,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":98930502,"Value":98930502,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":480,"Value":480,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":480,"Executor Deserialize CPU Time":98930502,"Executor Run Time":2181,"Executor CPU Time":766099159,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":128,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14145,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":21,"Index":20,"Attempt":0,"Launch Time":1678162980039,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162982750,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"252","Value":"501","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"11","Value":"13","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"2","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"57","Value":"127","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"54","Value":"104","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13311,"Value":27456,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":589824,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":128,"Value":256,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":7618,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":207903279,"Value":974002438,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2182,"Value":4363,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":226997291,"Value":325927793,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":476,"Value":956,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":476,"Executor Deserialize CPU Time":226997291,"Executor Run Time":2182,"Executor CPU Time":207903279,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":128,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13311,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":29,"Index":28,"Attempt":0,"Launch Time":1678162980050,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162982753,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"249","Value":"750","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"786432","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"7","Value":"20","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"70","Value":"197","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"48","Value":"152","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14145,"Value":41601,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":884736,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":128,"Value":384,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":11427,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":606057407,"Value":1580059845,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2197,"Value":6560,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":96377652,"Value":422305445,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":465,"Value":1421,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":465,"Executor Deserialize CPU Time":96377652,"Executor Run Time":2197,"Executor CPU Time":606057407,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":128,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14145,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":33,"Index":32,"Attempt":0,"Launch Time":1678162982742,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162982916,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"20","Value":"770","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"1048576","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"20","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"4","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"112","Value":"309","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"112","Value":"264","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14746,"Value":56347,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1179648,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3766,"Value":15193,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":30744569,"Value":1610804414,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":158,"Value":6718,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6280822,"Value":428586267,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":7,"Value":1428,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":390886176,"JVMOffHeapMemory":113641848,"OnHeapExecutionMemory":100794368,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":1712433,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":102506801,"OffHeapUnifiedMemory":0,"DirectPoolMemory":181850,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7154511872,"ProcessTreeJVMRSSMemory":1206054912,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":8,"MinorGCTime":192,"MajorGCCount":3,"MajorGCTime":220},"Task Metrics":{"Executor Deserialize Time":7,"Executor Deserialize CPU Time":6280822,"Executor Run Time":158,"Executor CPU Time":30744569,"Peak Execution Memory":294912,"Result Size":3766,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14746,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":26,"Index":25,"Attempt":0,"Launch Time":1678162980046,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984247,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"150","Value":"920","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"1310720","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"24","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1140","Value":"1449","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1137","Value":"1401","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13405,"Value":69752,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1474560,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":223,"Value":607,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":19002,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":292778735,"Value":1903583149,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2830,"Value":9548,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":215106889,"Value":643693156,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1320,"Value":2748,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1320,"Executor Deserialize CPU Time":215106889,"Executor Run Time":2830,"Executor CPU Time":292778735,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":223,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13405,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":2,"Index":1,"Attempt":0,"Launch Time":1678162980008,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984248,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"150","Value":"1070","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"1572864","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"5","Value":"29","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1140","Value":"2589","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1137","Value":"2538","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14746,"Value":84498,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1769472,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":223,"Value":830,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":22811,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":785092723,"Value":2688675872,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2841,"Value":12389,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":309626858,"Value":953320014,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1321,"Value":4069,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1321,"Executor Deserialize CPU Time":309626858,"Executor Run Time":2841,"Executor CPU Time":785092723,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":223,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14746,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":4,"Index":3,"Attempt":0,"Launch Time":1678162980012,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984349,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"162","Value":"1232","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"1835008","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"6","Value":"35","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"989","Value":"3578","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"986","Value":"3524","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14746,"Value":99244,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2064384,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":260,"Value":1090,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":26620,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":415924911,"Value":3104600783,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2754,"Value":15143,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":368314331,"Value":1321634345,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1495,"Value":5564,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1495,"Executor Deserialize CPU Time":368314331,"Executor Run Time":2754,"Executor CPU Time":415924911,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":260,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14746,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":28,"Index":27,"Attempt":0,"Launch Time":1678162980049,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984349,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"163","Value":"1395","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"2097152","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"7","Value":"42","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"989","Value":"4567","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"984","Value":"4508","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14746,"Value":113990,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2359296,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":260,"Value":1350,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":30429,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1001089760,"Value":4105690543,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2751,"Value":17894,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":212652740,"Value":1534287085,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1499,"Value":7063,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1499,"Executor Deserialize CPU Time":212652740,"Executor Run Time":2751,"Executor CPU Time":1001089760,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":260,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14746,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":25,"Index":24,"Attempt":0,"Launch Time":1678162980045,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984364,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"143","Value":"1538","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"2359296","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"5","Value":"47","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"9","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"983","Value":"5550","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"980","Value":"5488","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13311,"Value":127301,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2654208,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":2,"Value":2,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":291,"Value":1641,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":34281,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":342278729,"Value":4447969272,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2770,"Value":20664,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":321868716,"Value":1856155801,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1494,"Value":8557,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1494,"Executor Deserialize CPU Time":321868716,"Executor Run Time":2770,"Executor CPU Time":342278729,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":291,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13311,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":1,"Index":0,"Attempt":0,"Launch Time":1678162979998,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984365,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"143","Value":"1681","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"2621440","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"47","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"983","Value":"6533","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"980","Value":"6468","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14145,"Value":141446,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2949120,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":2,"Value":4,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":291,"Value":1932,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":38133,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":228058736,"Value":4676028008,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2771,"Value":23435,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":184976661,"Value":2041132462,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1494,"Value":10051,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1494,"Executor Deserialize CPU Time":184976661,"Executor Run Time":2771,"Executor CPU Time":228058736,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":291,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14145,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":9,"Index":8,"Attempt":0,"Launch Time":1678162980019,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984376,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"143","Value":"1824","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"2883584","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"47","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"11","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"983","Value":"7516","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"980","Value":"7448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13297,"Value":154743,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3244032,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":5,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":291,"Value":2223,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":41985,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1070307064,"Value":5746335072,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2777,"Value":26212,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":445887440,"Value":2487019902,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1488,"Value":11539,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1488,"Executor Deserialize CPU Time":445887440,"Executor Run Time":2777,"Executor CPU Time":1070307064,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":291,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13297,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":3,"Index":2,"Attempt":0,"Launch Time":1678162980011,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984466,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"197","Value":"2021","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"3145728","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"51","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"12","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1053","Value":"8569","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1050","Value":"8498","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13297,"Value":168040,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3538944,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":6,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":264,"Value":2487,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":45837,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1141720052,"Value":6888055124,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2904,"Value":29116,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":317476676,"Value":2804496578,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1461,"Value":13000,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1461,"Executor Deserialize CPU Time":317476676,"Executor Run Time":2904,"Executor CPU Time":1141720052,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":264,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13297,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":11,"Index":10,"Attempt":0,"Launch Time":1678162980022,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984469,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"197","Value":"2218","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"3407872","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"55","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"13","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1052","Value":"9621","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1050","Value":"9548","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13297,"Value":181337,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3833856,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":7,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":264,"Value":2751,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":49689,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":242581006,"Value":7130636130,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2903,"Value":32019,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":207123400,"Value":3011619978,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1462,"Value":14462,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1462,"Executor Deserialize CPU Time":207123400,"Executor Run Time":2903,"Executor CPU Time":242581006,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":264,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13297,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":19,"Index":18,"Attempt":0,"Launch Time":1678162980036,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984472,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"197","Value":"2415","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"3670016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"59","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"14","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1059","Value":"10680","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1056","Value":"10604","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13405,"Value":194742,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4128768,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":8,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":264,"Value":3015,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":53541,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":308774930,"Value":7439411060,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2903,"Value":34922,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":259126857,"Value":3270746835,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1462,"Value":15924,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1462,"Executor Deserialize CPU Time":259126857,"Executor Run Time":2903,"Executor CPU Time":308774930,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":264,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13405,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":24,"Index":23,"Attempt":0,"Launch Time":1678162980043,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984495,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"165","Value":"2580","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"3932160","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"5","Value":"64","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"15","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1011","Value":"11691","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1009","Value":"11613","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13405,"Value":208147,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4423680,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":254,"Value":3269,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":57350,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":219182581,"Value":7658593641,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2880,"Value":37802,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":250025131,"Value":3520771966,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1497,"Value":17421,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":526178872,"JVMOffHeapMemory":102106288,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":663903,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":794975,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7072075776,"ProcessTreeJVMRSSMemory":828153856,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":6,"MinorGCTime":89,"MajorGCCount":3,"MajorGCTime":305},"Task Metrics":{"Executor Deserialize Time":1497,"Executor Deserialize CPU Time":250025131,"Executor Run Time":2880,"Executor CPU Time":219182581,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":254,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13405,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":8,"Index":7,"Attempt":0,"Launch Time":1678162980017,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984497,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"155","Value":"2735","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"4194304","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"5","Value":"69","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"16","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1011","Value":"12702","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1009","Value":"12622","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14145,"Value":222292,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4718592,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":254,"Value":3523,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":61159,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":764049657,"Value":8422643298,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2877,"Value":40679,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":216945390,"Value":3737717356,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1503,"Value":18924,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":526178872,"JVMOffHeapMemory":102106288,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":663903,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":794975,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7072075776,"ProcessTreeJVMRSSMemory":828153856,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":6,"MinorGCTime":89,"MajorGCCount":3,"MajorGCTime":305},"Task Metrics":{"Executor Deserialize Time":1503,"Executor Deserialize CPU Time":216945390,"Executor Run Time":2877,"Executor CPU Time":764049657,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":254,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14145,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":32,"Index":31,"Attempt":0,"Launch Time":1678162980055,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984499,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"168","Value":"2903","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"4456448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"5","Value":"74","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"17","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1010","Value":"13712","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1008","Value":"13630","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13311,"Value":235603,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5013504,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":254,"Value":3777,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":64968,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":542525408,"Value":8965168706,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2875,"Value":43554,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":399029227,"Value":4136746583,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1501,"Value":20425,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":526178872,"JVMOffHeapMemory":102106288,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":663903,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":794975,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7072075776,"ProcessTreeJVMRSSMemory":828153856,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":6,"MinorGCTime":89,"MajorGCCount":3,"MajorGCTime":305},"Task Metrics":{"Executor Deserialize Time":1501,"Executor Deserialize CPU Time":399029227,"Executor Run Time":2875,"Executor CPU Time":542525408,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":254,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13311,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":22,"Index":21,"Attempt":0,"Launch Time":1678162980041,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984517,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"145","Value":"3048","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"78","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"18","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1136","Value":"14848","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1133","Value":"14763","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14746,"Value":250349,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5308416,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":9,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":264,"Value":4041,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":68820,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":419171962,"Value":9384340668,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2887,"Value":46441,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":392112202,"Value":4528858785,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1514,"Value":21939,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":141939104,"JVMOffHeapMemory":72113424,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":107290,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":107290,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7042990080,"ProcessTreeJVMRSSMemory":626630656,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":5,"MinorGCTime":61,"MajorGCCount":2,"MajorGCTime":149},"Task Metrics":{"Executor Deserialize Time":1514,"Executor Deserialize CPU Time":392112202,"Executor Run Time":2887,"Executor CPU Time":419171962,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":264,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14746,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":16,"Index":15,"Attempt":0,"Launch Time":1678162980031,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984518,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"165","Value":"3213","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"4980736","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"6","Value":"84","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1010","Value":"15858","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1008","Value":"15771","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13311,"Value":263660,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5603328,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":10,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":254,"Value":4295,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":72672,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":293076137,"Value":9677416805,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2874,"Value":49315,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":263297650,"Value":4792156435,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1497,"Value":23436,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":526178872,"JVMOffHeapMemory":102106288,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":663903,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":794975,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7072075776,"ProcessTreeJVMRSSMemory":828153856,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":6,"MinorGCTime":89,"MajorGCCount":3,"MajorGCTime":305},"Task Metrics":{"Executor Deserialize Time":1497,"Executor Deserialize CPU Time":263297650,"Executor Run Time":2874,"Executor CPU Time":293076137,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":254,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13311,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":30,"Index":29,"Attempt":0,"Launch Time":1678162980052,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984519,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"144","Value":"3357","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"5242880","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"2","Value":"86","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"20","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1140","Value":"16998","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1138","Value":"16909","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":26702,"Value":290362,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5898240,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":264,"Value":4559,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":76481,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":687384885,"Value":10364801690,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2887,"Value":52202,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":342520347,"Value":5134676782,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1514,"Value":24950,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":141939104,"JVMOffHeapMemory":72113424,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":107290,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":107290,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7042990080,"ProcessTreeJVMRSSMemory":626630656,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":5,"MinorGCTime":61,"MajorGCCount":2,"MajorGCTime":149},"Task Metrics":{"Executor Deserialize Time":1514,"Executor Deserialize CPU Time":342520347,"Executor Run Time":2887,"Executor CPU Time":687384885,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":264,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":26702,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":7,"Index":6,"Attempt":0,"Launch Time":1678162980015,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984579,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"157","Value":"3514","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"5505024","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"90","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"21","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1105","Value":"18103","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1101","Value":"18010","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13311,"Value":303673,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6193152,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":259,"Value":4818,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":80290,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":248267655,"Value":10613069345,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2920,"Value":55122,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":207027837,"Value":5341704619,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1537,"Value":26487,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":348482528,"JVMOffHeapMemory":107273224,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":1712433,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":1843505,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7081742336,"ProcessTreeJVMRSSMemory":852627456,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":7,"MinorGCTime":134,"MajorGCCount":3,"MajorGCTime":367},"Task Metrics":{"Executor Deserialize Time":1537,"Executor Deserialize CPU Time":207027837,"Executor Run Time":2920,"Executor CPU Time":248267655,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":259,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13311,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":15,"Index":14,"Attempt":0,"Launch Time":1678162980030,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984579,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"157","Value":"3671","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"5767168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"94","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"22","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1105","Value":"19208","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1101","Value":"19111","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13297,"Value":316970,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6488064,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":259,"Value":5077,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":84099,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":753349708,"Value":11366419053,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2923,"Value":58045,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":213345547,"Value":5555050166,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1527,"Value":28014,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":348482528,"JVMOffHeapMemory":107273224,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":1712433,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":1843505,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7081742336,"ProcessTreeJVMRSSMemory":852627456,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":7,"MinorGCTime":134,"MajorGCCount":3,"MajorGCTime":367},"Task Metrics":{"Executor Deserialize Time":1527,"Executor Deserialize CPU Time":213345547,"Executor Run Time":2923,"Executor CPU Time":753349708,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":259,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13297,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":31,"Index":30,"Attempt":0,"Launch Time":1678162980054,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984580,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"157","Value":"3828","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"6029312","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"23","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1105","Value":"20313","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1101","Value":"20212","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13405,"Value":330375,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6782976,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":259,"Value":5336,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":87908,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":692206155,"Value":12058625208,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2920,"Value":60965,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":511768345,"Value":6066818511,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1526,"Value":29540,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":348482528,"JVMOffHeapMemory":107273224,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":1712433,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":1843505,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7081742336,"ProcessTreeJVMRSSMemory":852627456,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":7,"MinorGCTime":134,"MajorGCCount":3,"MajorGCTime":367},"Task Metrics":{"Executor Deserialize Time":1526,"Executor Deserialize CPU Time":511768345,"Executor Run Time":2920,"Executor CPU Time":692206155,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":259,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13405,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":5,"Index":4,"Attempt":0,"Launch Time":1678162980013,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162986094,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"8","Value":"3836","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"6291456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"24","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"4619","Value":"24932","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"274","Value":"274","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"1120882","Value":"1120882","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"1120882","Value":"1120882","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"4446","Value":"24658","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":1120882,"Value":1120882,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":129065639,"Value":129396014,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7077888,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":159,"Value":5495,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":91760,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1247949979,"Value":13306575187,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":5535,"Value":66500,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":206215778,"Value":6273034289,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":482,"Value":30022,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":390886176,"JVMOffHeapMemory":113641848,"OnHeapExecutionMemory":100794368,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":1712433,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":102506801,"OffHeapUnifiedMemory":0,"DirectPoolMemory":181850,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7154511872,"ProcessTreeJVMRSSMemory":1206054912,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":8,"MinorGCTime":192,"MajorGCCount":3,"MajorGCTime":220},"Task Metrics":{"Executor Deserialize Time":482,"Executor Deserialize CPU Time":206215778,"Executor Run Time":5535,"Executor CPU Time":1247949979,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":159,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":129065639,"Records Read":1120882},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":17,"Index":16,"Attempt":0,"Launch Time":1678162980032,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162986160,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"9","Value":"3845","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"6553600","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"25","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"3744","Value":"28676","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"17","Value":"291","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"69516","Value":"1190398","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"69516","Value":"1190398","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"3724","Value":"28382","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":69516,"Value":1190398,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":72231192,"Value":201627206,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7372800,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":303,"Value":5798,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":95612,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":737889122,"Value":14044464309,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":4585,"Value":71085,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":202715130,"Value":6475749419,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1494,"Value":31516,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":395991528,"JVMOffHeapMemory":118706328,"OnHeapExecutionMemory":32768,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":1712433,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":1745201,"OffHeapUnifiedMemory":0,"DirectPoolMemory":13821364,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7106768896,"ProcessTreeJVMRSSMemory":995651584,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":8,"MinorGCTime":181,"MajorGCCount":3,"MajorGCTime":311},"Task Metrics":{"Executor Deserialize Time":1494,"Executor Deserialize CPU Time":202715130,"Executor Run Time":4585,"Executor CPU Time":737889122,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":303,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":72231192,"Records Read":69516},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":10,"Index":9,"Attempt":0,"Launch Time":1678162980020,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162986840,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"13","Value":"3858","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"6815744","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"26","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"4666","Value":"33342","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"181","Value":"472","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"740621","Value":"1931019","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"740621","Value":"1931019","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"4563","Value":"32945","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":740621,"Value":1931019,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":96064540,"Value":297691746,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7667712,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":268,"Value":6066,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":99464,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":920976379,"Value":14965440688,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":5440,"Value":76525,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":266952657,"Value":6742702076,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1318,"Value":32834,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1318,"Executor Deserialize CPU Time":266952657,"Executor Run Time":5440,"Executor CPU Time":920976379,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":268,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":96064540,"Records Read":740621},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":18,"Index":17,"Attempt":0,"Launch Time":1678162980035,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162986857,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"7","Value":"3865","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"7077888","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"27","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"4690","Value":"38032","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"186","Value":"658","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"760701","Value":"2691720","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"760701","Value":"2691720","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"4584","Value":"37529","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":760701,"Value":2691720,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":96562829,"Value":394254575,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7962624,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":268,"Value":6334,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":103316,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1084466489,"Value":16049907177,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":5440,"Value":81965,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":353709505,"Value":7096411581,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1336,"Value":34170,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1336,"Executor Deserialize CPU Time":353709505,"Executor Run Time":5440,"Executor CPU Time":1084466489,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":268,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":96562829,"Records Read":760701},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":20,"Index":19,"Attempt":0,"Launch Time":1678162980037,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162987637,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"15","Value":"3880","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"7340032","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"28","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"5185","Value":"43217","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"178","Value":"836","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"728996","Value":"3420716","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"728996","Value":"3420716","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"5066","Value":"42595","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":728996,"Value":3420716,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":107980462,"Value":502235037,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8257536,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":303,"Value":6637,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":107168,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1320529166,"Value":17370436343,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":6050,"Value":88015,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":207428063,"Value":7303839644,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1499,"Value":35669,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":946782296,"JVMOffHeapMemory":112107152,"OnHeapExecutionMemory":65536,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":2760963,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":2826499,"OffHeapUnifiedMemory":0,"DirectPoolMemory":13955,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7083728896,"ProcessTreeJVMRSSMemory":1319464960,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":8,"MinorGCTime":165,"MajorGCCount":3,"MajorGCTime":332},"Task Metrics":{"Executor Deserialize Time":1499,"Executor Deserialize CPU Time":207428063,"Executor Run Time":6050,"Executor CPU Time":1320529166,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":303,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":107980462,"Records Read":728996},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":12,"Index":11,"Attempt":0,"Launch Time":1678162980023,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162987801,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"8","Value":"3888","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"7602176","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"29","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"5356","Value":"48573","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"137","Value":"973","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"560100","Value":"3980816","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"560100","Value":"3980816","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"5271","Value":"47866","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":560100,"Value":3980816,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":131899546,"Value":634134583,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8552448,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":303,"Value":6940,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":111020,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":723894455,"Value":18094330798,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":6215,"Value":94230,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":299017127,"Value":7602856771,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1499,"Value":37168,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":946782296,"JVMOffHeapMemory":112107152,"OnHeapExecutionMemory":65536,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":2760963,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":2826499,"OffHeapUnifiedMemory":0,"DirectPoolMemory":13955,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7083728896,"ProcessTreeJVMRSSMemory":1319464960,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":8,"MinorGCTime":165,"MajorGCCount":3,"MajorGCTime":332},"Task Metrics":{"Executor Deserialize Time":1499,"Executor Deserialize CPU Time":299017127,"Executor Run Time":6215,"Executor CPU Time":723894455,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":303,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":131899546,"Records Read":560100},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":14,"Index":13,"Attempt":0,"Launch Time":1678162980026,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162987843,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"15","Value":"3903","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"7864320","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"30","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"5432","Value":"54005","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"191","Value":"1164","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"781104","Value":"4761920","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"781104","Value":"4761920","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"5318","Value":"53184","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":781104,"Value":4761920,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":108077075,"Value":742211658,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8847360,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":308,"Value":7248,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":114872,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1178900448,"Value":19273231246,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":6227,"Value":100457,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":198881590,"Value":7801738361,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1514,"Value":38682,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":141939104,"JVMOffHeapMemory":72113424,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":107290,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":107290,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7042990080,"ProcessTreeJVMRSSMemory":626630656,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":5,"MinorGCTime":61,"MajorGCCount":2,"MajorGCTime":149},"Task Metrics":{"Executor Deserialize Time":1514,"Executor Deserialize CPU Time":198881590,"Executor Run Time":6227,"Executor CPU Time":1178900448,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":308,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":108077075,"Records Read":781104},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":6,"Index":5,"Attempt":0,"Launch Time":1678162980014,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162987845,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"19","Value":"3922","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"8126464","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"31","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"5426","Value":"59431","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"198","Value":"1362","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"809218","Value":"5571138","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"809218","Value":"5571138","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"5293","Value":"58477","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":809218,"Value":5571138,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":96212273,"Value":838423931,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9142272,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":308,"Value":7556,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":118724,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1133807133,"Value":20407038379,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":6229,"Value":106686,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":196597733,"Value":7998336094,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1514,"Value":40196,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":141939104,"JVMOffHeapMemory":72113424,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":107290,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":107290,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7042990080,"ProcessTreeJVMRSSMemory":626630656,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":5,"MinorGCTime":61,"MajorGCCount":2,"MajorGCTime":149},"Task Metrics":{"Executor Deserialize Time":1514,"Executor Deserialize CPU Time":196597733,"Executor Run Time":6229,"Executor CPU Time":1133807133,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":308,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":96212273,"Records Read":809218},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":27,"Index":26,"Attempt":0,"Launch Time":1678162980048,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162988094,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"10","Value":"3932","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"8388608","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"32","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"5676","Value":"65107","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"215","Value":"1577","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"879479","Value":"6450617","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"879479","Value":"6450617","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"5535","Value":"64012","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":879479,"Value":6450617,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":125375392,"Value":963799323,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9437184,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":300,"Value":7856,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":122576,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1311072168,"Value":21718110547,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":6545,"Value":113231,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":368371012,"Value":8366707106,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1462,"Value":41658,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1462,"Executor Deserialize CPU Time":368371012,"Executor Run Time":6545,"Executor CPU Time":1311072168,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":300,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":125375392,"Records Read":879479},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":23,"Index":22,"Attempt":0,"Launch Time":1678162980042,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162988553,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"7","Value":"3939","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"33","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"6090","Value":"71197","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"237","Value":"1814","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"969399","Value":"7420016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"969399","Value":"7420016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"5951","Value":"69963","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":969399,"Value":7420016,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":126954524,"Value":1090753847,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":292,"Value":8148,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":126428,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1276437053,"Value":22994547600,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":6903,"Value":120134,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":225138204,"Value":8591845310,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1527,"Value":43185,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":348482528,"JVMOffHeapMemory":107273224,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":1712433,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":1843505,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7081742336,"ProcessTreeJVMRSSMemory":852627456,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":7,"MinorGCTime":134,"MajorGCCount":3,"MajorGCTime":367},"Task Metrics":{"Executor Deserialize Time":1527,"Executor Deserialize CPU Time":225138204,"Executor Run Time":6903,"Executor CPU Time":1276437053,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":292,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":126954524,"Records Read":969399},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":10,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"7\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[8],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162979931,"Completion Time":1678162988554,"Accumulables":[{"ID":173,"Name":"internal.metrics.resultSerializationTime","Value":10,"Internal":true,"Count Failed Values":true},{"ID":164,"Name":"time in aggregation build","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Value":"69963","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Value":"71197","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Value":"1814","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Value":43185,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":160,"Name":"duration","Value":"3939","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":169,"Name":"internal.metrics.executorRunTime","Value":120134,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Value":8148,"Internal":true,"Count Failed Values":true},{"ID":112,"Name":"number of output rows","Value":"7420016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Value":"33","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Value":7420016,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Value":8591845310,"Internal":true,"Count Failed Values":true},{"ID":114,"Name":"number of output rows","Value":"7420016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":171,"Name":"internal.metrics.resultSize","Value":126428,"Internal":true,"Count Failed Values":true},{"ID":162,"Name":"peak memory","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":170,"Name":"internal.metrics.executorCpuTime","Value":22994547600,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Value":1090753847,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":1,"Completion Time":1678162988571,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":5,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Sort [Language#279 ASC NULLS FIRST], false, 0\n +- Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Sort [Charset#311 ASC NULLS FIRST], false, 0\n +- Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- ShuffleQueryStage 1\n +- Exchange SinglePartition, true, [id=#204]\n +- *(2) HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)], output=[n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L])\n +- CustomShuffleReader coalesced\n +- ShuffleQueryStage 0\n +- Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#126]\n +- *(1) HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)], output=[Charset#311, Language#279, count#1053L])\n +- *(1) Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- *(1) Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0","children":[{"nodeName":"Project","simpleString":"Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]","children":[{"nodeName":"Filter","simpleString":"Filter (n_pages#545L >= 100)","children":[{"nodeName":"Window","simpleString":"Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]","children":[{"nodeName":"Filter","simpleString":"Filter NOT Contains(Language#279, ,)","children":[{"nodeName":"Window","simpleString":"Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]","children":[{"nodeName":"Sort","simpleString":"Sort [Charset#311 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]","children":[{"nodeName":"Window","simpleString":"Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 1","children":[{"nodeName":"Exchange","simpleString":"Exchange SinglePartition, true, [id=#204]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#126]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)])","children":[{"nodeName":"Project","simpleString":"Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]","children":[{"nodeName":"Filter","simpleString":"Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":109,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":166,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":163,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":164,"metricType":"timing"},{"name":"peak memory","accumulatorId":162,"metricType":"size"},{"name":"number of output rows","accumulatorId":161,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":165,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":160,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":131,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":132,"metricType":"nsTiming"},{"name":"records read","accumulatorId":129,"metricType":"sum"},{"name":"local bytes read","accumulatorId":127,"metricType":"size"},{"name":"fetch wait time","accumulatorId":128,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":125,"metricType":"size"},{"name":"local blocks read","accumulatorId":124,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":123,"metricType":"sum"},{"name":"data size","accumulatorId":122,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":126,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":130,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":283,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":284,"metricType":"timing"},{"name":"peak memory","accumulatorId":282,"metricType":"size"},{"name":"number of output rows","accumulatorId":281,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":285,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":280,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":267,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":268,"metricType":"nsTiming"},{"name":"records read","accumulatorId":265,"metricType":"sum"},{"name":"local bytes read","accumulatorId":263,"metricType":"size"},{"name":"fetch wait time","accumulatorId":264,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":261,"metricType":"size"},{"name":"local blocks read","accumulatorId":260,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":259,"metricType":"sum"},{"name":"data size","accumulatorId":258,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":262,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":266,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":277,"metricType":"timing"},{"name":"peak memory","accumulatorId":278,"metricType":"size"},{"name":"spill size","accumulatorId":279,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":276,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":273,"metricType":"timing"},{"name":"peak memory","accumulatorId":274,"metricType":"size"},{"name":"spill size","accumulatorId":275,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":272,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":269,"metricType":"timing"},{"name":"peak memory","accumulatorId":270,"metricType":"size"},{"name":"spill size","accumulatorId":271,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":83,"metricType":"sum"},{"name":"written output","accumulatorId":84,"metricType":"size"},{"name":"number of output rows","accumulatorId":85,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":86,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":2,"Submission Time":1678162988703,"Stage Infos":[{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":10,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"7\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[8],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":3,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":13,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"22\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[12],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":11,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"26\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[10],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":12,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"23\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[11],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[2],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[2,3],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"22\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"5","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":3,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":13,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"22\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[12],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":11,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"26\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[10],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":12,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"23\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[11],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[2],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162988710,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"22\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"5","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":3,"Stage Attempt ID":0,"Task Info":{"Task ID":34,"Index":0,"Attempt":0,"Launch Time":1678162988725,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":3,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":34,"Index":0,"Attempt":0,"Launch Time":1678162988725,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162988856,"Failed":false,"Killed":false,"Accumulables":[{"ID":280,"Name":"duration","Update":"5","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":282,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":284,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":303,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":302,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":301,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":300,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":299,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":298,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":297,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":295,"Name":"internal.metrics.peakExecutionMemory","Update":262144,"Value":262144,"Internal":true,"Count Failed Values":true},{"ID":290,"Name":"internal.metrics.resultSize","Update":4175,"Value":4175,"Internal":true,"Count Failed Values":true},{"ID":289,"Name":"internal.metrics.executorCpuTime","Update":54273023,"Value":54273023,"Internal":true,"Count Failed Values":true},{"ID":288,"Name":"internal.metrics.executorRunTime","Update":85,"Value":85,"Internal":true,"Count Failed Values":true},{"ID":287,"Name":"internal.metrics.executorDeserializeCpuTime","Update":26396779,"Value":26396779,"Internal":true,"Count Failed Values":true},{"ID":286,"Name":"internal.metrics.executorDeserializeTime","Update":35,"Value":35,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":35,"Executor Deserialize CPU Time":26396779,"Executor Run Time":85,"Executor CPU Time":54273023,"Peak Execution Memory":262144,"Result Size":4175,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":3,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":13,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"22\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[12],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":11,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"26\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[10],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":12,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"23\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[11],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[2],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162988710,"Completion Time":1678162988857,"Accumulables":[{"ID":286,"Name":"internal.metrics.executorDeserializeTime","Value":35,"Internal":true,"Count Failed Values":true},{"ID":295,"Name":"internal.metrics.peakExecutionMemory","Value":262144,"Internal":true,"Count Failed Values":true},{"ID":298,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":280,"Name":"duration","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":289,"Name":"internal.metrics.executorCpuTime","Value":54273023,"Internal":true,"Count Failed Values":true},{"ID":301,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":300,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":303,"Name":"internal.metrics.shuffle.read.recordsRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":288,"Name":"internal.metrics.executorRunTime","Value":85,"Internal":true,"Count Failed Values":true},{"ID":297,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":282,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":299,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":290,"Name":"internal.metrics.resultSize","Value":4175,"Internal":true,"Count Failed Values":true},{"ID":284,"Name":"time in aggregation build","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":302,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":287,"Name":"internal.metrics.executorDeserializeCpuTime","Value":26396779,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":2,"Completion Time":1678162988858,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":5,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- AdaptiveSparkPlan isFinalPlan=true\n +- *(5) Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0\n +- *(5) Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- *(5) Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- *(4) Sort [Language#279 ASC NULLS FIRST], false, 0\n +- *(4) Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- *(4) Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- *(3) Sort [Charset#311 ASC NULLS FIRST], false, 0\n +- *(3) Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- ShuffleQueryStage 1\n +- Exchange SinglePartition, true, [id=#204]\n +- *(2) HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)], output=[n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L])\n +- CustomShuffleReader coalesced\n +- ShuffleQueryStage 0\n +- Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#126]\n +- *(1) HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)], output=[Charset#311, Language#279, count#1053L])\n +- *(1) Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- *(1) Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=true","children":[{"nodeName":"WholeStageCodegen (5)","simpleString":"WholeStageCodegen (5)","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0","children":[{"nodeName":"Project","simpleString":"Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]","children":[{"nodeName":"Filter","simpleString":"Filter (n_pages#545L >= 100)","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Window","simpleString":"Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]","children":[{"nodeName":"Filter","simpleString":"Filter NOT Contains(Language#279, ,)","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Window","simpleString":"Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"Sort","simpleString":"Sort [Charset#311 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Window","simpleString":"Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 1","children":[{"nodeName":"Exchange","simpleString":"Exchange SinglePartition, true, [id=#204]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#126]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)])","children":[{"nodeName":"Project","simpleString":"Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]","children":[{"nodeName":"Filter","simpleString":"Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":109,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":166,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":163,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":164,"metricType":"timing"},{"name":"peak memory","accumulatorId":162,"metricType":"size"},{"name":"number of output rows","accumulatorId":161,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":165,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":160,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":131,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":132,"metricType":"nsTiming"},{"name":"records read","accumulatorId":129,"metricType":"sum"},{"name":"local bytes read","accumulatorId":127,"metricType":"size"},{"name":"fetch wait time","accumulatorId":128,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":125,"metricType":"size"},{"name":"local blocks read","accumulatorId":124,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":123,"metricType":"sum"},{"name":"data size","accumulatorId":122,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":126,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":130,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":283,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":284,"metricType":"timing"},{"name":"peak memory","accumulatorId":282,"metricType":"size"},{"name":"number of output rows","accumulatorId":281,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":285,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":280,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":267,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":268,"metricType":"nsTiming"},{"name":"records read","accumulatorId":265,"metricType":"sum"},{"name":"local bytes read","accumulatorId":263,"metricType":"size"},{"name":"fetch wait time","accumulatorId":264,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":261,"metricType":"size"},{"name":"local blocks read","accumulatorId":260,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":259,"metricType":"sum"},{"name":"data size","accumulatorId":258,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":262,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":266,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":322,"metricType":"timing"},{"name":"peak memory","accumulatorId":323,"metricType":"size"},{"name":"spill size","accumulatorId":324,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":321,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":320,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":317,"metricType":"timing"},{"name":"peak memory","accumulatorId":318,"metricType":"size"},{"name":"spill size","accumulatorId":319,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":316,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":315,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":312,"metricType":"timing"},{"name":"peak memory","accumulatorId":313,"metricType":"size"},{"name":"spill size","accumulatorId":314,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":311,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":83,"metricType":"sum"},{"name":"written output","accumulatorId":84,"metricType":"size"},{"name":"number of output rows","accumulatorId":85,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":86,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":3,"Submission Time":1678162989242,"Stage Infos":[{"Stage ID":5,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":13,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"22\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[12],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":11,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"26\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[10],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":12,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"23\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[11],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[4],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":6,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":20,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"28\",\"name\":\"WholeStageCodegen (5)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[19],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":18,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"34\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[17],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":17,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"39\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[16],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":14,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"46\",\"name\":\"Exchange\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[13],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":15,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"44\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[14],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":16,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"40\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[15],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":19,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"33\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[18],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[5],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":4,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":10,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"7\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[8],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[5,6,4],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"5\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"5","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":6,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":20,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"28\",\"name\":\"WholeStageCodegen (5)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[19],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":18,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"34\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[17],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":17,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"39\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[16],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":14,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"46\",\"name\":\"Exchange\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[13],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":15,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"44\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[14],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":16,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"40\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[15],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":19,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"33\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[18],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[5],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162989248,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"5\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"5","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":6,"Stage Attempt ID":0,"Task Info":{"Task ID":35,"Index":0,"Attempt":0,"Launch Time":1678162989292,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":6,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":35,"Index":0,"Attempt":0,"Launch Time":1678162989292,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162989941,"Failed":false,"Killed":false,"Accumulables":[{"ID":311,"Name":"duration","Update":"233","Value":"233","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":314,"Name":"spill size","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":313,"Name":"peak memory","Update":"65536","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":312,"Name":"sort time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":316,"Name":"duration","Update":"31","Value":"31","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":319,"Name":"spill size","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":318,"Name":"peak memory","Update":"65536","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":317,"Name":"sort time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":321,"Name":"duration","Update":"54","Value":"54","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":324,"Name":"spill size","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":323,"Name":"peak memory","Update":"65536","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":322,"Name":"sort time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":348,"Name":"internal.metrics.output.bytesWritten","Update":111,"Value":111,"Internal":true,"Count Failed Values":true},{"ID":342,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":341,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":340,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":339,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":338,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":337,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":336,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":334,"Name":"internal.metrics.peakExecutionMemory","Update":196608,"Value":196608,"Internal":true,"Count Failed Values":true},{"ID":331,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":329,"Name":"internal.metrics.resultSize","Update":6419,"Value":6419,"Internal":true,"Count Failed Values":true},{"ID":328,"Name":"internal.metrics.executorCpuTime","Update":430425007,"Value":430425007,"Internal":true,"Count Failed Values":true},{"ID":327,"Name":"internal.metrics.executorRunTime","Update":539,"Value":539,"Internal":true,"Count Failed Values":true},{"ID":326,"Name":"internal.metrics.executorDeserializeCpuTime","Update":86902724,"Value":86902724,"Internal":true,"Count Failed Values":true},{"ID":325,"Name":"internal.metrics.executorDeserializeTime","Update":96,"Value":96,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":866619456,"JVMOffHeapMemory":128322616,"OnHeapExecutionMemory":65536,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":982692,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":1048228,"OffHeapUnifiedMemory":0,"DirectPoolMemory":9371220,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7132655616,"ProcessTreeJVMRSSMemory":1554882560,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":11,"MinorGCTime":163,"MajorGCCount":3,"MajorGCTime":281},"Task Metrics":{"Executor Deserialize Time":96,"Executor Deserialize CPU Time":86902724,"Executor Run Time":539,"Executor CPU Time":430425007,"Peak Execution Memory":196608,"Result Size":6419,"JVM GC Time":0,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":111,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":6,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":20,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"28\",\"name\":\"WholeStageCodegen (5)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[19],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":18,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"34\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[17],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":17,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"39\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[16],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":14,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"46\",\"name\":\"Exchange\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[13],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":15,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"44\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[14],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":16,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"40\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[15],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":19,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"33\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[18],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[5],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162989248,"Completion Time":1678162989942,"Accumulables":[{"ID":340,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":331,"Name":"internal.metrics.resultSerializationTime","Value":1,"Internal":true,"Count Failed Values":true},{"ID":322,"Name":"sort time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":313,"Name":"peak memory","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":334,"Name":"internal.metrics.peakExecutionMemory","Value":196608,"Internal":true,"Count Failed Values":true},{"ID":325,"Name":"internal.metrics.executorDeserializeTime","Value":96,"Internal":true,"Count Failed Values":true},{"ID":316,"Name":"duration","Value":"31","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":319,"Name":"spill size","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":337,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":328,"Name":"internal.metrics.executorCpuTime","Value":430425007,"Internal":true,"Count Failed Values":true},{"ID":336,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":327,"Name":"internal.metrics.executorRunTime","Value":539,"Internal":true,"Count Failed Values":true},{"ID":318,"Name":"peak memory","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":348,"Name":"internal.metrics.output.bytesWritten","Value":111,"Internal":true,"Count Failed Values":true},{"ID":312,"Name":"sort time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":339,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":321,"Name":"duration","Value":"54","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":342,"Name":"internal.metrics.shuffle.read.recordsRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":324,"Name":"spill size","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":341,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":323,"Name":"peak memory","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":317,"Name":"sort time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":326,"Name":"internal.metrics.executorDeserializeCpuTime","Value":86902724,"Internal":true,"Count Failed Values":true},{"ID":338,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":329,"Name":"internal.metrics.resultSize","Value":6419,"Internal":true,"Count Failed Values":true},{"ID":311,"Name":"duration","Value":"233","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":314,"Name":"spill size","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"}]}} -{"Event":"SparkListenerJobEnd","Job ID":3,"Completion Time":1678162989942,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates","executionId":5,"accumUpdates":[[83,1],[84,111],[85,0],[86,0]]} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":5,"timePerRule":{"PruneFileSourcePartitions":2144830,"ReassignLambdaVariableID":724707,"PushPredicateThroughNonJoin":608446,"Analyzer$HandleNullInputsForUDF":26349,"Analyzer$ResolveSubqueryColumnAliases":16113,"ResolveTimeZone":21614,"Analyzer$ResolveNamespace":20360,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":23141,"RewriteCorrelatedScalarSubquery":3514511,"RemoveLiteralFromGroupExpressions":1402327,"PushProjectionThroughUnion":1631262,"EliminateSubqueryAliases":4206194,"ResolveCatalogs":23527,"PushLeftSemiLeftAntiThroughJoin":1489244,"FlattenScalarSubqueriesWithAggregates":3863366,"LikeSimplification":5425317,"CollapseRepartition":2465750,"ResolveHints$ResolveCoalesceHints":20038,"Analyzer$ExtractGenerator":38890,"RewriteIntersectAll":627146,"ResolveHints$ResolveJoinStrategyHints":21154,"TypeCoercion$MapZipWithCoercion":26613,"NullPropagation":15733614,"PullupCorrelatedPredicates":1289379,"UpdateOuterReferences":23104,"ExtractPythonUDFs":1115243,"Analyzer$WindowsSubstitution":30716,"CombineUnions":1999965,"ExtractGroupingPythonUDFFromAggregate":1582878,"ReorderAssociativeOperator":7933245,"CleanupDynamicPruningFilters":2408477,"ResolveHints$RemoveAllHints":20406,"SimplifyBinaryComparison":3678265,"ResolveTableValuedFunctions":24373,"EliminateSerialization":1180344,"TypeCoercion$BooleanEquality":79174,"package$ExpressionCanonicalizer$CleanExpressions":825749,"ReplaceIntersectWithSemiJoin":1174735,"ConstantPropagation":2691863,"CostBasedJoinReorder":23176,"Analyzer$ResolveReferences":74041,"CTESubstitution":1150505,"RemoveRedundantAliases":12059838,"TypeCoercion$ImplicitTypeCasts":26112,"RewriteExceptAll":636663,"UpdateAttributeNullability":134291,"PropagateEmptyRelation":12980320,"SimplifyCasts":6493467,"EliminateMapObjects":1129024,"CombineLimits":1391153,"DetectAmbiguousSelfJoin":1716998,"ReplaceExpressions":1371904,"ResolveInlineTables":38964,"OptimizeIn":2596173,"CollapseWindow":1367762,"TypeCoercion$IfCoercion":20966,"ResolveSessionCatalog":26402,"PartitionPruning":405147,"BooleanSimplification":7034649,"TypeCoercion$PromoteStrings":26861,"Analyzer$ResolveAliases":16085,"DecimalAggregates":1101506,"PruneFilters":36343716,"Analyzer$ResolveMissingReferences":15777,"TransposeWindow":1432335,"Analyzer$ResolveRelations":39942,"EliminateUnions":27946,"RewritePredicateSubquery":1034141,"ObjectSerializerPruning":407524,"LimitPushDown":1897566,"SimplifyCaseConversionExpressions":3976856,"Analyzer$ResolveNaturalAndUsingJoin":31793,"EliminateView":771523,"CombineTypedFilters":375043,"OptimizeLimitZero":646858,"CheckCartesianProducts":55012,"ExtractPythonUDFFromAggregate":1555787,"Analyzer$ExtractWindowExpressions":31080,"ReplaceExceptWithAntiJoin":630195,"ResolveLambdaVariables":27528,"FallBackFileSourceV2":17384,"Analyzer$ResolveTables":1058791,"SubstituteUnresolvedOrdinals":24988,"TypeCoercion$CaseWhenCoercion":20801,"DecimalPrecision":33402,"EliminateSorts":5027734,"PushDownLeftSemiAntiJoin":1586581,"ExtractPythonUDFFromJoinCondition":388195,"TypeCoercion$StackCoercion":22547,"Analyzer$ResolveAggAliasInGroupBy":17989,"TypeCoercion$StringLiteralCoercion":21495,"FoldablePropagation":1998562,"V2ScanRelationPushDown":2386328,"EliminateDistinct":12922,"InferFiltersFromConstraints":2118772,"Analyzer$PullOutNondeterministic":20264,"Analyzer$ResolveFunctions":22188,"ReplaceNullWithFalseInPredicate":18150892,"ResolveHigherOrderFunctions":26349,"Analyzer$ResolvePivot":17494,"CollapseProject":35101256,"Analyzer$ResolveNewInstance":21753,"ColumnPruning":36814201,"Analyzer$ResolveWindowOrder":20704,"TypeCoercion$ConcatCoercion":26415,"PushDownPredicates":34192568,"TimeWindowing":377163,"Optimizer$OptimizeSubqueries":2518451,"RewriteNonCorrelatedExists":16023362,"DemoteBroadcastHashJoin":1583625,"TypeCoercion$Division":23352,"ComputeCurrentTime":2544030,"ResolveCreateNamedStruct":27423,"TypeCoercion$EltCoercion":24494,"ConvertToLocalRelation":1036580,"RemoveRepetitionFromGroupExpressions":901093,"ReplaceDistinctWithAggregate":613429,"PreprocessTableCreation":35768,"ResolveSQLOnFile":18463,"Analyzer$ResolveSubquery":16859,"CombineConcats":35171,"Analyzer$ResolveGroupingAnalytics":23341,"Analyzer$ResolveBinaryArithmetic":24084,"RemoveDispensableExpressions":3238421,"Analyzer$ResolveAlterTableChanges":23583,"ResolveEncodersInScalaAgg":25882,"TypeCoercion$IntegralDivision":21397,"Analyzer$ResolveWindowFrame":298934,"Analyzer$ResolveDeserializer":23038,"RewriteDistinctAggregates":2712033,"RemoveNoopOperators":21808704,"Analyzer$ResolveAggregateFunctions":20854,"NormalizeFloatingNumbers":983677,"ReorderJoin":1504201,"Analyzer$ResolveUpCast":21253,"Analyzer$ResolveGenerate":21750,"TypeCoercion$WidenSetOperationTypes":17883,"EliminateOuterJoin":1531845,"SimplifyExtractValueOps":3665075,"OptimizeMetadataOnlyQuery":19828,"EliminateResolvedHint":2404985,"Analyzer$ResolveInsertInto":19118,"ReplaceExceptWithFilter":701989,"CleanupAliases":30232,"GetCurrentDatabase":2846042,"SchemaPruning":4676256,"Analyzer$ResolveOutputRelation":18745,"BloomFilterJoinRule":414505,"Analyzer$ResolveRandomSeed":17909,"TypeCoercion$WindowFrameCoercion":21570,"ConstantFolding":8217907,"TypeCoercion$DateTimeOperations":22064,"TypeCoercion$InConversion":23842,"FindDataSourceTable":23797,"SimplifyConditionals":5962134,"DataSourceAnalysis":19857,"TypeCoercion$FunctionArgumentConversion":29252,"Analyzer$GlobalAggregates":19799,"Analyzer$LookupFunctions":50673,"CombineFilters":1703830,"ReplaceDeduplicateWithAggregate":664176,"PreprocessTableInsertion":21103},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":3,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":3,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":3,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":3,"CollapseRepartition":3,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":3,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":4,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":3,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":3,"ResolveTableValuedFunctions":1,"EliminateSerialization":3,"TypeCoercion$BooleanEquality":1,"package$ExpressionCanonicalizer$CleanExpressions":8,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":3,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":3,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":3,"EliminateMapObjects":1,"CombineLimits":3,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":3,"CollapseWindow":3,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":3,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":4,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":3,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":3,"SimplifyCaseConversionExpressions":3,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":3,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":3,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":3,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":4,"Analyzer$ResolveNewInstance":1,"ColumnPruning":5,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":5,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"DemoteBroadcastHashJoin":2,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":3,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":3,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":5,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":3,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":3,"SimplifyExtractValueOps":3,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":3,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":3,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":4,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{"EliminateSubqueryAliases":1,"LikeSimplification":1,"SimplifyCasts":1,"RewritePredicateSubquery":1,"InferFiltersFromConstraints":1,"CollapseProject":1,"ColumnPruning":1,"PushDownPredicates":1,"ConstantFolding":1},"timeEffectiveRunsPerRule":{"EliminateSubqueryAliases":4206194,"LikeSimplification":3132604,"SimplifyCasts":5097741,"RewritePredicateSubquery":1034141,"InferFiltersFromConstraints":2118772,"CollapseProject":33418665,"ColumnPruning":18210290,"PushDownPredicates":31603282,"ConstantFolding":5681845},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":5,"time":1678162990095} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":6,"description":"save at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[count(1)], output=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count#537L])\n +- Exchange hashpartitioning(Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, 1000), true, [id=#287]\n +- HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[partial_count(1)], output=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count#1662L])\n +- Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- InMemoryTableScan [content_charset#26, content_languages#27, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13, url_host_tld#7]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[count(1)])","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, 1000), true, [id=#287]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[partial_count(1)])","children":[{"nodeName":"Project","simpleString":"Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [content_charset#26, content_languages#27, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13, url_host_tld#7]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":397,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":394,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":395,"metricType":"timing"},{"name":"peak memory","accumulatorId":393,"metricType":"size"},{"name":"number of output rows","accumulatorId":392,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":396,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":359,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":360,"metricType":"nsTiming"},{"name":"records read","accumulatorId":357,"metricType":"sum"},{"name":"local bytes read","accumulatorId":355,"metricType":"size"},{"name":"fetch wait time","accumulatorId":356,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":353,"metricType":"size"},{"name":"local blocks read","accumulatorId":352,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":351,"metricType":"sum"},{"name":"data size","accumulatorId":350,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":354,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":358,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":389,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":390,"metricType":"timing"},{"name":"peak memory","accumulatorId":388,"metricType":"size"},{"name":"number of output rows","accumulatorId":387,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":391,"metricType":"average"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":383,"metricType":"sum"},{"name":"written output","accumulatorId":384,"metricType":"size"},{"name":"number of output rows","accumulatorId":385,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":386,"metricType":"sum"}]},"time":1678162990226} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":6,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":6,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[count(1)], output=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count#537L])\n +- ShuffleQueryStage 0\n +- Exchange hashpartitioning(Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, 1000), true, [id=#302]\n +- *(1) HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[partial_count(1)], output=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count#1662L])\n +- *(1) Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- InMemoryTableScan [content_charset#26, content_languages#27, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13, url_host_tld#7]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[count(1)])","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, 1000), true, [id=#302]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[partial_count(1)])","children":[{"nodeName":"Project","simpleString":"Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [content_charset#26, content_languages#27, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13, url_host_tld#7]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":397,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":417,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":418,"metricType":"timing"},{"name":"peak memory","accumulatorId":416,"metricType":"size"},{"name":"number of output rows","accumulatorId":415,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":419,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":414,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":407,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":408,"metricType":"nsTiming"},{"name":"records read","accumulatorId":405,"metricType":"sum"},{"name":"local bytes read","accumulatorId":403,"metricType":"size"},{"name":"fetch wait time","accumulatorId":404,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":401,"metricType":"size"},{"name":"local blocks read","accumulatorId":400,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":399,"metricType":"sum"},{"name":"data size","accumulatorId":398,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":402,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":406,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":411,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":412,"metricType":"timing"},{"name":"peak memory","accumulatorId":410,"metricType":"size"},{"name":"number of output rows","accumulatorId":409,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":413,"metricType":"average"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":383,"metricType":"sum"},{"name":"written output","accumulatorId":384,"metricType":"size"},{"name":"number of output rows","accumulatorId":385,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":386,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":4,"Submission Time":1678162990571,"Stage Infos":[{"Stage ID":7,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":26,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"68\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[25],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":24,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[23],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":25,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"69\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[24],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":23,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[7],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"68\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"6","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":7,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":26,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"68\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[25],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":24,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[23],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":25,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"69\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[24],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":23,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162990576,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"68\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"6","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":36,"Index":7,"Attempt":0,"Launch Time":1678162990601,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":37,"Index":2,"Attempt":0,"Launch Time":1678162990601,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":38,"Index":1,"Attempt":0,"Launch Time":1678162990601,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":39,"Index":4,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":40,"Index":3,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":41,"Index":6,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":42,"Index":5,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":43,"Index":0,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":44,"Index":15,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":45,"Index":10,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":46,"Index":9,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":47,"Index":12,"Attempt":0,"Launch Time":1678162990604,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":48,"Index":11,"Attempt":0,"Launch Time":1678162990604,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":49,"Index":14,"Attempt":0,"Launch Time":1678162990604,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":50,"Index":13,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":51,"Index":8,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":52,"Index":23,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":53,"Index":18,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":54,"Index":17,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":55,"Index":20,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":56,"Index":19,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":57,"Index":22,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":58,"Index":21,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":59,"Index":16,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":60,"Index":31,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":61,"Index":26,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":62,"Index":25,"Attempt":0,"Launch Time":1678162990613,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":63,"Index":28,"Attempt":0,"Launch Time":1678162990614,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":64,"Index":27,"Attempt":0,"Launch Time":1678162990614,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":65,"Index":30,"Attempt":0,"Launch Time":1678162990614,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":66,"Index":29,"Attempt":0,"Launch Time":1678162990615,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":67,"Index":24,"Attempt":0,"Launch Time":1678162990615,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":38,"Index":1,"Attempt":0,"Launch Time":1678162990601,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990854,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"64","Value":"64","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":46,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":294912,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":3704,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":37083937,"Value":37083937,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":179,"Value":179,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11832221,"Value":11832221,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":46,"Value":46,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":46,"Executor Deserialize CPU Time":11832221,"Executor Run Time":179,"Executor CPU Time":37083937,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":54,"Index":17,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990854,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"54","Value":"118","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":92,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":589824,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":7408,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":71193481,"Value":108277418,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":192,"Value":371,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7985801,"Value":19818022,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":26,"Value":72,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":26,"Executor Deserialize CPU Time":7985801,"Executor Run Time":192,"Executor CPU Time":71193481,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":62,"Index":25,"Attempt":0,"Launch Time":1678162990613,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990857,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"65","Value":"183","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"786432","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":138,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":884736,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":11112,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":34768477,"Value":143045895,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":197,"Value":568,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6754024,"Value":26572046,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":23,"Value":95,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":23,"Executor Deserialize CPU Time":6754024,"Executor Run Time":197,"Executor CPU Time":34768477,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":46,"Index":9,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990861,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"61","Value":"244","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"1048576","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":184,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1179648,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":14816,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":31039667,"Value":174085562,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":185,"Value":753,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7114043,"Value":33686089,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":42,"Value":137,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":42,"Executor Deserialize CPU Time":7114043,"Executor Run Time":185,"Executor CPU Time":31039667,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":37,"Index":2,"Attempt":0,"Launch Time":1678162990601,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990865,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"78","Value":"322","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"1310720","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":230,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1474560,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":18520,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":84151587,"Value":258237149,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":197,"Value":950,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7484568,"Value":41170657,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":39,"Value":176,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":39,"Executor Deserialize CPU Time":7484568,"Executor Run Time":197,"Executor CPU Time":84151587,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":45,"Index":10,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990868,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"82","Value":"404","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"1572864","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":276,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1769472,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":22224,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":27144554,"Value":285381703,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":195,"Value":1145,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6907346,"Value":48078003,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":38,"Value":214,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":38,"Executor Deserialize CPU Time":6907346,"Executor Run Time":195,"Executor CPU Time":27144554,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":61,"Index":26,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990871,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"88","Value":"492","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"1835008","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"4","Value":"4","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":322,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2064384,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":25928,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":70054206,"Value":355435909,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":204,"Value":1349,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7346786,"Value":55424789,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":38,"Value":252,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":38,"Executor Deserialize CPU Time":7346786,"Executor Run Time":204,"Executor CPU Time":70054206,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":53,"Index":18,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990871,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"87","Value":"579","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"2097152","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"2","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":368,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2359296,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":29632,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":26955003,"Value":382390912,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":191,"Value":1540,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":12881464,"Value":68306253,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":49,"Value":301,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":49,"Executor Deserialize CPU Time":12881464,"Executor Run Time":191,"Executor CPU Time":26955003,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":44,"Index":15,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990883,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"79","Value":"658","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"2359296","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":414,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2654208,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":33336,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":40553416,"Value":422944328,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":216,"Value":1756,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":17868080,"Value":86174333,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":43,"Value":344,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":43,"Executor Deserialize CPU Time":17868080,"Executor Run Time":216,"Executor CPU Time":40553416,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":60,"Index":31,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990883,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"80","Value":"738","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"2621440","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":460,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2949120,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":37040,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":50559943,"Value":473504271,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":213,"Value":1969,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":13057335,"Value":99231668,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":43,"Value":387,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":43,"Executor Deserialize CPU Time":13057335,"Executor Run Time":213,"Executor CPU Time":50559943,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":36,"Index":7,"Attempt":0,"Launch Time":1678162990601,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990888,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"84","Value":"822","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"2883584","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":506,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3244032,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":40744,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":40418664,"Value":513922935,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":220,"Value":2189,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11833801,"Value":111065469,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":44,"Value":431,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":44,"Executor Deserialize CPU Time":11833801,"Executor Run Time":220,"Executor CPU Time":40418664,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":52,"Index":23,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990894,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"90","Value":"912","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"3145728","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":552,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3538944,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":44448,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":92996721,"Value":606919656,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":227,"Value":2416,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11539577,"Value":122605046,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":42,"Value":473,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":42,"Executor Deserialize CPU Time":11539577,"Executor Run Time":227,"Executor CPU Time":92996721,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":68,"Index":32,"Attempt":0,"Launch Time":1678162990914,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":63,"Index":28,"Attempt":0,"Launch Time":1678162990614,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990915,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"71","Value":"983","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"3407872","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":598,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3833856,"Internal":true,"Count Failed Values":true},{"ID":426,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3747,"Value":48195,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":65780719,"Value":672700375,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":201,"Value":2617,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":12679202,"Value":135284248,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":71,"Value":544,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":71,"Executor Deserialize CPU Time":12679202,"Executor Run Time":201,"Executor CPU Time":65780719,"Peak Execution Memory":294912,"Result Size":3747,"JVM GC Time":0,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":47,"Index":12,"Attempt":0,"Launch Time":1678162990604,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990919,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"73","Value":"1056","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"3670016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":644,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4128768,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":51899,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":33738577,"Value":706438952,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":213,"Value":2830,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":12023300,"Value":147307548,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":63,"Value":607,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":63,"Executor Deserialize CPU Time":12023300,"Executor Run Time":213,"Executor CPU Time":33738577,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":55,"Index":20,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990920,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"76","Value":"1132","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"3932160","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":690,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4423680,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":55603,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":47410768,"Value":753849720,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":212,"Value":3042,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11045615,"Value":158353163,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":65,"Value":672,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":65,"Executor Deserialize CPU Time":11045615,"Executor Run Time":212,"Executor CPU Time":47410768,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":57,"Index":22,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990920,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"53","Value":"1185","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"4194304","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":736,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4718592,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":59307,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":23775227,"Value":777624947,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":188,"Value":3230,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":8431652,"Value":166784815,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":80,"Value":752,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":80,"Executor Deserialize CPU Time":8431652,"Executor Run Time":188,"Executor CPU Time":23775227,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":49,"Index":14,"Attempt":0,"Launch Time":1678162990604,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990928,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"55","Value":"1240","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"4456448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":782,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5013504,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":63011,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":103555165,"Value":881180112,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":211,"Value":3441,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15266346,"Value":182051161,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":62,"Value":814,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":62,"Executor Deserialize CPU Time":15266346,"Executor Run Time":211,"Executor CPU Time":103555165,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":39,"Index":4,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990928,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"71","Value":"1311","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":828,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5308416,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":66715,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":50668091,"Value":931848203,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":223,"Value":3664,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15389316,"Value":197440477,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":57,"Value":871,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":57,"Executor Deserialize CPU Time":15389316,"Executor Run Time":223,"Executor CPU Time":50668091,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":43,"Index":0,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990935,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"74","Value":"1385","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"4980736","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":874,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5603328,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":70419,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":71691936,"Value":1003540139,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":194,"Value":3858,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9698926,"Value":207139403,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":91,"Value":962,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":91,"Executor Deserialize CPU Time":9698926,"Executor Run Time":194,"Executor CPU Time":71691936,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":51,"Index":8,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990935,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"76","Value":"1461","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"5242880","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":920,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5898240,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":74123,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":44219301,"Value":1047759440,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":208,"Value":4066,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":16558084,"Value":223697487,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":78,"Value":1040,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":78,"Executor Deserialize CPU Time":16558084,"Executor Run Time":208,"Executor CPU Time":44219301,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":59,"Index":16,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990939,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"76","Value":"1537","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"5505024","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":966,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6193152,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":77827,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":56415182,"Value":1104174622,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":224,"Value":4290,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10563362,"Value":234260849,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":61,"Value":1101,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":61,"Executor Deserialize CPU Time":10563362,"Executor Run Time":224,"Executor CPU Time":56415182,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":67,"Index":24,"Attempt":0,"Launch Time":1678162990615,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990945,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"84","Value":"1621","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"5767168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"1","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1012,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6488064,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":81531,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":55389061,"Value":1159563683,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":215,"Value":4505,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":8273412,"Value":242534261,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":78,"Value":1179,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":78,"Executor Deserialize CPU Time":8273412,"Executor Run Time":215,"Executor CPU Time":55389061,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":41,"Index":6,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990948,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"75","Value":"1696","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"6029312","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1058,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6782976,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":85235,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":19413949,"Value":1178977632,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":228,"Value":4733,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10586062,"Value":253120323,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":69,"Value":1248,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":69,"Executor Deserialize CPU Time":10586062,"Executor Run Time":228,"Executor CPU Time":19413949,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":65,"Index":30,"Attempt":0,"Launch Time":1678162990614,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990951,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"76","Value":"1772","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"6291456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1104,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7077888,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":88939,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":50978166,"Value":1229955798,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":250,"Value":4983,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":8664373,"Value":261784696,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":45,"Value":1293,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":45,"Executor Deserialize CPU Time":8664373,"Executor Run Time":250,"Executor CPU Time":50978166,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":68,"Index":32,"Attempt":0,"Launch Time":1678162990914,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990951,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"9","Value":"1781","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"6553600","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1150,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7372800,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":92643,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":14351311,"Value":1244307109,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":16,"Value":4999,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3814989,"Value":265599685,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":4,"Value":1297,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":4,"Executor Deserialize CPU Time":3814989,"Executor Run Time":16,"Executor CPU Time":14351311,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":58,"Index":21,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990961,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"71","Value":"1852","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"6815744","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1196,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7667712,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":96347,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":58835547,"Value":1303142656,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":261,"Value":5260,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11416406,"Value":277016091,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":50,"Value":1347,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":892630656,"JVMOffHeapMemory":123492160,"OnHeapExecutionMemory":65536,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":667115,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":732651,"OffHeapUnifiedMemory":0,"DirectPoolMemory":19676008,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7151345664,"ProcessTreeJVMRSSMemory":1655054336,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":11,"MinorGCTime":223,"MajorGCCount":3,"MajorGCTime":283},"Task Metrics":{"Executor Deserialize Time":50,"Executor Deserialize CPU Time":11416406,"Executor Run Time":261,"Executor CPU Time":58835547,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":42,"Index":5,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990961,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"71","Value":"1923","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"7077888","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1242,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7962624,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":100051,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":66422831,"Value":1369565487,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":223,"Value":5483,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11488196,"Value":288504287,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":89,"Value":1436,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":892630656,"JVMOffHeapMemory":123492160,"OnHeapExecutionMemory":65536,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":667115,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":732651,"OffHeapUnifiedMemory":0,"DirectPoolMemory":19676008,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7151345664,"ProcessTreeJVMRSSMemory":1655054336,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":11,"MinorGCTime":223,"MajorGCCount":3,"MajorGCTime":283},"Task Metrics":{"Executor Deserialize Time":89,"Executor Deserialize CPU Time":11488196,"Executor Run Time":223,"Executor CPU Time":66422831,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":48,"Index":11,"Attempt":0,"Launch Time":1678162990604,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990962,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"71","Value":"1994","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"7340032","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1288,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8257536,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":103755,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":31804881,"Value":1401370368,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":239,"Value":5722,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":13505518,"Value":302009805,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":81,"Value":1517,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":81,"Executor Deserialize CPU Time":13505518,"Executor Run Time":239,"Executor CPU Time":31804881,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":40,"Index":3,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990962,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"72","Value":"2066","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"7602176","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1334,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8552448,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":107459,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":82566623,"Value":1483936991,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":239,"Value":5961,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7618637,"Value":309628442,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":82,"Value":1599,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":82,"Executor Deserialize CPU Time":7618637,"Executor Run Time":239,"Executor CPU Time":82566623,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":64,"Index":27,"Attempt":0,"Launch Time":1678162990614,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990974,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"71","Value":"2137","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"7864320","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1380,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8847360,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":111163,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":23589394,"Value":1507526385,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":268,"Value":6229,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6338441,"Value":315966883,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":49,"Value":1648,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":49,"Executor Deserialize CPU Time":6338441,"Executor Run Time":268,"Executor CPU Time":23589394,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":66,"Index":29,"Attempt":0,"Launch Time":1678162990615,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990974,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"72","Value":"2209","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"8126464","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1426,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9142272,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":114867,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":46349815,"Value":1553876200,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":265,"Value":6494,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11085006,"Value":327051889,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":44,"Value":1692,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":892630656,"JVMOffHeapMemory":123492160,"OnHeapExecutionMemory":65536,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":667115,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":732651,"OffHeapUnifiedMemory":0,"DirectPoolMemory":19676008,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7151345664,"ProcessTreeJVMRSSMemory":1655054336,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":11,"MinorGCTime":223,"MajorGCCount":3,"MajorGCTime":283},"Task Metrics":{"Executor Deserialize Time":44,"Executor Deserialize CPU Time":11085006,"Executor Run Time":265,"Executor CPU Time":46349815,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":50,"Index":13,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990975,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"72","Value":"2281","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"8388608","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1472,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9437184,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":118571,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":32787257,"Value":1586663457,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":267,"Value":6761,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":16976246,"Value":344028135,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":48,"Value":1740,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":892630656,"JVMOffHeapMemory":123492160,"OnHeapExecutionMemory":65536,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":667115,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":732651,"OffHeapUnifiedMemory":0,"DirectPoolMemory":19676008,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7151345664,"ProcessTreeJVMRSSMemory":1655054336,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":11,"MinorGCTime":223,"MajorGCCount":3,"MajorGCTime":283},"Task Metrics":{"Executor Deserialize Time":48,"Executor Deserialize CPU Time":16976246,"Executor Run Time":267,"Executor CPU Time":32787257,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":56,"Index":19,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990977,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"73","Value":"2354","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1518,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":122275,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":57206426,"Value":1643869883,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":268,"Value":7029,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6415841,"Value":350443976,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":49,"Value":1789,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":49,"Executor Deserialize CPU Time":6415841,"Executor Run Time":268,"Executor CPU Time":57206426,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":7,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":26,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"68\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[25],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":24,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[23],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":25,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"69\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[24],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":23,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162990576,"Completion Time":1678162990978,"Accumulables":[{"ID":424,"Name":"internal.metrics.resultSize","Value":122275,"Internal":true,"Count Failed Values":true},{"ID":418,"Name":"time in aggregation build","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Value":350443976,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Value":1789,"Internal":true,"Count Failed Values":true},{"ID":414,"Name":"duration","Value":"2354","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Value":1518,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Value":1643869883,"Internal":true,"Count Failed Values":true},{"ID":426,"Name":"internal.metrics.resultSerializationTime","Value":1,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Value":7029,"Internal":true,"Count Failed Values":true},{"ID":416,"Name":"peak memory","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"}]}} -{"Event":"SparkListenerJobEnd","Job ID":4,"Completion Time":1678162990987,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":6,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- AdaptiveSparkPlan isFinalPlan=true\n +- *(2) HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[count(1)], output=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count#537L])\n +- CustomShuffleReader coalesced\n +- ShuffleQueryStage 0\n +- Exchange hashpartitioning(Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, 1000), true, [id=#302]\n +- *(1) HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[partial_count(1)], output=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count#1662L])\n +- *(1) Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- InMemoryTableScan [content_charset#26, content_languages#27, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13, url_host_tld#7]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=true","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, 1000), true, [id=#302]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[partial_count(1)])","children":[{"nodeName":"Project","simpleString":"Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [content_charset#26, content_languages#27, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13, url_host_tld#7]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":397,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":417,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":418,"metricType":"timing"},{"name":"peak memory","accumulatorId":416,"metricType":"size"},{"name":"number of output rows","accumulatorId":415,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":419,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":414,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":407,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":408,"metricType":"nsTiming"},{"name":"records read","accumulatorId":405,"metricType":"sum"},{"name":"local bytes read","accumulatorId":403,"metricType":"size"},{"name":"fetch wait time","accumulatorId":404,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":401,"metricType":"size"},{"name":"local blocks read","accumulatorId":400,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":399,"metricType":"sum"},{"name":"data size","accumulatorId":398,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":402,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":406,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":459,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":460,"metricType":"timing"},{"name":"peak memory","accumulatorId":458,"metricType":"size"},{"name":"number of output rows","accumulatorId":457,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":461,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":456,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":383,"metricType":"sum"},{"name":"written output","accumulatorId":384,"metricType":"size"},{"name":"number of output rows","accumulatorId":385,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":386,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":5,"Submission Time":1678162991085,"Stage Infos":[{"Stage ID":9,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":28,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"75\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[27],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":27,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"78\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[26],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[8],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":8,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":26,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"68\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[25],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":24,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[23],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":25,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"69\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[24],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":23,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[9,8],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"66\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"6","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":9,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":28,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"75\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[27],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":27,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"78\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[26],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[8],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162991087,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"66\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"6","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":9,"Stage Attempt ID":0,"Task Info":{"Task ID":69,"Index":0,"Attempt":0,"Launch Time":1678162991129,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":9,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":69,"Index":0,"Attempt":0,"Launch Time":1678162991129,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162991765,"Failed":false,"Killed":false,"Accumulables":[{"ID":456,"Name":"duration","Update":"396","Value":"396","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":458,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":460,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":485,"Name":"internal.metrics.output.bytesWritten","Update":87,"Value":87,"Internal":true,"Count Failed Values":true},{"ID":479,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":478,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":477,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":476,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":475,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":474,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":473,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":471,"Name":"internal.metrics.peakExecutionMemory","Update":262144,"Value":262144,"Internal":true,"Count Failed Values":true},{"ID":468,"Name":"internal.metrics.resultSerializationTime","Update":2,"Value":2,"Internal":true,"Count Failed Values":true},{"ID":466,"Name":"internal.metrics.resultSize","Update":4800,"Value":4800,"Internal":true,"Count Failed Values":true},{"ID":465,"Name":"internal.metrics.executorCpuTime","Update":269717740,"Value":269717740,"Internal":true,"Count Failed Values":true},{"ID":464,"Name":"internal.metrics.executorRunTime","Update":552,"Value":552,"Internal":true,"Count Failed Values":true},{"ID":463,"Name":"internal.metrics.executorDeserializeCpuTime","Update":68577712,"Value":68577712,"Internal":true,"Count Failed Values":true},{"ID":462,"Name":"internal.metrics.executorDeserializeTime","Update":75,"Value":75,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":75,"Executor Deserialize CPU Time":68577712,"Executor Run Time":552,"Executor CPU Time":269717740,"Peak Execution Memory":262144,"Result Size":4800,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":87,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":9,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":28,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"75\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[27],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":27,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"78\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[26],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[8],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162991087,"Completion Time":1678162991766,"Accumulables":[{"ID":478,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":460,"Name":"time in aggregation build","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":463,"Name":"internal.metrics.executorDeserializeCpuTime","Value":68577712,"Internal":true,"Count Failed Values":true},{"ID":474,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":456,"Name":"duration","Value":"396","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":465,"Name":"internal.metrics.executorCpuTime","Value":269717740,"Internal":true,"Count Failed Values":true},{"ID":468,"Name":"internal.metrics.resultSerializationTime","Value":2,"Internal":true,"Count Failed Values":true},{"ID":477,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":471,"Name":"internal.metrics.peakExecutionMemory","Value":262144,"Internal":true,"Count Failed Values":true},{"ID":462,"Name":"internal.metrics.executorDeserializeTime","Value":75,"Internal":true,"Count Failed Values":true},{"ID":479,"Name":"internal.metrics.shuffle.read.recordsRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":464,"Name":"internal.metrics.executorRunTime","Value":552,"Internal":true,"Count Failed Values":true},{"ID":473,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":458,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":485,"Name":"internal.metrics.output.bytesWritten","Value":87,"Internal":true,"Count Failed Values":true},{"ID":476,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":466,"Name":"internal.metrics.resultSize","Value":4800,"Internal":true,"Count Failed Values":true},{"ID":475,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":5,"Completion Time":1678162991767,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates","executionId":6,"accumUpdates":[[383,1],[384,87],[385,0],[386,0]]} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":6,"timePerRule":{"PruneFileSourcePartitions":253272,"ReassignLambdaVariableID":221432,"PushPredicateThroughNonJoin":120341,"Analyzer$HandleNullInputsForUDF":24848,"Analyzer$ResolveSubqueryColumnAliases":12844,"ResolveTimeZone":17803,"Analyzer$ResolveNamespace":14411,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":17584,"RewriteCorrelatedScalarSubquery":489498,"RemoveLiteralFromGroupExpressions":294694,"PushProjectionThroughUnion":538890,"EliminateSubqueryAliases":261703,"ResolveCatalogs":43322,"PushLeftSemiLeftAntiThroughJoin":468382,"FlattenScalarSubqueriesWithAggregates":199405,"LikeSimplification":22746726,"CollapseRepartition":1445916,"ResolveHints$ResolveCoalesceHints":13643,"Analyzer$ExtractGenerator":41229,"RewriteIntersectAll":267650,"ResolveHints$ResolveJoinStrategyHints":16484,"TypeCoercion$MapZipWithCoercion":17114,"NullPropagation":690506,"PullupCorrelatedPredicates":409545,"UpdateOuterReferences":21611,"ExtractPythonUDFs":397685,"Analyzer$WindowsSubstitution":17866,"CombineUnions":553256,"ExtractGroupingPythonUDFFromAggregate":136413,"ReorderAssociativeOperator":595564,"CleanupDynamicPruningFilters":866309,"ResolveHints$RemoveAllHints":18823,"SimplifyBinaryComparison":582612,"ResolveTableValuedFunctions":19548,"EliminateSerialization":352989,"TypeCoercion$BooleanEquality":20662,"ReplaceIntersectWithSemiJoin":239161,"ConstantPropagation":530918,"CostBasedJoinReorder":17893,"Analyzer$ResolveReferences":71033,"CTESubstitution":560141,"RemoveRedundantAliases":731939,"TypeCoercion$ImplicitTypeCasts":15808,"RewriteExceptAll":274716,"UpdateAttributeNullability":110371,"PropagateEmptyRelation":483979,"SimplifyCasts":582812,"EliminateMapObjects":246971,"CombineLimits":311007,"DetectAmbiguousSelfJoin":36203,"ReplaceExpressions":449311,"ResolveInlineTables":17395,"OptimizeIn":449957,"CollapseWindow":345087,"TypeCoercion$IfCoercion":17770,"ResolveSessionCatalog":24629,"PartitionPruning":173960,"BooleanSimplification":1806378,"TypeCoercion$PromoteStrings":17499,"Analyzer$ResolveAliases":16555,"DecimalAggregates":187219,"PruneFilters":611458,"Analyzer$ResolveMissingReferences":12837,"TransposeWindow":291583,"Analyzer$ResolveRelations":26192,"EliminateUnions":22924,"RewritePredicateSubquery":126175,"ObjectSerializerPruning":117383,"LimitPushDown":458036,"SimplifyCaseConversionExpressions":523113,"Analyzer$ResolveNaturalAndUsingJoin":13408,"EliminateView":288220,"CombineTypedFilters":131581,"OptimizeLimitZero":252729,"CheckCartesianProducts":33088,"ExtractPythonUDFFromAggregate":149646,"Analyzer$ExtractWindowExpressions":21631,"ReplaceExceptWithAntiJoin":266708,"ResolveLambdaVariables":21554,"FallBackFileSourceV2":13639,"Analyzer$ResolveTables":22022,"SubstituteUnresolvedOrdinals":16149,"TypeCoercion$CaseWhenCoercion":17846,"DecimalPrecision":24537,"EliminateSorts":201597,"PushDownLeftSemiAntiJoin":468436,"ExtractPythonUDFFromJoinCondition":136820,"TypeCoercion$StackCoercion":17218,"Analyzer$ResolveAggAliasInGroupBy":14095,"TypeCoercion$StringLiteralCoercion":16320,"FoldablePropagation":162402,"V2ScanRelationPushDown":224664,"EliminateDistinct":15964,"InferFiltersFromConstraints":176342,"Analyzer$PullOutNondeterministic":24631,"Analyzer$ResolveFunctions":19918,"ReplaceNullWithFalseInPredicate":535716,"ResolveHigherOrderFunctions":18362,"Analyzer$ResolvePivot":14745,"CollapseProject":1478242,"Analyzer$ResolveNewInstance":18174,"ColumnPruning":3239875,"Analyzer$ResolveWindowOrder":17389,"TypeCoercion$ConcatCoercion":20971,"PushDownPredicates":799751,"TimeWindowing":51036,"Optimizer$OptimizeSubqueries":837955,"RewriteNonCorrelatedExists":412790,"DemoteBroadcastHashJoin":43539,"TypeCoercion$Division":16709,"ComputeCurrentTime":462486,"ResolveCreateNamedStruct":19807,"TypeCoercion$EltCoercion":19551,"ConvertToLocalRelation":408215,"RemoveRepetitionFromGroupExpressions":362933,"ReplaceDistinctWithAggregate":265417,"PreprocessTableCreation":18272,"ResolveSQLOnFile":13787,"Analyzer$ResolveSubquery":13205,"CombineConcats":71118,"Analyzer$ResolveGroupingAnalytics":22627,"Analyzer$ResolveBinaryArithmetic":19376,"RemoveDispensableExpressions":640676,"Analyzer$ResolveAlterTableChanges":20488,"ResolveEncodersInScalaAgg":19584,"TypeCoercion$IntegralDivision":15946,"Analyzer$ResolveWindowFrame":15964,"Analyzer$ResolveDeserializer":20192,"RewriteDistinctAggregates":346549,"RemoveNoopOperators":7229896,"Analyzer$ResolveAggregateFunctions":14910,"NormalizeFloatingNumbers":116046,"ReorderJoin":534511,"Analyzer$ResolveUpCast":18464,"Analyzer$ResolveGenerate":16075,"TypeCoercion$WidenSetOperationTypes":14173,"EliminateOuterJoin":468131,"SimplifyExtractValueOps":637528,"OptimizeMetadataOnlyQuery":16186,"EliminateResolvedHint":565273,"Analyzer$ResolveInsertInto":12989,"ReplaceExceptWithFilter":271491,"CleanupAliases":24903,"GetCurrentDatabase":488348,"SchemaPruning":296715,"Analyzer$ResolveOutputRelation":13593,"BloomFilterJoinRule":142981,"Analyzer$ResolveRandomSeed":13957,"TypeCoercion$WindowFrameCoercion":17717,"ConstantFolding":443957,"TypeCoercion$DateTimeOperations":15555,"TypeCoercion$InConversion":45548,"FindDataSourceTable":15931,"SimplifyConditionals":4565594,"DataSourceAnalysis":14324,"TypeCoercion$FunctionArgumentConversion":16287,"Analyzer$GlobalAggregates":13642,"Analyzer$LookupFunctions":23350,"CombineFilters":444600,"ReplaceDeduplicateWithAggregate":257073,"PreprocessTableInsertion":12397},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":3,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":3,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":3,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":3,"CollapseRepartition":3,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":3,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":4,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":3,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":3,"ResolveTableValuedFunctions":1,"EliminateSerialization":3,"TypeCoercion$BooleanEquality":1,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":3,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":3,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":3,"EliminateMapObjects":1,"CombineLimits":3,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":3,"CollapseWindow":3,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":3,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":4,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":3,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":3,"SimplifyCaseConversionExpressions":3,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":3,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":3,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":3,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":4,"Analyzer$ResolveNewInstance":1,"ColumnPruning":5,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":5,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"DemoteBroadcastHashJoin":1,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":3,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":3,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":5,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":3,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":3,"SimplifyExtractValueOps":3,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":3,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":3,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":4,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{"ColumnPruning":1,"CollapseProject":1},"timeEffectiveRunsPerRule":{"ColumnPruning":2159753,"CollapseProject":1117639},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":6,"time":1678162991856} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":7,"description":"save at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247, Date#344], [Site#247, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L]\n : +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- Project [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- SortMergeJoin [Site#247], [Site#485], FullOuter\n :- Sort [Site#247 ASC NULLS FIRST], false, 0\n : +- HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)], output=[Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L])\n : +- Exchange hashpartitioning(Site#247, 1000), true, [id=#387]\n : +- HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)], output=[Site#247, sum#2727L, sum#2728L, sum#2729, count#2730L])\n : +- HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Total_Pages#449L, Total_Record_Length#451L])\n : +- Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#383]\n : +- HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, count#2733L, sum#2734L])\n : +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"Project","simpleString":"Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]","children":[{"nodeName":"SortMergeJoin","simpleString":"SortMergeJoin [Site#247], [Site#485], FullOuter","children":[{"nodeName":"Sort","simpleString":"Sort [Site#247 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)])","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, 1000), true, [id=#387]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#383]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":614,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":611,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":612,"metricType":"timing"},{"name":"peak memory","accumulatorId":610,"metricType":"size"},{"name":"number of output rows","accumulatorId":609,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":613,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":496,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":497,"metricType":"nsTiming"},{"name":"records read","accumulatorId":494,"metricType":"sum"},{"name":"local bytes read","accumulatorId":492,"metricType":"size"},{"name":"fetch wait time","accumulatorId":493,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":490,"metricType":"size"},{"name":"local blocks read","accumulatorId":489,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":488,"metricType":"sum"},{"name":"data size","accumulatorId":487,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":491,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":495,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":606,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":607,"metricType":"timing"},{"name":"peak memory","accumulatorId":605,"metricType":"size"},{"name":"number of output rows","accumulatorId":604,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":608,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":601,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":602,"metricType":"timing"},{"name":"peak memory","accumulatorId":600,"metricType":"size"},{"name":"number of output rows","accumulatorId":599,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":603,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":507,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":508,"metricType":"nsTiming"},{"name":"records read","accumulatorId":505,"metricType":"sum"},{"name":"local bytes read","accumulatorId":503,"metricType":"size"},{"name":"fetch wait time","accumulatorId":504,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":501,"metricType":"size"},{"name":"local blocks read","accumulatorId":500,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":499,"metricType":"sum"},{"name":"data size","accumulatorId":498,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":502,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":506,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":596,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":597,"metricType":"timing"},{"name":"peak memory","accumulatorId":595,"metricType":"size"},{"name":"number of output rows","accumulatorId":594,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":598,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":591,"metricType":"timing"},{"name":"peak memory","accumulatorId":592,"metricType":"size"},{"name":"spill size","accumulatorId":593,"metricType":"size"}]},{"nodeName":"Sort","simpleString":"Sort [Site#485 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#485, 1000), true, [id=#389]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[partial_avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#485, pythonUDF0#2422 AS levenshtein_distance#215]","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2422]","children":[{"nodeName":"Project","simpleString":"Project [url_host_2nd_last_part#8, url_host_registered_domain#13]","children":[{"nodeName":"Filter","simpleString":"Filter (pythonUDF0#2421 <= 2)","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2421]","children":[{"nodeName":"Filter","simpleString":"Filter ((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(url_host_registered_domain#13))","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_2nd_last_part#8, url_host_registered_domain#13, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(url_host_registered_domain#13)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":630,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":629,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":628,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":625,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":626,"metricType":"timing"},{"name":"peak memory","accumulatorId":624,"metricType":"size"},{"name":"number of output rows","accumulatorId":623,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":627,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":518,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":519,"metricType":"nsTiming"},{"name":"records read","accumulatorId":516,"metricType":"sum"},{"name":"local bytes read","accumulatorId":514,"metricType":"size"},{"name":"fetch wait time","accumulatorId":515,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":512,"metricType":"size"},{"name":"local blocks read","accumulatorId":511,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":510,"metricType":"sum"},{"name":"data size","accumulatorId":509,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":513,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":517,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":620,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":621,"metricType":"timing"},{"name":"peak memory","accumulatorId":619,"metricType":"size"},{"name":"number of output rows","accumulatorId":618,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":622,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":615,"metricType":"timing"},{"name":"peak memory","accumulatorId":616,"metricType":"size"},{"name":"spill size","accumulatorId":617,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":590,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":586,"metricType":"sum"},{"name":"written output","accumulatorId":587,"metricType":"size"},{"name":"number of output rows","accumulatorId":588,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":589,"metricType":"sum"}]},"time":1678162992108} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":7,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":7,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247, Date#344], [Site#247, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L]\n : +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- Project [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- SortMergeJoin [Site#247], [Site#485], FullOuter\n :- Sort [Site#247 ASC NULLS FIRST], false, 0\n : +- HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)], output=[Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L])\n : +- Exchange hashpartitioning(Site#247, 1000), true, [id=#439]\n : +- HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)], output=[Site#247, sum#2727L, sum#2728L, sum#2729, count#2730L])\n : +- HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Total_Pages#449L, Total_Record_Length#451L])\n : +- ShuffleQueryStage 0\n : +- Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#435]\n : +- *(1) HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, count#2733L, sum#2734L])\n : +- *(1) Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"Project","simpleString":"Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]","children":[{"nodeName":"SortMergeJoin","simpleString":"SortMergeJoin [Site#247], [Site#485], FullOuter","children":[{"nodeName":"Sort","simpleString":"Sort [Site#247 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)])","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, 1000), true, [id=#439]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#435]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":614,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":686,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":687,"metricType":"timing"},{"name":"peak memory","accumulatorId":685,"metricType":"size"},{"name":"number of output rows","accumulatorId":684,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":688,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":683,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":640,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":641,"metricType":"nsTiming"},{"name":"records read","accumulatorId":638,"metricType":"sum"},{"name":"local bytes read","accumulatorId":636,"metricType":"size"},{"name":"fetch wait time","accumulatorId":637,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":634,"metricType":"size"},{"name":"local blocks read","accumulatorId":633,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":632,"metricType":"sum"},{"name":"data size","accumulatorId":631,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":635,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":639,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":680,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":681,"metricType":"timing"},{"name":"peak memory","accumulatorId":679,"metricType":"size"},{"name":"number of output rows","accumulatorId":678,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":682,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":675,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":676,"metricType":"timing"},{"name":"peak memory","accumulatorId":674,"metricType":"size"},{"name":"number of output rows","accumulatorId":673,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":677,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":651,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":652,"metricType":"nsTiming"},{"name":"records read","accumulatorId":649,"metricType":"sum"},{"name":"local bytes read","accumulatorId":647,"metricType":"size"},{"name":"fetch wait time","accumulatorId":648,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":645,"metricType":"size"},{"name":"local blocks read","accumulatorId":644,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":643,"metricType":"sum"},{"name":"data size","accumulatorId":642,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":646,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":650,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":670,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":671,"metricType":"timing"},{"name":"peak memory","accumulatorId":669,"metricType":"size"},{"name":"number of output rows","accumulatorId":668,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":672,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":665,"metricType":"timing"},{"name":"peak memory","accumulatorId":666,"metricType":"size"},{"name":"spill size","accumulatorId":667,"metricType":"size"}]},{"nodeName":"Sort","simpleString":"Sort [Site#485 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 1","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#485, 1000), true, [id=#455]","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[partial_avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#485, pythonUDF0#2422 AS levenshtein_distance#215]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2422]","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"Project","simpleString":"Project [url_host_2nd_last_part#8, url_host_registered_domain#13]","children":[{"nodeName":"Filter","simpleString":"Filter (pythonUDF0#2421 <= 2)","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2421]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"Filter","simpleString":"Filter ((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(url_host_registered_domain#13))","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_2nd_last_part#8, url_host_registered_domain#13, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(url_host_registered_domain#13)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":630,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":706,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":705,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":704,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":703,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":700,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":701,"metricType":"timing"},{"name":"peak memory","accumulatorId":699,"metricType":"size"},{"name":"number of output rows","accumulatorId":698,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":702,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":697,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":662,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":663,"metricType":"nsTiming"},{"name":"records read","accumulatorId":660,"metricType":"sum"},{"name":"local bytes read","accumulatorId":658,"metricType":"size"},{"name":"fetch wait time","accumulatorId":659,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":656,"metricType":"size"},{"name":"local blocks read","accumulatorId":655,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":654,"metricType":"sum"},{"name":"data size","accumulatorId":653,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":657,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":661,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":694,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":695,"metricType":"timing"},{"name":"peak memory","accumulatorId":693,"metricType":"size"},{"name":"number of output rows","accumulatorId":692,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":696,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":689,"metricType":"timing"},{"name":"peak memory","accumulatorId":690,"metricType":"size"},{"name":"spill size","accumulatorId":691,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":664,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":586,"metricType":"sum"},{"name":"written output","accumulatorId":587,"metricType":"size"},{"name":"number of output rows","accumulatorId":588,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":589,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":6,"Submission Time":1678162992587,"Stage Infos":[{"Stage ID":10,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":36,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"88\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[35],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":34,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[33],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":35,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"89\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[34],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":32,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[31],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":31,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":33,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[32],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[10],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"88\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":10,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":36,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"88\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[35],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":34,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[33],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":35,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"89\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[34],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":32,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[31],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":31,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":33,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[32],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162992592,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"88\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":70,"Index":0,"Attempt":0,"Launch Time":1678162992616,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":71,"Index":4,"Attempt":0,"Launch Time":1678162992616,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":72,"Index":7,"Attempt":0,"Launch Time":1678162992616,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":73,"Index":5,"Attempt":0,"Launch Time":1678162992617,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":74,"Index":1,"Attempt":0,"Launch Time":1678162992617,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":75,"Index":2,"Attempt":0,"Launch Time":1678162992617,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":76,"Index":6,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":77,"Index":3,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":78,"Index":8,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":79,"Index":12,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":80,"Index":15,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":81,"Index":13,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":82,"Index":9,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":83,"Index":10,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":84,"Index":14,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":85,"Index":11,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":86,"Index":16,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":87,"Index":20,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":88,"Index":23,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":89,"Index":21,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":90,"Index":17,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":91,"Index":18,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":92,"Index":22,"Attempt":0,"Launch Time":1678162992622,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":93,"Index":19,"Attempt":0,"Launch Time":1678162992622,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":94,"Index":24,"Attempt":0,"Launch Time":1678162992623,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":95,"Index":28,"Attempt":0,"Launch Time":1678162992623,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":96,"Index":31,"Attempt":0,"Launch Time":1678162992623,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":97,"Index":29,"Attempt":0,"Launch Time":1678162992624,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":98,"Index":25,"Attempt":0,"Launch Time":1678162992624,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":99,"Index":26,"Attempt":0,"Launch Time":1678162992624,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":100,"Index":30,"Attempt":0,"Launch Time":1678162992625,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":101,"Index":27,"Attempt":0,"Launch Time":1678162992625,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerJobStart","Job ID":7,"Submission Time":1678162992767,"Stage Infos":[{"Stage ID":11,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":46,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"95\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[45],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":45,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"96\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[44],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":41,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[40],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":39,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"106\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[38],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":37,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":38,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[37],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":43,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[42],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":40,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[39],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":42,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"101\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[41],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":44,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[43],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[11],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"95\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":11,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":46,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"95\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[45],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":45,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"96\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[44],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":41,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[40],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":39,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"106\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[38],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":37,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":38,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[37],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":43,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[42],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":40,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[39],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":42,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"101\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[41],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":44,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[43],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162992775,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"95\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":102,"Index":32,"Attempt":0,"Launch Time":1678162993260,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":103,"Index":4,"Attempt":0,"Launch Time":1678162993261,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":104,"Index":12,"Attempt":0,"Launch Time":1678162993263,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":71,"Index":4,"Attempt":0,"Launch Time":1678162992616,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993263,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"59","Value":"59","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"3","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":46,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":294912,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":4317,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":107330436,"Value":107330436,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":548,"Value":548,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":13825269,"Value":13825269,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":70,"Value":70,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":70,"Executor Deserialize CPU Time":13825269,"Executor Run Time":548,"Executor CPU Time":107330436,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":79,"Index":12,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993263,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"58","Value":"117","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"2","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":92,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":589824,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":8634,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":49446470,"Value":156776906,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":548,"Value":1096,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":14443524,"Value":28268793,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":70,"Value":140,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":70,"Executor Deserialize CPU Time":14443524,"Executor Run Time":548,"Executor CPU Time":49446470,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":105,"Index":20,"Attempt":0,"Launch Time":1678162993264,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":87,"Index":20,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993265,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"58","Value":"175","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"786432","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"2","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":138,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":884736,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":12951,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":102007663,"Value":258784569,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":548,"Value":1644,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":18146400,"Value":46415193,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":70,"Value":210,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":70,"Executor Deserialize CPU Time":18146400,"Executor Run Time":548,"Executor CPU Time":102007663,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":95,"Index":28,"Attempt":0,"Launch Time":1678162992623,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993265,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"58","Value":"233","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"1048576","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"3","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":184,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1179648,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":17268,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":36105286,"Value":294889855,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":549,"Value":2193,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":16079089,"Value":62494282,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":70,"Value":280,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":70,"Executor Deserialize CPU Time":16079089,"Executor Run Time":549,"Executor CPU Time":36105286,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":106,"Index":5,"Attempt":0,"Launch Time":1678162993275,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":81,"Index":13,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993276,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"65","Value":"298","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"1310720","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"10","Value":"20","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":230,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1474560,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":21585,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":63148314,"Value":358038169,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":555,"Value":2748,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":17852949,"Value":80347231,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":76,"Value":356,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":76,"Executor Deserialize CPU Time":17852949,"Executor Run Time":555,"Executor CPU Time":63148314,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":107,"Index":13,"Attempt":0,"Launch Time":1678162993276,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":97,"Index":29,"Attempt":0,"Launch Time":1678162992624,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993277,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"66","Value":"364","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"1572864","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"10","Value":"30","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":276,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1769472,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":25902,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":126023045,"Value":484061214,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":558,"Value":3306,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":20234952,"Value":100582183,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":72,"Value":428,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":72,"Executor Deserialize CPU Time":20234952,"Executor Run Time":558,"Executor CPU Time":126023045,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":108,"Index":21,"Attempt":0,"Launch Time":1678162993278,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":89,"Index":21,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993279,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"66","Value":"430","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"1835008","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"10","Value":"40","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":322,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2064384,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":30219,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":45626052,"Value":529687266,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":559,"Value":3865,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":20882856,"Value":121465039,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":73,"Value":501,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":73,"Executor Deserialize CPU Time":20882856,"Executor Run Time":559,"Executor CPU Time":45626052,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":109,"Index":29,"Attempt":0,"Launch Time":1678162993288,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":73,"Index":5,"Attempt":0,"Launch Time":1678162992617,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993289,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"79","Value":"509","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"2097152","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"0","Value":"40","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":368,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2359296,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":34536,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":75941816,"Value":605629082,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":572,"Value":4437,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9742046,"Value":131207085,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":73,"Value":574,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":73,"Executor Deserialize CPU Time":9742046,"Executor Run Time":572,"Executor CPU Time":75941816,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":110,"Index":3,"Attempt":0,"Launch Time":1678162993315,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":93,"Index":19,"Attempt":0,"Launch Time":1678162992622,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993316,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"85","Value":"594","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"2359296","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"4","Value":"44","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":414,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2654208,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":38853,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":94505733,"Value":700134815,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":569,"Value":5006,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":23029554,"Value":154236639,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":92,"Value":666,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":92,"Executor Deserialize CPU Time":23029554,"Executor Run Time":569,"Executor CPU Time":94505733,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":111,"Index":11,"Attempt":0,"Launch Time":1678162993317,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":77,"Index":3,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993319,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"86","Value":"680","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"2621440","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"0","Value":"44","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":460,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2949120,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":43170,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":70021600,"Value":770156415,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":579,"Value":5585,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10729030,"Value":164965669,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":83,"Value":749,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":83,"Executor Deserialize CPU Time":10729030,"Executor Run Time":579,"Executor CPU Time":70021600,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":112,"Index":19,"Attempt":0,"Launch Time":1678162993321,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":113,"Index":27,"Attempt":0,"Launch Time":1678162993322,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":85,"Index":11,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993322,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"86","Value":"766","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"2883584","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"2","Value":"46","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":506,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3244032,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":47487,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":91606129,"Value":861762544,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":568,"Value":6153,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":22569591,"Value":187535260,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":94,"Value":843,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":94,"Executor Deserialize CPU Time":22569591,"Executor Run Time":568,"Executor CPU Time":91606129,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":101,"Index":27,"Attempt":0,"Launch Time":1678162992625,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993323,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"90","Value":"856","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"3145728","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"0","Value":"46","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":552,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3538944,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":51804,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":53094462,"Value":914857006,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":582,"Value":6735,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":13985479,"Value":201520739,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":82,"Value":925,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":82,"Executor Deserialize CPU Time":13985479,"Executor Run Time":582,"Executor CPU Time":53094462,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":114,"Index":6,"Attempt":0,"Launch Time":1678162993329,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":92,"Index":22,"Attempt":0,"Launch Time":1678162992622,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993329,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"80","Value":"936","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"3407872","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"5","Value":"51","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":598,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3833856,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":56121,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":62972629,"Value":977829635,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":591,"Value":7326,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11302985,"Value":212823724,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":75,"Value":1000,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":75,"Executor Deserialize CPU Time":11302985,"Executor Run Time":591,"Executor CPU Time":62972629,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":115,"Index":0,"Attempt":0,"Launch Time":1678162993330,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":70,"Index":0,"Attempt":0,"Launch Time":1678162992616,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993331,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"83","Value":"1019","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"3670016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"12","Value":"63","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":644,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4128768,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":60438,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":43441534,"Value":1021271169,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":602,"Value":7928,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":13102094,"Value":225925818,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":82,"Value":1082,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":82,"Executor Deserialize CPU Time":13102094,"Executor Run Time":602,"Executor CPU Time":43441534,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":116,"Index":7,"Attempt":0,"Launch Time":1678162993333,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":88,"Index":23,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993334,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"75","Value":"1094","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"3932160","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"2","Value":"65","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":690,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4423680,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":64755,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":73450020,"Value":1094721189,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":604,"Value":8532,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":19132932,"Value":245058750,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":76,"Value":1158,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":76,"Executor Deserialize CPU Time":19132932,"Executor Run Time":604,"Executor CPU Time":73450020,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":117,"Index":15,"Attempt":0,"Launch Time":1678162993337,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":118,"Index":14,"Attempt":0,"Launch Time":1678162993339,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":76,"Index":6,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993339,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"85","Value":"1179","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"4194304","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"11","Value":"76","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":736,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4718592,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":69072,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":59693533,"Value":1154414722,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":589,"Value":9121,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":28968546,"Value":274027296,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":84,"Value":1242,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":84,"Executor Deserialize CPU Time":28968546,"Executor Run Time":589,"Executor CPU Time":59693533,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":119,"Index":8,"Attempt":0,"Launch Time":1678162993340,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":72,"Index":7,"Attempt":0,"Launch Time":1678162992616,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993344,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"75","Value":"1254","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"4456448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"2","Value":"78","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":782,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5013504,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":73389,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":111674182,"Value":1266088904,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":605,"Value":9726,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":13576867,"Value":287604163,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":75,"Value":1317,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":75,"Executor Deserialize CPU Time":13576867,"Executor Run Time":605,"Executor CPU Time":111674182,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":94,"Index":24,"Attempt":0,"Launch Time":1678162992623,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993344,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"83","Value":"1337","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"14","Value":"92","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":828,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5308416,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":77706,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":87776574,"Value":1353865478,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":601,"Value":10327,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":26186251,"Value":313790414,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":81,"Value":1398,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":81,"Executor Deserialize CPU Time":26186251,"Executor Run Time":601,"Executor CPU Time":87776574,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":80,"Index":15,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993344,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"75","Value":"1412","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"4980736","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"2","Value":"94","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":874,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5603328,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":82023,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":76238967,"Value":1430104445,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":605,"Value":10932,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":18595574,"Value":332385988,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":75,"Value":1473,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":75,"Executor Deserialize CPU Time":18595574,"Executor Run Time":605,"Executor CPU Time":76238967,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":120,"Index":23,"Attempt":0,"Launch Time":1678162993346,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":121,"Index":22,"Attempt":0,"Launch Time":1678162993349,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":84,"Index":14,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993349,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"86","Value":"1498","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"5242880","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"11","Value":"105","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":920,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5898240,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":86340,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":93797328,"Value":1523901773,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":598,"Value":11530,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15239818,"Value":347625806,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":75,"Value":1548,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":75,"Executor Deserialize CPU Time":15239818,"Executor Run Time":598,"Executor CPU Time":93797328,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":100,"Index":30,"Attempt":0,"Launch Time":1678162992625,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993352,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"86","Value":"1584","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"5505024","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"11","Value":"116","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":966,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6193152,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":90657,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":99239038,"Value":1623140811,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":589,"Value":12119,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":16210661,"Value":363836467,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":83,"Value":1631,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":83,"Executor Deserialize CPU Time":16210661,"Executor Run Time":589,"Executor CPU Time":99239038,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":122,"Index":30,"Attempt":0,"Launch Time":1678162993358,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":123,"Index":16,"Attempt":0,"Launch Time":1678162993361,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":78,"Index":8,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993362,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"87","Value":"1671","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"5767168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"17","Value":"133","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1012,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6488064,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":94974,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":116673417,"Value":1739814228,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":611,"Value":12730,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":14429820,"Value":378266287,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":82,"Value":1713,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":82,"Executor Deserialize CPU Time":14429820,"Executor Run Time":611,"Executor CPU Time":116673417,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":102,"Index":32,"Attempt":0,"Launch Time":1678162993260,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993364,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"47","Value":"1718","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"6029312","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"36","Value":"169","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1058,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6782976,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":713,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":99334,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":13777069,"Value":1753591297,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":60,"Value":12790,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4583414,"Value":382849701,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":4,"Value":1717,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":4,"Executor Deserialize CPU Time":4583414,"Executor Run Time":60,"Executor CPU Time":13777069,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":0,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":124,"Index":28,"Attempt":0,"Launch Time":1678162993365,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":125,"Index":31,"Attempt":0,"Launch Time":1678162993366,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":96,"Index":31,"Attempt":0,"Launch Time":1678162992623,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993367,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"75","Value":"1793","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"6291456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"2","Value":"171","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1104,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7077888,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":103651,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":57035925,"Value":1810627222,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":605,"Value":13395,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15964162,"Value":398813863,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":73,"Value":1790,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":73,"Executor Deserialize CPU Time":15964162,"Executor Run Time":605,"Executor CPU Time":57035925,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":126,"Index":24,"Attempt":0,"Launch Time":1678162993369,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":86,"Index":16,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993370,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"87","Value":"1880","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"6553600","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"16","Value":"187","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1150,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7372800,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":107968,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":77443734,"Value":1888070956,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":614,"Value":14009,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15695708,"Value":414509571,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":82,"Value":1872,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":82,"Executor Deserialize CPU Time":15695708,"Executor Run Time":614,"Executor CPU Time":77443734,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":127,"Index":2,"Attempt":0,"Launch Time":1678162993426,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":99,"Index":26,"Attempt":0,"Launch Time":1678162992624,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993427,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"90","Value":"1970","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"6815744","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"1","Value":"188","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1196,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7667712,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":85,"Value":85,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":112328,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":103675938,"Value":1991746894,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":702,"Value":14711,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15305524,"Value":429815095,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":69,"Value":1941,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":69,"Executor Deserialize CPU Time":15305524,"Executor Run Time":702,"Executor CPU Time":103675938,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":85,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":128,"Index":10,"Attempt":0,"Launch Time":1678162993429,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":83,"Index":10,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993431,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"90","Value":"2060","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"7077888","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"4","Value":"192","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1242,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7962624,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":85,"Value":170,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":116688,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":72849669,"Value":2064596563,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":703,"Value":15414,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11572278,"Value":441387373,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":73,"Value":2014,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":73,"Executor Deserialize CPU Time":11572278,"Executor Run Time":703,"Executor CPU Time":72849669,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":85,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":129,"Index":18,"Attempt":0,"Launch Time":1678162993433,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":91,"Index":18,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993434,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"90","Value":"2150","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"7340032","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"4","Value":"196","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1288,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8257536,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":85,"Value":255,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":121048,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":53340942,"Value":2117937505,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":700,"Value":16114,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":26602063,"Value":467989436,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":73,"Value":2087,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":73,"Executor Deserialize CPU Time":26602063,"Executor Run Time":700,"Executor CPU Time":53340942,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":85,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":75,"Index":2,"Attempt":0,"Launch Time":1678162992617,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993435,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"88","Value":"2238","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"7602176","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"3","Value":"199","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1334,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8552448,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":85,"Value":340,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":125408,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":92150391,"Value":2210087896,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":696,"Value":16810,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15016377,"Value":483005813,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":82,"Value":2169,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":82,"Executor Deserialize CPU Time":15016377,"Executor Run Time":696,"Executor CPU Time":92150391,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":85,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":130,"Index":26,"Attempt":0,"Launch Time":1678162993435,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":131,"Index":1,"Attempt":0,"Launch Time":1678162993488,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":82,"Index":9,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993489,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"84","Value":"2322","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"7864320","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"3","Value":"202","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1380,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8847360,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":163,"Value":503,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":129768,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":59336502,"Value":2269424398,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":615,"Value":17425,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":16201272,"Value":499207085,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":228,"Value":2397,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":228,"Executor Deserialize CPU Time":16201272,"Executor Run Time":615,"Executor CPU Time":59336502,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":163,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":132,"Index":9,"Attempt":0,"Launch Time":1678162993491,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":74,"Index":1,"Attempt":0,"Launch Time":1678162992617,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993492,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"84","Value":"2406","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"8126464","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"3","Value":"205","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1426,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9142272,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":163,"Value":666,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":134128,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":88637061,"Value":2358061459,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":614,"Value":18039,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":21840870,"Value":521047955,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":229,"Value":2626,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":229,"Executor Deserialize CPU Time":21840870,"Executor Run Time":614,"Executor CPU Time":88637061,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":163,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":98,"Index":25,"Attempt":0,"Launch Time":1678162992624,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993493,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"84","Value":"2490","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"8388608","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"3","Value":"208","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1472,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9437184,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":163,"Value":829,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":138488,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":75279968,"Value":2433341427,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":615,"Value":18654,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":12366169,"Value":533414124,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":226,"Value":2852,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":226,"Executor Deserialize CPU Time":12366169,"Executor Run Time":615,"Executor CPU Time":75279968,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":163,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":133,"Index":17,"Attempt":0,"Launch Time":1678162993493,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":134,"Index":25,"Attempt":0,"Launch Time":1678162993497,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":90,"Index":17,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993497,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"87","Value":"2577","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"3","Value":"211","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1518,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":163,"Value":992,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":142848,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":74041861,"Value":2507383288,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":621,"Value":19275,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9357807,"Value":542771931,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":228,"Value":3080,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":228,"Executor Deserialize CPU Time":9357807,"Executor Run Time":621,"Executor CPU Time":74041861,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":163,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":10,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":36,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"88\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[35],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":34,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[33],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":35,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"89\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[34],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":32,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[31],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":31,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":33,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[32],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162992592,"Completion Time":1678162993498,"Accumulables":[{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Value":0,"Internal":true,"Count Failed Values":true},{"ID":687,"Name":"time in aggregation build","Value":"211","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Value":542771931,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Value":142848,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Value":2507383288,"Internal":true,"Count Failed Values":true},{"ID":728,"Name":"internal.metrics.input.bytesRead","Value":1518,"Internal":true,"Count Failed Values":true},{"ID":683,"Name":"duration","Value":"2577","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":713,"Name":"internal.metrics.resultSerializationTime","Value":1,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Value":3080,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Value":0,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Value":19275,"Internal":true,"Count Failed Values":true},{"ID":685,"Name":"peak memory","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":712,"Name":"internal.metrics.jvmGCTime","Value":992,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":6,"Completion Time":1678162993504,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":7,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247, Date#344], [Site#247, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L]\n : +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- Project [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- SortMergeJoin [Site#247], [Site#485], FullOuter\n :- Sort [Site#247 ASC NULLS FIRST], false, 0\n : +- HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)], output=[Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L])\n : +- ShuffleQueryStage 2\n : +- Exchange hashpartitioning(Site#247, 1000), true, [id=#508]\n : +- *(5) HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)], output=[Site#247, sum#2727L, sum#2728L, sum#2729, count#2730L])\n : +- *(5) HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Total_Pages#449L, Total_Record_Length#451L])\n : +- CustomShuffleReader coalesced\n : +- ShuffleQueryStage 0\n : +- Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#435]\n : +- *(1) HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, count#2733L, sum#2734L])\n : +- *(1) Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"Project","simpleString":"Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]","children":[{"nodeName":"SortMergeJoin","simpleString":"SortMergeJoin [Site#247], [Site#485], FullOuter","children":[{"nodeName":"Sort","simpleString":"Sort [Site#247 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)])","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, 1000), true, [id=#508]","children":[{"nodeName":"WholeStageCodegen (5)","simpleString":"WholeStageCodegen (5)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#435]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":614,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":686,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":687,"metricType":"timing"},{"name":"peak memory","accumulatorId":685,"metricType":"size"},{"name":"number of output rows","accumulatorId":684,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":688,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":683,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":640,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":641,"metricType":"nsTiming"},{"name":"records read","accumulatorId":638,"metricType":"sum"},{"name":"local bytes read","accumulatorId":636,"metricType":"size"},{"name":"fetch wait time","accumulatorId":637,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":634,"metricType":"size"},{"name":"local blocks read","accumulatorId":633,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":632,"metricType":"sum"},{"name":"data size","accumulatorId":631,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":635,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":639,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":862,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":863,"metricType":"timing"},{"name":"peak memory","accumulatorId":861,"metricType":"size"},{"name":"number of output rows","accumulatorId":860,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":864,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":857,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":858,"metricType":"timing"},{"name":"peak memory","accumulatorId":856,"metricType":"size"},{"name":"number of output rows","accumulatorId":855,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":859,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":854,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":843,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":844,"metricType":"nsTiming"},{"name":"records read","accumulatorId":841,"metricType":"sum"},{"name":"local bytes read","accumulatorId":839,"metricType":"size"},{"name":"fetch wait time","accumulatorId":840,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":837,"metricType":"size"},{"name":"local blocks read","accumulatorId":836,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":835,"metricType":"sum"},{"name":"data size","accumulatorId":834,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":838,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":842,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":851,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":852,"metricType":"timing"},{"name":"peak memory","accumulatorId":850,"metricType":"size"},{"name":"number of output rows","accumulatorId":849,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":853,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":846,"metricType":"timing"},{"name":"peak memory","accumulatorId":847,"metricType":"size"},{"name":"spill size","accumulatorId":848,"metricType":"size"}]},{"nodeName":"Sort","simpleString":"Sort [Site#485 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 1","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#485, 1000), true, [id=#455]","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[partial_avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#485, pythonUDF0#2422 AS levenshtein_distance#215]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2422]","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"Project","simpleString":"Project [url_host_2nd_last_part#8, url_host_registered_domain#13]","children":[{"nodeName":"Filter","simpleString":"Filter (pythonUDF0#2421 <= 2)","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2421]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"Filter","simpleString":"Filter ((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(url_host_registered_domain#13))","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_2nd_last_part#8, url_host_registered_domain#13, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(url_host_registered_domain#13)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":630,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":706,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":705,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":704,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":703,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":700,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":701,"metricType":"timing"},{"name":"peak memory","accumulatorId":699,"metricType":"size"},{"name":"number of output rows","accumulatorId":698,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":702,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":697,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":662,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":663,"metricType":"nsTiming"},{"name":"records read","accumulatorId":660,"metricType":"sum"},{"name":"local bytes read","accumulatorId":658,"metricType":"size"},{"name":"fetch wait time","accumulatorId":659,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":656,"metricType":"size"},{"name":"local blocks read","accumulatorId":655,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":654,"metricType":"sum"},{"name":"data size","accumulatorId":653,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":657,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":661,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":694,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":695,"metricType":"timing"},{"name":"peak memory","accumulatorId":693,"metricType":"size"},{"name":"number of output rows","accumulatorId":692,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":696,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":689,"metricType":"timing"},{"name":"peak memory","accumulatorId":690,"metricType":"size"},{"name":"spill size","accumulatorId":691,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":845,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":586,"metricType":"sum"},{"name":"written output","accumulatorId":587,"metricType":"size"},{"name":"number of output rows","accumulatorId":588,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":589,"metricType":"sum"}]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":135,"Index":32,"Attempt":0,"Launch Time":1678162993608,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":104,"Index":12,"Attempt":0,"Launch Time":1678162993263,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993608,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"51","Value":"51","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"44","Value":"44","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"32","Value":"32","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":46,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":294912,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":29,"Value":29,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":4608,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":41719006,"Value":41719006,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":278,"Value":278,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5961550,"Value":5961550,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":57,"Value":57,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":57,"Executor Deserialize CPU Time":5961550,"Executor Run Time":278,"Executor CPU Time":41719006,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":29,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":105,"Index":20,"Attempt":0,"Launch Time":1678162993264,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993611,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"51","Value":"102","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"1","Value":"2","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"35","Value":"79","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"27","Value":"59","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":92,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":589824,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":29,"Value":58,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":9216,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":77317133,"Value":119036139,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":293,"Value":571,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":8602423,"Value":14563973,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":42,"Value":99,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":42,"Executor Deserialize CPU Time":8602423,"Executor Run Time":293,"Executor CPU Time":77317133,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":29,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":124,"Index":28,"Attempt":0,"Launch Time":1678162993365,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993611,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"51","Value":"153","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"786432","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"2","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"34","Value":"113","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"29","Value":"88","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":138,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":884736,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":29,"Value":87,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":13824,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":39714216,"Value":158750355,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":229,"Value":800,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5554408,"Value":20118381,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":7,"Value":106,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":7,"Executor Deserialize CPU Time":5554408,"Executor Run Time":229,"Executor CPU Time":39714216,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":29,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":103,"Index":4,"Attempt":0,"Launch Time":1678162993261,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993611,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"51","Value":"204","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"1048576","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"1","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"35","Value":"148","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"31","Value":"119","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":184,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1179648,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":29,"Value":116,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":18432,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":80196797,"Value":238947152,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":292,"Value":1092,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11376267,"Value":31494648,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":47,"Value":153,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":47,"Executor Deserialize CPU Time":11376267,"Executor Run Time":292,"Executor CPU Time":80196797,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":29,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":108,"Index":21,"Attempt":0,"Launch Time":1678162993278,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993619,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"70","Value":"274","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"1310720","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"26","Value":"174","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"31","Value":"150","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":230,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1474560,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":22997,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":66593308,"Value":305540460,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":301,"Value":1393,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5725998,"Value":37220646,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":33,"Value":186,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":33,"Executor Deserialize CPU Time":5725998,"Executor Run Time":301,"Executor CPU Time":66593308,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":106,"Index":5,"Attempt":0,"Launch Time":1678162993275,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993619,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"70","Value":"344","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"1572864","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"35","Value":"209","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"31","Value":"181","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":276,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1769472,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":27562,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":56729766,"Value":362270226,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":302,"Value":1695,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11618007,"Value":48838653,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":34,"Value":220,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":34,"Executor Deserialize CPU Time":11618007,"Executor Run Time":302,"Executor CPU Time":56729766,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":107,"Index":13,"Attempt":0,"Launch Time":1678162993276,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993621,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"70","Value":"414","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"1835008","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"34","Value":"243","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"31","Value":"212","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":322,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2064384,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":32127,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":31946269,"Value":394216495,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":290,"Value":1985,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10347503,"Value":59186156,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":45,"Value":265,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":45,"Executor Deserialize CPU Time":10347503,"Executor Run Time":290,"Executor CPU Time":31946269,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":109,"Index":29,"Attempt":0,"Launch Time":1678162993288,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993622,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"74","Value":"488","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"2097152","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"24","Value":"267","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"55","Value":"267","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":368,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2359296,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":36692,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":93125827,"Value":487342322,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":303,"Value":2288,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7285418,"Value":66471574,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":22,"Value":287,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":22,"Executor Deserialize CPU Time":7285418,"Executor Run Time":303,"Executor CPU Time":93125827,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":122,"Index":30,"Attempt":0,"Launch Time":1678162993358,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993636,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"55","Value":"543","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"2359296","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"18","Value":"285","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"30","Value":"297","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":414,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2654208,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":41257,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":51850614,"Value":539192936,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":226,"Value":2514,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":8722904,"Value":75194478,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":29,"Value":316,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":29,"Executor Deserialize CPU Time":8722904,"Executor Run Time":226,"Executor CPU Time":51850614,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":114,"Index":6,"Attempt":0,"Launch Time":1678162993329,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993636,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"53","Value":"596","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"2621440","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"18","Value":"303","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"25","Value":"322","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":460,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2949120,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":45822,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":38320265,"Value":577513201,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":224,"Value":2738,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11682990,"Value":86877468,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":70,"Value":386,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":70,"Executor Deserialize CPU Time":11682990,"Executor Run Time":224,"Executor CPU Time":38320265,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":121,"Index":22,"Attempt":0,"Launch Time":1678162993349,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993637,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"52","Value":"648","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"2883584","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"27","Value":"330","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"30","Value":"352","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":506,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3244032,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":50387,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":62434507,"Value":639947708,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":223,"Value":2961,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7898850,"Value":94776318,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":52,"Value":438,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":52,"Executor Deserialize CPU Time":7898850,"Executor Run Time":223,"Executor CPU Time":62434507,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":111,"Index":11,"Attempt":0,"Launch Time":1678162993317,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993637,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"59","Value":"707","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"3145728","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"38","Value":"368","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"46","Value":"398","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":552,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3538944,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":54952,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":30769863,"Value":670717571,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":274,"Value":3235,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6161892,"Value":100938210,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":32,"Value":470,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":32,"Executor Deserialize CPU Time":6161892,"Executor Run Time":274,"Executor CPU Time":30769863,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":110,"Index":3,"Attempt":0,"Launch Time":1678162993315,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993637,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"60","Value":"767","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"3407872","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"45","Value":"413","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"43","Value":"441","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":598,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3833856,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":59517,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":42523999,"Value":713241570,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":272,"Value":3507,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10038737,"Value":110976947,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":37,"Value":507,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":37,"Executor Deserialize CPU Time":10038737,"Executor Run Time":272,"Executor CPU Time":42523999,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":113,"Index":27,"Attempt":0,"Launch Time":1678162993322,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993638,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"60","Value":"827","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"3670016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"26","Value":"439","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"27","Value":"468","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":644,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4128768,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":64082,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":81144766,"Value":794386336,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":274,"Value":3781,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6347940,"Value":117324887,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":20,"Value":527,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":20,"Executor Deserialize CPU Time":6347940,"Executor Run Time":274,"Executor CPU Time":81144766,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":112,"Index":19,"Attempt":0,"Launch Time":1678162993321,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993638,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"63","Value":"890","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"3932160","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"3","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"26","Value":"465","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"38","Value":"506","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":690,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4423680,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":68647,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":101172224,"Value":895558560,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":277,"Value":4058,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9296436,"Value":126621323,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":29,"Value":556,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":29,"Executor Deserialize CPU Time":9296436,"Executor Run Time":277,"Executor CPU Time":101172224,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":118,"Index":14,"Attempt":0,"Launch Time":1678162993339,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993642,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"57","Value":"947","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"4194304","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"31","Value":"496","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"24","Value":"530","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":736,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4718592,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":73212,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":70690768,"Value":966249328,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":224,"Value":4282,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7911847,"Value":134533170,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":61,"Value":617,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":61,"Executor Deserialize CPU Time":7911847,"Executor Run Time":224,"Executor CPU Time":70690768,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":123,"Index":16,"Attempt":0,"Launch Time":1678162993361,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993674,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"73","Value":"1020","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"4456448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"2","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"58","Value":"554","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"27","Value":"557","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":782,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5013504,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":20,"Value":136,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":77820,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":66555759,"Value":1032805087,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":250,"Value":4532,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5647079,"Value":140180249,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":50,"Value":667,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":50,"Executor Deserialize CPU Time":5647079,"Executor Run Time":250,"Executor CPU Time":66555759,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":119,"Index":8,"Attempt":0,"Launch Time":1678162993340,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993677,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"71","Value":"1091","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"50","Value":"604","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"22","Value":"579","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":828,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5308416,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":20,"Value":156,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":82428,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":61645813,"Value":1094450900,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":249,"Value":4781,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6278891,"Value":146459140,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":65,"Value":732,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":65,"Executor Deserialize CPU Time":6278891,"Executor Run Time":249,"Executor CPU Time":61645813,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":115,"Index":0,"Attempt":0,"Launch Time":1678162993330,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993678,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"72","Value":"1163","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"4980736","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"21","Value":"625","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"17","Value":"596","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":874,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5603328,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":20,"Value":176,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":87036,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":67068364,"Value":1161519264,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":251,"Value":5032,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11958819,"Value":158417959,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":83,"Value":815,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":83,"Executor Deserialize CPU Time":11958819,"Executor Run Time":251,"Executor CPU Time":67068364,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":125,"Index":31,"Attempt":0,"Launch Time":1678162993366,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993684,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"55","Value":"1218","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"5242880","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"1","Value":"9","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"18","Value":"643","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"26","Value":"622","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":920,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5898240,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":91601,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":112839470,"Value":1274358734,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":289,"Value":5321,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5884460,"Value":164302419,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":6,"Value":821,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":6,"Executor Deserialize CPU Time":5884460,"Executor Run Time":289,"Executor CPU Time":112839470,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":117,"Index":15,"Attempt":0,"Launch Time":1678162993337,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993685,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"55","Value":"1273","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"5505024","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"1","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"19","Value":"662","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"20","Value":"642","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":966,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6193152,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":96166,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":40324442,"Value":1314683176,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":296,"Value":5617,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6918114,"Value":171220533,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":37,"Value":858,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":37,"Executor Deserialize CPU Time":6918114,"Executor Run Time":296,"Executor CPU Time":40324442,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":135,"Index":32,"Attempt":0,"Launch Time":1678162993608,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993685,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"39","Value":"1312","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"5767168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"34","Value":"44","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"40","Value":"702","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"2","Value":"644","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1012,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6488064,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":100731,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":17682869,"Value":1332366045,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":60,"Value":5677,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4540572,"Value":175761105,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":6,"Value":864,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":6,"Executor Deserialize CPU Time":4540572,"Executor Run Time":60,"Executor CPU Time":17682869,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":116,"Index":7,"Attempt":0,"Launch Time":1678162993333,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993685,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"56","Value":"1368","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"6029312","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"44","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"29","Value":"731","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"67","Value":"711","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1058,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6782976,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":105296,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":42507205,"Value":1374873250,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":302,"Value":5979,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15397465,"Value":191158570,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":32,"Value":896,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":32,"Executor Deserialize CPU Time":15397465,"Executor Run Time":302,"Executor CPU Time":42507205,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":126,"Index":24,"Attempt":0,"Launch Time":1678162993369,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993685,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"72","Value":"1440","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"6291456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"1","Value":"45","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"21","Value":"752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"28","Value":"739","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1104,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7077888,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":20,"Value":196,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":109904,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":65727307,"Value":1440600557,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":248,"Value":6227,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9777786,"Value":200936356,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":44,"Value":940,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":44,"Executor Deserialize CPU Time":9777786,"Executor Run Time":248,"Executor CPU Time":65727307,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":120,"Index":23,"Attempt":0,"Launch Time":1678162993346,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993688,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"73","Value":"1513","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"6553600","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"45","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"18","Value":"770","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"70","Value":"809","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1150,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7372800,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":114469,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":53263629,"Value":1493864186,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":314,"Value":6541,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":8063421,"Value":208999777,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":17,"Value":957,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":17,"Executor Deserialize CPU Time":8063421,"Executor Run Time":314,"Executor CPU Time":53263629,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerJobStart","Job ID":8,"Submission Time":1678162993691,"Stage Infos":[{"Stage ID":12,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":36,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"88\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[35],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":34,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[33],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":35,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"89\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[34],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":32,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[31],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":31,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":33,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[32],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":13,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":49,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"111\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[48],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":48,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"112\",\"name\":\"WholeStageCodegen (5)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[47],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":47,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"116\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[36],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[12],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[12,13],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"111\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":13,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":49,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"111\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[48],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":48,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"112\",\"name\":\"WholeStageCodegen (5)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[47],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":47,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"116\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[36],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[12],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162993693,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"111\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":13,"Stage Attempt ID":0,"Task Info":{"Task ID":136,"Index":0,"Attempt":0,"Launch Time":1678162993710,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":131,"Index":1,"Attempt":0,"Launch Time":1678162993488,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993749,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"45","Value":"1558","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"6815744","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"45","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"17","Value":"787","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"35","Value":"844","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1196,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7667712,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":119034,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":45003418,"Value":1538867604,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":199,"Value":6740,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":12149931,"Value":221149708,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":53,"Value":1010,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":53,"Executor Deserialize CPU Time":12149931,"Executor Run Time":199,"Executor CPU Time":45003418,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":134,"Index":25,"Attempt":0,"Launch Time":1678162993497,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993750,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"45","Value":"1603","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"7077888","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"45","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"26","Value":"813","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"35","Value":"879","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1242,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7962624,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":123599,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":36764629,"Value":1575632233,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":197,"Value":6937,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6923444,"Value":228073152,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":45,"Value":1055,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":45,"Executor Deserialize CPU Time":6923444,"Executor Run Time":197,"Executor CPU Time":36764629,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":132,"Index":9,"Attempt":0,"Launch Time":1678162993491,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993750,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"45","Value":"1648","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"7340032","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"1","Value":"46","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"17","Value":"830","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"35","Value":"914","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1288,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8257536,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":128164,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":77263998,"Value":1652896231,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":197,"Value":7134,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6761483,"Value":234834635,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":52,"Value":1107,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":52,"Executor Deserialize CPU Time":6761483,"Executor Run Time":197,"Executor CPU Time":77263998,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":133,"Index":17,"Attempt":0,"Launch Time":1678162993493,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993752,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"50","Value":"1698","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"7602176","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"6","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"17","Value":"847","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"24","Value":"938","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1334,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8552448,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":132729,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":63045017,"Value":1715941248,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":200,"Value":7334,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6218513,"Value":241053148,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":52,"Value":1159,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":52,"Executor Deserialize CPU Time":6218513,"Executor Run Time":200,"Executor CPU Time":63045017,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":129,"Index":18,"Attempt":0,"Launch Time":1678162993433,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993764,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"56","Value":"1754","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"7864320","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"50","Value":"897","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"26","Value":"964","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1380,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8847360,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":137294,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":34917946,"Value":1750859194,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":231,"Value":7565,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6281482,"Value":247334630,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":82,"Value":1241,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":82,"Executor Deserialize CPU Time":6281482,"Executor Run Time":231,"Executor CPU Time":34917946,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":130,"Index":26,"Attempt":0,"Launch Time":1678162993435,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993764,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"56","Value":"1810","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"8126464","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"41","Value":"938","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"21","Value":"985","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1426,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9142272,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":141859,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":42424074,"Value":1793283268,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":231,"Value":7796,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5374857,"Value":252709487,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":85,"Value":1326,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":85,"Executor Deserialize CPU Time":5374857,"Executor Run Time":231,"Executor CPU Time":42424074,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":128,"Index":10,"Attempt":0,"Launch Time":1678162993429,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993765,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"56","Value":"1866","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"8388608","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"41","Value":"979","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"19","Value":"1004","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1472,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9437184,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":146424,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":109365502,"Value":1902648770,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":228,"Value":8024,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7709189,"Value":260418676,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":84,"Value":1410,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":84,"Executor Deserialize CPU Time":7709189,"Executor Run Time":228,"Executor CPU Time":109365502,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":127,"Index":2,"Attempt":0,"Launch Time":1678162993426,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993765,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"56","Value":"1922","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"41","Value":"1020","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"26","Value":"1030","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1518,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":150989,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":51522399,"Value":1954171169,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":230,"Value":8254,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":16458192,"Value":276876868,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":101,"Value":1511,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":101,"Executor Deserialize CPU Time":16458192,"Executor Run Time":230,"Executor CPU Time":51522399,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":11,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":46,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"95\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[45],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":45,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"96\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[44],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":41,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[40],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":39,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"106\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[38],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":37,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":38,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[37],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":43,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[42],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":40,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[39],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":42,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"101\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[41],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":44,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[43],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162992775,"Completion Time":1678162993768,"Accumulables":[{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Value":1511,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":705,"Name":"duration","Value":"1030","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":735,"Name":"internal.metrics.executorCpuTime","Value":1954171169,"Internal":true,"Count Failed Values":true},{"ID":753,"Name":"internal.metrics.input.bytesRead","Value":1518,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Value":196,"Internal":true,"Count Failed Values":true},{"ID":701,"Name":"time in aggregation build","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Value":0,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Value":8254,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Value":276876868,"Internal":true,"Count Failed Values":true},{"ID":697,"Name":"duration","Value":"1922","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":736,"Name":"internal.metrics.resultSize","Value":150989,"Internal":true,"Count Failed Values":true},{"ID":703,"Name":"duration","Value":"1020","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Value":0,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":7,"Completion Time":1678162993775,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"SparkListenerTaskEnd","Stage ID":13,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":136,"Index":0,"Attempt":0,"Launch Time":1678162993710,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993900,"Failed":false,"Killed":false,"Accumulables":[{"ID":854,"Name":"duration","Update":"33","Value":"33","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":856,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":858,"Name":"time in aggregation build","Update":"4","Value":"4","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":861,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":863,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":882,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":881,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":880,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":879,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":878,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":877,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":876,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":874,"Name":"internal.metrics.peakExecutionMemory","Update":557056,"Value":557056,"Internal":true,"Count Failed Values":true},{"ID":869,"Name":"internal.metrics.resultSize","Update":5959,"Value":5959,"Internal":true,"Count Failed Values":true},{"ID":868,"Name":"internal.metrics.executorCpuTime","Update":123565595,"Value":123565595,"Internal":true,"Count Failed Values":true},{"ID":867,"Name":"internal.metrics.executorRunTime","Update":133,"Value":133,"Internal":true,"Count Failed Values":true},{"ID":866,"Name":"internal.metrics.executorDeserializeCpuTime","Update":41965992,"Value":41965992,"Internal":true,"Count Failed Values":true},{"ID":865,"Name":"internal.metrics.executorDeserializeTime","Update":50,"Value":50,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":50,"Executor Deserialize CPU Time":41965992,"Executor Run Time":133,"Executor CPU Time":123565595,"Peak Execution Memory":557056,"Result Size":5959,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":13,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":49,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"111\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[48],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":48,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"112\",\"name\":\"WholeStageCodegen (5)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[47],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":47,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"116\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[36],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[12],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162993693,"Completion Time":1678162993901,"Accumulables":[{"ID":882,"Name":"internal.metrics.shuffle.read.recordsRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":863,"Name":"time in aggregation build","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":881,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":866,"Name":"internal.metrics.executorDeserializeCpuTime","Value":41965992,"Internal":true,"Count Failed Values":true},{"ID":869,"Name":"internal.metrics.resultSize","Value":5959,"Internal":true,"Count Failed Values":true},{"ID":878,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":854,"Name":"duration","Value":"33","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":877,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":880,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":856,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":865,"Name":"internal.metrics.executorDeserializeTime","Value":50,"Internal":true,"Count Failed Values":true},{"ID":874,"Name":"internal.metrics.peakExecutionMemory","Value":557056,"Internal":true,"Count Failed Values":true},{"ID":868,"Name":"internal.metrics.executorCpuTime","Value":123565595,"Internal":true,"Count Failed Values":true},{"ID":858,"Name":"time in aggregation build","Value":"4","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":867,"Name":"internal.metrics.executorRunTime","Value":133,"Internal":true,"Count Failed Values":true},{"ID":876,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":861,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":879,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":8,"Completion Time":1678162993903,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":7,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247, Date#344], [Site#247, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L]\n : +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- Project [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- AdaptiveSparkPlan isFinalPlan=true\n +- *(8) Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- SortMergeJoin [Site#247], [Site#485], FullOuter\n :- *(6) Sort [Site#247 ASC NULLS FIRST], false, 0\n : +- *(6) HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)], output=[Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L])\n : +- CustomShuffleReader coalesced\n : +- ShuffleQueryStage 2\n : +- Exchange hashpartitioning(Site#247, 1000), true, [id=#508]\n : +- *(5) HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)], output=[Site#247, sum#2727L, sum#2728L, sum#2729, count#2730L])\n : +- *(5) HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Total_Pages#449L, Total_Record_Length#451L])\n : +- CustomShuffleReader coalesced\n : +- ShuffleQueryStage 0\n : +- Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#435]\n : +- *(1) HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, count#2733L, sum#2734L])\n : +- *(1) Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=true","children":[{"nodeName":"WholeStageCodegen (8)","simpleString":"WholeStageCodegen (8)","children":[{"nodeName":"Project","simpleString":"Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"SortMergeJoin","simpleString":"SortMergeJoin [Site#247], [Site#485], FullOuter","children":[{"nodeName":"WholeStageCodegen (6)","simpleString":"WholeStageCodegen (6)","children":[{"nodeName":"Sort","simpleString":"Sort [Site#247 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, 1000), true, [id=#508]","children":[{"nodeName":"WholeStageCodegen (5)","simpleString":"WholeStageCodegen (5)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#435]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":614,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":686,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":687,"metricType":"timing"},{"name":"peak memory","accumulatorId":685,"metricType":"size"},{"name":"number of output rows","accumulatorId":684,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":688,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":683,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":640,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":641,"metricType":"nsTiming"},{"name":"records read","accumulatorId":638,"metricType":"sum"},{"name":"local bytes read","accumulatorId":636,"metricType":"size"},{"name":"fetch wait time","accumulatorId":637,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":634,"metricType":"size"},{"name":"local blocks read","accumulatorId":633,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":632,"metricType":"sum"},{"name":"data size","accumulatorId":631,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":635,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":639,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":862,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":863,"metricType":"timing"},{"name":"peak memory","accumulatorId":861,"metricType":"size"},{"name":"number of output rows","accumulatorId":860,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":864,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":857,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":858,"metricType":"timing"},{"name":"peak memory","accumulatorId":856,"metricType":"size"},{"name":"number of output rows","accumulatorId":855,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":859,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":854,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":843,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":844,"metricType":"nsTiming"},{"name":"records read","accumulatorId":841,"metricType":"sum"},{"name":"local bytes read","accumulatorId":839,"metricType":"size"},{"name":"fetch wait time","accumulatorId":840,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":837,"metricType":"size"},{"name":"local blocks read","accumulatorId":836,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":835,"metricType":"sum"},{"name":"data size","accumulatorId":834,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":838,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":842,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":942,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":943,"metricType":"timing"},{"name":"peak memory","accumulatorId":941,"metricType":"size"},{"name":"number of output rows","accumulatorId":940,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":944,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":937,"metricType":"timing"},{"name":"peak memory","accumulatorId":938,"metricType":"size"},{"name":"spill size","accumulatorId":939,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":936,"metricType":"timing"}]},{"nodeName":"WholeStageCodegen (7)","simpleString":"WholeStageCodegen (7)","children":[{"nodeName":"Sort","simpleString":"Sort [Site#485 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 1","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#485, 1000), true, [id=#455]","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[partial_avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#485, pythonUDF0#2422 AS levenshtein_distance#215]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2422]","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"Project","simpleString":"Project [url_host_2nd_last_part#8, url_host_registered_domain#13]","children":[{"nodeName":"Filter","simpleString":"Filter (pythonUDF0#2421 <= 2)","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2421]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"Filter","simpleString":"Filter ((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(url_host_registered_domain#13))","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_2nd_last_part#8, url_host_registered_domain#13, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(url_host_registered_domain#13)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":630,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":706,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":705,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":704,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":703,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":700,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":701,"metricType":"timing"},{"name":"peak memory","accumulatorId":699,"metricType":"size"},{"name":"number of output rows","accumulatorId":698,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":702,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":697,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":662,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":663,"metricType":"nsTiming"},{"name":"records read","accumulatorId":660,"metricType":"sum"},{"name":"local bytes read","accumulatorId":658,"metricType":"size"},{"name":"fetch wait time","accumulatorId":659,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":656,"metricType":"size"},{"name":"local blocks read","accumulatorId":655,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":654,"metricType":"sum"},{"name":"data size","accumulatorId":653,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":657,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":661,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":951,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":952,"metricType":"timing"},{"name":"peak memory","accumulatorId":950,"metricType":"size"},{"name":"number of output rows","accumulatorId":949,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":953,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":946,"metricType":"timing"},{"name":"peak memory","accumulatorId":947,"metricType":"size"},{"name":"spill size","accumulatorId":948,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":945,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":935,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":934,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":586,"metricType":"sum"},{"name":"written output","accumulatorId":587,"metricType":"size"},{"name":"number of output rows","accumulatorId":588,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":589,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":9,"Submission Time":1678162994136,"Stage Infos":[{"Stage ID":15,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":36,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"88\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[35],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":34,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[33],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":35,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"89\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[34],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":32,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[31],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":31,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":33,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[32],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":16,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":49,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"111\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[48],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":48,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"112\",\"name\":\"WholeStageCodegen (5)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[47],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":47,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"116\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[36],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[15],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":17,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":55,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"119\",\"name\":\"WholeStageCodegen (8)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[54],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":54,"Name":"ZippedPartitionsRDD2","Scope":"{\"id\":\"122\",\"name\":\"SortMergeJoin\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[51,53],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":53,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"128\",\"name\":\"WholeStageCodegen (7)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[52],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":51,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"123\",\"name\":\"WholeStageCodegen (6)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[50],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":50,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"127\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[49],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":52,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"132\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[46],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[16,14],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":14,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":46,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"95\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[45],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":45,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"96\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[44],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":41,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[40],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":39,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"106\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[38],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":37,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":38,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[37],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":43,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[42],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":40,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[39],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":42,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"101\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[41],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":44,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[43],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[15,16,17,14],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"86\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":17,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":55,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"119\",\"name\":\"WholeStageCodegen (8)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[54],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":54,"Name":"ZippedPartitionsRDD2","Scope":"{\"id\":\"122\",\"name\":\"SortMergeJoin\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[51,53],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":53,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"128\",\"name\":\"WholeStageCodegen (7)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[52],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":51,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"123\",\"name\":\"WholeStageCodegen (6)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[50],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":50,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"127\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[49],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":52,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"132\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[46],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[16,14],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162994138,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"86\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":17,"Stage Attempt ID":0,"Task Info":{"Task ID":137,"Index":0,"Attempt":0,"Launch Time":1678162994186,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":17,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":137,"Index":0,"Attempt":0,"Launch Time":1678162994186,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162994875,"Failed":false,"Killed":false,"Accumulables":[{"ID":934,"Name":"duration","Update":"310","Value":"310","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":945,"Name":"duration","Update":"43","Value":"43","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":948,"Name":"spill size","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":947,"Name":"peak memory","Update":"65536","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":946,"Name":"sort time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":950,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":952,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":936,"Name":"duration","Update":"101","Value":"101","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":939,"Name":"spill size","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":938,"Name":"peak memory","Update":"65536","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":937,"Name":"sort time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":941,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":943,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":977,"Name":"internal.metrics.output.bytesWritten","Update":105,"Value":105,"Internal":true,"Count Failed Values":true},{"ID":971,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":970,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":969,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":968,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":967,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":966,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":965,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":963,"Name":"internal.metrics.peakExecutionMemory","Update":655360,"Value":655360,"Internal":true,"Count Failed Values":true},{"ID":960,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":959,"Name":"internal.metrics.jvmGCTime","Update":88,"Value":88,"Internal":true,"Count Failed Values":true},{"ID":958,"Name":"internal.metrics.resultSize","Update":8973,"Value":8973,"Internal":true,"Count Failed Values":true},{"ID":957,"Name":"internal.metrics.executorCpuTime","Update":376180312,"Value":376180312,"Internal":true,"Count Failed Values":true},{"ID":956,"Name":"internal.metrics.executorRunTime","Update":584,"Value":584,"Internal":true,"Count Failed Values":true},{"ID":955,"Name":"internal.metrics.executorDeserializeCpuTime","Update":89830500,"Value":89830500,"Internal":true,"Count Failed Values":true},{"ID":954,"Name":"internal.metrics.executorDeserializeTime","Update":95,"Value":95,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":95,"Executor Deserialize CPU Time":89830500,"Executor Run Time":584,"Executor CPU Time":376180312,"Peak Execution Memory":655360,"Result Size":8973,"JVM GC Time":88,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":105,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":17,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":55,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"119\",\"name\":\"WholeStageCodegen (8)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[54],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":54,"Name":"ZippedPartitionsRDD2","Scope":"{\"id\":\"122\",\"name\":\"SortMergeJoin\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[51,53],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":53,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"128\",\"name\":\"WholeStageCodegen (7)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[52],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":51,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"123\",\"name\":\"WholeStageCodegen (6)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[50],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":50,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"127\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[49],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":52,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"132\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[46],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[16,14],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162994138,"Completion Time":1678162994876,"Accumulables":[{"ID":941,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":950,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":977,"Name":"internal.metrics.output.bytesWritten","Value":105,"Internal":true,"Count Failed Values":true},{"ID":968,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":959,"Name":"internal.metrics.jvmGCTime","Value":88,"Internal":true,"Count Failed Values":true},{"ID":971,"Name":"internal.metrics.shuffle.read.recordsRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":947,"Name":"peak memory","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":938,"Name":"peak memory","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":965,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":956,"Name":"internal.metrics.executorRunTime","Value":584,"Internal":true,"Count Failed Values":true},{"ID":955,"Name":"internal.metrics.executorDeserializeCpuTime","Value":89830500,"Internal":true,"Count Failed Values":true},{"ID":946,"Name":"sort time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":937,"Name":"sort time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":967,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":958,"Name":"internal.metrics.resultSize","Value":8973,"Internal":true,"Count Failed Values":true},{"ID":943,"Name":"time in aggregation build","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":970,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":952,"Name":"time in aggregation build","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":934,"Name":"duration","Value":"310","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":960,"Name":"internal.metrics.resultSerializationTime","Value":1,"Internal":true,"Count Failed Values":true},{"ID":969,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":954,"Name":"internal.metrics.executorDeserializeTime","Value":95,"Internal":true,"Count Failed Values":true},{"ID":945,"Name":"duration","Value":"43","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":963,"Name":"internal.metrics.peakExecutionMemory","Value":655360,"Internal":true,"Count Failed Values":true},{"ID":936,"Name":"duration","Value":"101","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":939,"Name":"spill size","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":948,"Name":"spill size","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":957,"Name":"internal.metrics.executorCpuTime","Value":376180312,"Internal":true,"Count Failed Values":true},{"ID":966,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":9,"Completion Time":1678162994876,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates","executionId":7,"accumUpdates":[[586,1],[587,105],[588,0],[589,0]]} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":7,"timePerRule":{"PruneFileSourcePartitions":968485,"ReassignLambdaVariableID":458071,"PushPredicateThroughNonJoin":3564620,"Analyzer$HandleNullInputsForUDF":23624,"Analyzer$ResolveSubqueryColumnAliases":13084,"ResolveTimeZone":9392,"Analyzer$ResolveNamespace":40585,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":10261,"RewriteCorrelatedScalarSubquery":1412978,"RemoveLiteralFromGroupExpressions":638734,"PushProjectionThroughUnion":1187865,"EliminateSubqueryAliases":810487,"ResolveCatalogs":15056,"PushLeftSemiLeftAntiThroughJoin":1300255,"FlattenScalarSubqueriesWithAggregates":421222,"LikeSimplification":1400818,"CollapseRepartition":1375546,"ResolveHints$ResolveCoalesceHints":8236,"Analyzer$ExtractGenerator":41577,"RewriteIntersectAll":599542,"ResolveHints$ResolveJoinStrategyHints":9362,"TypeCoercion$MapZipWithCoercion":9931,"NullPropagation":2423557,"PullupCorrelatedPredicates":919992,"UpdateOuterReferences":18996,"ExtractPythonUDFs":9683900,"Analyzer$WindowsSubstitution":12159,"CombineUnions":1320543,"ExtractGroupingPythonUDFFromAggregate":293194,"ReorderAssociativeOperator":1823992,"CleanupDynamicPruningFilters":814900,"ResolveHints$RemoveAllHints":17764,"SimplifyBinaryComparison":2143319,"ResolveTableValuedFunctions":11033,"EliminateSerialization":893367,"TypeCoercion$BooleanEquality":9601,"package$ExpressionCanonicalizer$CleanExpressions":51978,"ReplaceIntersectWithSemiJoin":597646,"ConstantPropagation":1119496,"CostBasedJoinReorder":17485,"Analyzer$ResolveReferences":49414,"CTESubstitution":607868,"RemoveRedundantAliases":4614815,"TypeCoercion$ImplicitTypeCasts":16952,"RewriteExceptAll":688308,"UpdateAttributeNullability":122673,"PropagateEmptyRelation":1256651,"SimplifyCasts":1487533,"EliminateMapObjects":456078,"CombineLimits":961370,"DetectAmbiguousSelfJoin":54610,"ReplaceExpressions":948894,"ResolveInlineTables":9518,"OptimizeIn":1428915,"CollapseWindow":992661,"TypeCoercion$IfCoercion":51283,"ResolveSessionCatalog":22328,"PartitionPruning":5432760,"BooleanSimplification":3166661,"TypeCoercion$PromoteStrings":10293,"Analyzer$ResolveAliases":14254,"DecimalAggregates":505562,"PruneFilters":1666010,"Analyzer$ResolveMissingReferences":29496,"TransposeWindow":1007413,"Analyzer$ResolveRelations":23314,"EliminateUnions":20640,"RewritePredicateSubquery":518475,"ObjectSerializerPruning":273902,"LimitPushDown":1203704,"SimplifyCaseConversionExpressions":1525130,"Analyzer$ResolveNaturalAndUsingJoin":13808,"EliminateView":608174,"CombineTypedFilters":278335,"OptimizeLimitZero":355385,"CheckCartesianProducts":31115,"ExtractPythonUDFFromAggregate":361175,"Analyzer$ExtractWindowExpressions":35358,"ReplaceExceptWithAntiJoin":710824,"ResolveLambdaVariables":13076,"FallBackFileSourceV2":12554,"Analyzer$ResolveTables":12838,"SubstituteUnresolvedOrdinals":8625,"TypeCoercion$CaseWhenCoercion":18357,"DecimalPrecision":15787,"EliminateSorts":1787170,"PushDownLeftSemiAntiJoin":2582198,"ExtractPythonUDFFromJoinCondition":889690,"TypeCoercion$StackCoercion":19042,"Analyzer$ResolveAggAliasInGroupBy":8825,"TypeCoercion$StringLiteralCoercion":15729,"FoldablePropagation":663614,"V2ScanRelationPushDown":791231,"EliminateDistinct":13649,"InferFiltersFromConstraints":1617887,"Analyzer$PullOutNondeterministic":18439,"Analyzer$ResolveFunctions":20690,"ReplaceNullWithFalseInPredicate":1514748,"ResolveHigherOrderFunctions":10852,"Analyzer$ResolvePivot":7788,"CollapseProject":3113855,"Analyzer$ResolveNewInstance":10493,"ColumnPruning":16478446,"Analyzer$ResolveWindowOrder":17480,"TypeCoercion$ConcatCoercion":11757,"PushDownPredicates":6424559,"TimeWindowing":34569,"Optimizer$OptimizeSubqueries":1951143,"RewriteNonCorrelatedExists":989595,"DemoteBroadcastHashJoin":1834545,"TypeCoercion$Division":16469,"ComputeCurrentTime":987701,"ResolveCreateNamedStruct":12365,"TypeCoercion$EltCoercion":35356,"ConvertToLocalRelation":634826,"RemoveRepetitionFromGroupExpressions":641634,"ReplaceDistinctWithAggregate":611322,"PreprocessTableCreation":17681,"ResolveSQLOnFile":12676,"Analyzer$ResolveSubquery":13677,"CombineConcats":30221,"Analyzer$ResolveGroupingAnalytics":12235,"Analyzer$ResolveBinaryArithmetic":11483,"RemoveDispensableExpressions":1434468,"Analyzer$ResolveAlterTableChanges":17707,"ResolveEncodersInScalaAgg":18494,"TypeCoercion$IntegralDivision":15699,"Analyzer$ResolveWindowFrame":15529,"Analyzer$ResolveDeserializer":11365,"RewriteDistinctAggregates":701057,"RemoveNoopOperators":3230208,"Analyzer$ResolveAggregateFunctions":9219,"NormalizeFloatingNumbers":7723553,"ReorderJoin":1309215,"Analyzer$ResolveUpCast":9756,"Analyzer$ResolveGenerate":15652,"TypeCoercion$WidenSetOperationTypes":7941,"EliminateOuterJoin":1184041,"SimplifyExtractValueOps":1273134,"OptimizeMetadataOnlyQuery":15500,"EliminateResolvedHint":2535553,"Analyzer$ResolveInsertInto":17181,"ReplaceExceptWithFilter":592917,"CleanupAliases":24138,"GetCurrentDatabase":1035811,"SchemaPruning":841302,"Analyzer$ResolveOutputRelation":16547,"BloomFilterJoinRule":1535456,"Analyzer$ResolveRandomSeed":9952,"TypeCoercion$WindowFrameCoercion":17154,"ConstantFolding":1470136,"TypeCoercion$DateTimeOperations":16043,"TypeCoercion$InConversion":10911,"FindDataSourceTable":14674,"SimplifyConditionals":1486631,"DataSourceAnalysis":13526,"TypeCoercion$FunctionArgumentConversion":9678,"Analyzer$GlobalAggregates":14321,"Analyzer$LookupFunctions":15287,"CombineFilters":1276526,"ReplaceDeduplicateWithAggregate":399595,"PreprocessTableInsertion":13035},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":3,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":3,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":3,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":3,"CollapseRepartition":3,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":3,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":4,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":3,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":3,"ResolveTableValuedFunctions":1,"EliminateSerialization":3,"TypeCoercion$BooleanEquality":1,"package$ExpressionCanonicalizer$CleanExpressions":4,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":3,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":3,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":3,"EliminateMapObjects":1,"CombineLimits":3,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":3,"CollapseWindow":3,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":3,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":4,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":3,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":3,"SimplifyCaseConversionExpressions":3,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":3,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":3,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":3,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":4,"Analyzer$ResolveNewInstance":1,"ColumnPruning":5,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":5,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"DemoteBroadcastHashJoin":3,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":3,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":3,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":5,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":3,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":3,"SimplifyExtractValueOps":3,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":3,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":3,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":4,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{"PushPredicateThroughNonJoin":1,"EliminateSubqueryAliases":1,"ExtractPythonUDFs":1,"RewritePredicateSubquery":1,"InferFiltersFromConstraints":1,"CollapseProject":1,"ColumnPruning":2,"PushDownPredicates":1,"RemoveNoopOperators":1},"timeEffectiveRunsPerRule":{"PushPredicateThroughNonJoin":3564620,"EliminateSubqueryAliases":810487,"ExtractPythonUDFs":9683900,"RewritePredicateSubquery":518475,"InferFiltersFromConstraints":1617887,"CollapseProject":2178101,"ColumnPruning":11608093,"PushDownPredicates":4785341,"RemoveNoopOperators":1660431},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":7,"time":1678162994964} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":8,"description":"save at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]\n +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]\n +- Project [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint)), avg(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, Total_Pages#449L, Total_Record_Length#451L, Avg_Record_Length#453])\n +- Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#602]\n +- HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint)), partial_avg(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, count#2733L, sum#2734L, sum#3510, count#3511L])\n +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]\n +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]\n +- InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint)), avg(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#602]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint)), partial_avg(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1026,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1023,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1024,"metricType":"timing"},{"name":"peak memory","accumulatorId":1022,"metricType":"size"},{"name":"number of output rows","accumulatorId":1021,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1025,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":988,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":989,"metricType":"nsTiming"},{"name":"records read","accumulatorId":986,"metricType":"sum"},{"name":"local bytes read","accumulatorId":984,"metricType":"size"},{"name":"fetch wait time","accumulatorId":985,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":982,"metricType":"size"},{"name":"local blocks read","accumulatorId":981,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":980,"metricType":"sum"},{"name":"data size","accumulatorId":979,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":983,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":987,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1018,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1019,"metricType":"timing"},{"name":"peak memory","accumulatorId":1017,"metricType":"size"},{"name":"number of output rows","accumulatorId":1016,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1020,"metricType":"average"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":1012,"metricType":"sum"},{"name":"written output","accumulatorId":1013,"metricType":"size"},{"name":"number of output rows","accumulatorId":1014,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":1015,"metricType":"sum"}]},"time":1678162995039} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":8,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":8,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]\n +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]\n +- Project [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint)), avg(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, Total_Pages#449L, Total_Record_Length#451L, Avg_Record_Length#453])\n +- ShuffleQueryStage 0\n +- Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#619]\n +- *(1) HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint)), partial_avg(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, count#2733L, sum#2734L, sum#3510, count#3511L])\n +- *(1) Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]\n +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]\n +- InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint)), avg(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#619]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint)), partial_avg(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1026,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1046,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1047,"metricType":"timing"},{"name":"peak memory","accumulatorId":1045,"metricType":"size"},{"name":"number of output rows","accumulatorId":1044,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1048,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1043,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":1036,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":1037,"metricType":"nsTiming"},{"name":"records read","accumulatorId":1034,"metricType":"sum"},{"name":"local bytes read","accumulatorId":1032,"metricType":"size"},{"name":"fetch wait time","accumulatorId":1033,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":1030,"metricType":"size"},{"name":"local blocks read","accumulatorId":1029,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":1028,"metricType":"sum"},{"name":"data size","accumulatorId":1027,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":1031,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":1035,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1040,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1041,"metricType":"timing"},{"name":"peak memory","accumulatorId":1039,"metricType":"size"},{"name":"number of output rows","accumulatorId":1038,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1042,"metricType":"average"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":1012,"metricType":"sum"},{"name":"written output","accumulatorId":1013,"metricType":"size"},{"name":"number of output rows","accumulatorId":1014,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":1015,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":10,"Submission Time":1678162995370,"Stage Infos":[{"Stage ID":18,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":63,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"162\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[62],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":62,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"163\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[61],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":58,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":59,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[58],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":60,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[59],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":61,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[60],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[18],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"162\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"8","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":18,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":63,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"162\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[62],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":62,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"163\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[61],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":58,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":59,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[58],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":60,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[59],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":61,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[60],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162995377,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"162\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"8","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":138,"Index":4,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":139,"Index":5,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":140,"Index":3,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":141,"Index":2,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":142,"Index":6,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":143,"Index":1,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":144,"Index":7,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":145,"Index":0,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":146,"Index":12,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":147,"Index":13,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":148,"Index":11,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":149,"Index":10,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":150,"Index":14,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":151,"Index":9,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":152,"Index":15,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":153,"Index":8,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":154,"Index":20,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":155,"Index":21,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":156,"Index":19,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":157,"Index":18,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":158,"Index":22,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":159,"Index":17,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":160,"Index":23,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":161,"Index":16,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":162,"Index":28,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":163,"Index":29,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":164,"Index":27,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":165,"Index":26,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":166,"Index":30,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":167,"Index":25,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":168,"Index":31,"Attempt":0,"Launch Time":1678162995395,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":169,"Index":24,"Attempt":0,"Launch Time":1678162995395,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":148,"Index":11,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995543,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"48","Value":"48","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"25","Value":"25","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":46,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":294912,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":4317,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":21320078,"Value":21320078,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":122,"Value":122,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4953046,"Value":4953046,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":15,"Value":15,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":15,"Executor Deserialize CPU Time":4953046,"Executor Run Time":122,"Executor CPU Time":21320078,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":156,"Index":19,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995543,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"36","Value":"84","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"25","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":92,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":589824,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":8634,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":46373233,"Value":67693311,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":123,"Value":245,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5155506,"Value":10108552,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":15,"Value":30,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":15,"Executor Deserialize CPU Time":5155506,"Executor Run Time":123,"Executor CPU Time":46373233,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":164,"Index":27,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995544,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"27","Value":"111","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"786432","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"25","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":138,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":884736,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":12951,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":16928643,"Value":84621954,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":118,"Value":363,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4904786,"Value":15013338,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":18,"Value":48,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":18,"Executor Deserialize CPU Time":4904786,"Executor Run Time":118,"Executor CPU Time":16928643,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":140,"Index":3,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995544,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"48","Value":"159","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"1048576","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"25","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":184,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1179648,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":17268,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":14809047,"Value":99431001,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":121,"Value":484,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9702007,"Value":24715345,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":17,"Value":65,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":17,"Executor Deserialize CPU Time":9702007,"Executor Run Time":121,"Executor CPU Time":14809047,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":159,"Index":17,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995584,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"35","Value":"194","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"1310720","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":230,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1474560,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":21585,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":23775870,"Value":123206871,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":104,"Value":588,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4955355,"Value":29670700,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":47,"Value":112,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":47,"Executor Deserialize CPU Time":4955355,"Executor Run Time":104,"Executor CPU Time":23775870,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":143,"Index":1,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995587,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"40","Value":"234","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"1572864","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":276,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1769472,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":25902,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":18792895,"Value":141999766,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":112,"Value":700,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9486320,"Value":39157020,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":48,"Value":160,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":48,"Executor Deserialize CPU Time":9486320,"Executor Run Time":112,"Executor CPU Time":18792895,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":149,"Index":10,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995589,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"40","Value":"274","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"1835008","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":322,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2064384,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":30219,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":26620393,"Value":168620159,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":154,"Value":854,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4434561,"Value":43591581,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":20,"Value":180,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":20,"Executor Deserialize CPU Time":4434561,"Executor Run Time":154,"Executor CPU Time":26620393,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":151,"Index":9,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995589,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"36","Value":"310","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"2097152","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":368,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2359296,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":34536,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":40997063,"Value":209617222,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":114,"Value":968,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3796499,"Value":47388080,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":43,"Value":223,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":43,"Executor Deserialize CPU Time":3796499,"Executor Run Time":114,"Executor CPU Time":40997063,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":141,"Index":2,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995591,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"41","Value":"351","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"2359296","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":414,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2654208,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":38853,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":48829550,"Value":258446772,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":156,"Value":1124,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9021856,"Value":56409936,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":24,"Value":247,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":24,"Executor Deserialize CPU Time":9021856,"Executor Run Time":156,"Executor CPU Time":48829550,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":167,"Index":25,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995592,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"35","Value":"386","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"2621440","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":460,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2949120,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":43170,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":22386891,"Value":280833663,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":111,"Value":1235,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3741591,"Value":60151527,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":41,"Value":288,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":41,"Executor Deserialize CPU Time":3741591,"Executor Run Time":111,"Executor CPU Time":22386891,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":165,"Index":26,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995602,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"47","Value":"433","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"2883584","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":506,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3244032,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":47487,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":18851281,"Value":299684944,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":164,"Value":1399,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5128876,"Value":65280403,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":15,"Value":303,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":15,"Executor Deserialize CPU Time":5128876,"Executor Run Time":164,"Executor CPU Time":18851281,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":157,"Index":18,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995604,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"45","Value":"478","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"3145728","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":552,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3538944,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":51804,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":24384581,"Value":324069525,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":160,"Value":1559,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4662042,"Value":69942445,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":23,"Value":326,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":23,"Executor Deserialize CPU Time":4662042,"Executor Run Time":160,"Executor CPU Time":24384581,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":144,"Index":7,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995611,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"42","Value":"520","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"3407872","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":598,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3833856,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":56121,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":20518004,"Value":344587529,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":114,"Value":1673,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9907909,"Value":79850354,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":56,"Value":382,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":56,"Executor Deserialize CPU Time":9907909,"Executor Run Time":114,"Executor CPU Time":20518004,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":168,"Index":31,"Attempt":0,"Launch Time":1678162995395,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995613,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"42","Value":"562","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"3670016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":644,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4128768,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":60438,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":25982392,"Value":370569921,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":105,"Value":1778,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5221445,"Value":85071799,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":55,"Value":437,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":55,"Executor Deserialize CPU Time":5221445,"Executor Run Time":105,"Executor CPU Time":25982392,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":152,"Index":15,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995614,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"53","Value":"615","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"3932160","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"7","Value":"57","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":690,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4423680,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":64755,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":48263553,"Value":418833474,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":113,"Value":1891,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4297031,"Value":89368830,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":66,"Value":503,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":66,"Executor Deserialize CPU Time":4297031,"Executor Run Time":113,"Executor CPU Time":48263553,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":160,"Index":23,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995617,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"53","Value":"668","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"4194304","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"14","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":736,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4718592,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":69072,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":27864481,"Value":446697955,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":113,"Value":2004,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5283606,"Value":94652436,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":67,"Value":570,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":67,"Executor Deserialize CPU Time":5283606,"Executor Run Time":113,"Executor CPU Time":27864481,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":139,"Index":5,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995636,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"64","Value":"732","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"4456448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":782,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5013504,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":26,"Value":26,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":73432,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":43499716,"Value":490197671,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":210,"Value":2214,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10877708,"Value":105530144,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":23,"Value":593,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":23,"Executor Deserialize CPU Time":10877708,"Executor Run Time":210,"Executor CPU Time":43499716,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":26,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":155,"Index":21,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995637,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"65","Value":"797","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":828,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5308416,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":26,"Value":52,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":77792,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":22685397,"Value":512883068,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":209,"Value":2423,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6545492,"Value":112075636,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":22,"Value":615,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":22,"Executor Deserialize CPU Time":6545492,"Executor Run Time":209,"Executor CPU Time":22685397,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":26,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":166,"Index":30,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995637,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"41","Value":"838","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"4980736","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":874,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5603328,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":82109,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":64312810,"Value":577195878,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":160,"Value":2583,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7627848,"Value":119703484,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":52,"Value":667,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":52,"Executor Deserialize CPU Time":7627848,"Executor Run Time":160,"Executor CPU Time":64312810,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":142,"Index":6,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995638,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"41","Value":"879","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"5242880","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":920,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5898240,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":86426,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":23994741,"Value":601190619,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":158,"Value":2741,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11955428,"Value":131658912,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":59,"Value":726,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":59,"Executor Deserialize CPU Time":11955428,"Executor Run Time":158,"Executor CPU Time":23994741,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":147,"Index":13,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995645,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"82","Value":"961","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"5505024","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":966,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6193152,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":26,"Value":78,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":90786,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":23600822,"Value":624791441,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":218,"Value":2959,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6490041,"Value":138148953,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":21,"Value":747,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":21,"Executor Deserialize CPU Time":6490041,"Executor Run Time":218,"Executor CPU Time":23600822,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":26,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":158,"Index":22,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995645,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"49","Value":"1010","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"5767168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1012,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6488064,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":95103,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":24357024,"Value":649148465,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":164,"Value":3123,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7467748,"Value":145616701,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":54,"Value":801,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":54,"Executor Deserialize CPU Time":7467748,"Executor Run Time":164,"Executor CPU Time":24357024,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":150,"Index":14,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995649,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"56","Value":"1066","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"6029312","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1058,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6782976,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":99420,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":22790601,"Value":671939066,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":172,"Value":3295,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6656514,"Value":152273215,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":53,"Value":854,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":53,"Executor Deserialize CPU Time":6656514,"Executor Run Time":172,"Executor CPU Time":22790601,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":163,"Index":29,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995660,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"96","Value":"1162","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"6291456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1104,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7077888,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":26,"Value":104,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":103780,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":60981377,"Value":732920443,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":232,"Value":3527,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6500114,"Value":158773329,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":20,"Value":874,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":20,"Executor Deserialize CPU Time":6500114,"Executor Run Time":232,"Executor CPU Time":60981377,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":26,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":161,"Index":16,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995697,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"38","Value":"1200","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"6553600","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1150,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7372800,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":108097,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":28289519,"Value":761209962,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":135,"Value":3662,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10016289,"Value":168789618,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":117,"Value":991,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":117,"Executor Deserialize CPU Time":10016289,"Executor Run Time":135,"Executor CPU Time":28289519,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":169,"Index":24,"Attempt":0,"Launch Time":1678162995395,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995704,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"39","Value":"1239","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"6815744","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"1","Value":"72","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1196,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7667712,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":112414,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":20108618,"Value":781318580,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":134,"Value":3796,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5907430,"Value":174697048,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":117,"Value":1108,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":117,"Executor Deserialize CPU Time":5907430,"Executor Run Time":134,"Executor CPU Time":20108618,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":145,"Index":0,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995705,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"37","Value":"1276","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"7077888","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"72","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1242,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7962624,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":116731,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":24900507,"Value":806219087,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":136,"Value":3932,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6176949,"Value":180873997,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":115,"Value":1223,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":115,"Executor Deserialize CPU Time":6176949,"Executor Run Time":136,"Executor CPU Time":24900507,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":153,"Index":8,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995706,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"37","Value":"1313","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"7340032","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"72","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1288,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8257536,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":121048,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":57267920,"Value":863487007,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":134,"Value":4066,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6218641,"Value":187092638,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":117,"Value":1340,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":117,"Executor Deserialize CPU Time":6218641,"Executor Run Time":134,"Executor CPU Time":57267920,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":170,"Index":32,"Attempt":0,"Launch Time":1678162995709,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":154,"Index":20,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995711,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"35","Value":"1348","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"7602176","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"72","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1334,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8552448,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":182,"Value":286,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":125408,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":37701729,"Value":901188736,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":288,"Value":4354,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3750740,"Value":190843378,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":17,"Value":1357,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":17,"Executor Deserialize CPU Time":3750740,"Executor Run Time":288,"Executor CPU Time":37701729,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":182,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":138,"Index":4,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995711,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"36","Value":"1384","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"7864320","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"1","Value":"73","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1380,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8847360,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":182,"Value":468,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":129768,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":24454791,"Value":925643527,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":288,"Value":4642,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7684368,"Value":198527746,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":18,"Value":1375,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":18,"Executor Deserialize CPU Time":7684368,"Executor Run Time":288,"Executor CPU Time":24454791,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":182,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":162,"Index":28,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995711,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"36","Value":"1420","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"8126464","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"1","Value":"74","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1426,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9142272,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":182,"Value":650,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":134128,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":21696634,"Value":947340161,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":289,"Value":4931,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3468248,"Value":201995994,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":14,"Value":1389,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":14,"Executor Deserialize CPU Time":3468248,"Executor Run Time":289,"Executor CPU Time":21696634,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":182,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":146,"Index":12,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995716,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"40","Value":"1460","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"8388608","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"74","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1472,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9437184,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":182,"Value":832,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":138488,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":55265151,"Value":1002605312,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":294,"Value":5225,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4201860,"Value":206197854,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":16,"Value":1405,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":16,"Executor Deserialize CPU Time":4201860,"Executor Run Time":294,"Executor CPU Time":55265151,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":182,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":170,"Index":32,"Attempt":0,"Launch Time":1678162995709,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995770,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"42","Value":"1502","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"38","Value":"112","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1518,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":142805,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":9339226,"Value":1011944538,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":48,"Value":5273,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3117626,"Value":209315480,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":3,"Value":1408,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":3,"Executor Deserialize CPU Time":3117626,"Executor Run Time":48,"Executor CPU Time":9339226,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":18,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":63,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"162\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[62],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":62,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"163\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[61],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":58,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":59,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[58],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":60,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[59],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":61,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[60],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162995377,"Completion Time":1678162995771,"Accumulables":[{"ID":1070,"Name":"internal.metrics.input.bytesRead","Value":1518,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Value":1011944538,"Internal":true,"Count Failed Values":true},{"ID":1043,"Name":"duration","Value":"1502","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Value":1408,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Value":5273,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Value":832,"Internal":true,"Count Failed Values":true},{"ID":1045,"Name":"peak memory","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1047,"Name":"time in aggregation build","Value":"112","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Value":209315480,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Value":142805,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":10,"Completion Time":1678162995774,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":8,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]\n +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]\n +- Project [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- AdaptiveSparkPlan isFinalPlan=true\n +- *(2) HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint)), avg(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, Total_Pages#449L, Total_Record_Length#451L, Avg_Record_Length#453])\n +- CustomShuffleReader coalesced\n +- ShuffleQueryStage 0\n +- Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#619]\n +- *(1) HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint)), partial_avg(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, count#2733L, sum#2734L, sum#3510, count#3511L])\n +- *(1) Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]\n +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]\n +- InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=true","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint)), avg(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#619]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint)), partial_avg(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1026,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1046,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1047,"metricType":"timing"},{"name":"peak memory","accumulatorId":1045,"metricType":"size"},{"name":"number of output rows","accumulatorId":1044,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1048,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1043,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":1036,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":1037,"metricType":"nsTiming"},{"name":"records read","accumulatorId":1034,"metricType":"sum"},{"name":"local bytes read","accumulatorId":1032,"metricType":"size"},{"name":"fetch wait time","accumulatorId":1033,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":1030,"metricType":"size"},{"name":"local blocks read","accumulatorId":1029,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":1028,"metricType":"sum"},{"name":"data size","accumulatorId":1027,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":1031,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":1035,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1088,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1089,"metricType":"timing"},{"name":"peak memory","accumulatorId":1087,"metricType":"size"},{"name":"number of output rows","accumulatorId":1086,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1090,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1085,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":1012,"metricType":"sum"},{"name":"written output","accumulatorId":1013,"metricType":"size"},{"name":"number of output rows","accumulatorId":1014,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":1015,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":11,"Submission Time":1678162995858,"Stage Infos":[{"Stage ID":19,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":63,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"162\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[62],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":62,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"163\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[61],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":58,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":59,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[58],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":60,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[59],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":61,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[60],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":20,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":65,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"170\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[64],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":64,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"173\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[63],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[19],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[19,20],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"160\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"8","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":20,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":65,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"170\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[64],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":64,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"173\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[63],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[19],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162995861,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"160\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"8","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":20,"Stage Attempt ID":0,"Task Info":{"Task ID":171,"Index":0,"Attempt":0,"Launch Time":1678162995896,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":20,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":171,"Index":0,"Attempt":0,"Launch Time":1678162995896,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162996067,"Failed":false,"Killed":false,"Accumulables":[{"ID":1085,"Name":"duration","Update":"67","Value":"67","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1087,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1089,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1114,"Name":"internal.metrics.output.bytesWritten","Update":60,"Value":60,"Internal":true,"Count Failed Values":true},{"ID":1108,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1107,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1106,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1105,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1104,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1103,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1102,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1100,"Name":"internal.metrics.peakExecutionMemory","Update":262144,"Value":262144,"Internal":true,"Count Failed Values":true},{"ID":1095,"Name":"internal.metrics.resultSize","Update":5284,"Value":5284,"Internal":true,"Count Failed Values":true},{"ID":1094,"Name":"internal.metrics.executorCpuTime","Update":34832333,"Value":34832333,"Internal":true,"Count Failed Values":true},{"ID":1093,"Name":"internal.metrics.executorRunTime","Update":145,"Value":145,"Internal":true,"Count Failed Values":true},{"ID":1092,"Name":"internal.metrics.executorDeserializeCpuTime","Update":12183221,"Value":12183221,"Internal":true,"Count Failed Values":true},{"ID":1091,"Name":"internal.metrics.executorDeserializeTime","Update":18,"Value":18,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":18,"Executor Deserialize CPU Time":12183221,"Executor Run Time":145,"Executor CPU Time":34832333,"Peak Execution Memory":262144,"Result Size":5284,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":60,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":20,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":65,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"170\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[64],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":64,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"173\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[63],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[19],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162995861,"Completion Time":1678162996068,"Accumulables":[{"ID":1106,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1100,"Name":"internal.metrics.peakExecutionMemory","Value":262144,"Internal":true,"Count Failed Values":true},{"ID":1091,"Name":"internal.metrics.executorDeserializeTime","Value":18,"Internal":true,"Count Failed Values":true},{"ID":1094,"Name":"internal.metrics.executorCpuTime","Value":34832333,"Internal":true,"Count Failed Values":true},{"ID":1085,"Name":"duration","Value":"67","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1102,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1093,"Name":"internal.metrics.executorRunTime","Value":145,"Internal":true,"Count Failed Values":true},{"ID":1105,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1087,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1114,"Name":"internal.metrics.output.bytesWritten","Value":60,"Internal":true,"Count Failed Values":true},{"ID":1108,"Name":"internal.metrics.shuffle.read.recordsRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1107,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1089,"Name":"time in aggregation build","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1092,"Name":"internal.metrics.executorDeserializeCpuTime","Value":12183221,"Internal":true,"Count Failed Values":true},{"ID":1095,"Name":"internal.metrics.resultSize","Value":5284,"Internal":true,"Count Failed Values":true},{"ID":1104,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1103,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":11,"Completion Time":1678162996068,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates","executionId":8,"accumUpdates":[[1012,1],[1013,60],[1014,0],[1015,0]]} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":8,"timePerRule":{"PruneFileSourcePartitions":188612,"ReassignLambdaVariableID":184134,"PushPredicateThroughNonJoin":166386,"Analyzer$HandleNullInputsForUDF":46617,"Analyzer$ResolveSubqueryColumnAliases":11600,"ResolveTimeZone":16037,"Analyzer$ResolveNamespace":12586,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":12616,"RewriteCorrelatedScalarSubquery":430123,"RemoveLiteralFromGroupExpressions":239289,"PushProjectionThroughUnion":437157,"EliminateSubqueryAliases":226003,"ResolveCatalogs":21850,"PushLeftSemiLeftAntiThroughJoin":421808,"FlattenScalarSubqueriesWithAggregates":149181,"LikeSimplification":529164,"CollapseRepartition":440098,"ResolveHints$ResolveCoalesceHints":12966,"Analyzer$ExtractGenerator":40486,"RewriteIntersectAll":220647,"ResolveHints$ResolveJoinStrategyHints":14925,"TypeCoercion$MapZipWithCoercion":17699,"NullPropagation":589498,"PullupCorrelatedPredicates":429342,"UpdateOuterReferences":20881,"ExtractPythonUDFs":1254885,"Analyzer$WindowsSubstitution":18722,"CombineUnions":524076,"ExtractGroupingPythonUDFFromAggregate":109718,"ReorderAssociativeOperator":546434,"CleanupDynamicPruningFilters":256584,"ResolveHints$RemoveAllHints":15542,"SimplifyBinaryComparison":561075,"ResolveTableValuedFunctions":16220,"EliminateSerialization":302829,"TypeCoercion$BooleanEquality":16031,"ReplaceIntersectWithSemiJoin":222723,"ConstantPropagation":312997,"CostBasedJoinReorder":13531,"Analyzer$ResolveReferences":63843,"CTESubstitution":402605,"RemoveRedundantAliases":574766,"TypeCoercion$ImplicitTypeCasts":16195,"RewriteExceptAll":230192,"UpdateAttributeNullability":110136,"PropagateEmptyRelation":306387,"SimplifyCasts":532142,"EliminateMapObjects":174567,"CombineLimits":300207,"DetectAmbiguousSelfJoin":23038,"ReplaceExpressions":427551,"ResolveInlineTables":40151,"OptimizeIn":533844,"CollapseWindow":344669,"TypeCoercion$IfCoercion":15649,"ResolveSessionCatalog":21789,"PartitionPruning":119818,"BooleanSimplification":793660,"TypeCoercion$PromoteStrings":18037,"Analyzer$ResolveAliases":12615,"DecimalAggregates":252082,"PruneFilters":418330,"Analyzer$ResolveMissingReferences":11100,"TransposeWindow":307818,"Analyzer$ResolveRelations":24967,"EliminateUnions":21080,"RewritePredicateSubquery":98644,"ObjectSerializerPruning":96806,"LimitPushDown":423460,"SimplifyCaseConversionExpressions":532735,"Analyzer$ResolveNaturalAndUsingJoin":12389,"EliminateView":236933,"CombineTypedFilters":93276,"OptimizeLimitZero":234686,"CheckCartesianProducts":23093,"ExtractPythonUDFFromAggregate":135765,"Analyzer$ExtractWindowExpressions":15087,"ReplaceExceptWithAntiJoin":248786,"ResolveLambdaVariables":21347,"FallBackFileSourceV2":11627,"Analyzer$ResolveTables":20462,"SubstituteUnresolvedOrdinals":16953,"TypeCoercion$CaseWhenCoercion":15661,"DecimalPrecision":27547,"EliminateSorts":209553,"PushDownLeftSemiAntiJoin":435232,"ExtractPythonUDFFromJoinCondition":105169,"TypeCoercion$StackCoercion":15903,"Analyzer$ResolveAggAliasInGroupBy":12255,"TypeCoercion$StringLiteralCoercion":15509,"FoldablePropagation":137222,"V2ScanRelationPushDown":192701,"EliminateDistinct":15268,"InferFiltersFromConstraints":116628,"Analyzer$PullOutNondeterministic":19588,"Analyzer$ResolveFunctions":18733,"ReplaceNullWithFalseInPredicate":467134,"ResolveHigherOrderFunctions":17679,"Analyzer$ResolvePivot":13829,"CollapseProject":1095786,"Analyzer$ResolveNewInstance":13128,"ColumnPruning":3262916,"Analyzer$ResolveWindowOrder":17028,"TypeCoercion$ConcatCoercion":19713,"PushDownPredicates":733750,"TimeWindowing":43519,"Optimizer$OptimizeSubqueries":903235,"RewriteNonCorrelatedExists":378903,"DemoteBroadcastHashJoin":34426,"TypeCoercion$Division":15097,"ComputeCurrentTime":394549,"ResolveCreateNamedStruct":18953,"TypeCoercion$EltCoercion":19534,"ConvertToLocalRelation":331210,"RemoveRepetitionFromGroupExpressions":284392,"ReplaceDistinctWithAggregate":226364,"PreprocessTableCreation":17613,"ResolveSQLOnFile":12442,"Analyzer$ResolveSubquery":12824,"CombineConcats":28537,"Analyzer$ResolveGroupingAnalytics":17163,"Analyzer$ResolveBinaryArithmetic":17555,"RemoveDispensableExpressions":538044,"Analyzer$ResolveAlterTableChanges":17469,"ResolveEncodersInScalaAgg":19456,"TypeCoercion$IntegralDivision":16314,"Analyzer$ResolveWindowFrame":19166,"Analyzer$ResolveDeserializer":13554,"RewriteDistinctAggregates":267193,"RemoveNoopOperators":628431,"Analyzer$ResolveAggregateFunctions":12185,"NormalizeFloatingNumbers":99694,"ReorderJoin":439224,"Analyzer$ResolveUpCast":13194,"Analyzer$ResolveGenerate":14650,"TypeCoercion$WidenSetOperationTypes":13236,"EliminateOuterJoin":427054,"SimplifyExtractValueOps":435938,"OptimizeMetadataOnlyQuery":11358,"EliminateResolvedHint":481062,"Analyzer$ResolveInsertInto":11335,"ReplaceExceptWithFilter":243004,"CleanupAliases":27501,"GetCurrentDatabase":488690,"SchemaPruning":218230,"Analyzer$ResolveOutputRelation":12851,"BloomFilterJoinRule":109343,"Analyzer$ResolveRandomSeed":12917,"TypeCoercion$WindowFrameCoercion":15565,"ConstantFolding":494746,"TypeCoercion$DateTimeOperations":14989,"TypeCoercion$InConversion":17906,"FindDataSourceTable":14050,"SimplifyConditionals":529169,"DataSourceAnalysis":12374,"TypeCoercion$FunctionArgumentConversion":18393,"Analyzer$GlobalAggregates":11045,"Analyzer$LookupFunctions":23366,"CombineFilters":394027,"ReplaceDeduplicateWithAggregate":240841,"PreprocessTableInsertion":12122},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":3,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":3,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":3,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":3,"CollapseRepartition":3,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":3,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":4,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":3,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":3,"ResolveTableValuedFunctions":1,"EliminateSerialization":3,"TypeCoercion$BooleanEquality":1,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":3,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":3,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":3,"EliminateMapObjects":1,"CombineLimits":3,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":3,"CollapseWindow":3,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":3,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":4,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":3,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":3,"SimplifyCaseConversionExpressions":3,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":3,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":3,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":3,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":4,"Analyzer$ResolveNewInstance":1,"ColumnPruning":5,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":5,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"DemoteBroadcastHashJoin":1,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":3,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":3,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":5,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":3,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":3,"SimplifyExtractValueOps":3,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":3,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":3,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":4,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{"ColumnPruning":2,"CollapseProject":1,"ExtractPythonUDFs":1},"timeEffectiveRunsPerRule":{"ColumnPruning":2542317,"CollapseProject":751354,"ExtractPythonUDFs":1254885},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":8,"time":1678162996163} -{"Event":"SparkListenerApplicationEnd","Timestamp":1678162996233} diff --git a/tests/data/emr-cluster-report.json b/tests/data/emr-cluster-report.json deleted file mode 100644 index a9f5072..0000000 --- a/tests/data/emr-cluster-report.json +++ /dev/null @@ -1,354 +0,0 @@ -{ - "Cluster": { - "Id": "j-14QV64S2PV1Y2", - "Name": "indexdataetl1gb", - "Status": { - "State": "TERMINATED", - "StateChangeReason": { - "Code": "ALL_STEPS_COMPLETED", - "Message": "Steps completed" - }, - "Timeline": { - "CreationDateTime": "2023-03-06T20:14:30.160000-08:00", - "ReadyDateTime": "2023-03-06T20:22:10.554000-08:00", - "EndDateTime": "2023-03-06T20:27:03.197000-08:00" - } - }, - "Ec2InstanceAttributes": { - "Ec2KeyName": "global-key", - "Ec2SubnetId": "subnet-08xlshei29a9202dc", - "RequestedEc2SubnetIds": [ - "subnet-0blasve89vw96b8c5", - "subnet-97svllkja9346a8c4", - "subnet-08dd4lkjser09872f", - "subnet-slv80valk3avnj797", - "subnet-098xxlkqhklwf3lkj", - "subnet-lakjf989h39kajdg7" - ], - "Ec2AvailabilityZone": "us-east-1c", - "RequestedEc2AvailabilityZones": [], - "IamInstanceProfile": "EMR_EC2_DefaultRole", - "EmrManagedMasterSecurityGroup": "sg-alsvewf29837437e5", - "EmrManagedSlaveSecurityGroup": "sg-alsdfj93870342bdf" - }, - "InstanceCollectionType": "INSTANCE_FLEET", - "LogUri": "s3n://my-emr-job-logs/indexdataetl1gb/", - "ReleaseLabel": "emr-6.2.0", - "AutoTerminate": true, - "TerminationProtected": false, - "VisibleToAllUsers": true, - "Applications": [ - { - "Name": "Spark", - "Version": "3.0.1" - } - ], - "Tags": [ - { - "Key": "sync:run-id", - "Value": "f84639ed-7a6a-4496-81e1-b5ba8fa8b6ce" - }, - { - "Key": "Owner", - "Value": "Scott" - }, - { - "Key": "sync:project-id", - "Value": "29f4dded-70be-4344-b9b5-396c8c0481cf" - } - ], - "ServiceRole": "EMR_DefaultRole", - "NormalizedInstanceHours": 76, - "MasterPublicDnsName": "ec2-52-21-192-83.compute-1.amazonaws.com", - "Configurations": [ - { - "Classification": "spark-defaults", - "Properties": { - "spark.dynamicAllocation.enabled": "false", - "spark.eventLog.dir": "s3a://my-emr-projects/29f4dded-70be-4344-b9b5-396c8c0481cf/2023-03-07T04:14:28Z/f84639ed-7a6a-4496-81e1-b5ba8fa8b6ce/eventlog/", - "spark.eventLog.enabled": "true", - "spark.executor.cores": "4", - "spark.executor.instances": "8", - "spark.executor.memory": "4656M", - "spark.executor.processTreeMetrics.enabled": "true" - } - } - ], - "ScaleDownBehavior": "TERMINATE_AT_TASK_COMPLETION", - "ClusterArn": "arn:aws:elasticmapreduce:us-east-1:111122223333:cluster/j-14QB7SA9801Y2", - "StepConcurrencyLevel": 1, - "PlacementGroups": [], - "BootstrapActions": [ - { - "Name": "Packages setup", - "ScriptPath": "s3://my-emr-job-scripts/dummy.sh", - "Args": [] - } - ], - "InstanceFleets": [ - { - "Id": "if-DR8F73EAI88V", - "Name": "Core - 2", - "Status": { - "State": "TERMINATED", - "StateChangeReason": { - "Code": "CLUSTER_TERMINATED", - "Message": "Job flow terminated" - }, - "Timeline": { - "CreationDateTime": "2023-03-06T20:14:30.234000-08:00", - "ReadyDateTime": "2023-03-06T20:22:08.788000-08:00", - "EndDateTime": "2023-03-06T20:27:03.126000-08:00" - } - }, - "InstanceFleetType": "CORE", - "TargetOnDemandCapacity": 1, - "TargetSpotCapacity": 0, - "ProvisionedOnDemandCapacity": 0, - "ProvisionedSpotCapacity": 0, - "InstanceTypeSpecifications": [ - { - "InstanceType": "c5a.8xlarge", - "WeightedCapacity": 1, - "BidPriceAsPercentageOfOnDemandPrice": 100.0, - "EbsBlockDevices": [ - { - "VolumeSpecification": { - "VolumeType": "gp2", - "SizeInGB": 64 - } - } - ] - } - ] - }, - { - "Id": "if-SB7S98AJEMP7", - "Name": "Task - 1", - "Status": { - "State": "TERMINATED", - "StateChangeReason": { - "Code": "CLUSTER_TERMINATED", - "Message": "Job flow terminated" - }, - "Timeline": { - "CreationDateTime": "2023-03-06T20:14:30.238000-08:00", - "ReadyDateTime": "2023-03-06T20:22:10.574000-08:00", - "EndDateTime": "2023-03-06T20:27:03.126000-08:00" - } - }, - "InstanceFleetType": "TASK", - "TargetOnDemandCapacity": 0, - "TargetSpotCapacity": 1, - "ProvisionedOnDemandCapacity": 0, - "ProvisionedSpotCapacity": 0, - "InstanceTypeSpecifications": [ - { - "InstanceType": "m4.large", - "WeightedCapacity": 1, - "BidPriceAsPercentageOfOnDemandPrice": 100.0, - "EbsBlockDevices": [ - { - "VolumeSpecification": { - "VolumeType": "gp2", - "SizeInGB": 32 - } - } - ] - } - ], - "LaunchSpecifications": { - "SpotSpecification": { - "TimeoutDurationMinutes": 120, - "TimeoutAction": "TERMINATE_CLUSTER" - } - } - }, - { - "Id": "if-1HD2lk5lfl23H", - "Name": "Master node", - "Status": { - "State": "TERMINATED", - "StateChangeReason": { - "Code": "CLUSTER_TERMINATED", - "Message": "Job flow terminated" - }, - "Timeline": { - "CreationDateTime": "2023-03-06T20:14:30.230000-08:00", - "ReadyDateTime": "2023-03-06T20:21:51.635000-08:00", - "EndDateTime": "2023-03-06T20:27:03.126000-08:00" - } - }, - "InstanceFleetType": "MASTER", - "TargetOnDemandCapacity": 1, - "TargetSpotCapacity": 0, - "ProvisionedOnDemandCapacity": 0, - "ProvisionedSpotCapacity": 0, - "InstanceTypeSpecifications": [ - { - "InstanceType": "m4.xlarge", - "WeightedCapacity": 1, - "BidPriceAsPercentageOfOnDemandPrice": 100.0, - "EbsBlockDevices": [ - { - "VolumeSpecification": { - "VolumeType": "gp2", - "SizeInGB": 32 - } - }, - { - "VolumeSpecification": { - "VolumeType": "gp2", - "SizeInGB": 32 - } - } - ] - } - ] - } - ] - }, - "Instances": [ - { - "Id": "ci-08367242A7KTF0W6Z17L", - "Ec2InstanceId": "i-09a090c99e87741fe", - "PublicDnsName": "ec2-54-166-68-104.compute-1.amazonaws.com", - "PublicIpAddress": "54.166.68.104", - "PrivateDnsName": "ip-172-31-102-249.ec2.internal", - "PrivateIpAddress": "172.31.102.249", - "Status": { - "State": "TERMINATED", - "StateChangeReason": { - "Code": "INSTANCE_FAILURE", - "Message": "Instance was terminated." - }, - "Timeline": { - "CreationDateTime": "2023-03-06T20:14:56.729000-08:00", - "ReadyDateTime": "2023-03-06T20:21:34.856000-08:00", - "EndDateTime": "2023-03-06T20:27:03.039000-08:00" - } - }, - "InstanceFleetId": "if-DR8F73EAI88V", - "Market": "ON_DEMAND", - "InstanceType": "c5a.8xlarge", - "EbsVolumes": [ - { - "Device": "/dev/sdb", - "VolumeId": "vol-0a02a3db57625ec28" - } - ] - }, - { - "Id": "ci-05794553JWMGGZCM3VPB", - "Ec2InstanceId": "i-0f806b0efc34e4850", - "PublicDnsName": "ec2-52-23-195-73.compute-1.amazonaws.com", - "PublicIpAddress": "52.23.195.73", - "PrivateDnsName": "ip-172-31-102-115.ec2.internal", - "PrivateIpAddress": "172.31.102.115", - "Status": { - "State": "TERMINATED", - "StateChangeReason": { - "Code": "INSTANCE_FAILURE", - "Message": "Instance was terminated." - }, - "Timeline": { - "CreationDateTime": "2023-03-06T20:15:33.349000-08:00", - "ReadyDateTime": "2023-03-06T20:21:51.635000-08:00", - "EndDateTime": "2023-03-06T20:27:03.039000-08:00" - } - }, - "InstanceFleetId": "if-1HD2lk5lfl23H", - "Market": "ON_DEMAND", - "InstanceType": "m4.xlarge", - "EbsVolumes": [ - { - "Device": "/dev/sdc", - "VolumeId": "vol-017761545cdfb7e7b" - }, - { - "Device": "/dev/sdb", - "VolumeId": "vol-0f489afef8b46dba1" - } - ] - }, - { - "Id": "ci-0317762Z528GJIFRW14", - "Ec2InstanceId": "i-01bcf3fa4aacd6711", - "PublicDnsName": "ec2-54-162-122-114.compute-1.amazonaws.com", - "PublicIpAddress": "54.162.122.114", - "PrivateDnsName": "ip-172-31-102-191.ec2.internal", - "PrivateIpAddress": "172.31.102.191", - "Status": { - "State": "TERMINATED", - "StateChangeReason": { - "Code": "INSTANCE_FAILURE", - "Message": "Instance was terminated." - }, - "Timeline": { - "CreationDateTime": "2023-03-06T20:15:33.349000-08:00", - "ReadyDateTime": "2023-03-06T20:21:34.856000-08:00", - "EndDateTime": "2023-03-06T20:27:03.039000-08:00" - } - }, - "InstanceFleetId": "if-SB7S98AJEMP7", - "Market": "SPOT", - "InstanceType": "m4.large", - "EbsVolumes": [ - { - "Device": "/dev/sdb", - "VolumeId": "vol-0a14ef44daa3bf876" - } - ] - } - ], - "Steps": [ - { - "Id": "s-1EF238MZKOWWR", - "Name": "Execute job script", - "Config": { - "Jar": "command-runner.jar", - "Properties": {}, - "Args": [ - "spark-submit", - "/home/hadoop/index_data_etl_1GB.py" - ] - }, - "ActionOnFailure": "CANCEL_AND_WAIT", - "Status": { - "State": "COMPLETED", - "StateChangeReason": {}, - "Timeline": { - "CreationDateTime": "2023-03-06T20:14:30.255000-08:00", - "StartDateTime": "2023-03-06T20:22:22.473000-08:00", - "EndDateTime": "2023-03-06T20:23:18.602000-08:00" - } - } - }, - { - "Id": "s-3CRVCEKJYF4ZG", - "Name": "Setup and copy files to cluster", - "Config": { - "Jar": "command-runner.jar", - "Properties": {}, - "Args": [ - "aws", - "s3", - "cp", - "s3://my-emr-data/etl-jobs/scripts/index_data_etl/index_data_etl_1GB.py", - "/home/hadoop/" - ] - }, - "ActionOnFailure": "CANCEL_AND_WAIT", - "Status": { - "State": "COMPLETED", - "StateChangeReason": {}, - "Timeline": { - "CreationDateTime": "2023-03-06T20:14:30.255000-08:00", - "StartDateTime": "2023-03-06T20:22:15.097000-08:00", - "EndDateTime": "2023-03-06T20:22:17.466000-08:00" - } - } - } - ], - "Region": "us-east-1" -} diff --git a/tests/data/predictions_response.json b/tests/data/predictions_response.json deleted file mode 100644 index 6b54a2b..0000000 --- a/tests/data/predictions_response.json +++ /dev/null @@ -1,465 +0,0 @@ -{ - "result": [ - { - "prediction_id": "e26c36fa-3b50-4d42-a412-19db210591a4", - "project_id": "7b155ae1-bbdf-444a-9654-f74757705178", - "application_name": "AirlineDelayLR_gz", - "created_at": "2022-09-21T16:58:15Z", - "product_code": "aws-emr", - "product_name": "Spark + EMR", - "basis": { - "configuration": { - "Name": "AirlineDelayLR_gz", - "JobFlowRole": "EMR_EC2_DefaultRole", - "ServiceRole": "EMR_DefaultRole", - "ReleaseLabel": "emr-6.6.0", - "Configurations": [], - "Instances": { - "InstanceFleets": [ - { - "InstanceFleetType": "MASTER", - "InstanceTypeConfigs": [ - { - "EbsConfiguration": { - "EbsBlockDeviceConfigs": [ - { - "VolumeSpecification": { - "SizeInGB": 32, - "VolumeType": "gp2" - }, - "VolumesPerInstance": 2 - } - ] - }, - "InstanceType": "m5.xlarge", - "WeightedCapacity": 1 - } - ], - "TargetOnDemandCapacity": 1 - }, - { - "InstanceFleetType": "CORE", - "InstanceTypeConfigs": [ - { - "EbsConfiguration": { - "EbsBlockDeviceConfigs": [ - { - "VolumeSpecification": { - "SizeInGB": 20, - "VolumeType": "gp2" - }, - "VolumesPerInstance": 1 - } - ] - }, - "InstanceType": "m5.xlarge", - "WeightedCapacity": 1 - } - ], - "TargetSpotCapacity": 8 - } - ], - "KeepJobFlowAliveWhenNoSteps": false - } - }, - "metrics": { - "runtime": 11.397699999809266, - "cost": 0.230439635393404 - } - }, - "event_log": "airline_delay_eventLogs-application_1640165177768_0002.zip", - "solutions": { - "balanced": { - "configuration": { - "Name": "AirlineDelayLR_gz", - "JobFlowRole": "EMR_EC2_DefaultRole", - "ServiceRole": "EMR_DefaultRole", - "ReleaseLabel": "emr-6.6.0", - "Configurations": [ - { - "Classification": "spark-defaults", - "Properties": { - "spark.executor.memoryOverhead": "1602m", - "spark.executor.cores": "4", - "spark.executor.instances": "8", - "spark.executor.memory": "10685m", - "spark.driver.memory": "3239m", - "spark.driver.memoryOverhead": "323m", - "spark.sql.shuffle.partitions": "200", - "spark.dynamicAllocation.enabled": "false", - "spark.yarn.heterogeneousExecutors.enabled": "false" - } - }, - { - "Classification": "yarn-site", - "Properties": { - "yarn.nodemanager.resource.memory-mb": "12288", - "yarn.scheduler.maximum-allocation-mb": "12288" - } - } - ], - "Instances": { - "InstanceFleets": [ - { - "InstanceFleetType": "MASTER", - "InstanceTypeConfigs": [ - { - "EbsConfiguration": { - "EbsBlockDeviceConfigs": [ - { - "VolumeSpecification": { - "SizeInGB": 128, - "VolumeType": "gp2" - }, - "VolumesPerInstance": 1 - } - ] - }, - "InstanceType": "m4.large", - "WeightedCapacity": 1 - } - ], - "TargetOnDemandCapacity": 1 - }, - { - "InstanceFleetType": "CORE", - "InstanceTypeConfigs": [ - { - "EbsConfiguration": { - "EbsBlockDeviceConfigs": [ - { - "VolumeSpecification": { - "SizeInGB": 10, - "VolumeType": "gp2" - }, - "VolumesPerInstance": 2 - } - ] - }, - "InstanceType": "m5.xlarge", - "WeightedCapacity": 1 - } - ], - "TargetSpotCapacity": 8 - } - ], - "KeepJobFlowAliveWhenNoSteps": false - } - }, - "metrics": { - "runtime": 12.439822384371494, - "cost": 0.2288620583377891 - } - } - } - }, - { - "prediction_id": "495ae632-aa6c-4591-9ada-7b2fc0094c4e", - "project_id": "23fc3761-5ffc-4166-9eea-cdf6065e4018", - "application_name": "repjoin", - "created_at": "2022-08-17T21:05:00Z", - "product_code": "aws-emr", - "product_name": "Spark + EMR", - "basis": { - "configuration": { - "Name": "repjoin", - "JobFlowRole": "EMR_EC2_DefaultRole", - "ServiceRole": "EMR_DefaultRole", - "ReleaseLabel": "emr-6.6.0", - "Configurations": [], - "Instances": { - "InstanceFleets": [ - { - "InstanceFleetType": "MASTER", - "InstanceTypeConfigs": [ - { - "EbsConfiguration": { - "EbsBlockDeviceConfigs": [ - { - "VolumeSpecification": { - "SizeInGB": 32, - "VolumeType": "gp2" - }, - "VolumesPerInstance": 4 - } - ] - }, - "InstanceType": "m5.2xlarge", - "WeightedCapacity": 1 - } - ], - "TargetOnDemandCapacity": 1 - }, - { - "InstanceFleetType": "CORE", - "InstanceTypeConfigs": [ - { - "EbsConfiguration": { - "EbsBlockDeviceConfigs": [ - { - "VolumeSpecification": { - "SizeInGB": 64, - "VolumeType": "gp2" - }, - "VolumesPerInstance": 4 - } - ] - }, - "InstanceType": "r5.4xlarge", - "WeightedCapacity": 1 - } - ], - "TargetSpotCapacity": 6 - } - ], - "KeepJobFlowAliveWhenNoSteps": false - } - }, - "metrics": { - "runtime": 45.992949998378755, - "cost": 4.131044766160318 - } - }, - "event_log": "repjoin_application_1641345235951_0001-1.zip", - "solutions": { - "balanced": { - "configuration": { - "Name": "repjoin", - "JobFlowRole": "EMR_EC2_DefaultRole", - "ServiceRole": "EMR_DefaultRole", - "ReleaseLabel": "emr-6.6.0", - "Configurations": [ - { - "Classification": "spark-defaults", - "Properties": { - "spark.executor.memoryOverhead": "7265m", - "spark.executor.cores": "8", - "spark.executor.instances": "2", - "spark.executor.memory": "48439m", - "spark.driver.memory": "4986m", - "spark.driver.memoryOverhead": "747m", - "spark.sql.shuffle.partitions": "100", - "spark.dynamicAllocation.enabled": "true", - "spark.dynamicAllocation.executorIdleTimeout": "20", - "spark.dynamicAllocation.maxExecutors": "2", - "spark.yarn.heterogeneousExecutors.enabled": "false" - } - }, - { - "Classification": "yarn-site", - "Properties": { - "yarn.nodemanager.resource.memory-mb": "122880", - "yarn.scheduler.maximum-allocation-mb": "122880" - } - } - ], - "Instances": { - "InstanceFleets": [ - { - "InstanceFleetType": "MASTER", - "InstanceTypeConfigs": [ - { - "EbsConfiguration": { - "EbsBlockDeviceConfigs": [ - { - "VolumeSpecification": { - "SizeInGB": 128, - "VolumeType": "gp2" - }, - "VolumesPerInstance": 1 - } - ] - }, - "InstanceType": "m4.large", - "WeightedCapacity": 1 - } - ], - "TargetOnDemandCapacity": 1 - }, - { - "InstanceFleetType": "CORE", - "InstanceTypeConfigs": [ - { - "EbsConfiguration": { - "EbsBlockDeviceConfigs": [ - { - "VolumeSpecification": { - "SizeInGB": 122, - "VolumeType": "gp2" - }, - "VolumesPerInstance": 4 - } - ] - }, - "InstanceType": "r5.4xlarge", - "WeightedCapacity": 1 - } - ], - "TargetSpotCapacity": 1 - } - ], - "KeepJobFlowAliveWhenNoSteps": false - } - }, - "metrics": { - "runtime": 60.7643960231413, - "cost": 1.0063835339542084 - } - } - } - }, - { - "prediction_id": "009cb5eb-79da-4eda-8828-b362ce2c5508", - "project_id": "07b4af41-f45e-4ff8-a133-f06915bf42d9", - "application_name": "Text Similarity", - "created_at": "2022-07-19T18:40:13Z", - "product_code": "aws-emr", - "product_name": "Spark + EMR", - "basis": { - "configuration": { - "Name": "Text Similarity", - "JobFlowRole": "EMR_EC2_DefaultRole", - "ServiceRole": "EMR_DefaultRole", - "ReleaseLabel": "emr-6.6.0", - "Configurations": [], - "Instances": { - "InstanceFleets": [ - { - "InstanceFleetType": "MASTER", - "InstanceTypeConfigs": [ - { - "EbsConfiguration": { - "EbsBlockDeviceConfigs": [ - { - "VolumeSpecification": { - "SizeInGB": 32, - "VolumeType": "gp2" - }, - "VolumesPerInstance": 2 - } - ] - }, - "InstanceType": "m5.xlarge", - "WeightedCapacity": 1 - } - ], - "TargetOnDemandCapacity": 1 - }, - { - "InstanceFleetType": "CORE", - "InstanceTypeConfigs": [ - { - "EbsConfiguration": { - "EbsBlockDeviceConfigs": [ - { - "VolumeSpecification": { - "SizeInGB": 20, - "VolumeType": "gp2" - }, - "VolumesPerInstance": 1 - } - ] - }, - "InstanceType": "m5.8xlarge", - "WeightedCapacity": 1 - } - ], - "TargetSpotCapacity": 1 - } - ], - "KeepJobFlowAliveWhenNoSteps": false - } - }, - "metrics": { - "runtime": 65.1628999988238, - "cost": 1.2324550873316353 - } - }, - "event_log": "text_similarity_application_1640227485896_0002.zip", - "solutions": { - "balanced": { - "configuration": { - "Name": "Text Similarity", - "JobFlowRole": "EMR_EC2_DefaultRole", - "ServiceRole": "EMR_DefaultRole", - "ReleaseLabel": "emr-6.6.0", - "Configurations": [ - { - "Classification": "spark-defaults", - "Properties": { - "spark.executor.memoryOverhead": "4177m", - "spark.executor.cores": "8", - "spark.executor.instances": "12", - "spark.executor.memory": "24219m", - "spark.driver.memory": "4986m", - "spark.driver.memoryOverhead": "747m", - "spark.sql.shuffle.partitions": "200", - "spark.dynamicAllocation.enabled": "false", - "spark.yarn.heterogeneousExecutors.enabled": "false" - } - }, - { - "Classification": "yarn-site", - "Properties": { - "yarn.nodemanager.resource.memory-mb": "122880", - "yarn.scheduler.maximum-allocation-mb": "122880" - } - } - ], - "Instances": { - "InstanceFleets": [ - { - "InstanceFleetType": "MASTER", - "InstanceTypeConfigs": [ - { - "EbsConfiguration": { - "EbsBlockDeviceConfigs": [ - { - "VolumeSpecification": { - "SizeInGB": 128, - "VolumeType": "gp2" - }, - "VolumesPerInstance": 1 - } - ] - }, - "InstanceType": "m4.large", - "WeightedCapacity": 1 - } - ], - "TargetOnDemandCapacity": 1 - }, - { - "InstanceFleetType": "CORE", - "InstanceTypeConfigs": [ - { - "EbsConfiguration": { - "EbsBlockDeviceConfigs": [ - { - "VolumeSpecification": { - "SizeInGB": 10, - "VolumeType": "gp2" - }, - "VolumesPerInstance": 4 - } - ] - }, - "InstanceType": "m5.8xlarge", - "WeightedCapacity": 1 - } - ], - "TargetSpotCapacity": 3 - } - ], - "KeepJobFlowAliveWhenNoSteps": false - } - }, - "metrics": { - "runtime": 31.74975690467787, - "cost": 1.620694176602595 - } - } - } - } - ] -} diff --git a/tests/test_awsdatabricks.py b/tests/test_awsdatabricks.py deleted file mode 100644 index c586d2b..0000000 --- a/tests/test_awsdatabricks.py +++ /dev/null @@ -1,1091 +0,0 @@ -import copy -import io -import json -from datetime import datetime -from unittest.mock import Mock, patch -from uuid import uuid4 - -import boto3 as boto -from botocore.response import StreamingBody -from botocore.stub import Stubber -from httpx import Response - -from sync.awsdatabricks import create_prediction_for_run -from sync.config import DatabricksConf -from sync.models import DatabricksAPIError, DatabricksError -from sync.models import Response as SyncResponse -from sync.utils.json import DateTimeEncoderNaiveUTCDropMicroseconds - -MOCK_RUN = { - "job_id": 12345678910, - "run_id": 75778, - "creator_user_name": "user_name@domain.com", - "number_in_job": 75778, - "original_attempt_run_id": 75778, - "state": { - "life_cycle_state": "TERMINATED", - "result_state": "SUCCESS", - "state_message": "", - "user_cancelled_or_timedout": False, - }, - "start_time": 1681249421062, - "setup_duration": 237000, - "execution_duration": 130000, - "cleanup_duration": 0, - "end_time": 1681249788433, - "trigger": "ONE_TIME", - "run_name": "test_job", - "run_page_url": "https://dbc-foo-bar.cloud.databricks.com/?o=12345678910#job/10987654321/run/12345", - "run_type": "JOB_RUN", - "tasks": [ - { - "run_id": 76722, - "task_key": "my_task", - "notebook_task": {"notebook_path": "/Users/user/notebook", "source": "WORKSPACE"}, - "job_cluster_key": "my_job_cluster", - "state": { - "life_cycle_state": "TERMINATED", - "result_state": "SUCCESS", - "state_message": "", - "user_cancelled_or_timedout": False, - }, - "run_page_url": "https://dbc-foo-bar.cloud.databricks.com/?o=12345678910#job/10987654321/run/12345", - "start_time": 1681249421074, - "setup_duration": 237000, - "execution_duration": 130000, - "cleanup_duration": 0, - "end_time": 1681249788312, - "cluster_instance": { - "cluster_id": "0101-214342-tpi6qdp2", - "spark_context_id": "1443449481634833945", - }, - "attempt_number": 0, - } - ], - "job_clusters": [ - { - "job_cluster_key": "my_job_cluster", - "new_cluster": { - "cluster_name": "", - "spark_version": "12.2.x-scala2.12", - "aws_attributes": { - "first_on_demand": 2, - "availability": "SPOT_WITH_FALLBACK", - "zone_id": "auto", - "instance_profile_arn": "arn:aws:iam::123456789:instance-profile/my-iam-profile", - "spot_bid_price_percent": 100, - "ebs_volume_count": 0, - }, - "node_type_id": "i3.4xlarge", - "driver_node_type_id": "i3.xlarge", - "cluster_log_conf": { - "s3": { - "destination": "s3://bucket/path/to/logs/", - "region": "us-east-1", - "enable_encryption": True, - "canned_acl": "bucket-owner-full-control", - } - }, - "enable_elastic_disk": True, - "policy_id": "9C6308F703005DF2", - "data_security_mode": "SINGLE_USER", - "runtime_engine": "PHOTON", - "num_workers": 1, - }, - } - ], - "format": "MULTI_TASK", -} - -MOCK_RUN_WITH_SYNC_TASK = { - "job_id": 12345678910, - "run_id": 75778, - "creator_user_name": "user_name@domain.com", - "number_in_job": 75778, - "original_attempt_run_id": 75778, - "state": { - "life_cycle_state": "TERMINATED", - "result_state": "SUCCESS", - "state_message": "", - "user_cancelled_or_timedout": False, - }, - "start_time": 1681249421062, - "setup_duration": 237000, - "execution_duration": 130000, - "cleanup_duration": 0, - "end_time": 1681249788433, - "trigger": "ONE_TIME", - "run_name": "test_job", - "run_page_url": "https://dbc-foo-bar.cloud.databricks.com/?o=12345678910#job/10987654321/run/12345", - "run_type": "JOB_RUN", - "tasks": [ - { - "run_id": 76722, - "task_key": "my_task", - "notebook_task": {"notebook_path": "/Users/user/notebook", "source": "WORKSPACE"}, - "job_cluster_key": "my_job_cluster", - "state": { - "life_cycle_state": "TERMINATED", - "result_state": "SUCCESS", - "state_message": "", - "user_cancelled_or_timedout": False, - }, - "run_page_url": "https://dbc-foo-bar.cloud.databricks.com/?o=12345678910#job/10987654321/run/12345", - "start_time": 1681249421074, - "setup_duration": 237000, - "execution_duration": 130000, - "cleanup_duration": 0, - "end_time": 1681249788312, - "cluster_instance": { - "cluster_id": "0101-214342-tpi6qdp2", - "spark_context_id": "1443449481634833945", - }, - "attempt_number": 0, - }, - { - "run_id": 5541644, - "task_key": "sync_task", - "depends_on": [{"task_key": "load_all_lending_tables_full"}], - "notebook_task": { - "notebook_path": "/Users/pete.tamisin@synccomputing.com/sync_task", - "base_parameters": { - "DATABRICKS_RUN_ID": "{{run_id}}", - "DATABRICKS_JOB_ID": "{{job_id}}", - "DATABRICKS_PARENT_RUN_ID": "{{parent_run_id}}", - "DATABRICKS_TASK_KEY": "{{task_key}}", - }, - "source": "WORKSPACE", - }, - "job_cluster_key": "Job_cluster", - "state": { - "life_cycle_state": "RUNNING", - "state_message": "In run", - "user_cancelled_or_timedout": False, - }, - "run_page_url": "https://dbc-d95d06ca-1d00.cloud.databricks.com/?o=656201176161048#job/1085772780706533/run/5541644", - "start_time": 1681249788312, - "setup_duration": 1000, - "execution_duration": 139000, - "cleanup_duration": 0, - "end_time": 1681249828312, - "cluster_instance": { - "cluster_id": "0518-173917-zp8ig48r", - "spark_context_id": "5062870523583018788", - }, - "attempt_number": 0, - }, - ], - "job_clusters": [ - { - "job_cluster_key": "my_job_cluster", - "new_cluster": { - "cluster_name": "", - "spark_version": "12.2.x-scala2.12", - "aws_attributes": { - "first_on_demand": 2, - "availability": "SPOT_WITH_FALLBACK", - "zone_id": "auto", - "instance_profile_arn": "arn:aws:iam::123456789:instance-profile/my-iam-profile", - "spot_bid_price_percent": 100, - "ebs_volume_count": 0, - }, - "node_type_id": "i3.4xlarge", - "driver_node_type_id": "i3.xlarge", - "cluster_log_conf": { - "s3": { - "destination": "s3://bucket/path/to/logs/", - "region": "us-east-1", - "enable_encryption": True, - "canned_acl": "bucket-owner-full-control", - } - }, - "enable_elastic_disk": True, - "policy_id": "9C6308F703005DF2", - "data_security_mode": "SINGLE_USER", - "runtime_engine": "PHOTON", - "num_workers": 1, - }, - }, - { - "job_cluster_key": "sync_cluster", - "new_cluster": { - "cluster_name": "", - "spark_version": "13.1.x-scala2.12", - "spark_conf": { - "spark.master": "local[*, 4]", - "spark.databricks.cluster.profile": "singleNode", - }, - "aws_attributes": { - "first_on_demand": 1, - "availability": "SPOT_WITH_FALLBACK", - "zone_id": "us-east-1a", - "instance_profile_arn": "arn:aws:iam::471881062455:instance-profile/databricks-workspace-stack-access-data-buckets", - "spot_bid_price_percent": 100, - "ebs_volume_count": 0, - }, - "node_type_id": "i3.xlarge", - "custom_tags": {"ResourceClass": "SingleNode"}, - "enable_elastic_disk": True, - "data_security_mode": "LEGACY_SINGLE_USER_STANDARD", - "runtime_engine": "STANDARD", - "num_workers": 0, - }, - }, - ], - "format": "MULTI_TASK", -} - -MOCK_CLUSTER = { - "cluster_id": "0101-214342-tpi6qdp2", - "creator_user_name": "user_name@domain.com", - "spark_context_id": 1443449481634833945, - "driver_healthy": True, - "cluster_name": "job-12345678910-run-75778-my_job_run", - "spark_version": "12.2.x-scala2.12", - "aws_attributes": { - "first_on_demand": 2, - "availability": "SPOT_WITH_FALLBACK", - "zone_id": "auto", - "instance_profile_arn": "arn:aws:iam::123456789:instance-profile/my-iam-profile", - "spot_bid_price_percent": 100, - "ebs_volume_count": 0, - }, - "node_type_id": "i3.4xlarge", - "driver_node_type_id": "i3.xlarge", - "cluster_log_conf": { - "s3": { - "destination": "s3://bucket/path/to/logs/", - "region": "us-east-1", - "enable_encryption": True, - "canned_acl": "bucket-owner-full-control", - } - }, - "autotermination_minutes": 0, - "enable_elastic_disk": True, - "disk_spec": {"disk_count": 0}, - "cluster_source": "JOB", - "single_user_name": "user_name@domain.com", - "policy_id": "9C6308F703005DF2", - "enable_local_disk_encryption": False, - "instance_source": {"node_type_id": "i3.4xlarge"}, - "driver_instance_source": {"node_type_id": "i3.xlarge"}, - "data_security_mode": "SINGLE_USER", - "runtime_engine": "PHOTON", - "effective_spark_version": "12.2.x-photon-scala2.12", - "state": "TERMINATED", - "state_message": "", - "start_time": 1681249423048, - "terminated_time": 1681249791560, - "last_state_loss_time": 0, - "last_activity_time": 1681249606510, - "last_restarted_time": 1681249654358, - "num_workers": 1, - "default_tags": { - "Vendor": "Databricks", - "Creator": "user_name@domain.com", - "ClusterName": "job-12345678910-run-75778-my_job_run", - "ClusterId": "0101-214342-tpi6qdp2", - "JobId": "943021334099449", - "RunName": "my_job_run", - }, - "cluster_log_status": {"last_attempted": 1681249693295}, - "termination_reason": {"code": "JOB_FINISHED", "type": "SUCCESS"}, - "init_scripts_safe_mode": False, -} - -MOCK_INSTANCES = { - "Reservations": [ - { - "Instances": [ - { - "AmiLaunchIndex": 0, - "ImageId": "ami-0ea2c19e79de11215", - "InstanceId": "i-0ae24904bc2811797", - "InstanceType": "i3.xlarge", - "LaunchTime": datetime.fromisoformat("2023-04-04T22:51:24+00:00"), - "Monitoring": { - "State": "disabled", - }, - "Placement": { - "AvailabilityZone": "us-east-1c", - "GroupName": "", - "Tenancy": "default", - }, - "PrivateDnsName": "", - "ProductCodes": [], - "PublicDnsName": "", - "State": {"Code": 48, "Name": "terminated"}, - "Architecture": "x86_64", - "BlockDeviceMappings": [], - "ClientToken": "f05ee9c8-a720-4e60-ace7-cd188da120ea", - "EbsOptimized": False, - "EnaSupport": True, - "Hypervisor": "xen", - "InstanceLifecycle": "spot", - "NetworkInterfaces": [], - "RootDeviceName": "/dev/sda1", - "RootDeviceType": "ebs", - "SecurityGroups": [], - "SpotInstanceRequestId": "sir-mzqyjsdg", - "StateReason": { - "Code": "Client.UserInitiatedShutdown", - "Message": "Client.UserInitiatedShutdown: User initiated shutdown", - }, - "Tags": [ - {"Key": "RunName", "Value": "my_test_job"}, - {"Key": "Vendor", "Value": "Databricks"}, - {"Key": "management_service", "Value": "instance_manager_service"}, - {"Key": "ClusterId", "Value": "0101-214342-tpi6qdp2"}, - {"Key": "Creator", "Value": "user@domain.com"}, - {"Key": "JobId", "Value": "943021334099449"}, - { - "Key": "Name", - "Value": "workerenv-1187965937856149-f0bf0016-45bf-496b-9f05-16cf93ffb24d-worker", - }, - { - "Key": "ClusterName", - "Value": "job-12345678910-run-75778-my_job_run", - }, - ], - "VirtualizationType": "hvm", - "CpuOptions": {"CoreCount": 2, "ThreadsPerCore": 2}, - "CapacityReservationSpecification": {"CapacityReservationPreference": "open"}, - "HibernationOptions": {"Configured": False}, - "MetadataOptions": { - "State": "pending", - "HttpTokens": "required", - "HttpPutResponseHopLimit": 2, - "HttpEndpoint": "enabled", - "HttpProtocolIpv6": "disabled", - "InstanceMetadataTags": "disabled", - }, - "EnclaveOptions": {"Enabled": False}, - "PlatformDetails": "Linux/UNIX", - "UsageOperation": "RunInstances", - "UsageOperationUpdateTime": datetime.fromisoformat("2023-04-04T22:51:24+00:00"), - "MaintenanceOptions": {"AutoRecovery": "default"}, - } - ], - }, - { - "Instances": [ - { - "AmiLaunchIndex": 0, - "ImageId": "ami-0ea2c19e79de11215", - "InstanceId": "i-0ae24904bc2811797", - "InstanceType": "i3.4xlarge", - "LaunchTime": datetime.fromisoformat("2023-04-04T22:51:24+00:00"), - "Monitoring": { - "State": "disabled", - }, - "Placement": { - "AvailabilityZone": "us-east-1c", - "GroupName": "", - "Tenancy": "default", - }, - "PrivateDnsName": "", - "ProductCodes": [], - "PublicDnsName": "", - "State": {"Code": 48, "Name": "terminated"}, - "Architecture": "x86_64", - "BlockDeviceMappings": [], - "ClientToken": "f05ee9c8-a720-4e60-ace7-cd188da120ea", - "EbsOptimized": False, - "EnaSupport": True, - "Hypervisor": "xen", - "InstanceLifecycle": "spot", - "NetworkInterfaces": [], - "RootDeviceName": "/dev/sda1", - "RootDeviceType": "ebs", - "SecurityGroups": [], - "SpotInstanceRequestId": "sir-mzqyjsdg", - "StateReason": { - "Code": "Client.UserInitiatedShutdown", - "Message": "Client.UserInitiatedShutdown: User initiated shutdown", - }, - "Tags": [ - {"Key": "RunName", "Value": "my_test_job"}, - {"Key": "Vendor", "Value": "Databricks"}, - {"Key": "management_service", "Value": "instance_manager_service"}, - {"Key": "ClusterId", "Value": "0101-214342-tpi6qdp2"}, - {"Key": "Creator", "Value": "user@domain.com"}, - {"Key": "JobId", "Value": "943021334099449"}, - { - "Key": "Name", - "Value": "workerenv-1187965937856149-f0bf0016-45bf-496b-9f05-16cf93ffb24d-worker", - }, - { - "Key": "ClusterName", - "Value": "job-12345678910-run-75778-my_job_run", - }, - ], - "VirtualizationType": "hvm", - "CpuOptions": {"CoreCount": 8, "ThreadsPerCore": 8}, - "CapacityReservationSpecification": {"CapacityReservationPreference": "open"}, - "HibernationOptions": {"Configured": False}, - "MetadataOptions": { - "State": "pending", - "HttpTokens": "required", - "HttpPutResponseHopLimit": 2, - "HttpEndpoint": "enabled", - "HttpProtocolIpv6": "disabled", - "InstanceMetadataTags": "disabled", - }, - "EnclaveOptions": {"Enabled": False}, - "PlatformDetails": "Linux/UNIX", - "UsageOperation": "RunInstances", - "UsageOperationUpdateTime": datetime.fromisoformat("2023-04-04T22:51:24+00:00"), - "MaintenanceOptions": {"AutoRecovery": "default"}, - } - ] - }, - ], -} - -MOCK_VOLUMES = { - "Volumes": [ - { - "Attachments": [ - { - "AttachTime": datetime.fromisoformat("2023-08-15T21:43:03+00:00"), - "Device": "/dev/sda1", - "InstanceId": "i-01ecbca3d064b99cb", - "State": "attached", - "VolumeId": "vol-01114146532c3a1c5", - "DeleteOnTermination": True, - } - ], - "AvailabilityZone": "us-east-1f", - "CreateTime": datetime.fromisoformat("2023-08-15T21:43:04.058000+00:00"), - "Encrypted": True, - "KmsKeyId": "arn:aws:kms:us-east-1:471881062455:key/75aa19ed-25bd-4fca-be79-c21a23256d69", - "Size": 30, - "SnapshotId": "snap-08aeac93ec15d59b0", - "State": "in-use", - "VolumeId": "vol-01114146532c3a1c5", - "Iops": 3000, - "Tags": [ - {"Key": "ClusterName", "Value": "script-test-nvme"}, - {"Key": "Vendor", "Value": "Databricks"}, - { - "Key": "Name", - "Value": "workerenv-656201176161048-20de29ca-1c16-42b0-a91e-4f6a30f57313-worker", - }, - {"Key": "management_service", "Value": "instance_manager_service"}, - {"Key": "ClusterId", "Value": "0815-124039-8cpvy56n"}, - {"Key": "Creator", "Value": "sean.gorsky@synccomputing.com"}, - ], - "VolumeType": "gp3", - "MultiAttachEnabled": False, - "Throughput": 125, - }, - { - "Attachments": [ - { - "AttachTime": datetime.fromisoformat("2023-08-15T21:43:03+00:00"), - "Device": "/dev/xvdb", - "InstanceId": "i-01ecbca3d064b99cb", - "State": "attached", - "VolumeId": "vol-0888642ead4f823ea", - "DeleteOnTermination": True, - } - ], - "AvailabilityZone": "us-east-1f", - "CreateTime": datetime.fromisoformat("2023-08-15T21:43:03.967000+00:00"), - "Encrypted": True, - "KmsKeyId": "arn:aws:kms:us-east-1:471881062455:key/75aa19ed-25bd-4fca-be79-c21a23256d69", - "Size": 150, - "SnapshotId": "", - "State": "in-use", - "VolumeId": "vol-0888642ead4f823ea", - "Iops": 3000, - "Tags": [ - {"Key": "ClusterId", "Value": "0815-124039-8cpvy56n"}, - {"Key": "ClusterName", "Value": "script-test-nvme"}, - {"Key": "management_service", "Value": "instance_manager_service"}, - {"Key": "Vendor", "Value": "Databricks"}, - { - "Key": "Name", - "Value": "workerenv-656201176161048-20de29ca-1c16-42b0-a91e-4f6a30f57313-worker", - }, - {"Key": "Creator", "Value": "sean.gorsky@synccomputing.com"}, - ], - "VolumeType": "gp3", - "MultiAttachEnabled": False, - "Throughput": 125, - }, - ] -} - -MOCK_DBX_CONF = DatabricksConf( - host="https://dbc-123.cloud.databricks.com", - token="my_secret_token", - aws_region_name="us-east-1", -) - - -@patch("sync.awsdatabricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync.clients.databricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync._databricks.get_project", Mock(return_value=SyncResponse(result={}))) -def test_create_prediction_for_failed_run(respx_mock): - failure_response = {"error_code": "FAILED", "message": "This run failed"} - - respx_mock.get("https://*.cloud.databricks.com/api/2.1/jobs/runs/get?run_id=75778").mock( - return_value=Response(200, json=failure_response) - ) - - result = create_prediction_for_run("75778", "Premium", "Jobs Compute", "my-project-id") - - assert result.error - assert isinstance(result.error, DatabricksAPIError) - - failure_response = { - "state": {"result_state": "FAILED"}, - "tasks": [ - { - "task_key": "tpcds_2000GB_group_q76_q80", - "cluster_instance": {"cluster_id": 12345}, - "state": {"result_state": "FAILED"}, - } - ], - } - respx_mock.get("https://*.cloud.databricks.com/api/2.1/jobs/runs/get?run_id=75778").mock( - return_value=Response(200, json=failure_response) - ) - - result = create_prediction_for_run("75778", "Premium", "Jobs Compute", "my-project-id") - - assert result.error - assert isinstance(result.error, DatabricksError) - - -@patch("sync.awsdatabricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync.clients.databricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync._databricks.get_project", Mock(return_value=SyncResponse(result={}))) -def test_create_prediction_for_run_bad_cluster_data(respx_mock): - # Test too many clusters found - run_with_multiple_clusters = copy.deepcopy(MOCK_RUN) - run_with_multiple_clusters["tasks"][0]["cluster_instance"][ - "cluster_id" - ] = "different_cluster_id" - - run_with_multiple_clusters["tasks"] = [ - MOCK_RUN["tasks"][0], - run_with_multiple_clusters["tasks"][0], - ] - - respx_mock.get("https://*.cloud.databricks.com/api/2.1/jobs/runs/get?run_id=75778").mock( - return_value=Response(200, json=run_with_multiple_clusters) - ) - - result = create_prediction_for_run("75778", "Premium", "Jobs Compute", "my-project-id") - - assert result.error - - # Test no tasks/clusters at all - run_with_no_tasks = copy.deepcopy(MOCK_RUN) - run_with_no_tasks["tasks"] = [] - respx_mock.get("https://*.cloud.databricks.com/api/2.1/jobs/runs/get?run_id=75778").mock( - return_value=Response(200, json=run_with_no_tasks) - ) - - result = create_prediction_for_run("75778", "Premium", "Jobs Compute", "my-project-id") - - assert result.error - - -@patch("sync.awsdatabricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync.clients.databricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync._databricks.get_project", Mock(return_value=SyncResponse(result={}))) -def test_create_prediction_for_run_no_instances_found(respx_mock): - respx_mock.get("https://*.cloud.databricks.com/api/2.1/jobs/runs/get?run_id=75778").mock( - return_value=Response(200, json=MOCK_RUN) - ) - - respx_mock.get( - "https://*.cloud.databricks.com/api/2.0/clusters/get?cluster_id=0101-214342-tpi6qdp2" - ).mock(return_value=Response(200, json=MOCK_CLUSTER)) - - respx_mock.post("https://*.cloud.databricks.com/api/2.0/clusters/events").mock( - return_value=Response(200, json={"events": [], "total_count": 0}) - ) - - ec2 = boto.client("ec2", region_name=MOCK_DBX_CONF.aws_region_name) - s3 = boto.client("s3") - - s3_stubber = Stubber(s3) - s3_stubber.add_client_error("get_object", "NoSuchKey") - - def client_patch(name, **kwargs): - if name == "ec2": - return ec2 - elif name == "s3": - return s3 - - # First test no instances - ec2_stubber = Stubber(ec2) - ec2_stubber.add_response("describe_instances", {"Reservations": []}) - ec2_stubber.add_response("describe_volumes", MOCK_VOLUMES) - - with s3_stubber, ec2_stubber, patch("boto3.client") as mock_aws_client: - mock_aws_client.side_effect = client_patch - result = create_prediction_for_run("75778", "Premium", "Jobs Compute", "my-project-id") - - assert result.error - - -@patch("sync.awsdatabricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync.clients.databricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync._databricks.get_project", Mock(return_value=SyncResponse(result={}))) -def test_create_prediction_for_run_unauthorized_ec2(respx_mock): - respx_mock.get("https://*.cloud.databricks.com/api/2.1/jobs/runs/get?run_id=75778").mock( - return_value=Response(200, json=MOCK_RUN) - ) - - respx_mock.get( - "https://*.cloud.databricks.com/api/2.0/clusters/get?cluster_id=0101-214342-tpi6qdp2" - ).mock(return_value=Response(200, json=MOCK_CLUSTER)) - - respx_mock.post("https://*.cloud.databricks.com/api/2.0/clusters/events").mock( - return_value=Response(200, json={"events": [], "total_count": 0}) - ) - - ec2 = boto.client("ec2", region_name=MOCK_DBX_CONF.aws_region_name) - ec2_stubber = Stubber(ec2) - ec2_stubber.add_client_error( - "describe_instances", - service_error_code="AccessDeniedException", - service_message="User: arn:aws:sts::123456789012:assumed-role/sync-test-no-access/botocore-session-1687389953 is not authorized to perform: elasticmapreduce:DescribeCluster on resource: arn:aws:elasticmapreduce:us-east-1:123456789012:cluster/j-3GJINYS04BO38 because no identity-based policy allows the elasticmapreduce:DescribeCluster action", - ) - - s3 = boto.client("s3") - s3_stubber = Stubber(s3) - s3_stubber.add_client_error("get_object", "NoSuchKey") - - def client_patch(name, **kwargs): - if name == "ec2": - return ec2 - elif name == "s3": - return s3 - - with s3_stubber, ec2_stubber, patch("boto3.client") as mock_aws_client: - mock_aws_client.side_effect = client_patch - result = create_prediction_for_run("75778", "Premium", "Jobs Compute", "my-project-id") - - assert result.error - - -MOCK_PREDICTION_CREATION_RESPONSE = { - "result": { - "prediction_id": str(uuid4()), - "upload_details": {"url": "https://presigned-url", "fields": {"key": "foobar"}}, - } -} - - -@patch("sync.awsdatabricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync.clients.databricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync._databricks.get_project", Mock(return_value=SyncResponse(result={}))) -def test_create_prediction_for_run_success(respx_mock): - respx_mock.get("https://*.cloud.databricks.com/api/2.1/jobs/runs/get?run_id=75778").mock( - return_value=Response(200, json=MOCK_RUN) - ) - - respx_mock.get( - "https://*.cloud.databricks.com/api/2.0/clusters/get?cluster_id=0101-214342-tpi6qdp2" - ).mock(return_value=Response(200, json=MOCK_CLUSTER)) - - respx_mock.post("https://*.cloud.databricks.com/api/2.0/clusters/events").mock( - return_value=Response(200, json={"events": [], "total_count": 0}) - ) - - respx_mock.post("/v1/auth/token").mock( - return_value=Response( - 200, - json={ - "result": { - "access_token": "notarealtoken", - "expires_at_utc": "2022-09-01T20:54:48Z", - } - }, - ) - ) - - respx_mock.post("/v1/autotuner/predictions").mock( - return_value=Response(200, json=MOCK_PREDICTION_CREATION_RESPONSE) - ) - - respx_mock.post(MOCK_PREDICTION_CREATION_RESPONSE["result"]["upload_details"]["url"]).mock( - return_value=Response(204) - ) - - ec2 = boto.client("ec2", region_name=MOCK_DBX_CONF.aws_region_name) - ec2_stubber = Stubber(ec2) - ec2_stubber.add_response("describe_instances", MOCK_INSTANCES) - ec2_stubber.add_response("describe_volumes", MOCK_VOLUMES) - - base_prefix = "path/to/logs/0101-214342-tpi6qdp2" - eventlog_file_prefix = f"{base_prefix}/eventlog/0101-214342-tpi6qdp2" - - s3 = boto.client("s3") - s3_stubber = Stubber(s3) - s3_stubber.add_client_error("get_object", "NoSuchKey") - s3_stubber.add_response( - "list_objects_v2", - { - "Contents": [ - { - "Key": f"{eventlog_file_prefix}/eventlog", - "LastModified": datetime.utcfromtimestamp(1681249791560 / 1000), - } - ] - }, - {"Bucket": "bucket", "Prefix": eventlog_file_prefix}, - ) - s3_stubber.add_response( - "get_object", - { - "ContentType": "application/octet-stream", - "ContentLength": 0, - "Body": StreamingBody(io.BytesIO(), 0), - }, - {"Bucket": "bucket", "Key": f"{eventlog_file_prefix}/eventlog"}, - ) - - def client_patch(name, **kwargs): - if name == "ec2": - return ec2 - elif name == "s3": - return s3 - - with s3_stubber, ec2_stubber, patch("boto3.client") as mock_aws_client: - mock_aws_client.side_effect = client_patch - result = create_prediction_for_run("75778", "Premium", "Jobs Compute", "my-project-id") - - assert result.result - - -@patch("sync.awsdatabricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync.clients.databricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync._databricks.get_project", Mock(return_value=SyncResponse(result={}))) -def test_create_prediction_for_run_success_with_cluster_instance_file(respx_mock): - respx_mock.get("https://*.cloud.databricks.com/api/2.1/jobs/runs/get?run_id=75778").mock( - return_value=Response(200, json=MOCK_RUN) - ) - - respx_mock.get( - "https://*.cloud.databricks.com/api/2.0/clusters/get?cluster_id=0101-214342-tpi6qdp2" - ).mock(return_value=Response(200, json=MOCK_CLUSTER)) - - respx_mock.post("https://*.cloud.databricks.com/api/2.0/clusters/events").mock( - return_value=Response(200, json={"events": [], "total_count": 0}) - ) - - respx_mock.post("/v1/auth/token").mock( - return_value=Response( - 200, - json={ - "result": { - "access_token": "notarealtoken", - "expires_at_utc": "2022-09-01T20:54:48Z", - } - }, - ) - ) - - respx_mock.post("/v1/autotuner/predictions").mock( - return_value=Response(200, json=MOCK_PREDICTION_CREATION_RESPONSE) - ) - - respx_mock.post(MOCK_PREDICTION_CREATION_RESPONSE["result"]["upload_details"]["url"]).mock( - return_value=Response(204) - ) - - base_prefix = "path/to/logs/0101-214342-tpi6qdp2" - eventlog_file_prefix = f"{base_prefix}/eventlog/0101-214342-tpi6qdp2" - cluster_info_file_key = f"{base_prefix}/sync_data/1443449481634833945/aws_cluster_info.json" - - # Don't add any responses for this one as we expect all the instance data we need to be available - # in the cluster_instances.json file - ec2 = boto.client("ec2", region_name=MOCK_DBX_CONF.aws_region_name) - ec2_stubber = Stubber(ec2) - - s3 = boto.client("s3") - s3_stubber = Stubber(s3) - - mock_cluster_info_bytes = bytes( - json.dumps( - { - "volumes": MOCK_VOLUMES["Volumes"], - "instances": [ - inst for res in MOCK_INSTANCES["Reservations"] for inst in res["Instances"] - ], - }, - cls=DateTimeEncoderNaiveUTCDropMicroseconds, - ), - "utf-8", - ) - s3_stubber.add_response( - "get_object", - { - "ContentType": "application/octet-stream", - "ContentLength": len(mock_cluster_info_bytes), - "Body": StreamingBody( - io.BytesIO(mock_cluster_info_bytes), - len(mock_cluster_info_bytes), - ), - }, - {"Bucket": "bucket", "Key": cluster_info_file_key}, - ) - s3_stubber.add_response( - "list_objects_v2", - { - "Contents": [ - { - "Key": f"{eventlog_file_prefix}/eventlog", - "LastModified": datetime.utcfromtimestamp(1681249791560 / 1000), - } - ] - }, - {"Bucket": "bucket", "Prefix": eventlog_file_prefix}, - ) - s3_stubber.add_response( - "get_object", - { - "ContentType": "application/octet-stream", - "ContentLength": 0, - "Body": StreamingBody(io.BytesIO(), 0), - }, - {"Bucket": "bucket", "Key": f"{eventlog_file_prefix}/eventlog"}, - ) - - def client_patch(name, **kwargs): - if name == "ec2": - return ec2 - elif name == "s3": - return s3 - - with s3_stubber, ec2_stubber, patch("boto3.client") as mock_aws_client: - mock_aws_client.side_effect = client_patch - result = create_prediction_for_run("75778", "Premium", "Jobs Compute", "my-project-id") - - assert result.result - - -@patch("sync.awsdatabricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync.clients.databricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync._databricks.get_project", Mock(return_value=SyncResponse(result={}))) -def test_create_prediction_for_run_with_pending_task(respx_mock): - respx_mock.get("https://*.cloud.databricks.com/api/2.1/jobs/runs/get?run_id=75778").mock( - return_value=Response(200, json=MOCK_RUN_WITH_SYNC_TASK) - ) - - respx_mock.get( - "https://*.cloud.databricks.com/api/2.0/clusters/get?cluster_id=0101-214342-tpi6qdp2" - ).mock(return_value=Response(200, json=MOCK_CLUSTER)) - - respx_mock.post("https://*.cloud.databricks.com/api/2.0/clusters/events").mock( - return_value=Response(200, json={"events": [], "total_count": 0}) - ) - - respx_mock.post("/v1/auth/token").mock( - return_value=Response( - 200, - json={ - "result": { - "access_token": "notarealtoken", - "expires_at_utc": "2022-09-01T20:54:48Z", - } - }, - ) - ) - - respx_mock.post("/v1/autotuner/predictions").mock( - return_value=Response(200, json=MOCK_PREDICTION_CREATION_RESPONSE) - ) - - respx_mock.post(MOCK_PREDICTION_CREATION_RESPONSE["result"]["upload_details"]["url"]).mock( - return_value=Response(204) - ) - - ec2 = boto.client("ec2", region_name=MOCK_DBX_CONF.aws_region_name) - ec2_stubber = Stubber(ec2) - ec2_stubber.add_response("describe_instances", MOCK_INSTANCES) - ec2_stubber.add_response("describe_volumes", MOCK_VOLUMES) - - s3_file_prefix = "path/to/logs/0101-214342-tpi6qdp2/eventlog/0101-214342-tpi6qdp2" - - s3 = boto.client("s3") - s3_stubber = Stubber(s3) - s3_stubber.add_client_error("get_object", "NoSuchKey") - s3_stubber.add_response( - "list_objects_v2", - { - "Contents": [ - { - "Key": f"{s3_file_prefix}/eventlog", - "LastModified": datetime.utcfromtimestamp(1681249791560 / 1000), - } - ] - }, - {"Bucket": "bucket", "Prefix": s3_file_prefix}, - ) - s3_stubber.add_response( - "get_object", - { - "ContentType": "application/octet-stream", - "ContentLength": 0, - "Body": StreamingBody(io.BytesIO(), 0), - }, - {"Bucket": "bucket", "Key": f"{s3_file_prefix}/eventlog"}, - ) - - def client_patch(name, **kwargs): - if name == "ec2": - return ec2 - elif name == "s3": - return s3 - - with s3_stubber, ec2_stubber, patch("boto3.client") as mock_aws_client: - mock_aws_client.side_effect = client_patch - result = create_prediction_for_run( - "75778", "Premium", "Jobs Compute", "my-project-id", exclude_tasks=["sync_task"] - ) - - assert result.result - - -@patch("sync.awsdatabricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync.clients.databricks.DB_CONFIG", new=MOCK_DBX_CONF) -@patch("sync._databricks._event_log_poll_duration_seconds", Mock(return_value=0)) -@patch("sync._databricks.get_project", Mock(return_value=SyncResponse(result={}))) -def test_create_prediction_for_run_event_log_upload_delay(respx_mock): - respx_mock.get("https://*.cloud.databricks.com/api/2.1/jobs/runs/get?run_id=75778").mock( - return_value=Response(200, json=MOCK_RUN) - ) - - respx_mock.get( - "https://*.cloud.databricks.com/api/2.0/clusters/get?cluster_id=0101-214342-tpi6qdp2" - ).mock(return_value=Response(200, json=MOCK_CLUSTER)) - - respx_mock.post("https://*.cloud.databricks.com/api/2.0/clusters/events").mock( - return_value=Response(200, json={"events": [], "total_count": 0}) - ) - - respx_mock.post("/v1/auth/token").mock( - return_value=Response( - 200, - json={ - "result": { - "access_token": "notarealtoken", - "expires_at_utc": "2022-09-01T20:54:48Z", - } - }, - ) - ) - - respx_mock.post("/v1/autotuner/predictions").mock( - return_value=Response(200, json=MOCK_PREDICTION_CREATION_RESPONSE) - ) - - respx_mock.post(MOCK_PREDICTION_CREATION_RESPONSE["result"]["upload_details"]["url"]).mock( - return_value=Response(204) - ) - - ec2 = boto.client("ec2", region_name=MOCK_DBX_CONF.aws_region_name) - ec2_stubber = Stubber(ec2) - ec2_stubber.add_response("describe_instances", MOCK_INSTANCES) - ec2_stubber.add_response("describe_volumes", MOCK_VOLUMES) - - s3_file_prefix = "path/to/logs/0101-214342-tpi6qdp2/eventlog/0101-214342-tpi6qdp2" - - s3 = boto.client("s3") - s3_stubber = Stubber(s3) - s3_stubber.add_client_error("get_object", "NoSuchKey") - - # Test no event log files present yet - s3_stubber.add_response( - "list_objects_v2", {"Contents": []}, {"Bucket": "bucket", "Prefix": s3_file_prefix} - ) - - # Test incomplete event log data present - s3_stubber.add_response( - "list_objects_v2", - { - "Contents": [ - { - "Key": f"{s3_file_prefix}/eventlog-2023-04-11--23-30.gz", - "LastModified": datetime.utcfromtimestamp(1681249688400 / 1000), - } - ] - }, - {"Bucket": "bucket", "Prefix": s3_file_prefix}, - ) - - # Test still waiting for remaining data to make it to the final event log file - s3_stubber.add_response( - "list_objects_v2", - { - "Contents": [ - { - "Key": f"{s3_file_prefix}/eventlog", - "LastModified": datetime.fromtimestamp(1681249688433 / 1000), - }, - { - "Key": f"{s3_file_prefix}/eventlog-2023-04-11--23-30.gz", - "LastModified": datetime.fromtimestamp(1681249588400 / 1000), - }, - ] - }, - {"Bucket": "bucket", "Prefix": s3_file_prefix}, - ) - - # Finally, all the data is present - s3_stubber.add_response( - "list_objects_v2", - { - "Contents": [ - { - "Key": f"{s3_file_prefix}/eventlog", - "LastModified": datetime.fromtimestamp(1681249788435 / 1000), - }, - { - "Key": f"{s3_file_prefix}/eventlog-2023-04-11--23-30.gz", - "LastModified": datetime.fromtimestamp(1681249588400 / 1000), - }, - ] - }, - {"Bucket": "bucket", "Prefix": s3_file_prefix}, - ) - - s3_stubber.add_response( - "get_object", - { - "ContentType": "application/octet-stream", - "ContentLength": 0, - "Body": StreamingBody(io.BytesIO(), 0), - }, - {"Bucket": "bucket", "Key": f"{s3_file_prefix}/eventlog"}, - ) - - s3_stubber.add_response( - "get_object", - { - "ContentType": "application/octet-stream", - "ContentLength": 0, - "Body": StreamingBody(io.BytesIO(), 0), - }, - {"Bucket": "bucket", "Key": f"{s3_file_prefix}/eventlog-2023-04-11--23-30.gz"}, - ) - - def client_patch(name, **kwargs): - if name == "ec2": - return ec2 - elif name == "s3": - return s3 - - with s3_stubber, ec2_stubber, patch("boto3.client") as mock_aws_client: - mock_aws_client.side_effect = client_patch - result = create_prediction_for_run("75778", "Premium", "Jobs Compute", "my-project-id") - - assert result.result diff --git a/tests/test_awsemr.py b/tests/test_awsemr.py deleted file mode 100644 index 862a341..0000000 --- a/tests/test_awsemr.py +++ /dev/null @@ -1,197 +0,0 @@ -import json -from unittest.mock import Mock, patch - -import boto3 as boto -from botocore.stub import ANY, Stubber -from dateutil.parser import parse -from deepdiff import DeepDiff - -from sync import TIME_FORMAT -from sync.awsemr import ( - create_prediction_for_cluster, - get_cluster_report, - get_project_cluster_report, -) -from sync.models import Response - - -@patch("sync.awsemr.get_cluster_report") -@patch("sync.awsemr.create_prediction") -def test_create_prediction(create_prediction, get_cluster_report): - with open("tests/data/emr-cluster-report.json") as emr_cluster_report_fobj: - get_cluster_report.return_value = Response( - result=json.loads(emr_cluster_report_fobj.read()) - ) - - prediction_id = "320554b0-3972-4b7c-9e41-c8efdbdc042c" - create_prediction.return_value = Response(result=prediction_id) - - s3 = boto.client("s3") - stubber = Stubber(s3) - - stubber.add_response( - "list_objects_v2", - { - "Contents": [ - { - "Key": "29f4dded-70be-4344-b9b5-396c8c0481cf/2023-03-07T04:14:28Z/f84639ed-7a6a-4496-81e1-b5ba8fa8b6ce/eventlog/application_1678162862227_0001" - } - ] - }, - { - "Bucket": "my-emr-projects", - "Prefix": "29f4dded-70be-4344-b9b5-396c8c0481cf/2023-03-07T04:14:28Z/f84639ed-7a6a-4496-81e1-b5ba8fa8b6ce/eventlog/", - }, - ) - - s3_mock = Mock(wraps=s3) - - # This method cannot be stubbed like list_objects_v2 - s3_mock.generate_presigned_url.return_value = ( - "https://my-emr-projects.s3.amazonaws.com/something/something" - ) - - with stubber, patch("boto3.client") as mock_client: - mock_client.return_value = s3_mock - response = create_prediction_for_cluster( - get_cluster_report.return_value.result["Cluster"]["Id"] - ) - - assert prediction_id == response.result - - -def test_get_cluster_report(): - with open("tests/data/emr-cluster-report.json") as emr_cluster_report_fobj: - emr_cluster_report = json.loads(emr_cluster_report_fobj.read()) - - cluster_id = emr_cluster_report["Cluster"]["Id"] - region = emr_cluster_report["Region"] - - emr = boto.client("emr") - stubber = Stubber(emr) - - describe_response = {"Cluster": emr_cluster_report["Cluster"].copy()} - del describe_response["Cluster"]["BootstrapActions"] - del describe_response["Cluster"]["InstanceFleets"] - - stubber.add_response("describe_cluster", describe_response, {"ClusterId": cluster_id}) - stubber.add_response( - "list_bootstrap_actions", - {"BootstrapActions": emr_cluster_report["Cluster"]["BootstrapActions"]}, - {"ClusterId": cluster_id}, - ) - stubber.add_response( - "list_instance_fleets", - {"InstanceFleets": emr_cluster_report["Cluster"]["InstanceFleets"]}, - {"ClusterId": cluster_id}, - ) - stubber.add_response( - "list_instances", {"Instances": emr_cluster_report["Instances"]}, {"ClusterId": cluster_id} - ) - stubber.add_response( - "list_steps", {"Steps": emr_cluster_report["Steps"]}, {"ClusterId": cluster_id} - ) - - with stubber, patch("boto3.client") as mock_client: - mock_client.return_value = emr - result = get_cluster_report(cluster_id, region).result - - assert not DeepDiff(emr_cluster_report, result) - - -@patch("sync.awsemr.get_cluster_report") -@patch("sync.awsemr.get_project") -def test_get_project_report(get_project, get_cluster_report): - with open("tests/data/emr-cluster-report.json") as emr_cluster_report_fobj: - cluster_report = json.loads(emr_cluster_report_fobj.read()) - get_cluster_report.return_value = Response(result=cluster_report) - - get_project.return_value = Response( - result={ - "created_at": "2023-01-20T00:38:10Z", - "updated_at": "2023-03-10T17:18:50Z", - "id": "4f5fe783-df74-4d64-adad-a635d6319579", - "name": "Data Insights", - "description": "My first project", - "cluster_log_url": "s3://megacorp-bucket/projects/emr", - "prediction_preference": "balanced", - } - ) - - s3 = boto.client("s3") - s3_stubber = Stubber(s3) - - run_timestamp = parse("2023-03-07T04:14:28Z") - last_modified = parse("2023-03-07T05:14:28Z") - project_id = "4f5fe783-df74-4d64-adad-a635d6319579" - run_id = "f84639ed-7a6a-4496-81e1-b5ba8fa8b6ce" - run_prefix = f"projects/emr/{project_id}/{run_timestamp.strftime(TIME_FORMAT)}/{run_id}" - event_log_key = f"{run_prefix}/eventlog/application_1678162862227_0001" - bucket = "megacorp-bucket" - - s3_stubber.add_response( - "list_objects_v2", - {"Contents": [{"Key": event_log_key, "LastModified": last_modified}]}, - { - "Bucket": bucket, - "Prefix": f"projects/emr/{project_id}/", - }, - ) - s3_stubber.add_response( - "put_object", - { - "ETag": '"14fe4f49fffffffffff9afbaaaaaaaa9"', - "ResponseMetadata": { - "HTTPHeaders": { - "content-length": "0", - "date": "Wed, 09 Apr 2022 " "20:35:42 GMT", - "etag": '"14fe4f49fffffffffff9afbaaaaaaaa9"', - "server": "AmazonS3", - }, - "HTTPStatusCode": 200, - "HostId": "GEHrJmjk76Ug/clCVUwimbmIjTTb2S4kU0lLg3Ylj8GKrAIsv5+S7AFb2cRkCLd+mpptmxfubLM=", - "RequestId": "A8FFFFFFF84C3A77", - "RetryAttempts": 0, - }, - "VersionId": "Dbc0gbLVEN4N5F4oz7Hhek0Xd82Mdgyo", - }, - {"Body": ANY, "Bucket": bucket, "Key": f"{run_prefix}/emr-cluster-report.json"}, - ) - - emr = boto.client("emr") - emr_stubber = Stubber(emr) - - cluster_id = "j-14QV64S2PV1Y2" - - emr_stubber.add_response( - "list_clusters", - { - "Clusters": [ - {"Id": cluster_id, "Status": {"StateChangeReason": {"Code": "ALL_STEPS_COMPLETED"}}} - ] - }, - { - "CreatedBefore": last_modified, - "CreatedAfter": run_timestamp, - "ClusterStates": ["TERMINATED"], - }, - ) - - emr_stubber.add_response( - "describe_cluster", - {"Cluster": {"Id": cluster_id, "Tags": [{"Key": "sync:run-id", "Value": run_id}]}}, - {"ClusterId": cluster_id}, - ) - - def client_patch(name, **kwargs): - if name == "s3": - return s3 - elif name == "emr": - return emr - - with s3_stubber, emr_stubber, patch("boto3.client") as mock_client: - mock_client.side_effect = client_patch - result = get_project_cluster_report(project_id).result - - assert not DeepDiff(cluster_report, result[0]) - assert f"s3://{bucket}/{event_log_key}" == result[1] From 0c7329529a3463666a0d228ee14a7e8fe8380f54 Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Fri, 9 Feb 2024 20:04:34 -0500 Subject: [PATCH 02/18] delete predictions api functions --- sync/api/__init__.py | 25 +++- sync/api/predictions.py | 277 ----------------------------------- sync/api/projects.py | 28 +--- sync/asyncapi/__init__.py | 0 sync/asyncapi/predictions.py | 177 ---------------------- 5 files changed, 25 insertions(+), 482 deletions(-) delete mode 100644 sync/api/predictions.py delete mode 100644 sync/asyncapi/__init__.py delete mode 100644 sync/asyncapi/predictions.py diff --git a/sync/api/__init__.py b/sync/api/__init__.py index 86fa6ca..469860a 100644 --- a/sync/api/__init__.py +++ b/sync/api/__init__.py @@ -1,5 +1,7 @@ from sync.clients.sync import get_default_client -from sync.models import AccessReport, AccessReportLine, AccessStatusCode +from sync.models import AccessReport, AccessReportLine, AccessStatusCode, Response +import boto3 as boto +from urllib.parse import urlparse def get_access_report() -> AccessReport: @@ -31,3 +33,24 @@ def get_access_report() -> AccessReport: ) ] ) + +def generate_presigned_url(s3_url: str, expires_in_secs: int = 3600) -> Response[str]: + """Generates presigned HTTP URL for S3 URL + + :param s3_url: URL of object in S3 + :type s3_url: str + :param expires_in_secs: number of seconds after which presigned URL expires, defaults to 3600 + :type expires_in_secs: int, optional + :return: presigned URL + :rtype: Response[str] + """ + parsed_s3_url = urlparse(s3_url) + + s3 = boto.client("s3") + return Response( + result=s3.generate_presigned_url( + "get_object", + Params={"Bucket": parsed_s3_url.netloc, "Key": parsed_s3_url.path.lstrip("/")}, + ExpiresIn=expires_in_secs, + ) + ) diff --git a/sync/api/predictions.py b/sync/api/predictions.py deleted file mode 100644 index dd9be1c..0000000 --- a/sync/api/predictions.py +++ /dev/null @@ -1,277 +0,0 @@ -"""Prediction functions -""" - -import io -import logging -from time import sleep -from urllib.parse import urlparse - -import boto3 as boto -import httpx -from typing import List - -from sync.clients.sync import get_default_client -from sync.models import Platform, PredictionError, Response - -logger = logging.getLogger(__name__) - - -def get_products() -> Response[List[str]]: - """Get supported platforms - - :return: list of platform names - :rtype: Response[list[str]] - """ - response = get_default_client().get_products() - return Response(**response) - - -def generate_prediction( - platform: Platform, cluster_report: dict, eventlog_url: str, preference: str = None -) -> Response[dict]: - """Create and return prediction - - :param platform: e.g. "aws-emr" - :type platform: Platform - :param cluster_report: cluster report - :type cluster_report: dict - :param eventlog_url: Apache Spark event log URL - :type eventlog_url: str - :param preference: prediction preference, defaults to None - :type preference: str, optional - :return: prediction object - :rtype: Response[dict] - """ - response = create_prediction(platform, cluster_report, eventlog_url) - - prediction_id = response.result - if prediction_id: - return wait_for_prediction(prediction_id, preference) - - return response - - -def wait_for_prediction(prediction_id: str, preference: str = None) -> Response[dict]: - """Get a prediction, wait if it's not ready - - :param prediction_id: prediction ID - :type prediction_id: str - :param preference: prediction preference, defaults to None - :type preference: str, optional - :return: prediction object - :rtype: Response[dict] - """ - response = wait_for_final_prediction_status(prediction_id) - result = response.result - if result: - if result == "SUCCESS": - return get_prediction(prediction_id, preference) - - return Response(error=PredictionError(message="Prediction failed")) - - return response - - -def get_prediction(prediction_id: str, preference: str = None) -> Response[dict]: - """Get a prediction, don't wait - - :param prediction_id: prediction ID - :type prediction_id: str - :param preference: prediction preference, defaults to None - :type preference: str, optional - :return: prediction object - :rtype: Response[dict] - """ - response = get_default_client().get_prediction( - prediction_id, {"preference": preference} if preference else None - ) - - result = response.get("result") - if result: - return Response(result=result) - - return Response(**response) - - -def get_status(prediction_id: str) -> Response[str]: - """Get prediction status - - :param prediction_id: prediction ID - :type prediction_id: str - :return: prediction status, e.g. "SUCCESS" - :rtype: Response[str] - """ - response = get_default_client().get_prediction_status(prediction_id) - - result = response.get("result") - if result: - return Response(result=result["status"]) - - return Response(**response) - - -def get_predictions( - product: str = None, project_id: str = None, preference: str = None -) -> Response[List[dict]]: - """Get predictions - - :param product: platform to filter by, e.g. "aws-emr", defaults to None - :type product: str, optional - :param project_id: project to filter by, defaults to None - :type project_id: str, optional - :param preference: prediction preference, defaults to None - :type preference: str, optional - :return: list of prediction objects - :rtype: Response[list[dict]] - """ - params = {} - if product: - params["products"] = [product] - if project_id: - params["project_id"] = project_id - if preference: - params["preference"] = preference - - response = get_default_client().get_predictions(params) - - if response.get("result") is not None: - return Response(result=response["result"]) - - return Response(**response) - - -def wait_for_final_prediction_status(prediction_id: str) -> Response[str]: - """Wait for and return terminal prediction status - - :param prediction_id: prediction ID - :type prediction_id: str - :return: prediction status, e.g. "SUCCESS" - :rtype: Response[str] - """ - response = get_default_client().get_prediction_status(prediction_id) - while response: - result = response.get("result") - if result: - if result["status"] in ("SUCCESS", "FAILURE"): - return Response(result=result["status"]) - else: - return Response(**response) - - logger.info("Waiting for prediction") - sleep(10) - - response = get_default_client().get_prediction_status(prediction_id) - - return Response(error=PredictionError(message="Failed to get prediction status")) - - -def create_prediction_with_eventlog_bytes( - platform: Platform, - cluster_report: dict, - eventlog_name: str, - eventlog_bytes: bytes, - project_id: str = None, -) -> Response[str]: - """Creates a prediction giving event log bytes instead of a URL - - :param platform: platform, e.g. "aws-emr" - :type platform: Platform - :param cluster_report: cluster report - :type cluster_report: dict - :param eventlog_name: name of event log (extension is important) - :type eventlog_name: str - :param eventlog_bytes: encoded event log - :type eventlog_bytes: bytes - :param project_id: ID of project to which the prediction belongs, defaults to None - :type project_id: str, optional - :return: prediction ID - :rtype: Response[str] - """ - response = get_default_client().create_prediction( - { - "project_id": project_id, - "product_code": platform, - "configs": cluster_report, - } - ) - - if response.get("error"): - return Response(**response) - - upload_details = response["result"]["upload_details"] - log_response = httpx.post( - upload_details["url"], - data={ - **upload_details["fields"], - "key": upload_details["fields"]["key"].replace("${filename}", eventlog_name), - }, - files={"file": io.BytesIO(eventlog_bytes)}, - ) - if not log_response.status_code == httpx.codes.NO_CONTENT: - return Response(error=PredictionError(message="Failed to upload event log")) - - return Response(result=response["result"]["prediction_id"]) - - -def create_prediction( - platform: Platform, cluster_report: dict, eventlog_url: str, project_id: str = None -) -> Response[str]: - """Create prediction - - :param platform: platform, e.g. "aws-emr" - :type platform: Platform - :param cluster_report: cluster report - :type cluster_report: dict - :param eventlog_url: event log URL - :type eventlog_url: str - :param project_id: ID of project to which the prediction belongs, defaults to None - :type project_id: str, optional - :return: prediction ID - :rtype: Response[str] - """ - scheme = urlparse(eventlog_url).scheme - if scheme == "s3": - response = generate_presigned_url(eventlog_url) - if response.error: - return response - eventlog_http_url = response.result - elif scheme in {"http", "https"}: - eventlog_http_url = eventlog_url - else: - return Response(error=PredictionError(message="Unsupported event log URL scheme")) - - response = get_default_client().create_prediction( - { - "project_id": project_id, - "product_code": platform, - "eventlog_url": eventlog_http_url, - "configs": cluster_report, - } - ) - - if response.get("error"): - return Response(**response) - - return Response(result=response["result"]["prediction_id"]) - - -def generate_presigned_url(s3_url: str, expires_in_secs: int = 3600) -> Response[str]: - """Generates presigned HTTP URL for S3 URL - - :param s3_url: URL of object in S3 - :type s3_url: str - :param expires_in_secs: number of seconds after which presigned URL expires, defaults to 3600 - :type expires_in_secs: int, optional - :return: presigned URL - :rtype: Response[str] - """ - parsed_s3_url = urlparse(s3_url) - - s3 = boto.client("s3") - return Response( - result=s3.generate_presigned_url( - "get_object", - Params={"Bucket": parsed_s3_url.netloc, "Key": parsed_s3_url.path.lstrip("/")}, - ExpiresIn=expires_in_secs, - ) - ) diff --git a/sync/api/projects.py b/sync/api/projects.py index 3c93a2b..5d0a1c5 100644 --- a/sync/api/projects.py +++ b/sync/api/projects.py @@ -8,7 +8,7 @@ import httpx -from sync.api.predictions import generate_presigned_url, get_predictions +from . import generate_presigned_url from sync.clients.sync import get_default_client from sync.models import ( Platform, @@ -22,32 +22,6 @@ logger = logging.getLogger(__name__) -def get_prediction(project_id: str, preference: Preference = None) -> Response[dict]: - """Get the latest prediction of a project - - :param project_id: project ID - :type project_id: str - :param preference: preferred prediction solution, defaults to project setting - :type preference: Preference, optional - :return: prediction object - :rtype: Response[dict] - """ - project_response = get_project(project_id) - project = project_response.result - if project: - predictions_response = get_predictions( - project_id=project_id, preference=preference or project.get("preference") - ) - if predictions_response.error: - return predictions_response - - predictions = predictions_response.result - if predictions: - return Response(result=predictions[0]) - return Response(error=ProjectError(message="No predictions in the project")) - return project_response - - def create_project( name: str, product_code: str, diff --git a/sync/asyncapi/__init__.py b/sync/asyncapi/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/sync/asyncapi/predictions.py b/sync/asyncapi/predictions.py deleted file mode 100644 index 213ea9e..0000000 --- a/sync/asyncapi/predictions.py +++ /dev/null @@ -1,177 +0,0 @@ -import logging -from asyncio import sleep -from urllib.parse import urlparse - -from typing import List - -from sync.api.predictions import generate_presigned_url -from sync.clients.sync import get_default_async_client -from sync.models import Platform, PredictionError, Response - -logger = logging.getLogger(__name__) - - -async def generate_prediction( - platform: Platform, cluster_report: dict, eventlog_url: str, preference: str = None -) -> Response[dict]: - """Create and return prediction - - :param platform: e.g. "aws-emr" - :type platform: Platform - :param cluster_report: cluster report - :type cluster_report: dict - :param eventlog_url: Apache Spark event log URL - :type eventlog_url: str - :param preference: prediction preference, defaults to None - :type preference: str, optional - :return: prediction object - :rtype: Response[dict] - """ - response = await create_prediction(platform, cluster_report, eventlog_url) - - prediction_id = response.result - if prediction_id: - return await wait_for_prediction(prediction_id, preference) - - return response - - -async def wait_for_prediction(prediction_id: str, preference: str) -> Response[dict]: - """Get a prediction, wait if it's not ready - - :param prediction_id: prediction ID - :type prediction_id: str - :param preference: prediction preference, defaults to None - :type preference: str, optional - :return: prediction object - :rtype: Response[dict] - """ - response = await wait_for_final_prediction_status(prediction_id) - - result = response.result - if result: - if result == "SUCCESS": - return await get_prediction(prediction_id, preference) - - return Response(error=PredictionError(message="Prediction failed")) - - return response - - -async def get_prediction(prediction_id: str, preference: str = None) -> Response[dict]: - """Get a prediction, don't wait - - :param prediction_id: prediction ID - :type prediction_id: str - :param preference: prediction preference, defaults to None - :type preference: str, optional - :return: prediction object - :rtype: Response[dict] - """ - response = await get_default_async_client().get_prediction( - prediction_id, {"preference": preference} if preference else None - ) - - result = response.get("result") - if result: - return Response(result=result) - - return Response(**response) - - -async def get_predictions( - product: str = None, project_id: str = None, preference: str = None -) -> Response[List[dict]]: - """Get predictions - - :param product: platform to filter by, e.g. "aws-emr", defaults to None - :type product: str, optional - :param project_id: project to filter by, defaults to None - :type project_id: str, optional - :param preference: prediction preference, defaults to None - :type preference: str, optional - :return: list of prediction objects - :rtype: Response[list[dict]] - """ - params = {} - if product: - params["products"] = [product] - if project_id: - params["project_id"] = project_id - if preference: - params["preference"] = preference - - response = await get_default_async_client().get_predictions(params) - - if response.get("result") is not None: - return Response(result=response["result"]) - - return Response(**response) - - -async def wait_for_final_prediction_status(prediction_id: str) -> Response[str]: - """Wait for and return terminal prediction status - - :param prediction_id: prediction ID - :type prediction_id: str - :return: prediction status, e.g. "SUCCESS" - :rtype: Response[str] - """ - response = await get_default_async_client().get_prediction_status(prediction_id) - while response: - result = response.get("result") - if result: - if result["status"] in ("SUCCESS", "FAILURE"): - return Response(result=result["status"]) - else: - return Response(**response) - - logger.info("Waiting for prediction") - await sleep(10) - - response = await get_default_async_client().get_prediction_status(prediction_id) - - return Response(error=PredictionError(message="Failed to get prediction status")) - - -async def create_prediction( - platform: Platform, cluster_report: dict, eventlog_url: str, project_id: str = None -) -> Response[str]: - """Create prediction - - :param platform: platform, e.g. "aws-emr" - :type platform: Platform - :param cluster_report: cluster report - :type cluster_report: dict - :param eventlog_url: event log URL - :type eventlog_url: str - :param project_id: ID of project to which the prediction belongs, defaults to None - :type project_id: str, optional - :return: prediction ID - :rtype: Response[str] - """ - scheme = urlparse(eventlog_url).scheme - - if scheme == "s3": - response = generate_presigned_url(eventlog_url) - if response.error: - return response - eventlog_http_url = response.result - elif scheme in {"http", "https"}: - eventlog_http_url = eventlog_url - else: - return Response(error=PredictionError(message="Unsupported event log URL scheme")) - - response = await get_default_async_client().create_prediction( - { - "project_id": project_id, - "product_code": platform, - "eventlog_url": eventlog_http_url, - "configs": cluster_report, - } - ) - - if response.get("error"): - return Response(**response) - - return Response(result=response["result"]["prediction_id"]) From 4428165dce03c194c13f914f58c515b39e32cf34 Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Fri, 9 Feb 2024 20:21:45 -0500 Subject: [PATCH 03/18] delete predictions cli elements --- sync/cli/__init__.py | 28 +----- sync/cli/_databricks.py | 140 +--------------------------- sync/cli/awsdatabricks.py | 8 -- sync/cli/awsemr.py | 145 ----------------------------- sync/cli/azuredatabricks.py | 8 -- sync/cli/predictions.py | 177 ------------------------------------ sync/cli/projects.py | 18 ---- 7 files changed, 2 insertions(+), 522 deletions(-) delete mode 100644 sync/cli/awsemr.py delete mode 100644 sync/cli/predictions.py diff --git a/sync/cli/__init__.py b/sync/cli/__init__.py index aa1b503..b8a68b7 100644 --- a/sync/cli/__init__.py +++ b/sync/cli/__init__.py @@ -5,10 +5,8 @@ import click -from sync.api.predictions import get_products -from sync.cli import awsdatabricks, awsemr, azuredatabricks, predictions, projects, workspaces +from sync.cli import awsdatabricks, azuredatabricks, projects, workspaces from sync.cli.util import OPTIONAL_DEFAULT -from sync.clients.sync import get_default_client from sync.config import API_KEY, CONFIG, DB_CONFIG, APIKey, Configuration, DatabricksConf, init from sync.models import Preference @@ -25,9 +23,7 @@ def main(debug: bool): logging.disable() -main.add_command(predictions.predictions) main.add_command(projects.projects) -main.add_command(awsemr.aws_emr) main.add_command(awsdatabricks.aws_databricks) main.add_command(azuredatabricks.azure_databricks) main.add_command(workspaces.workspaces) @@ -97,25 +93,3 @@ def configure( and dbx_region != OPTIONAL_DEFAULT else None, ) - - -@main.command -def products(): - """List supported products""" - products_response = get_products() - products = products_response.result - if products: - click.echo(", ".join(products)) - else: - click.echo(str(products_response.error), err=True) - - -@main.command -def token(): - """Get an API access token""" - sync_client = get_default_client() - response = sync_client.get_products() - if "result" in response: - click.echo(sync_client._client.auth._access_token) - else: - click.echo(f"{response['error']['code']}: {response['error']['message']}", err=True) diff --git a/sync/cli/_databricks.py b/sync/cli/_databricks.py index c0857e6..02c4203 100644 --- a/sync/cli/_databricks.py +++ b/sync/cli/_databricks.py @@ -9,8 +9,7 @@ get_project_submission, ) from sync.cli.util import validate_project -from sync.config import CONFIG -from sync.models import DatabricksComputeType, DatabricksPlanType, Platform, Preference +from sync.models import DatabricksComputeType, DatabricksPlanType, Platform from sync.utils.json import DateTimeEncoderNaiveUTC pass_platform = click.make_pass_decorator(Platform) @@ -31,111 +30,6 @@ def access_report(platform: Platform, log_url: str = None): click.echo(databricks.get_access_report(log_url)) -@click.command -@click.argument("job-id") -@click.argument("prediction-id") -@click.option( - "-p", - "--preference", - type=click.Choice([p.value for p in Preference]), - default=CONFIG.default_prediction_preference, -) -@pass_platform -def run_prediction(platform: Platform, job_id: str, prediction_id: str, preference: str = None): - """Apply a prediction to a job and run it""" - if platform is Platform.AWS_DATABRICKS: - import sync.awsdatabricks as databricks - elif platform is Platform.AZURE_DATABRICKS: - import sync.azuredatabricks as databricks - - run = databricks.run_prediction(job_id, prediction_id, preference) - - run_id = run.result - if run_id: - click.echo(f"Run ID: {run_id}") - else: - click.echo(str(run.error), err=True) - - -@click.command -@click.argument("job-id") -@click.option("--plan", type=click.Choice(DatabricksPlanType), default=DatabricksPlanType.STANDARD) -@click.option( - "--compute", - type=click.Choice(DatabricksComputeType), - default=DatabricksComputeType.JOBS_COMPUTE, -) -@click.option("--project", callback=validate_project) -@pass_platform -def run_job( - platform: Platform, - job_id: str, - plan: DatabricksPlanType, - compute: DatabricksComputeType, - project: dict = None, -): - """Run a job, wait for it to complete then create a prediction""" - if platform is Platform.AWS_DATABRICKS: - import sync.awsdatabricks as databricks - elif platform is Platform.AZURE_DATABRICKS: - import sync.azuredatabricks as databricks - - run_response = databricks.run_and_record_job(job_id, plan, compute, project["id"]) - prediction_id = run_response.result - if prediction_id: - click.echo(f"Prediction ID: {prediction_id}") - else: - click.echo(str(run_response.error), err=True) - - -@click.command -@click.argument("run-id") -@click.option("--plan", type=click.Choice(DatabricksPlanType), default=DatabricksPlanType.STANDARD) -@click.option( - "--compute", - type=click.Choice(DatabricksComputeType), - default=DatabricksComputeType.JOBS_COMPUTE, -) -@click.option( - "--project", - callback=validate_project, - help="The project ID for which to generate a cluster report, if any. This is most relevant to runs that may utilize multiple clusters.", -) -@click.option( - "--allow-incomplete", - is_flag=True, - default=False, - help="Force creation of a prediction even with incomplete cluster data. Some features may not be available. To ensure a complete cluster report see https://docs.synccomputing.com/sync-gradient/integrating-with-gradient/databricks-workflows.", -) -@click.option( - "--exclude-task", help="Don't consider task when finding the cluster of a run", multiple=True -) -@pass_platform -def create_prediction( - platform: Platform, - run_id: str, - plan: DatabricksPlanType, - compute: DatabricksComputeType, - project: dict = None, - allow_incomplete: bool = False, - exclude_task: Tuple[str, ...] = None, -): - """Create a prediction for a job run""" - if platform is Platform.AWS_DATABRICKS: - import sync.awsdatabricks as databricks - elif platform is Platform.AZURE_DATABRICKS: - import sync.azuredatabricks as databricks - - prediction_response = databricks.create_prediction_for_run( - run_id, plan, compute, project["id"], allow_incomplete, exclude_task - ) - prediction = prediction_response.result - if prediction: - click.echo(f"Prediction ID: {prediction}") - else: - click.echo(f"Failed to create prediction. {prediction_response.error}", err=True) - - @click.command @click.argument("run-id") @click.argument("project", callback=validate_project) @@ -290,38 +184,6 @@ def get_cluster_report( click.echo(f"Failed to create cluster report. {config_response.error}", err=True) -@click.command -@click.argument("job-id") -@click.argument("project-id") -@click.option("--prediction-id") -@click.option( - "-p", - "--preference", - type=click.Choice([p.value for p in Preference]), - default=CONFIG.default_prediction_preference, -) -@pass_platform -def apply_prediction( - platform: Platform, - job_id: str, - project_id: str, - prediction_id: str = None, - preference: str = None, -): - """Apply a prediction to a job""" - if platform is Platform.AWS_DATABRICKS: - import sync.awsdatabricks as databricks - elif platform is Platform.AZURE_DATABRICKS: - import sync.azuredatabricks as databricks - - response = databricks.apply_prediction(job_id, project_id, prediction_id, preference) - prediction_id = response.result - if prediction_id: - click.echo(f"Applied prediction {prediction_id} to job {job_id}") - else: - click.echo(f"Failed to apply prediction. {response.error}", err=True) - - @click.command @click.argument("job-id") @click.argument("project-id") diff --git a/sync/cli/awsdatabricks.py b/sync/cli/awsdatabricks.py index e71ef92..274d75d 100644 --- a/sync/cli/awsdatabricks.py +++ b/sync/cli/awsdatabricks.py @@ -2,17 +2,13 @@ from sync.cli._databricks import ( access_report, - apply_prediction, apply_recommendation, - create_prediction, create_recommendation, create_submission, get_cluster_report, get_recommendation, get_submission, monitor_cluster, - run_job, - run_prediction, ) from sync.models import Platform @@ -25,14 +21,10 @@ def aws_databricks(ctx: click.Context): aws_databricks.add_command(access_report) -aws_databricks.add_command(run_prediction) -aws_databricks.add_command(run_job) -aws_databricks.add_command(create_prediction) aws_databricks.add_command(create_submission) aws_databricks.add_command(create_recommendation) aws_databricks.add_command(get_recommendation) aws_databricks.add_command(get_submission) aws_databricks.add_command(apply_recommendation) aws_databricks.add_command(get_cluster_report) -aws_databricks.add_command(apply_prediction) aws_databricks.add_command(monitor_cluster) diff --git a/sync/cli/awsemr.py b/sync/cli/awsemr.py deleted file mode 100644 index 26cbc1a..0000000 --- a/sync/cli/awsemr.py +++ /dev/null @@ -1,145 +0,0 @@ -import json -from io import TextIOWrapper -from typing import Dict - -import click - -from sync import awsemr -from sync.api.predictions import get_prediction -from sync.cli.util import validate_project -from sync.config import CONFIG -from sync.models import Platform, Preference -from sync.utils.json import DateTimeEncoderNaiveUTC - - -@click.group -def aws_emr(): - """EMR commands""" - pass - - -@aws_emr.command -@click.option("--cluster-id") -@click.option("--log-url") -@click.option("-r", "--region") -def access_report(cluster_id: str = None, log_url: str = None, region: str = None): - """Get access report""" - click.echo(awsemr.get_access_report(log_url=log_url, cluster_id=cluster_id, region_name=region)) - - -@aws_emr.command -@click.argument("job-flow", type=click.File("r")) -@click.option("-p", "--project", callback=validate_project) -@click.option("-r", "--region") -def run_job_flow(job_flow: TextIOWrapper, project: dict = None, region: str = None): - """Run a job flow - - JOB_FLOW is a file containing the RunJobFlow request object""" - job_flow_obj = json.loads(job_flow.read()) - - run_response = awsemr.run_and_record_job_flow( - job_flow_obj, project["id"] if project else None, region - ) - prediction_id = run_response.result - if prediction_id: - click.echo(f"Run complete. Prediction ID: {prediction_id}") - else: - click.echo(str(run_response.error), err=True) - - -@aws_emr.command -@click.argument("prediction-id") -@click.option( - "-p", - "--preference", - type=click.Choice(Preference), - default=CONFIG.default_prediction_preference, -) -@click.option("-r", "--region") -def run_prediction(prediction_id: str, preference: Preference, region: str = None): - """Execute a prediction""" - prediction_response = get_prediction(prediction_id, preference.value) - prediction = prediction_response.result - if prediction: - config = prediction["solutions"][preference.value]["configuration"] - - if prediction["product_code"] == Platform.AWS_EMR: - cluster_response = awsemr.run_job_flow(config, prediction.get("project_id"), region) - cluster_id = cluster_response.result - if cluster_id: - click.echo(f"EMR cluster ID: {cluster_id}") - else: - click.echo(str(cluster_response.error), err=True) - else: - click.echo("Prediction is not for EMR", err=True) - else: - click.echo(str(prediction_response.error), err=True) - - -@aws_emr.command -@click.argument("cluster-id") -@click.option("-r", "--region") -def create_prediction(cluster_id: str, region: str = None): - """Create prediction for a cluster""" - prediction_response = awsemr.create_prediction_for_cluster(cluster_id, region) - prediction = prediction_response.result - if prediction: - click.echo(f"Prediction ID: {prediction}") - else: - click.echo(f"Failed to create prediction. {prediction_response.error}", err=True) - - -@aws_emr.command -@click.argument("project", callback=validate_project) -@click.option("-r", "--run-id") -@click.option("-r", "--region") -def create_project_prediction(project: Dict[str, str], run_id: str = None, region: str = None): - """Create prediction for the latest project cluster or one specified by --run-id""" - prediction_response = awsemr.create_project_prediction(project["id"], run_id, region) - prediction = prediction_response.result - if prediction: - click.echo(f"Prediction ID: {prediction}") - else: - click.echo(f"Failed to create prediction. {prediction_response.error}", err=True) - - -@aws_emr.command -@click.argument("project", callback=validate_project) -@click.option("-r", "--run-id") -@click.option("-r", "--region") -def create_submission(run_id: str, project: dict, region: str = None): - """Create a submission for a job run""" - submission_response = awsemr.create_submission(project["id"], run_id, region) - submission = submission_response.result - if submission: - click.echo(f"Submission ID: {submission}") - else: - click.echo(f"Failed to submit data. {submission_response.error}", err=True) - return - - -@aws_emr.command -@click.argument("cluster-id") -@click.option("-r", "--region") -def get_cluster_report(cluster_id: str, region: str = None): - """Get a cluster report""" - config_response = awsemr.get_cluster_report(cluster_id, region) - config = config_response.result - if config: - click.echo(json.dumps(config, indent=2, cls=DateTimeEncoderNaiveUTC)) - else: - click.echo(f"Failed to create prediction. {config_response.error}", err=True) - - -@aws_emr.command -@click.argument("cluster-id") -@click.argument("project", callback=validate_project) -@click.option("-r", "--region") -def record_run(cluster_id: str, project: str, region: str = None): - """Record a project run""" - response = awsemr.record_run(cluster_id, project["id"], region) - prediction_id = response.result - if prediction_id: - click.echo(f"Prediction ID: {prediction_id}") - else: - click.echo(str(response.error), err=True) diff --git a/sync/cli/azuredatabricks.py b/sync/cli/azuredatabricks.py index f74d9d1..bf01376 100644 --- a/sync/cli/azuredatabricks.py +++ b/sync/cli/azuredatabricks.py @@ -2,17 +2,13 @@ from sync.cli._databricks import ( access_report, - apply_prediction, apply_recommendation, - create_prediction, create_recommendation, create_submission, get_cluster_report, get_recommendation, get_submission, monitor_cluster, - run_job, - run_prediction, ) from sync.models import Platform @@ -25,14 +21,10 @@ def azure_databricks(ctx: click.Context): azure_databricks.add_command(access_report) -azure_databricks.add_command(run_prediction) -azure_databricks.add_command(run_job) -azure_databricks.add_command(create_prediction) azure_databricks.add_command(create_submission) azure_databricks.add_command(create_recommendation) azure_databricks.add_command(get_recommendation) azure_databricks.add_command(get_submission) azure_databricks.add_command(apply_recommendation) azure_databricks.add_command(get_cluster_report) -azure_databricks.add_command(apply_prediction) azure_databricks.add_command(monitor_cluster) diff --git a/sync/cli/predictions.py b/sync/cli/predictions.py deleted file mode 100644 index 8575ffb..0000000 --- a/sync/cli/predictions.py +++ /dev/null @@ -1,177 +0,0 @@ -import io -import json -from pathlib import Path -from urllib.parse import urlparse - -import boto3 as boto -import click - -from sync.api.predictions import ( - create_prediction, - create_prediction_with_eventlog_bytes, - get_prediction, - get_predictions, - get_status, - wait_for_prediction, -) -from sync.cli.util import validate_project -from sync.config import CONFIG -from sync.models import Platform, Preference -from sync.utils.json import DateTimeEncoderNaiveUTCDropMicroseconds - - -@click.group -def predictions(): - """Sync prediction commands""" - pass - - -@predictions.command -@click.argument("platform", type=click.Choice(Platform)) -@click.option("-e", "--event-log", metavar="URL/PATH", required=True) -@click.option("-r", "--report", metavar="URL/PATH", required=True) -@click.option("--project", callback=validate_project, help="project/app ID") -@click.option( - "--preference", - type=click.Choice(Preference), - default=CONFIG.default_prediction_preference, -) -@click.pass_context -def generate( - ctx: click.Context, - platform: Platform, - event_log: str, - report: str, - project: str, - preference: Preference, -): - """Create and retrieve a prediction""" - parsed_report_arg = urlparse(report) - if parsed_report_arg.scheme == "": - with open(report) as report_fobj: - report = json.loads(report_fobj.read()) - elif parsed_report_arg.scheme == "s3": - s3 = boto.client("s3") - report_io = io.BytesIO() - s3.download_fileobj(parsed_report_arg.netloc, parsed_report_arg.path.lstrip("/"), report_io) - report = json.loads(report_io.getvalue()) - else: - ctx.fail("Unsupported report argument") - - parsed_event_log_loc = urlparse(event_log) - event_log_path = None - event_log_url = None - if parsed_event_log_loc.scheme == "": - event_log_path = Path(event_log) - elif parsed_event_log_loc.scheme in {"s3", "http", "https"}: - event_log_url = event_log - else: - ctx.fail("Unsupported event log argument") - - if event_log_url: - response = create_prediction(platform, report, event_log_url, project["id"]) - elif event_log_path: - with open(event_log_path, "rb") as event_log_fobj: - response = create_prediction_with_eventlog_bytes( - platform, report, event_log_path.name, event_log_fobj.read(), project["id"] - ) - - prediction_id = response.result - if prediction_id: - click.echo(f"Prediction ID: {prediction_id}") - click.echo("Waiting for result...") - prediction_response = wait_for_prediction(prediction_id, preference.value) - prediction = prediction_response.result - if prediction: - click.echo( - json.dumps(prediction, indent=2, cls=DateTimeEncoderNaiveUTCDropMicroseconds) - ) - else: - click.echo(str(response.error), err=True) - else: - click.echo(str(response.error), err=True) - - -@predictions.command -@click.argument("platform", type=click.Choice(Platform)) -@click.option("-e", "--event-log", metavar="URL/PATH", required=True) -@click.option("-r", "--report", metavar="URL/PATH", required=True) -@click.option("-p", "--project", callback=validate_project, help="project/app ID") -@click.pass_context -def create(ctx: click.Context, platform: Platform, event_log: str, report: str, project: str): - """Create a prediction""" - parsed_report_arg = urlparse(report) - if parsed_report_arg.scheme == "": - with open(report) as report_fobj: - report = json.loads(report_fobj.read()) - elif parsed_report_arg.scheme == "s3": - s3 = boto.client("s3") - report_io = io.BytesIO() - s3.download_fileobj(parsed_report_arg.netloc, parsed_report_arg.path.lstrip("/"), report_io) - report = json.loads(report_io.getvalue()) - else: - ctx.fail("Unsupported report argument") - - parsed_event_log_loc = urlparse(event_log) - event_log_path = None - event_log_url = None - - if parsed_event_log_loc.scheme == "": - event_log_path = Path(event_log) - elif parsed_event_log_loc.scheme in {"s3", "http", "https"}: - event_log_url = event_log - else: - ctx.fail("Unsupported event log argument") - - if event_log_url: - response = create_prediction(platform, report, event_log_url, project["id"]) - elif event_log_path: - with open(event_log_path, "rb") as event_log_fobj: - response = create_prediction_with_eventlog_bytes( - platform, report, event_log_path.name, event_log_fobj.read(), project["id"] - ) - - prediction_id = response.result - if prediction_id: - click.echo(f"Prediction ID: {prediction_id}") - else: - click.echo(str(response.error), err=True) - - -@predictions.command -@click.argument("prediction-id") -def status(prediction_id: str): - """Get the status of a prediction""" - click.echo(get_status(prediction_id).result) - - -@predictions.command -@click.argument("prediction-id") -@click.option( - "-p", - "--preference", - type=click.Choice(Preference), - default=CONFIG.default_prediction_preference, -) -def get(prediction_id: str, preference: Preference): - """Retrieve a prediction""" - response = get_prediction(prediction_id, preference.value) - click.echo(json.dumps(response.result, indent=2, cls=DateTimeEncoderNaiveUTCDropMicroseconds)) - - -@predictions.command -@click.option("--platform", type=click.Choice(Platform)) -@click.option("--project", callback=validate_project, help="project/app ID") -def list(platform: Platform, project: dict = None): - """List predictions""" - response = get_predictions( - product=platform.value if platform else None, project_id=project["id"] - ) - predictions = response.result - if predictions: - click.echo_via_pager( - f"{p['created_at']} {p['prediction_id']} ({p.get('project_id', 'not part of a project'):^36}): {p['application_name']}\n" - for p in predictions - ) - else: - click.echo(str(response.error), err=True) diff --git a/sync/cli/projects.py b/sync/cli/projects.py index 118b709..f0aadfb 100644 --- a/sync/cli/projects.py +++ b/sync/cli/projects.py @@ -5,7 +5,6 @@ from sync.api.projects import ( create_project, delete_project, - get_prediction, get_project, get_projects, reset_project, @@ -180,20 +179,3 @@ def delete(project: dict): click.echo(response.result) else: click.echo(str(response.error), err=True) - - -@projects.command("get-prediction") -@click.argument("project", callback=validate_project) -@click.option( - "-p", - "--preference", - type=click.Choice(Preference), -) -def get_latest_prediction(project: dict, preference: Preference): - """Get the latest prediction in a project""" - prediction_response = get_prediction(project["id"], preference) - prediction = prediction_response.result - if prediction: - click.echo(json.dumps(prediction, indent=2, cls=DateTimeEncoderNaiveUTCDropMicroseconds)) - else: - click.echo(str(prediction_response.error), err=True) From a84dee658f4a4f54290f025518e58aad0a24bbd4 Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Fri, 9 Feb 2024 21:38:44 -0500 Subject: [PATCH 04/18] tidy the api folder --- sync/api/__init__.py | 7 +++++-- sync/api/projects.py | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/sync/api/__init__.py b/sync/api/__init__.py index 469860a..7da4d98 100644 --- a/sync/api/__init__.py +++ b/sync/api/__init__.py @@ -1,7 +1,9 @@ +from urllib.parse import urlparse + +import boto3 as boto + from sync.clients.sync import get_default_client from sync.models import AccessReport, AccessReportLine, AccessStatusCode, Response -import boto3 as boto -from urllib.parse import urlparse def get_access_report() -> AccessReport: @@ -34,6 +36,7 @@ def get_access_report() -> AccessReport: ] ) + def generate_presigned_url(s3_url: str, expires_in_secs: int = 3600) -> Response[str]: """Generates presigned HTTP URL for S3 URL diff --git a/sync/api/projects.py b/sync/api/projects.py index 5d0a1c5..7780197 100644 --- a/sync/api/projects.py +++ b/sync/api/projects.py @@ -8,7 +8,6 @@ import httpx -from . import generate_presigned_url from sync.clients.sync import get_default_client from sync.models import ( Platform, @@ -19,6 +18,8 @@ SubmissionError, ) +from . import generate_presigned_url + logger = logging.getLogger(__name__) From ee29973eb35a33f3ad72c9d21be5fc7f7506f3b0 Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Fri, 9 Feb 2024 21:39:28 -0500 Subject: [PATCH 05/18] delete predictions from sync clients --- sync/clients/sync.py | 55 +------------------------------------------- 1 file changed, 1 insertion(+), 54 deletions(-) diff --git a/sync/clients/sync.py b/sync/clients/sync.py index da8b785..8063a40 100644 --- a/sync/clients/sync.py +++ b/sync/clients/sync.py @@ -1,5 +1,5 @@ import logging -from typing import Generator, List +from typing import Generator import httpx @@ -60,34 +60,6 @@ def __init__(self, api_url: str, api_key: APIKey): ) ) - def get_products(self) -> dict: - return self._send(self._client.build_request("GET", "/v1/autotuner/products")) - - def create_prediction(self, prediction: dict) -> dict: - headers, content = encode_json(prediction) - return self._send( - self._client.build_request( - "POST", "/v1/autotuner/predictions", headers=headers, content=content - ) - ) - - def get_prediction(self, prediction_id, params: dict = None) -> dict: - return self._send( - self._client.build_request( - "GET", f"/v1/autotuner/predictions/{prediction_id}", params=params - ) - ) - - def get_predictions(self, params: dict = None) -> List[dict]: - return self._send( - self._client.build_request("GET", "/v1/autotuner/predictions", params=params) - ) - - def get_prediction_status(self, prediction_id) -> dict: - return self._send( - self._client.build_request("GET", f"/v1/autotuner/predictions/{prediction_id}/status") - ) - def create_project(self, project: dict) -> dict: headers, content = encode_json(project) return self._send( @@ -253,31 +225,6 @@ def __init__(self, api_url: str, api_key: APIKey): ) ) - async def create_prediction(self, prediction: dict) -> dict: - headers, content = encode_json(prediction) - return await self._send( - self._client.build_request( - "POST", "/v1/autotuner/predictions", headers=headers, content=content - ) - ) - - async def get_prediction(self, prediction_id, params: dict = None) -> dict: - return await self._send( - self._client.build_request( - "GET", f"/v1/autotuner/predictions/{prediction_id}", params=params - ) - ) - - async def get_predictions(self, params: dict = None) -> dict: - return await self._send( - self._client.build_request("GET", "/v1/autotuner/predictions", params=params) - ) - - async def get_prediction_status(self, prediction_id) -> dict: - return await self._send( - self._client.build_request("GET", f"/v1/autotuner/predictions/{prediction_id}/status") - ) - async def create_project(self, project: dict) -> dict: headers, content = encode_json(project) return await self._send( From e6cf431adb75d0193bc13df41477215122a661ff Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Fri, 9 Feb 2024 21:49:03 -0500 Subject: [PATCH 06/18] delete the bloat --- sync/_databricks.py | 1004 +++------------------------------------ sync/asyncawsemr.py | 51 -- sync/awsdatabricks.py | 40 -- sync/awsemr.py | 765 ----------------------------- sync/azuredatabricks.py | 42 -- 5 files changed, 59 insertions(+), 1843 deletions(-) delete mode 100644 sync/asyncawsemr.py delete mode 100644 sync/awsemr.py diff --git a/sync/_databricks.py b/sync/_databricks.py index 27d0131..26dc03d 100644 --- a/sync/_databricks.py +++ b/sync/_databricks.py @@ -15,96 +15,15 @@ import boto3 as boto -from sync.api.predictions import ( - create_prediction_with_eventlog_bytes, - get_prediction, - get_predictions, - wait_for_final_prediction_status, -) -from sync.api.projects import ( - create_project_recommendation, - create_project_submission_with_eventlog_bytes, - get_project, - get_project_recommendation, - wait_for_recommendation, -) +from sync.api import projects from sync.clients.databricks import get_default_client from sync.config import CONFIG -from sync.models import ( - DatabricksAPIError, - DatabricksClusterReport, - DatabricksError, - PredictionError, - Response, -) +from sync.models import DatabricksAPIError, DatabricksClusterReport, DatabricksError, Response from sync.utils.dbfs import format_dbfs_filepath, read_dbfs_file logger = logging.getLogger(__name__) -def create_prediction( - plan_type: str, - compute_type: str, - cluster: dict, - cluster_events: dict, - eventlog: bytes, - instances: dict = None, - instance_timelines: dict = None, - volumes: dict = None, - tasks: List[dict] = None, - project_id: str = None, -) -> Response[str]: - """Create a Databricks prediction - - :param plan_type: either "Standard", "Premium" or "Enterprise" - :type plan_type: str - :param compute_type: e.g. "Jobs Compute" - :type compute_type: str - :param cluster: The Databricks cluster definition as defined by - - https://docs.databricks.com/dev-tools/api/latest/clusters.html#get - :type cluster: dict - :param cluster_events: All events, including paginated events, for the cluster as defined by - - https://docs.databricks.com/dev-tools/api/latest/clusters.html#events - If the cluster is a long-running cluster, this should only include events relevant to the time window that a - run occurred in. - :type cluster_events: dict - :param eventlog: encoded event log zip - :type eventlog: bytes - :param instances: All EC2 Instances that were a part of the cluster. Expects a data format as is returned by - `boto3's EC2.describe_instances API `_ - Instances should be narrowed to just those instances relevant to the Databricks Run. This can be done by passing - a `tag:ClusterId` filter to the describe_instances call like - - ``Filters=[{"Name": "tag:ClusterId", "Values": ["my-dbx-clusterid"]}]`` - If there are multiple pages of instances, all pages should be accumulated into 1 dictionary and passed to this - function - :type instances: dict, optional - :param volumes: The EBS volumes that were attached to this cluster - :type volumes: dict, optional - :param tasks: The Databricks Tasks associated with the cluster - :type tasks: List[dict] - :param project_id: Sync project ID, defaults to None - :type project_id: str, optional - :return: prediction ID - :rtype: Response[str] - """ - return create_prediction_with_eventlog_bytes( - get_default_client().get_platform(), - { - "plan_type": plan_type, - "compute_type": compute_type, - "cluster": cluster, - "cluster_events": cluster_events, - "instances": instances, - "instance_timelines": instance_timelines, - "volumes": volumes, - "tasks": tasks, - }, - "eventlog.zip", - eventlog, - project_id, - ) - - def create_cluster(config: dict) -> Response[str]: """Create Databricks cluster from the provided configuration. @@ -122,107 +41,6 @@ def create_cluster(config: dict) -> Response[str]: return Response(result=response["cluster_id"]) -def get_cluster(cluster_id: str) -> Response[dict]: - """Get Databricks cluster. - - :param cluster_id: cluster ID - :type cluster_id: str - :return: cluster object - :rtype: Response[dict] - """ - cluster = get_default_client().get_cluster(cluster_id) - if "error_code" in cluster: - return Response(error=DatabricksAPIError(**cluster)) - - return Response(result=cluster) - - -def create_prediction_for_run( - run_id: str, - plan_type: str, - compute_type: str, - project_id: str = None, - allow_incomplete_cluster_report: bool = False, - exclude_tasks: Union[Collection[str], None] = None, -) -> Response[str]: - """Create a prediction for the specified Databricks run. - - :param run_id: Databricks run ID - :type run_id: str - :param plan_type: either "Standard", "Premium" or "Enterprise" - :type plan_type: str - :param compute_type: e.g. "Jobs Compute" - :type compute_type: str - :param project_id: Sync project ID, defaults to None - :type project_id: str, optional - :param allow_incomplete_cluster_report: Whether creating a prediction with incomplete cluster report data should be allowable - :type allow_incomplete_cluster_report: bool, optional, defaults to False - :param exclude_tasks: Keys of tasks (task names) to exclude from the prediction - :type exclude_tasks: Collection[str], optional, defaults to None - :return: prediction ID - :rtype: Response[str] - """ - run = get_default_client().get_run(run_id) - - if "error_code" in run: - return Response(error=DatabricksAPIError(**run)) - - cluster_path = None - if project_id: - project_response = get_project(project_id) - if project_response.error: - return project_response - cluster_path = project_response.result.get("cluster_path") - - project_cluster_tasks = _get_project_cluster_tasks(run, project_id, cluster_path, exclude_tasks) - - cluster_tasks = None - if project_id: - cluster_tasks = project_cluster_tasks.get(project_id) - elif len(project_cluster_tasks) == 1: - cluster_tasks = next(iter(project_cluster_tasks.values())) - - if not cluster_tasks: - return Response( - error=DatabricksError( - message=f"Failed to locate cluster in run {run_id} for project {project_id}" - ) - ) - - cluster_id, tasks = cluster_tasks - - return _create_prediction( - cluster_id, tasks, plan_type, compute_type, project_id, allow_incomplete_cluster_report - ) - - -def _create_prediction( - cluster_id: str, - tasks: List[dict], - plan_type: str, - compute_type: str, - project_id: str = None, - allow_incomplete_cluster_report: bool = False, -): - run_information_response = _get_run_information( - cluster_id, - tasks, - plan_type, - compute_type, - allow_incomplete_cluster_report=allow_incomplete_cluster_report, - ) - - if run_information_response.error: - return run_information_response - - cluster_report, eventlog = run_information_response.result - return create_prediction( - **cluster_report.dict(exclude_none=True), - eventlog=eventlog, - project_id=project_id, - ) - - def create_submission_for_run( run_id: str, plan_type: str, @@ -253,7 +71,7 @@ def create_submission_for_run( if "error_code" in run: return Response(error=DatabricksAPIError(**run)) - project_response = get_project(project_id) + project_response = projects.get_project(project_id) if project_response.error: return project_response cluster_path = project_response.result.get("cluster_path") @@ -296,7 +114,7 @@ def _create_submission( return run_information_response cluster_report, eventlog = run_information_response.result - return create_project_submission_with_eventlog_bytes( + return projects.create_project_submission_with_eventlog_bytes( get_default_client().get_platform(), cluster_report.dict(exclude_none=True), "eventlog.zip", @@ -372,7 +190,7 @@ def get_cluster_report( cluster_path = None if project_id: - project_response = get_project(project_id) + project_response = projects.get_project(project_id) if project_response.error: return project_response cluster_path = project_response.result.get("cluster_path") @@ -457,7 +275,7 @@ def handle_successful_job_run( return submission_response for project_id, submission_id in submission_response.result.items(): - project_response = get_project(project_id) + project_response = projects.get_project(project_id) if project_response.error: logger.error(f"Failed to retrieve project {project_id} - {project_response.error}") @@ -473,14 +291,6 @@ def handle_successful_job_run( logger.error( f"Failed to create and apply project {project_id} recommendation to job {job_id} - {recommendation_response.error}" ) - elif project["project_model_id"] == "AUTOTUNER": - prediction_response = wait_for_and_apply_prediction( - project_id, submission_id, job_id - ) - if prediction_response.error: - logger.error( - f"Failed to apply prediction {submission_id} to job {job_id} - {prediction_response.error}" - ) else: logger.error( f"Unexpected project_model_id for project {project_id}: {project['project_model_id']}" @@ -499,14 +309,14 @@ def create_and_apply_project_recommendation(project_id: str, job_id: str) -> Res :return: ID of applied recommendation :rtype: Response[str] """ - recommendation_response = create_project_recommendation(project_id) + recommendation_response = projects.create_project_recommendation(project_id) if recommendation_response.error: return recommendation_response recommendation_id = recommendation_response.result - recommendation_wait_response = wait_for_recommendation(project_id, recommendation_id) + recommendation_wait_response = projects.wait_for_recommendation(project_id, recommendation_id) if recommendation_wait_response.error: return recommendation_wait_response @@ -514,34 +324,6 @@ def create_and_apply_project_recommendation(project_id: str, job_id: str) -> Res return apply_project_recommendation(job_id, project_id, recommendation_id) -def wait_for_and_apply_prediction( - project_id: str, prediction_id: str, job_id: str -) -> Response[str]: - """Wait for prediction and apply it to the job - - :param project_id: ID of project for job - :type project_id: str - :param prediction_id: ID of project for job - :type prediction_id: str - :param job_id: ID of job to which the recommendation should be applied - :type job_id: str - :return: ID of applied recommendation - :rtype: Response[str] - """ - prediction_status_response = wait_for_final_prediction_status(prediction_id) - - if prediction_status_response.error: - return prediction_status_response - - prediction_status = prediction_status_response.result - if prediction_status == "SUCCESS": - return apply_prediction(job_id, project_id, prediction_id) - - return Response( - error=PredictionError(f"Prediction {prediction_id} failed. Status: {prediction_status}") - ) - - def record_run( run_id: str, plan_type: str, @@ -577,7 +359,7 @@ def record_run( cluster_path = None if project_id: - project_response = get_project(project_id) + project_response = projects.get_project(project_id) if project_response.error: return project_response cluster_path = project_response.result.get("cluster_path") @@ -610,7 +392,7 @@ def _record_project_clusters( """Creates project submissions/predictions and returns a map of project IDs to the new submissions/predictions IDs""" result_ids = {} for cluster_project_id, (cluster_id, tasks) in project_cluster_tasks.items(): - project_response = get_project(cluster_project_id) + project_response = projects.get_project(cluster_project_id) if project_response.error: logger.error( @@ -629,15 +411,6 @@ def _record_project_clusters( cluster_project_id, allow_incomplete_cluster_report, ) - elif project["project_model_id"] == "AUTOTUNER": - submission_response = _create_prediction( - cluster_id, - tasks, - plan_type, - compute_type, - cluster_project_id, - allow_incomplete_cluster_report, - ) else: logger.error( f"Unexpected project_model_id for project {cluster_project_id}: {project['project_model_id']}" @@ -655,170 +428,6 @@ def _record_project_clusters( return result_ids -def apply_prediction( - job_id: str, project_id: str, prediction_id: str = None, preference: str = None -): - """Updates jobs with prediction configuration - - :param job_id: ID of job to apply prediction to - :type job_id: str - :param project_id: Sync project ID - :type project_id: str - :param prediction_id: Sync prediction ID, defaults to latest in project - :type prediction_id: str, optional - :param preference: Prediction preference, defaults to "recommended" then "economy" - :type preference: str, optional - :return: ID of applied prediction - :rtype: Response[str] - """ - if prediction_id: - prediction_response = get_prediction(prediction_id, preference) - else: - predictions_response = get_predictions(project_id=project_id) - if predictions_response.error: - return predictions_response - prediction_id = predictions_response.result[0]["prediction_id"] - prediction_response = get_prediction(prediction_id, preference) - - if prediction_response.error: - return prediction_response - - prediction = prediction_response.result - - databricks_client = get_default_client() - - job = databricks_client.get_job(job_id) - job_clusters = _get_project_job_clusters(job) - - project_cluster = job_clusters.get(project_id) - if not project_cluster: - if len(job_clusters) == 1: - project_cluster = next(iter(job_clusters.values())) - else: - return Response( - error=DatabricksError( - message=f"Failed to locate cluster in job {job_id} for project {project_id}" - ) - ) - - project_cluster_path, _ = project_cluster - - if preference: - prediction_cluster = prediction["solutions"][preference]["configuration"] - else: - prediction_cluster = prediction["solutions"].get( - "recommended", prediction["solutions"]["economy"] - )["configuration"] - - if "cluster_name" in prediction_cluster: - del prediction_cluster["cluster_name"] - - if project_cluster_path[0] == "job_clusters": - new_settings = { - "job_clusters": [ - {"job_cluster_key": project_cluster_path[1], "new_cluster": prediction_cluster} - ] - } - else: - new_settings = { - "tasks": [{"task_key": project_cluster_path[1], "new_cluster": prediction_cluster}] - } - - response = databricks_client.update_job(job_id, new_settings) - - if "error_code" in response: - return Response(error=DatabricksAPIError(**response)) - - return Response(result=prediction_id) - - -def get_prediction_job( - job_id: str, prediction_id: str, preference: str = CONFIG.default_prediction_preference.value -) -> Response[dict]: - """Apply the prediction to the specified job. - - The basis job can only have tasks that run on the same cluster. That cluster is updated with the - configuration from the prediction and returned in the result job configuration. Use this function - to apply a prediction to an existing job or test a prediction with a one-off run. - - :param job_id: basis job ID - :type job_id: str - :param prediction_id: prediction ID - :type prediction_id: str - :param preference: preferred prediction solution, defaults to local configuration - :type preference: str, optional - :return: job object with prediction applied to it - :rtype: Response[dict] - """ - job = get_default_client().get_job(job_id) - if "error_code" in job: - return Response(error=DatabricksAPIError(**job)) - - job_settings = job["settings"] - tasks = job_settings.get("tasks", []) - if tasks: - cluster_response = _get_job_cluster(tasks, job_settings.get("job_clusters", [])) - cluster = cluster_response.result - if cluster: - prediction_cluster_response = get_prediction_cluster(cluster, prediction_id, preference) - prediction_cluster = prediction_cluster_response.result - if prediction_cluster: - cluster_key = tasks[0].get("job_cluster_key") - if cluster_key: - job_settings["job_clusters"] = [ - j - for j in job_settings["job_clusters"] - if j.get("job_cluster_key") != cluster_key - ] + [{"job_cluster_key": cluster_key, "new_cluster": prediction_cluster}] - else: - # For `new_cluster` definitions, Databricks will automatically assign the newly created cluster a name, - # and will reject any run submissions where the `cluster_name` is pre-populated - if "cluster_name" in prediction_cluster: - del prediction_cluster["cluster_name"] - tasks[0]["new_cluster"] = prediction_cluster - return Response(result=job) - return prediction_cluster_response - return cluster_response - return Response(error=DatabricksError(message="No task found in job")) - - -def get_prediction_cluster( - cluster: dict, prediction_id: str, preference: str = CONFIG.default_prediction_preference.value -) -> Response[dict]: - """Apply the prediction to the provided cluster. - - The cluster is updated with configuration from the prediction and returned in the result. - - :param cluster: Databricks cluster object - :type cluster: dict - :param prediction_id: prediction ID - :type prediction_id: str - :param preference: preferred prediction solution, defaults to local configuration - :type preference: str, optional - :return: job object with prediction applied to it - :rtype: Response[dict] - """ - prediction_response = get_prediction(prediction_id) - prediction = prediction_response.result - if prediction: - # num_workers/autoscale are mutually exclusive settings, and we are relying on our Prediction - # Recommendations to set these appropriately. Since we may recommend a Static cluster (i.e. a cluster - # with `num_workers`) for a cluster that was originally autoscaled, we want to make sure to remove this - # prior configuration - if "num_workers" in cluster: - del cluster["num_workers"] - - if "autoscale" in cluster: - del cluster["autoscale"] - - prediction_cluster = _deep_update( - cluster, prediction["solutions"][preference]["configuration"] - ) - - return Response(result=prediction_cluster) - return prediction_response - - def apply_project_recommendation( job_id: str, project_id: str, recommendation_id: str ) -> Response[str]: @@ -877,57 +486,6 @@ def apply_project_recommendation( return Response(result=recommendation_id) -def get_recommendation_job(job_id: str, project_id: str, recommendation_id: str) -> Response[dict]: - """Apply the recommendation to the specified job. - - The basis job can only have tasks that run on the same cluster. That cluster is updated with the - configuration from the prediction and returned in the result job configuration. Use this function - to apply a prediction to an existing job or test a prediction with a one-off run. - - :param job_id: basis job ID - :type job_id: str - :param project_id: Sync project ID - :type project_id: str - :param recommendation_id: recommendation ID - :type recommendation_id: str - :return: job object with recommendation applied to it - :rtype: Response[dict] - """ - job = get_default_client().get_job(job_id) - - if "error_code" in job: - return Response(error=DatabricksAPIError(**job)) - - job_settings = job["settings"] - tasks = job_settings.get("tasks", []) - if tasks: - cluster_response = _get_job_cluster(tasks, job_settings.get("job_clusters", [])) - cluster = cluster_response.result - if cluster: - recommendation_cluster_response = get_recommendation_cluster( - cluster, project_id, recommendation_id - ) - recommendation_cluster = recommendation_cluster_response.result - if recommendation_cluster: - cluster_key = tasks[0].get("job_cluster_key") - if cluster_key: - job_settings["job_clusters"] = [ - j - for j in job_settings["job_clusters"] - if j.get("job_cluster_key") != cluster_key - ] + [{"job_cluster_key": cluster_key, "new_cluster": recommendation_cluster}] - else: - # For `new_cluster` definitions, Databricks will automatically assign the newly created cluster a name, - # and will reject any run submissions where the `cluster_name` is pre-populated - if "cluster_name" in recommendation_cluster: - del recommendation_cluster["cluster_name"] - tasks[0]["new_cluster"] = recommendation_cluster - return Response(result=job) - return recommendation_cluster_response - return cluster_response - return Response(error=DatabricksError(message="No task found in job")) - - def get_recommendation_cluster( cluster: dict, project_id: str, recommendation_id: str ) -> Response[dict]: @@ -944,7 +502,7 @@ def get_recommendation_cluster( :return: cluster object with prediction applied to it :rtype: Response[dict] """ - recommendation_response = get_project_recommendation(project_id, recommendation_id) + recommendation_response = projects.get_project_recommendation(project_id, recommendation_id) recommendation = recommendation_response.result.get("recommendation") if recommendation: # num_workers/autoscale are mutually exclusive settings, and we are relying on our Prediction @@ -963,51 +521,6 @@ def get_recommendation_cluster( return recommendation_response -def get_project_job(job_id: str, project_id: str, region_name: str = None) -> Response[dict]: - """Apply project configuration to a job. - - The job can only have tasks that run on the same job cluster. That cluster is updated with tags - and a log configuration to facilitate project continuity. The result can be tested in a - one-off run or applied to an existing job to surface run-time (see :py:func:`~run_job_object`) or cost optimizations. - - :param job_id: ID of basis job - :type job_id: str - :param project_id: Sync project ID - :type project_id: str - :param region_name: region name, defaults to AWS configuration - :type region_name: str, optional - :return: project job object - :rtype: Response[dict] - """ - job = get_default_client().get_job(job_id) - if "error_code" in job: - return Response(error=DatabricksAPIError(**job)) - - job_settings = job["settings"] - tasks = job_settings.get("tasks", []) - if tasks: - cluster_response = _get_job_cluster(tasks, job_settings.get("job_clusters", [])) - cluster = cluster_response.result - if cluster: - project_cluster_response = get_project_cluster(cluster, project_id, region_name) - project_cluster = project_cluster_response.result - if project_cluster: - cluster_key = tasks[0].get("job_cluster_key") - if cluster_key: - job_settings["job_clusters"] = [ - j - for j in job_settings["job_clusters"] - if j.get("job_cluster_key") != cluster_key - ] + [{"job_cluster_key": cluster_key, "new_cluster": project_cluster}] - else: - tasks[0]["new_cluster"] = project_cluster - - return Response(result=job) - return project_cluster_response - return cluster_response - return Response(error=DatabricksError(message="No task found in job")) - - def get_project_cluster(cluster: dict, project_id: str, region_name: str = None) -> Response[dict]: """Apply project configuration to a cluster. @@ -1044,7 +557,7 @@ def get_project_cluster_settings(project_id: str, region_name: str = None) -> Re :return: project cluster settings - a subset of a Databricks cluster object :rtype: Response[dict] """ - project_response = get_project(project_id) + project_response = projects.get_project(project_id) project = project_response.result if project: result = { @@ -1083,399 +596,6 @@ def get_project_cluster_settings(project_id: str, region_name: str = None) -> Re return project_response -def run_job_object(job: dict) -> Response[Tuple[str, str]]: - """Create a Databricks one-off run based on the job configuration. - - :param job: Databricks job object - :type job: dict - :return: run ID, and optionally ID of newly created cluster - :rtype: Response[Tuple[str, str]] - """ - tasks = job["settings"]["tasks"] - cluster_response = _get_job_cluster(tasks, job["settings"].get("job_clusters", [])) - - cluster = cluster_response.result - if cluster: - new_cluster_id = None - if len(tasks) == 1: - # For `new_cluster` definitions, Databricks will automatically assign the newly created cluster a name, - # and will reject any run submissions where the `cluster_name` is pre-populated - if "cluster_name" in cluster: - del cluster["cluster_name"] - - tasks[0]["new_cluster"] = cluster - del tasks[0]["job_cluster_key"] - else: - # If the original Job has a pre-existing Policy, we want to remove this from the `create_cluster` payload, - # since we are not allowed to create clusters with certain policies via that endpoint, e.g. we cannot - # create a `Job Compute` cluster via this endpoint. - if "policy_id" in cluster: - del cluster["policy_id"] - - # Create an "All-Purpose Compute" cluster - cluster["cluster_name"] = cluster["cluster_name"] or job["settings"]["name"] - cluster["autotermination_minutes"] = 10 # 10 minutes is the minimum - - cluster_result = get_default_client().create_cluster(cluster) - if "error_code" in cluster_result: - return Response(error=DatabricksAPIError(**cluster_result)) - - new_cluster_id = cluster_result["cluster_id"] - - for task in tasks: - task["existing_cluster_id"] = cluster_result["cluster_id"] - if "new_cluster" in task: - del task["new_cluster"] - if "job_cluster_key" in task: - del task["job_cluster_key"] - - run_result = get_default_client().create_run( - {"run_name": job["settings"]["name"], "tasks": tasks} - ) - if "error_code" in run_result: - return Response(error=DatabricksAPIError(**run_result)) - - return Response(result=(run_result["run_id"], new_cluster_id)) - return cluster_response - - -def run_prediction(job_id: str, prediction_id: str, preference: str) -> Response[str]: - """Create a one-off Databricks run based on the prediction applied to the job. - - :param job_id: job ID - :type job_id: str - :param prediction_id: prediction ID - :type prediction_id: str - :param preference: preferred prediction solution - :type preference: str - :return: run ID - :rtype: Response[str] - """ - prediction_job_response = get_prediction_job(job_id, prediction_id, preference) - prediction_job = prediction_job_response.result - if prediction_job: - run_response = run_job_object(prediction_job) - if run_response.result: - return Response(result=run_response.result[0]) - return run_response - return prediction_job_response - - -def create_run(run: dict) -> Response[str]: - """Creates a run based off the incoming Databricks run configuration - - :param run: run object - :type run: dict - :return: run ID - :rtype: Response[str] - """ - run_result = get_default_client().create_run(run) - if "error_code" in run_result: - return Response(error=DatabricksAPIError(**run_result)) - - return Response(result=run_result["run_id"]) - - -def run_and_record_prediction_job( - job_id: str, - prediction_id: str, - plan_type: str, - compute_type: str, - project_id: str = None, - preference: str = CONFIG.default_prediction_preference.value, -) -> Response[str]: - """Run a prediction applied to the specified job and record the result. - - This function waits for the run to complete before creating a new prediction based on that run. - If a project is specified the new prediction is added to it. - - :param job_id: basis job ID - :type job_id: str - :param prediction_id: project ID - :type prediction_id: str - :param plan_type: either "Standard", "Premium" or "Enterprise" - :type plan_type: str - :param compute_type: e.g. "Jobs Compute" - :type compute_type: str - :param project_id: Sync project ID, defaults to None - :type project_id: str, optional - :param preference: preferred prediction solution, defaults to local configuration - :type preference: str, optional - :return: prediction ID - :rtype: Response[str] - """ - prediction_job_response = get_prediction_job(job_id, prediction_id, preference) - prediction_job = prediction_job_response.result - if prediction_job: - return run_and_record_job_object(prediction_job, plan_type, compute_type, project_id) - return prediction_job_response - - -def run_and_record_project_job( - job_id: str, project_id: str, plan_type: str, compute_type: str, region_name: str = None -) -> Response[str]: - """Runs the specified job and adds the result to the project. - - This function waits for the run to complete. - - :param job_id: Databricks job ID - :type job_id: str - :param project_id: Sync project ID - :type project_id: str - :param plan_type: either "Standard", "Premium" or "Enterprise" - :type plan_type: str - :param compute_type: e.g. "Jobs Compute" - :type compute_type: str - :param region_name: region name, defaults to AWS configuration - :type region_name: str, optional - :return: prediction ID - :rtype: Response[str] - """ - project_job_response = get_project_job(job_id, project_id, region_name) - project_job = project_job_response.result - if project_job: - return run_and_record_job_object(project_job, plan_type, compute_type, project_id) - return project_job_response - - -def run_and_record_job( - job_id: str, plan_type: str, compute_type: str, project_id: str = None -) -> Response[str]: - """Runs the specified job and creates a prediction based on the result. - - If a project is specified the prediction is added to it. - - :param job_id: Databricks job ID - :type job_id: str - :param plan_type: either "Standard", "Premium" or "Enterprise" - :type plan_type: str - :param compute_type: e.g. "Jobs Compute" - :type compute_type: str - :param project_id: Sync project ID, defaults to None - :type project_id: str, optional - :return: prediction ID - :rtype: Response[str] - """ - # creates a "Jobs Compute" cluster - run_result = get_default_client().create_job_run({"job_id": job_id}) - if "error_code" in run_result: - return Response(error=DatabricksAPIError(**run_result)) - - run_id = run_result["run_id"] - return wait_for_and_record_run(run_id, plan_type, compute_type, project_id) - - -def run_and_record_job_object( - job: dict, plan_type: str, compute_type: str, project_id: str = None -) -> Response[str]: - """Creates a one-off Databricks run based on the provided job object. - - Job tasks must use the same job cluster, and that cluster must be configured to store the - event logs in S3. - - :param job: Databricks job object - :type job: dict - :param plan_type: either "Standard", "Premium" or "Enterprise" - :type plan_type: str - :param compute_type: e.g. "Jobs Compute" - :type compute_type: str - :param project_id: Sync project ID, defaults to None - :type project_id: str, optional - :return: prediction ID - :rtype: Response[str] - """ - run_response = run_job_object(job) - run_and_cluster_ids = run_response.result - if run_and_cluster_ids: - response = wait_for_run_and_cluster(run_and_cluster_ids[0]) - result_state = response.result - if result_state: - if result_state == "SUCCESS": - response = record_run(run_and_cluster_ids[0], plan_type, compute_type, project_id) - else: - response = Response( - error=DatabricksError(message=f"Unsuccessful run result state: {result_state}") - ) - - for cluster_id in run_and_cluster_ids[1:]: - delete_cluster_response = get_default_client().delete_cluster(cluster_id) - if "error_code" in delete_cluster_response: - logger.warning( - f"Failed to delete cluster {cluster_id}: {delete_cluster_response['error_code']}: {delete_cluster_response['message']}" - ) - - return response - return run_response - - -def create_and_record_run( - run: dict, plan_type: str, compute_type: str, project_id: str = None -) -> Response[str]: - """Applies the Databricks run configuration and creates a prediction based on the result. - - If a project is specified the resulting prediction is added to it. This function waits for - run to complete. - - :param run: Databricks run configuration - :type run: dict - :param plan_type: either "Standard", "Premium" or "Enterprise" - :type plan_type: str - :param compute_type: e.g. "Jobs Compute" - :type compute_type: str - :param project_id: Sync project ID, defaults to None - :type project_id: str, optional - :return: prediction ID - :rtype: Response[str] - """ - run_response = create_run(run) - run_id = run_response.result - if run_id: - return wait_for_and_record_run(run_id, plan_type, compute_type, project_id) - return run_response - - -def wait_for_and_record_run( - run_id: str, plan_type: str, compute_type: str, project_id: str = None -) -> Response[str]: - """Waits for a run to complete before creating a prediction. - - The run must save 1 event log to S3. If a project is specified the prediction is added - to that project. - - :param run_id: Databricks run ID - :type run_id: str - :param plan_type: either "Standard", "Premium" or "Enterprise" - :type plan_type: str - :param compute_type: e.g. "Jobs Compute" - :type compute_type: str - :param project_id: Sync project ID, defaults to None - :type project_id: str, optional - :return: prediction ID - :rtype: Response[str] - """ - wait_response = wait_for_final_run_status(run_id) - result_state = wait_response.result - if result_state: - if result_state == "SUCCESS": - return record_run(run_id, plan_type, compute_type, project_id) - return Response( - error=DatabricksError(message=f"Unsuccessful run result state: {result_state}") - ) - return wait_response - - -def create_and_wait_for_run(run: dict) -> Response[str]: - """Creates a Databricks run from the incoming configuration and returns the final status. - - This function waits for the run to complete. - - :param run: Databricks run configuration - :type run: dict - :return: result state, e.g. "SUCCESS" - :rtype: Response[str] - """ - run_response = create_run(run) - if run_response.error: - return run_response - - return wait_for_final_run_status(run_response.result) - - -def wait_for_final_run_status(run_id: str) -> Response[str]: - """Waits for run returning final status. - - :param run_id: Databricks run ID - :type run_id: str - :return: result state, e.g. "SUCCESS" - :rtype: Response[str] - """ - run = get_default_client().get_run(run_id) - while "error_code" not in run: - result_state = run["state"].get("result_state") # result_state isn't present while running - if result_state in {"SUCCESS", "FAILED", "TIMEDOUT", "CANCELED"}: - return Response(result=result_state) - - sleep(30) - run = get_default_client().get_run(run_id) - - return Response(error=DatabricksAPIError(**run)) - - -def wait_for_run_and_cluster(run_id: str) -> Response[str]: - """Waits for final run status and returns it after terminating the cluster. - - :param run_id: Databricks run ID - :type run_id: str - :return: result state, e.g. "SUCCESS" - :rtype: Response[str] - """ - run = get_default_client().get_run(run_id) - while "error_code" not in run: - result_state = run["state"].get("result_state") # result_state isn't present while running - if result_state in {"SUCCESS", "FAILED", "TIMEDOUT", "CANCELED"}: - for cluster_id in {task.get("existing_cluster_id") for task in run["tasks"]}: - cluster_response = terminate_cluster(cluster_id) - if cluster_response.error: - return cluster_response - return Response(result=result_state) - - sleep(30) - run = get_default_client().get_run(run_id) - - return Response(error=DatabricksAPIError(**run)) - - -def terminate_cluster(cluster_id: str) -> Response[dict]: - """Terminate Databricks cluster and wait to return final state. - - :param cluster_id: Databricks cluster ID - :type cluster_id: str - :return: Databricks cluster object with state: "TERMINATED" - :rtype: Response[str] - """ - cluster = get_default_client().get_cluster(cluster_id) - if "error_code" not in cluster: - state = cluster.get("state") - if state == "TERMINATED": - return Response(result=cluster) - elif state == "TERMINATING": - return _wait_for_cluster_termination(cluster_id) - elif state in {"PENDING", "RUNNING", "RESTARTING", "RESIZING"}: - get_default_client().terminate_cluster(cluster_id) - return _wait_for_cluster_termination(cluster_id) - else: - return Response(error=DatabricksError(message=f"Unexpected cluster state: {state}")) - - return Response(error=DatabricksAPIError(**cluster)) - - -def _wait_for_cluster_termination( - cluster_id: str, timeout_seconds=600, poll_seconds=10 -) -> Response[dict]: - logging.info(f"Waiting for cluster {cluster_id} to terminate") - start_seconds = time.time() - cluster = get_default_client().get_cluster(cluster_id) - while "error_code" not in cluster: - state = cluster.get("state") - if state == "TERMINATED": - return Response(result=cluster) - elif state == "TERMINATING": - sleep(poll_seconds) - else: - return Response(error=DatabricksError(message=f"Unexpected cluster state: {state}")) - - if time.time() - start_seconds > timeout_seconds: - return Response( - error=DatabricksError( - message=f"Cluster failed to terminate after waiting {timeout_seconds} seconds" - ) - ) - - cluster = get_default_client().get_cluster(cluster_id) - - return Response(error=DatabricksAPIError(**cluster)) - - def _cluster_log_destination( cluster: dict, ) -> Union[Tuple[str, str, str, str], Tuple[None, None, None, None]]: @@ -1500,20 +620,6 @@ def _cluster_log_destination( return None, None, None, None -def _get_job_cluster(tasks: List[dict], job_clusters: list) -> Response[dict]: - if len(tasks) == 1: - return _get_task_cluster(tasks[0], job_clusters) - - if [t.get("job_cluster_key") for t in tasks].count(tasks[0].get("job_cluster_key")) == len( - tasks - ): - for cluster in job_clusters: - if cluster["job_cluster_key"] == tasks[0].get("job_cluster_key"): - return Response(result=cluster["new_cluster"]) - return Response(error=DatabricksError(message="No cluster found for task")) - return Response(error=DatabricksError(message="Not all tasks use the same cluster")) - - def _get_project_job_clusters( job: dict, exclude_tasks: Union[Collection[str], None] = None, @@ -1669,22 +775,6 @@ def _get_run_spark_context_id(tasks: List[dict]) -> Response[str]: return Response(error=DatabricksError(message="More than 1 cluster found for tasks")) -def _get_task_cluster(task: dict, clusters: list) -> Response[dict]: - cluster = task.get("new_cluster") - - if not cluster: - cluster_matches = [ - candidate - for candidate in clusters - if candidate["job_cluster_key"] == task.get("job_cluster_key") - ] - if cluster_matches: - cluster = cluster_matches[0]["new_cluster"] - else: - return Response(error=DatabricksError(message="No cluster found for task")) - return Response(result=cluster) - - def _s3_contents_have_all_rollover_logs(contents: List[dict], run_end_time_seconds: float): final_rollover_log = contents and next( ( @@ -1744,11 +834,6 @@ def _check_total_file_size_changed( return True, new_total_file_size -def _event_log_poll_duration_seconds(): - """Convenience function to aid testing""" - return 15 - - def _get_eventlog_from_s3( cluster_id: str, bucket: str, @@ -1904,7 +989,7 @@ def _get_eventlog( # https://docs.databricks.com/clusters/configure.html#cluster-log-delivery-1 # So we will poll this location for *up to* 5 minutes until we see all the eventlog files we are expecting # in the S3 bucket - poll_duration_seconds = _event_log_poll_duration_seconds() + poll_duration_seconds = 15 if filesystem == "s3": return _get_eventlog_from_s3( @@ -1926,6 +1011,29 @@ def _get_eventlog( return Response(error=DatabricksError(message=f"Unknown log destination: {filesystem}")) +KeyType = TypeVar("KeyType") + + +def _deep_update( + mapping: Dict[KeyType, Any], *updating_mappings: Dict[KeyType, Any] +) -> Dict[KeyType, Any]: + updated_mapping = mapping.copy() + for updating_mapping in updating_mappings: + for k, v in updating_mapping.items(): + if k in updated_mapping: + if isinstance(updated_mapping[k], dict) and isinstance(v, dict): + updated_mapping[k] = _deep_update(updated_mapping[k], v) + elif isinstance(updated_mapping[k], list) and isinstance(v, list): + updated_mapping[k] += v + else: + updated_mapping[k] = v + else: + updated_mapping[k] = v + return updated_mapping + + +# The methods below here are all called within the "subclass scripts" +# awsdatabricks.py and azuredatabricks.py def _get_all_cluster_events(cluster_id: str): """Fetches all ClusterEvents for a given Databricks cluster, optionally within a time window. Pages will be followed and returned as 1 object @@ -1995,22 +1103,28 @@ def _update_monitored_timelines( return active_timelines_by_id, retired_inst_timeline_list -KeyType = TypeVar("KeyType") +def _wait_for_cluster_termination( + cluster_id: str, timeout_seconds=600, poll_seconds=10 +) -> Response[dict]: + logging.info(f"Waiting for cluster {cluster_id} to terminate") + start_seconds = time.time() + cluster = get_default_client().get_cluster(cluster_id) + while "error_code" not in cluster: + state = cluster.get("state") + if state == "TERMINATED": + return Response(result=cluster) + elif state == "TERMINATING": + sleep(poll_seconds) + else: + return Response(error=DatabricksError(message=f"Unexpected cluster state: {state}")) + if time.time() - start_seconds > timeout_seconds: + return Response( + error=DatabricksError( + message=f"Cluster failed to terminate after waiting {timeout_seconds} seconds" + ) + ) -def _deep_update( - mapping: Dict[KeyType, Any], *updating_mappings: Dict[KeyType, Any] -) -> Dict[KeyType, Any]: - updated_mapping = mapping.copy() - for updating_mapping in updating_mappings: - for k, v in updating_mapping.items(): - if k in updated_mapping: - if isinstance(updated_mapping[k], dict) and isinstance(v, dict): - updated_mapping[k] = _deep_update(updated_mapping[k], v) - elif isinstance(updated_mapping[k], list) and isinstance(v, list): - updated_mapping[k] += v - else: - updated_mapping[k] = v - else: - updated_mapping[k] = v - return updated_mapping + cluster = get_default_client().get_cluster(cluster_id) + + return Response(error=DatabricksAPIError(**cluster)) diff --git a/sync/asyncawsemr.py b/sync/asyncawsemr.py deleted file mode 100644 index ad9d5df..0000000 --- a/sync/asyncawsemr.py +++ /dev/null @@ -1,51 +0,0 @@ -from sync.asyncapi.predictions import create_prediction, wait_for_prediction -from sync.awsemr import _get_eventlog_url_from_cluster_report, get_cluster_report -from sync.models import Platform, Response - - -async def get_prediction_for_cluster( - cluster_id: str, preference: str = None, region_name: str = None -) -> Response[dict]: - """Creates a prediction (see :py:func:`~create_prediction_for_cluster`) and returns it when it's ready. - - :param cluster_id: EMR cluster ID - :type cluster_id: str - :param preference: preferred solution defaults to None - :type preference: str, optional - :param region_name: AWS region name, defaults to None - :type region_name: str, optional - :return: prediction ID - :rtype: Response[str] - """ - prediction_response = await create_prediction_for_cluster(cluster_id, region_name) - if prediction_response.error: - return prediction_response - - return await wait_for_prediction(prediction_response.result, preference) - - -async def create_prediction_for_cluster(cluster_id: str, region_name: str = None) -> Response[str]: - """If the cluster terminated successfully with an event log available in S3 a prediction based - on such is created and its ID returned. - - :param cluster_id: EMR cluster ID - :type cluster_id: str - :param region_name: AWS region name, defaults to None - :type region_name: str, optional - :return: prediction ID - :rtype: Response[str] - """ - report_response = get_cluster_report(cluster_id, region_name) - cluster_report = report_response.result - if cluster_report: - eventlog_response = _get_eventlog_url_from_cluster_report(cluster_report) - if eventlog_response.error: - return eventlog_response - - eventlog_http_url = eventlog_response.result - if eventlog_http_url: - return await create_prediction(Platform.AWS_EMR, cluster_report, eventlog_http_url) - - return eventlog_response - - return report_response diff --git a/sync/awsdatabricks.py b/sync/awsdatabricks.py index 2ad9f5c..f937216 100644 --- a/sync/awsdatabricks.py +++ b/sync/awsdatabricks.py @@ -15,34 +15,14 @@ _get_cluster_instances_from_dbfs, _update_monitored_timelines, _wait_for_cluster_termination, - apply_prediction, apply_project_recommendation, - create_and_record_run, - create_and_wait_for_run, create_cluster, - create_prediction_for_run, - create_run, create_submission_for_run, - get_cluster, get_cluster_report, - get_prediction_cluster, - get_prediction_job, get_project_cluster, get_project_cluster_settings, - get_project_job, - get_recommendation_job, handle_successful_job_run, record_run, - run_and_record_job, - run_and_record_job_object, - run_and_record_prediction_job, - run_and_record_project_job, - run_job_object, - run_prediction, - terminate_cluster, - wait_for_and_record_run, - wait_for_final_run_status, - wait_for_run_and_cluster, ) from sync.api import get_access_report as get_api_access_report from sync.clients.databricks import get_default_client @@ -60,34 +40,14 @@ __all__ = [ "get_access_report", - "run_prediction", - "run_and_record_job", - "create_prediction_for_run", "create_submission_for_run", "get_cluster_report", "monitor_cluster", "create_cluster", - "get_cluster", "handle_successful_job_run", "record_run", - "get_prediction_job", - "get_prediction_cluster", - "get_project_job", "get_project_cluster", "get_project_cluster_settings", - "get_recommendation_job", - "run_job_object", - "create_run", - "run_and_record_prediction_job", - "run_and_record_project_job", - "run_and_record_job_object", - "create_and_record_run", - "wait_for_and_record_run", - "create_and_wait_for_run", - "wait_for_final_run_status", - "wait_for_run_and_cluster", - "terminate_cluster", - "apply_prediction", "apply_project_recommendation", ] diff --git a/sync/awsemr.py b/sync/awsemr.py deleted file mode 100644 index a8f0035..0000000 --- a/sync/awsemr.py +++ /dev/null @@ -1,765 +0,0 @@ -""" -Utilities for interacting with EMR -""" - -import datetime -import io -import json -import logging -import re -from copy import deepcopy -from typing import Tuple -from urllib.parse import urlparse -from uuid import uuid4 - -import boto3 as boto -from dateutil.parser import parse as dateparse - -from sync import TIME_FORMAT -from sync.api import get_access_report as get_api_access_report -from sync.api.predictions import create_prediction, wait_for_prediction -from sync.api.projects import create_project_submission, get_project -from sync.models import ( - AccessReport, - AccessReportLine, - AccessStatusCode, - EMRError, - Platform, - ProjectError, - Response, -) -from sync.utils.json import DateTimeEncoderNaiveUTCDropMicroseconds - -logger = logging.getLogger(__name__) - -RUN_DIR_PATTERN_TEMPLATE = r"{project_prefix}/{project_id}/(?P\d{{4}}-[^/]+)/{run_id}" - - -def get_access_report( - log_url: str = None, cluster_id: str = None, region_name: str = None -) -> AccessReport: - """Reports access to systems required for integration of EMR jobs with Sync. - Access is partially determined by the configuration of this library and boto3. - - :param log_url: location of event logs, defaults to None - :type log_url: str, optional - :param cluster_id: cluster ID with which to test EMR access, defaults to None - :type cluster_id: str, optional - :param region_name: region override, defaults to None - :type region_name: str, optional - :return: access report - :rtype: AccessReport - """ - report = get_api_access_report() - sts = boto.client("sts") - response = sts.get_caller_identity() - - arn = response.get("Arn") - if not arn: - report.append( - AccessReportLine( - name="AWS Authentication", - status=AccessStatusCode.RED, - message="Failed to authenticate AWS credentials", - ) - ) - else: - report.append( - AccessReportLine( - name="AWS Authentication", - status=AccessStatusCode.GREEN, - message=f"Authenticated as '{arn}'", - ) - ) - - if log_url: - parsed_log_url = urlparse(log_url) - - if parsed_log_url.scheme == "s3" and arn: - s3 = boto.client("s3") - report.add_boto_method_call( - s3.list_objects_v2, - Bucket=parsed_log_url.netloc, - Prefix=parsed_log_url.params.rstrip("/"), - MaxKeys=1, - ) - else: - report.append( - AccessReportLine( - name="Logging", - status=AccessStatusCode.RED, - message=f"scheme in {parsed_log_url.geturl()} is not supported", - ) - ) - - if arn and cluster_id: - emr = boto.client("emr", region_name=region_name) - - try: - response = emr.describe_cluster(ClusterId=cluster_id) - report.append( - AccessReportLine( - "EMR DescribeCluster", - AccessStatusCode.GREEN, - "describe_cluster call succeeded", - ) - ) - - if response["Cluster"]["InstanceCollectionType"] == "INSTANCE_FLEET": - report.add_boto_method_call(emr.list_instance_fleets, ClusterId=cluster_id) - elif response["Cluster"]["InstanceCollectionType"] == "INSTANCE_GROUP": - report.add_boto_method_call(emr.list_instance_groups, ClusterId=cluster_id) - - report.add_boto_method_call(emr.list_bootstrap_actions, ClusterId=cluster_id) - report.add_boto_method_call(emr.list_instances, ClusterId=cluster_id) - report.add_boto_method_call(emr.list_steps, ClusterId=cluster_id) - - except Exception as exc: - report.append(AccessReportLine("EMR DescribeCluster", AccessStatusCode.RED, str(exc))) - - return report - - -def get_project_job_flow(job_flow: dict, project_id: str) -> Response[dict]: - """Returns a copy of the incoming job flow with project configuration. - - These tags are added: - - 1. sync:run-id - 2. sync:project-id - - Additionally, if a location in S3 is configured for the project the job flow will be configured to store S3 logs there. - The event log URL follows this pattern - - {``s3_project_url``}/{project ID}/{timestamp}/{run ID}/eventlog/ - - :param job_flow: RunJobFlow request object - :type job_flow: dict - :param project_id: project ID - :type project_id: str - :return: RunJobFlow with project configuration - :rtype: Response[dict] - """ - result_job_flow = deepcopy(job_flow) - project_response = get_project(project_id) - _project = project_response.result - if _project: - # Add project ID tag - run_id = str(uuid4()) - tags = {tag["Key"]: tag["Value"] for tag in result_job_flow.get("Tags", [])} - tags["sync:project-id"] = project_id - tags["sync:run-id"] = run_id - result_job_flow["Tags"] = [{"Key": tag[0], "Value": tag[1]} for tag in tags.items()] - - s3_url = _project.get("cluster_log_url") - if s3_url: - parsed_project_url = urlparse(f"{s3_url.strip('/')}/{project_id}") - eventlog_props = { - "spark.eventLog.dir": f"s3a://{parsed_project_url.netloc}/{parsed_project_url.path.strip('/')}/{datetime.datetime.utcnow().strftime(TIME_FORMAT)}/{run_id}/eventlog/", - "spark.eventLog.enabled": "true", - } - for config in result_job_flow.get("Configurations", []): - if config.get("Classification") == "spark-defaults": - config["Properties"] = {**config.get("Properties", {}), **eventlog_props} - break - else: - result_job_flow["Configurations"] = result_job_flow.get("Configurations", []) + [ - {"Classification": "spark-defaults", "Properties": eventlog_props} - ] - - return Response(result=result_job_flow) - - return project_response - - -def get_project_prediction( - project_id: str, run_id: str = None, preference: str = None, region_name: str = None -) -> Response[dict]: - """Finds the latest run in a project or one with the ID if provided (see :py:func:`~get_project_cluster_report`) and returns a prediction based on it. - - The project must be configured with an S3 URL. - - :param project_id: project ID - :type project_id: str - :param run_id: run ID, defaults to None - :type run_id: str, optional - :param preference: preferred solution defaults to None - :type preference: str, optional - :param region_name: AWS region name, defaults to AWS configuration - :type region_name: str, optional - :return: Sync prediction - :rtype: Response[dict] - """ - prediction_response = create_project_prediction(project_id, run_id, preference, region_name) - if prediction_response.error: - return prediction_response - - return wait_for_prediction(prediction_response.result, preference) - - -def run_project_prediction( - project_id: str, run_id: str = None, preference: str = None, region_name: str = None -) -> Response[str]: - """Applies the latest prediction for a project, or one based on the run ID if provided (see :py:func:`~get_project_prediction`). - Returns the ID of the newly created cluster. - - :param project_id: project ID - :type project_id: str - :param run_id: project run ID, defaults to None - :type run_id: str, optional - :param preference: preferred prediction solution, defaults to None - :type preference: str, optional - :param region_name: AWS region name, defaults to AWS configuration - :type region_name: str, optional - :return: cluster ID - :rtype: Response[str] - """ - project_response = get_project(project_id) - project = project_response.result - if project: - response = get_project_prediction( - project_id, - run_id, - preference or project.get("prediction_preference", "balanced"), - region_name, - ) - if response.result: - return run_job_flow( - response.result["solutions"][ - preference or project.get("prediction_preference", "balanced") - ]["configuration"] - ) - return project_response - - -def record_run(cluster_id: str, project_id: str, region_name: str = None) -> Response[str]: - """Adds a report of the cluster to the project's S3 location if it has one, and - adds to the project a prediction based on such returning the ID. - - :param cluster_id: EMR cluster ID - :type cluster_id: str - :param project_id: project ID - :type project_id: str - :param region_name: region name, defaults to None - :type region_name: str, optional - :return: prediction ID - :rtype: Response[str] - """ - report_response = get_cluster_report(cluster_id, region_name) - if report_response.error: - return report_response - - cluster_report = report_response.result - - eventlog_url = _get_eventlog_url_from_cluster_report(cluster_report).result - if eventlog_url: - run_dir = _get_existing_run_dir_from_cluster_config(cluster_report, project_id).result - if run_dir: - if _upload_object( - cluster_report, - f"{run_dir}/emr-cluster-report.json", - ).error: - logger.warning("Failed to save configuration") - - # Start prediction - return create_prediction(Platform.AWS_EMR, cluster_report, eventlog_url, project_id) - - return Response(error=EMRError(message="Failed to find event log")) - - -def create_project_prediction( - project_id: str, run_id: str = None, region_name: str = None -) -> Response[str]: - """Finds the latest run in a project or one with the ID if provided (see :py:func:`~get_project_cluster_report`) - and creates a prediction based on it returning the ID. - - The project must be configured with an S3 URL. - - :param project_id: project ID - :type project_id: str - :param run_id: run ID, defaults to None - :type run_id: str, optional - :param region_name: AWS region name, defaults to AWS configuration - :type region_name: str, optional - :return: Sync prediction ID - :rtype: Response[str] - """ - response = get_project_cluster_report(project_id, run_id, region_name) - if response.error: - return response - - config, eventlog_url = response.result - return create_prediction(Platform.AWS_EMR, config, eventlog_url, project_id) - - -def create_submission( - project_id: str, run_id: str = None, region_name: str = None -) -> Response[str]: - """Gets the report and event log URL for the latest cluster in the project or the one identified by the `run_id` if provided. - - The project must be configured with an S3 URL. - - :param project_id: project ID - :type project_id: str - :param run_id: run ID, defaults to None - :type run_id: str, optional - :param region_name: AWS region name, defaults to AWS configuration - :type region_name: str, optional - :return: a Submission ID - :rtype: Response[Tuple[dict, str]] - """ - response = get_project_cluster_report(project_id, run_id, region_name) - if response.error: - return response - - config, eventlog_url = response.result - return create_project_submission(Platform.AWS_EMR, config, eventlog_url, project_id) - - -def get_project_cluster_report( # noqa: C901 - project_id: str, run_id: str = None, region_name: str = None -) -> Response[Tuple[dict, str]]: - """Gets the report and event log URL for the latest cluster in the project or the one identified by the `run_id` if provided. - - The project must be configured with an S3 URL. - - :param project_id: project ID - :type project_id: str - :param run_id: run ID, defaults to None - :type run_id: str, optional - :param region_name: AWS region name, defaults to AWS configuration - :type region_name: str, optional - :return: a tuple containing the cluster configuration and an event log URL - :rtype: Response[Tuple[dict, str]] - """ - project_response = get_project(project_id) - - project = project_response.result - if project: - project_url = project.get("cluster_log_url") - if project_url: - parsed_project_url = urlparse(f"{project_url}/{project['id']}") - project_prefix = parsed_project_url.path.strip("/") - - s3 = boto.client("s3") - contents = s3.list_objects_v2( - Bucket=parsed_project_url.netloc, Prefix=project_prefix + "/" - ).get("Contents") - if contents: - eventlog_pattern = re.compile( - rf"{project_prefix}/(?P\d{{4}}-[^/]+)/(?P{run_id or '[a-zA-Z0-9-]+'})/eventlog/application_[\d_]+$" - ) - - event_logs = [] - for content in contents: - match = eventlog_pattern.match(content["Key"]) - if match: - event_logs.append((content, match)) - - event_logs.sort(key=lambda x: x[0]["LastModified"], reverse=True) - for log_content, log_match in event_logs: - log_key = log_content["Key"] - config_key = f"{log_key[:log_key.rindex('/eventlog/')]}/emr-cluster-report.json" - if config_key in [content["Key"] for content in contents]: - config = io.BytesIO() - s3.download_fileobj(parsed_project_url.netloc, config_key, config) - return Response( - result=( - json.loads(config.getvalue().decode()), - f"s3://{parsed_project_url.netloc}/{log_key}", - ) - ) - - response = _find_cluster( - log_match.group("run_id"), - created_before=log_content["LastModified"], - created_after=dateparse(log_match.group("timestamp")), - region_name=region_name, - ) - cluster = response.result - if cluster: - response = get_cluster_report(cluster["Id"], region_name) - config = response.result - if config: - error = _upload_object( - config, f"s3://{parsed_project_url.netloc}/{config_key}" - ).error - if error: - logger.warning(f"Failed to save prediction config: {error.message}") - return Response( - result=(config, f"s3://{parsed_project_url.netloc}/{log_key}") - ) - else: - return Response( - error=EMRError(message="No event logs with corresponding configuration found") - ) - else: - return Response(error=ProjectError(message="S3 URL not configured for project")) - else: - return project_response - - -def run_job_flow(job_flow: dict, project_id: str = None, region_name: str = None) -> Response[str]: - """Creates an EMR cluster from the provided RunJobFlow request object. If a project ID - is supplied that project's configuration is first applied. See :py:func:`~get_project_job_flow` - - If the job flow is configured to save the event log in S3 that S3 directory is created as required by Apache Spark. - - If a project with an S3 location is specified the job flow is saved at a location with the following format before the cluster is created. - - {``s3_project_url``}/{project ID}/{timestamp}/{run ID}/job-flow.json - - :param job_flow: RunJobFlow request object - :type job_flow: dict - :param project_id: project ID, defaults to None - :type project_id: str, optional - :param region_name: region name, defaults to AWS configuration - :type region_name: str, optional - :return: cluster ID - :rtype: Response[str] - """ - if project_id: - job_flow_response = get_project_job_flow(job_flow, project_id) - if job_flow_response.error: - return job_flow_response - job_flow = job_flow_response.result - run_id = [tag["Value"] for tag in job_flow["Tags"] if tag["Key"] == "sync:run-id"][0] - - event_log_response = create_s3_event_log_dir(job_flow) - - if project_id: - project_response = get_project(project_id) - if project_response.error: - return project_response - project = project_response.result - if project.get("cluster_log_url"): - match = re.match( - RUN_DIR_PATTERN_TEMPLATE.format( - project_prefix=project["cluster_log_url"], project_id=project_id, run_id=run_id - ), - event_log_response.result or "", - ) - if match: - run_dir = match.group() - else: - run_dir = f"{project['cluster_log_url']}/{project['id']}/{datetime.datetime.utcnow().strftime(TIME_FORMAT)}/{run_id}" - - error = _upload_object( - job_flow, - f"{run_dir}/job-flow.json", - ).error - if error: - logger.warning(f"Failed to save job flow: {error.message}") - - emr = boto.client("emr", region_name=region_name) - return Response(result=emr.run_job_flow(**job_flow)["JobFlowId"]) - - -def run_and_record_job_flow( - job_flow: dict, project_id: str = None, region_name: str = None -) -> Response[str]: - """Creates an EMR cluster with the incoming RunJobFlow request object and project configuration (see :py:func:`~run_job_flow`), - waits for the cluster to complete and records the run. - - :param job_flow: RunJobFlow request object - :type job_flow: dict - :param project_id: project ID - :type project_id: str - :param region_name: region name, defaults to AWS configuration - :type region_name: str, optional - :return: prediction ID - :rtype: Response[str] - """ - job_flow_response = run_job_flow(job_flow, project_id) - if job_flow_response.error: - return job_flow_response - - cluster_id = job_flow_response.result - - emr = boto.client("emr", region_name=region_name) - # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/emr.html#EMR.Waiter.ClusterTerminated - # 30 seconds between polls, no more than 120 polls - waiter = emr.get_waiter("cluster_terminated") - waiter.wait(ClusterId=cluster_id, WaiterConfig={"Delay": 30, "MaxAttempts": 120}) - - return record_run(cluster_id, project_id, region_name) - - -def create_prediction_for_cluster(cluster_id: str, region_name: str = None) -> Response[str]: - """If the cluster terminated successfully with an event log available in S3 a prediction based - on such is created and its ID returned. - - :param cluster_id: EMR cluster ID - :type cluster_id: str - :param region_name: AWS region name, defaults to None - :type region_name: str, optional - :return: prediction ID - :rtype: Response[str] - """ - report_response = get_cluster_report(cluster_id, region_name) - cluster_report = report_response.result - if cluster_report: - eventlog_response = _get_eventlog_url_from_cluster_report(cluster_report) - if eventlog_response.error: - return eventlog_response - - eventlog_http_url = eventlog_response.result - if eventlog_http_url: - return create_prediction(Platform.AWS_EMR, cluster_report, eventlog_http_url) - - return eventlog_response - - return cluster_report - - -def get_cluster_report(cluster_id: str, region_name: str = None) -> Response[dict]: - """Get the cluster configuration required for Sync prediction - - :param cluster_id: cluster ID - :type cluster_id: str - :param region_name: AWS region name, defaults to AWS configuration - :type region_name: str, optional - :return: cluster configuration - :rtype: Response[dict] - """ - emr = boto.client("emr", region_name=region_name) - - cluster = emr.describe_cluster(ClusterId=cluster_id)["Cluster"] - - # Check status - status = cluster["Status"] - if ( - status["State"] != "TERMINATED" - or status["StateChangeReason"].get("Code") != "ALL_STEPS_COMPLETED" - ): - return Response( - error=EMRError( - message=f"Unexpected cluster termination state - {status['State']}: {status['StateChangeReason'].get('Code')}" - ) - ) - - cluster["BootstrapActions"] = emr.list_bootstrap_actions(ClusterId=cluster_id)[ - "BootstrapActions" - ] - if cluster["InstanceCollectionType"] == "INSTANCE_FLEET": - cluster["InstanceFleets"] = emr.list_instance_fleets(ClusterId=cluster_id)["InstanceFleets"] - else: - cluster["InstanceGroups"] = emr.list_instance_groups(ClusterId=cluster_id)["InstanceGroups"] - - cluster_instances = emr.list_instances(ClusterId=cluster_id) - instances = cluster_instances.get("Instances") - instances_next_marker = cluster_instances.get("Marker") - while instances_next_marker is not None: - cluster_instances = emr.list_instances(ClusterId=cluster_id, Marker=instances_next_marker) - instances.extend(cluster_instances.get("Instances")) - instances_next_marker = cluster_instances.get("Marker") - - cluster_steps = emr.list_steps(ClusterId=cluster_id) - steps = cluster_steps.get("Steps") - steps_next_marker = cluster_steps.get("Marker") - while steps_next_marker is not None: - cluster_steps = emr.list_steps(ClusterId=cluster_id, Marker=steps_next_marker) - steps.extend(cluster_steps.get("Steps")) - steps_next_marker = cluster_steps.get("Marker") - - return Response( - result={ - "Cluster": cluster, - "Instances": instances, - "Steps": steps, - "Region": region_name or emr.meta.region_name, - } - ) - - -def create_s3_event_log_dir(job_flow: dict) -> Response[str]: - """Creates the event log "directory" in S3 if the incoming RunJobFlow request object is configured with one. - - :param job_flow: RunJobFlow request object - :type job_flow: dict - :return: S3 event log directory URL - :rtype: Response[str] - """ - for config in job_flow["Configurations"]: - if config["Classification"] == "spark-defaults": - eventlog_dir = config["Properties"].get("spark.eventLog.dir") - if ( - eventlog_dir - and config["Properties"].get("spark.eventLog.enabled", "false").lower() == "true" - ): - parsed_eventlog_dir = urlparse(eventlog_dir) - if parsed_eventlog_dir.scheme == "s3a": - try: - s3 = boto.client("s3") - s3.put_object( - Bucket=parsed_eventlog_dir.netloc, - Key=parsed_eventlog_dir.path.lstrip("/"), - ) - return Response( - result=f"s3://{parsed_eventlog_dir.netloc}/{parsed_eventlog_dir.path.lstrip('/')}" - ) - except Exception as exc: - return Response(error=EMRError(message=str(exc))) - - return Response(error=EMRError(message="No S3 event log dir configured")) - - -def _get_eventlog_url_from_cluster_report(cluster_config: dict) -> Response[str]: - """Returns an S3 URL to the event log for the cluster if one - and only one - exists.""" - eventlog_dir = None - for config in cluster_config["Cluster"]["Configurations"]: - if config["Classification"] == "spark-defaults": - eventlog_dir = config["Properties"].get("spark.eventLog.dir") - break - - if not eventlog_dir: - return Response( - error=EMRError(message="Failed to find event log directory in cluster configuration") - ) - - parsed_eventlog_dir = urlparse(eventlog_dir) - eventlog_pattern = re.compile(rf"{parsed_eventlog_dir.path.lstrip('/')}application_[\d_]+$") - s3 = boto.client("s3") - s3_objects = s3.list_objects_v2( - Bucket=parsed_eventlog_dir.netloc, Prefix=parsed_eventlog_dir.path.lstrip("/") - ) - eventlog_keys = [c["Key"] for c in s3_objects["Contents"] if eventlog_pattern.match(c["Key"])] - - if not eventlog_keys: - return Response(error=EMRError(message="No event log found")) - if len(eventlog_keys) > 1: - return Response(error=EMRError(message="More than 1 event log found")) - - return Response(result=f"s3://{parsed_eventlog_dir.netloc}/{eventlog_keys[0]}") - - -def _get_existing_run_dir_from_cluster_config( - cluster_config: dict, project_id: str -) -> Response[str]: - for tag in cluster_config["Cluster"].get("Tags", []): - if tag["Key"] == "sync:run-id": - run_id = tag["Value"] - break - else: - return Response(error=EMRError(message="Failed to find run ID in cluster configuration")) - cluster_start_time = cluster_config["Cluster"]["Status"]["Timeline"]["CreationDateTime"] - - return _get_existing_run_dir(project_id, run_id, cluster_start_time) - - -def _get_existing_run_dir( - project_id: str, run_id: str, cluster_start_time: datetime.datetime = None -) -> Response[str]: - project_response = get_project(project_id) - project = project_response.result - if project: - s3_url = project.get("cluster_log_url") - if s3_url: - parsed_s3_url = urlparse(s3_url) - s3 = boto.client("s3") - run_dir_pattern = re.compile( - RUN_DIR_PATTERN_TEMPLATE.format( - project_prefix=parsed_s3_url.path.strip("/"), - project_id=project_id, - run_id=run_id, - ) - ) - start_after = (cluster_start_time - datetime.timedelta(hours=1)).isoformat() - list_response = s3.list_objects_v2( - Bucket=parsed_s3_url.netloc, - Prefix=f"{parsed_s3_url.path.strip('/')}/{project_id}/", - StartAfter=start_after, - ) - contents = list_response.get("Contents") - continuation_token = list_response.get("NextContinuationtoken") - pages_left = 5 - while pages_left and contents: - for content in contents: - match = run_dir_pattern.match(content["Key"]) - if match: - return Response( - result=f"s3://{parsed_s3_url.netloc}/{match.group().rstrip('/')}" - ) - if continuation_token: - list_response = s3.list_objects_v2( - Bucket=parsed_s3_url.netloc, - Prefix=f"{parsed_s3_url.path.strip('/')}/{project_id}/", - StartAfter=start_after, - ContinuationToken=continuation_token, - ) - contents = list_response.get("Contents") - continuation_token = list_response.get("NextContinuationtoken") - else: - contents = [] - pages_left -= 1 - return Response(error=EMRError(message="Existing run directory not found")) - return Response(error=EMRError(message="No S3 URL configured for project")) - return project_response - - -def _find_cluster( - run_id: str, - created_before: datetime.datetime, - created_after: datetime.datetime = None, - region_name: str = None, -) -> Response[dict]: - """Returns an AWS DescribeCluster response object if an EMR cluster tagged with the run ID can be found within the date range.""" - created_after = created_after or created_before - datetime.timedelta(days=3) - emr = boto.client("emr", region_name=region_name) - response = emr.list_clusters( - CreatedBefore=created_before, - CreatedAfter=created_after, - ClusterStates=["TERMINATED"], - ) - clusters = response.get("Clusters") - marker = response.get("Marker") - - pages_left = 5 - while clusters: - for cluster in clusters: - if cluster["Status"]["StateChangeReason"].get("Code") == "ALL_STEPS_COMPLETED": - cluster_detail = emr.describe_cluster(ClusterId=cluster["Id"])["Cluster"] - if [ - tag - for tag in cluster_detail["Tags"] - if tag["Key"] == "sync:run-id" and tag["Value"] == run_id - ]: - return Response(result=cluster_detail) - - pages_left -= 1 - if pages_left and marker: - response = emr.list_clusters( - CreatedBefore=created_before, - CreatedAfter=created_after, - ClusterStates=["TERMINATED"], - Marker=marker, - ) - clusters = response["Clusters"] - marker = response.get("Marker") - elif pages_left: - return Response( - error=EMRError(message="No matching cluster in the specified time period") - ) - else: - return Response(error=EMRError(message="Matching EMR cluster not found")) - - return Response(error=EMRError(message="Failed to find EMR cluster")) - - -def _upload_object(obj: dict, s3_url: str) -> Response[str]: - parsed_url = urlparse(s3_url) - obj_key = parsed_url.path.lstrip("/") - - try: - s3 = boto.client("s3") - s3.upload_fileobj( - io.BytesIO( - bytes(json.dumps(obj, cls=DateTimeEncoderNaiveUTCDropMicroseconds), "utf-8") - ), - parsed_url.netloc, - obj_key, - ) - - return Response(result=f"s3://{parsed_url.netloc}/{obj_key}") - except Exception as exc: - return Response(error=EMRError(message=f"Failed to save object: {exc}")) diff --git a/sync/azuredatabricks.py b/sync/azuredatabricks.py index d9218d5..340cf1e 100644 --- a/sync/azuredatabricks.py +++ b/sync/azuredatabricks.py @@ -19,34 +19,13 @@ _get_cluster_instances_from_dbfs, _update_monitored_timelines, _wait_for_cluster_termination, - apply_prediction, apply_project_recommendation, - create_and_record_run, - create_and_wait_for_run, create_cluster, - create_prediction_for_run, - create_run, create_submission_for_run, - get_cluster, get_cluster_report, - get_prediction_cluster, - get_prediction_job, get_project_cluster, get_project_cluster_settings, - get_project_job, - get_recommendation_job, - handle_successful_job_run, record_run, - run_and_record_job, - run_and_record_job_object, - run_and_record_prediction_job, - run_and_record_project_job, - run_job_object, - run_prediction, - terminate_cluster, - wait_for_and_record_run, - wait_for_final_run_status, - wait_for_run_and_cluster, ) from sync.api import get_access_report as get_api_access_report from sync.clients.databricks import get_default_client @@ -63,35 +42,14 @@ __all__ = [ "get_access_report", - "run_prediction", - "run_and_record_job", "monitor_cluster", "create_cluster", - "get_cluster", - "create_prediction_for_run", "create_submission_for_run", "get_cluster_report", - "handle_successful_job_run", "record_run", - "get_prediction_job", - "get_prediction_cluster", "get_project_cluster", - "get_project_job", - "get_recommendation_job", "get_project_cluster", "get_project_cluster_settings", - "run_job_object", - "create_run", - "run_and_record_prediction_job", - "run_and_record_project_job", - "run_and_record_job_object", - "create_and_record_run", - "wait_for_and_record_run", - "create_and_wait_for_run", - "wait_for_final_run_status", - "wait_for_run_and_cluster", - "terminate_cluster", - "apply_prediction", "apply_project_recommendation", ] From 9a02614193c1704dff2211228776c3aaa4b8b5bf Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Fri, 9 Feb 2024 21:57:19 -0500 Subject: [PATCH 07/18] delete prediction preference setting --- sync/api/projects.py | 21 ++------------------- sync/cli/__init__.py | 14 +------------- sync/cli/projects.py | 18 ------------------ sync/config.py | 7 ++----- sync/models.py | 7 ------- 5 files changed, 5 insertions(+), 62 deletions(-) diff --git a/sync/api/projects.py b/sync/api/projects.py index 7780197..4893ba7 100644 --- a/sync/api/projects.py +++ b/sync/api/projects.py @@ -9,14 +9,7 @@ import httpx from sync.clients.sync import get_default_client -from sync.models import ( - Platform, - Preference, - ProjectError, - RecommendationError, - Response, - SubmissionError, -) +from sync.models import Platform, ProjectError, RecommendationError, Response, SubmissionError from . import generate_presigned_url @@ -31,7 +24,6 @@ def create_project( cluster_path: str = None, workspace_id: str = None, cluster_log_url: str = None, - prediction_preference: Preference = Preference.ECONOMY, auto_apply_recs: bool = False, prediction_params: dict = None, app_id: str = None, @@ -53,8 +45,6 @@ def create_project( :type workspace_id: str, optional :param cluster_log_url: S3 or DBFS URL under which to store project configurations and logs, defaults to None :type cluster_log_url: str, optional - :param prediction_preference: preferred prediction solution, defaults to `Preference.ECONOMY` - :type prediction_preference: Preference, optional :param auto_apply_recs: automatically apply project recommendations, defaults to False :type auto_apply_recs: bool, optional :param prediction_params: dictionary of prediction parameters, defaults to None. Valid options are documented `here `__ @@ -74,7 +64,6 @@ def create_project( "cluster_path": cluster_path, "workspace_id": workspace_id, "cluster_log_url": cluster_log_url, - "prediction_preference": prediction_preference, "auto_apply_recs": auto_apply_recs, "prediction_params": prediction_params, "app_id": app_id, @@ -102,7 +91,6 @@ def update_project( workspace_id: str = None, cluster_log_url: str = None, app_id: str = None, - prediction_preference: Preference = None, auto_apply_recs: bool = None, prediction_params: dict = None, optimize_instance_size=None, @@ -121,12 +109,9 @@ def update_project( :type cluster_log_url: str, optional :param app_id: external identifier, defaults to None :type app_id: str, optional - :param prediction_preference: default preference for predictions, defaults to None - :type prediction_preference: Preference, optional :param auto_apply_recs: automatically apply project recommendations, defaults to None :type auto_apply_recs: bool, optional :param prediction_params: dictionary of prediction parameters, defaults to None. Valid options are documented `here `__ - :type prediction_preference: dict, optional :return: updated project :rtype: Response[dict] """ @@ -137,8 +122,6 @@ def update_project( project_update["cluster_log_url"] = cluster_log_url if app_id: project_update["app_id"] = app_id - if prediction_preference: - project_update["prediction_preference"] = prediction_preference if auto_apply_recs is not None: project_update["auto_apply_recs"] = auto_apply_recs if prediction_params: @@ -215,7 +198,7 @@ def create_project_submission( ) -> Response[str]: """Create a submission - :param platform: platform, e.g. "aws-emr" + :param platform: platform, e.g. "aws-databricks" :type platform: Platform :param cluster_report: cluster report :type cluster_report: dict diff --git a/sync/cli/__init__.py b/sync/cli/__init__.py index b8a68b7..ca14e41 100644 --- a/sync/cli/__init__.py +++ b/sync/cli/__init__.py @@ -7,8 +7,7 @@ from sync.cli import awsdatabricks, azuredatabricks, projects, workspaces from sync.cli.util import OPTIONAL_DEFAULT -from sync.config import API_KEY, CONFIG, DB_CONFIG, APIKey, Configuration, DatabricksConf, init -from sync.models import Preference +from sync.config import API_KEY, DB_CONFIG, APIKey, DatabricksConf, init LOG_FORMAT = "%(asctime)s %(levelname)s [%(name)s] %(message)s" @@ -32,14 +31,12 @@ def main(debug: bool): @main.command @click.option("--api-key-id") @click.option("--api-key-secret") -@click.option("--prediction-preference") @click.option("--databricks-host") @click.option("--databricks-token") @click.option("--databricks-region") def configure( api_key_id: str = None, api_key_secret: str = None, - prediction_preference: str = None, databricks_host: str = None, databricks_token: str = None, databricks_region: str = None, @@ -55,12 +52,6 @@ def configure( show_default=False, ) - prediction_preference = prediction_preference or click.prompt( - "Default prediction preference", - type=click.Choice([p.value for p in Preference]), - default=(CONFIG.default_prediction_preference or Preference.ECONOMY).value, - ) - dbx_host = databricks_host or OPTIONAL_DEFAULT dbx_token = databricks_token or OPTIONAL_DEFAULT dbx_region = databricks_region or OPTIONAL_DEFAULT @@ -84,9 +75,6 @@ def configure( init( APIKey(api_key_id=api_key_id, api_key_secret=api_key_secret), - Configuration( - default_prediction_preference=prediction_preference, - ), DatabricksConf(host=dbx_host, token=dbx_token, aws_region_name=dbx_region) if dbx_host != OPTIONAL_DEFAULT and dbx_token != OPTIONAL_DEFAULT diff --git a/sync/cli/projects.py b/sync/cli/projects.py index f0aadfb..d62b062 100644 --- a/sync/cli/projects.py +++ b/sync/cli/projects.py @@ -11,8 +11,6 @@ update_project, ) from sync.cli.util import validate_project -from sync.config import CONFIG -from sync.models import Preference from sync.utils.json import DateTimeEncoderNaiveUTCDropMicroseconds @@ -59,12 +57,6 @@ def get(project: dict): ) @click.option("-w", "--workspace-id", help="Databricks workspace ID") @click.option("-l", "--location", help="S3 URL under which to store event logs and configuration") -@click.option( - "-p", - "--preference", - type=click.Choice(Preference), - default=CONFIG.default_prediction_preference, -) @click.option( "--auto-apply-recs", is_flag=True, @@ -83,7 +75,6 @@ def create( cluster_path: str = None, workspace_id: str = None, location: str = None, - preference: Preference = None, app_id: str = None, ): """Create a project for a Spark application that runs on the platform identified by PRODUCT_CODE. @@ -97,7 +88,6 @@ def create( cluster_path=cluster_path, workspace_id=workspace_id, cluster_log_url=location, - prediction_preference=preference, auto_apply_recs=auto_apply_recs, app_id=app_id, ) @@ -121,12 +111,6 @@ def create( help="Path to cluster definition in job object, e.g. 'job_clusters/Job_cluster'", ) @click.option("-w", "--workspace-id", help="Databricks workspace ID") -@click.option( - "-p", - "--preference", - type=click.Choice(Preference), - default=CONFIG.default_prediction_preference, -) @click.option("--auto-apply-recs", type=bool, help="Automatically apply project recommendations") def update( project_id: str, @@ -135,7 +119,6 @@ def update( app_id: str = None, cluster_path: str = None, workspace_id: str = None, - preference: Preference = None, auto_apply_recs: bool = None, ): """Update a project""" @@ -146,7 +129,6 @@ def update( app_id=app_id, cluster_path=cluster_path, workspace_id=workspace_id, - prediction_preference=preference, auto_apply_recs=auto_apply_recs, ) if response.result: diff --git a/sync/config.py b/sync/config.py index 62969b8..697c2ad 100644 --- a/sync/config.py +++ b/sync/config.py @@ -4,13 +4,11 @@ import json from pathlib import Path -from typing import Any, Callable, Dict, Union +from typing import Any, Callable, Dict from urllib.parse import urlparse import boto3 as boto -from pydantic import BaseSettings, Field, validator, Extra - -from .models import Preference +from pydantic import BaseSettings, Extra, Field, validator CREDENTIALS_FILE = "credentials" CONFIG_FILE = "config" @@ -39,7 +37,6 @@ def customise_sources(cls, init_settings, env_settings, file_secret_settings): class Configuration(BaseSettings): - default_prediction_preference: Union[Preference, None] = Preference.ECONOMY api_url: str = Field("https://api.synccomputing.com", env="SYNC_API_URL") class Config: diff --git a/sync/models.py b/sync/models.py index 197a2f9..9677557 100644 --- a/sync/models.py +++ b/sync/models.py @@ -11,14 +11,7 @@ from pydantic.generics import GenericModel -class Preference(str, Enum): - PERFORMANCE = "performance" - BALANCED = "balanced" - ECONOMY = "economy" - - class Platform(str, Enum): - AWS_EMR = "aws-emr" AWS_DATABRICKS = "aws-databricks" AZURE_DATABRICKS = "azure-databricks" From dcf046ad1d3e136356a36421b57f70383eb9939c Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Fri, 9 Feb 2024 22:24:00 -0500 Subject: [PATCH 08/18] remove predictions instructions from docs --- README.md | 27 +- demo/emr/application_1678162862227_0001 | 464 ------------------------ demo/emr/emr-cluster-report.json | 354 ------------------ docs/guide/project.rst | 80 ---- docs/guide/start.rst | 5 +- docs/index.rst | 1 - docs/reference/api.rst | 3 +- docs/reference/api/predictions.rst | 5 - docs/reference/awsemr.rst | 6 - sync/api/projects.py | 2 +- 10 files changed, 7 insertions(+), 940 deletions(-) delete mode 100644 demo/emr/application_1678162862227_0001 delete mode 100644 demo/emr/emr-cluster-report.json delete mode 100644 docs/guide/project.rst delete mode 100644 docs/reference/api/predictions.rst delete mode 100644 docs/reference/awsemr.rst diff --git a/README.md b/README.md index f055519..15e941c 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,6 @@ The Sync Library provides drop-in functions facilitating integration between Syn *Note*: This library is under active development and may contain features not yet fully supported. -* [Goals](#goals) - * [Out of Scope](#out-of-scope) * [Contribution Guidelines](#contribution-guidelines) * [Documentation](#documentation) * [CLI](#cli) @@ -12,20 +10,6 @@ The Sync Library provides drop-in functions facilitating integration between Syn * [Configuration](#configuration) * [Future Considerations](#future-considerations) -## Goals -This Library enables recording EMR and Databricks job results to track high-level metrics and offer run-time and cost optimizing configuration updates. - -Specifically it supports, -1. Starting an EMR job from a [RunJobFlow](https://docs.aws.amazon.com/emr/latest/APIReference/API_RunJobFlow.html) spec or a Databricks job from a [Clusters 2.0 Create](https://docs.databricks.com/dev-tools/api/latest/clusters.html#create) spec with the configuration to make the resources necessary for a prediction available, e.g. event log. -2. Tracking run data to provide analysis and predictions. -3. Starting an EMR or Databricks job with configuration from a Sync prediction. - -### Out of Scope -Orchestration that supports either, -1. Running more than 1 job at a time -2. Continuous or scheduled iterations of a project - -Though this repo is not meant to provide full-blown continuous tuning solutions, functions provided by this library are intended to be incorporated in such orchestration tools. For convenience however, functions that return only after a cluster completes to record the result are included. ## Contribution Guidelines Only add what provides clear benefit - no speculative development. Contributions should be organized in well-defined orthogonal functions and classes - "building blocks" - with documentation and tests. Public functions are subject to the constraints of [semantic versioning](https://semver.org). And be nice! @@ -48,16 +32,13 @@ For troubleshooting documentation issues it may help to remove the "\_build" dir Releases are semi-automated with the "Release new library version" Github workflow. To cut a new release update the version in [`sync/__init__.py`](sync/__init__.py) in a PR. Once it's merged run the "Release new library version" workflow on `main` from the "Actions" tab of the Github page for this repo. This will tag `main` with the new version, update the `latest` tag to match and create a Github release. ## CLI -The CLI is provided mainly for demonstration of what's possible when you integrate with Sync's API using this library. Explore available commands for EMR clusters with `sync-cli aws-emr --help`. You can also use it to interact directly with API resources: `sync-cli predictions --help`. +The CLI is provided mainly for demonstration of what's possible when you integrate with Sync's API using this library. Explore available commands with `sync-cli --help` ## Developer Interface -The developer interface consists of the public attributes of the `sync.api` package, and the `sync.awsemr` and `sync.awsdatabricks` modules. With each change the impact to the version of the next release must be considered in regard to semantic versioning [semantic versioning](https://semver.org). The developer interface is built using clients including those in `sync.clients`. Clients in that package provide a raw interface to their corresponding services and are intended to support the developer interface only. +The developer interface consists of the public attributes of the `sync.api` package, and the `sync.awsazure` and `sync.awsdatabricks` modules. With each change the impact to the version of the next release must be considered in regard to semantic versioning [semantic versioning](https://semver.org). The developer interface is built using clients including those in `sync.clients`. Clients in that package provide a raw interface to their corresponding services and are intended to support the developer interface only. -This library is organized by functional domain as hinted by the names of the modules under the `sync` package. Utilities for interacting with EMR and Databricks are in `sync.awsemr` and `sync.awsdatabricks`, respectively. These modules will provide functionality for starting jobs and consolidating information required for Sync predictions. When starting jobs tags are applied and the event log location specified. +This library is organized by functional domain as hinted by the names of the modules under the `sync` package. Utilities for interacting with Databricks are in `sync.awsazure` and `sync.awsdatabricks`, respectively. These modules will provide functionality for starting jobs and consolidating information required for Sync predictions. When starting jobs tags are applied and the event log location specified. Successful responses and errors from the developer interface will be returned in an instance of the generic [Response](sync/models.py). Use of this model means that exceptions raised must be handled by this library to provide helpful information in the error response. ## Configuration -Configuration at the installation site is required before the library can be used. See the user guide for details. - -## Future Considerations -A Spark application may be refactored to the point of invalidating the basis of a prediction. This could lead to a prediction being wildly off or altogether broken in that its configuration cannot yield a successful run. It may therefore be worthwhile to evaluate the state of the application code a key points. Concretely, a hash of the code could be persisted to facilitate tracking changes and warning users when a prediction may no longer be reliably applied. +Configuration at the installation site is required before the library can be used. See the user guide for details. \ No newline at end of file diff --git a/demo/emr/application_1678162862227_0001 b/demo/emr/application_1678162862227_0001 deleted file mode 100644 index 461e65b..0000000 --- a/demo/emr/application_1678162862227_0001 +++ /dev/null @@ -1,464 +0,0 @@ -{"Event":"SparkListenerLogStart","Spark Version":"3.0.1-amzn-0"} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"driver","Host":"ip-172-31-102-115.ec2.internal","Port":36183},"Maximum Memory":1078827417,"Timestamp":1678162962526,"Maximum Onheap Memory":1078827417,"Maximum Offheap Memory":0} -{"Event":"SparkListenerEnvironmentUpdate","JVM Information":{"Java Home":"/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre","Java Version":"1.8.0_362 (Amazon.com Inc.)","Scala Version":"version 2.12.10"},"Spark Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.scheduler.mode":"FIFO","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"},"Hadoop Properties":{"hadoop.service.shutdown.timeout":"30s","yarn.resourcemanager.amlauncher.thread-count":"50","yarn.sharedcache.enabled":"false","fs.s3a.connection.maximum":"15","yarn.nodemanager.numa-awareness.numactl.cmd":"/usr/bin/numactl","fs.s3a.impl":"org.apache.hadoop.fs.s3a.S3AFileSystem","yarn.app.mapreduce.am.scheduler.heartbeat.interval-ms":"1000","yarn.timeline-service.timeline-client.number-of-async-entities-to-merge":"10","hadoop.security.kms.client.timeout":"60","hadoop.http.authentication.kerberos.principal":"HTTP/_HOST@LOCALHOST","mapreduce.jobhistory.loadedjob.tasks.max":"-1","mapreduce.framework.name":"yarn","yarn.sharedcache.uploader.server.thread-count":"50","yarn.nodemanager.log-aggregation.roll-monitoring-interval-seconds.min":"3600","yarn.nodemanager.linux-container-executor.nonsecure-mode.user-pattern":"^[_.A-Za-z0-9][-@_.A-Za-z0-9]{0,255}?[$]?$","tfile.fs.output.buffer.size":"262144","yarn.app.mapreduce.am.job.task.listener.thread-count":"60","yarn.nodemanager.node-attributes.resync-interval-ms":"120000","hadoop.security.groups.cache.background.reload.threads":"3","yarn.resourcemanager.webapp.cross-origin.enabled":"true","fs.AbstractFileSystem.ftp.impl":"org.apache.hadoop.fs.ftp.FtpFs","hadoop.registry.secure":"false","hadoop.shell.safely.delete.limit.num.files":"100","dfs.bytes-per-checksum":"512","fs.s3.buffer.dir":"/mnt/s3,/mnt1/s3","mapreduce.job.acl-view-job":" ","fs.s3a.s3guard.ddb.background.sleep":"25ms","fs.s3a.retry.limit":"${fs.s3a.attempts.maximum}","mapreduce.jobhistory.loadedjobs.cache.size":"5","fs.s3a.s3guard.ddb.table.create":"false","yarn.log-aggregation.enable-local-cleanup":"false","dfs.namenode.handler.count":"64","yarn.nodemanager.amrmproxy.enabled":"false","yarn.timeline-service.entity-group-fs-store.with-user-dir":"false","mapreduce.input.fileinputformat.split.minsize":"0","yarn.resourcemanager.container.liveness-monitor.interval-ms":"600000","dfs.namenode.replication.max-streams":"20","yarn.resourcemanager.client.thread-count":"64","io.seqfile.compress.blocksize":"1000000","mapreduce.tasktracker.http.threads":"60","fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","yarn.sharedcache.checksum.algo.impl":"org.apache.hadoop.yarn.sharedcache.ChecksumSHA256Impl","yarn.nodemanager.amrmproxy.interceptor-class.pipeline":"org.apache.hadoop.yarn.server.nodemanager.amrmproxy.DefaultRequestInterceptor","dfs.datanode.data.dir":"/mnt/hdfs,/mnt1/hdfs","dfs.replication":"1","yarn.timeline-service.entity-group-fs-store.leveldb-cache-read-cache-size":"10485760","mapreduce.reduce.shuffle.fetch.retry.interval-ms":"1000","mapreduce.task.profile.maps":"0-2","yarn.scheduler.include-port-in-node-name":"false","yarn.nodemanager.admin-env":"MALLOC_ARENA_MAX=$MALLOC_ARENA_MAX","yarn.resourcemanager.node-removal-untracked.timeout-ms":"60000","mapreduce.am.max-attempts":"2","hadoop.security.kms.client.failover.sleep.base.millis":"100","mapreduce.jobhistory.webapp.https.address":"0.0.0.0:19890","yarn.node-labels.fs-store.impl.class":"org.apache.hadoop.yarn.nodelabels.FileSystemNodeLabelsStore","yarn.nodemanager.collector-service.address":"${yarn.nodemanager.hostname}:8048","fs.trash.checkpoint.interval":"0","mapreduce.job.map.output.collector.class":"org.apache.hadoop.mapred.MapTask$MapOutputBuffer","yarn.resourcemanager.node-ip-cache.expiry-interval-secs":"-1","hadoop.http.authentication.signature.secret.file":"*********(redacted)","hadoop.jetty.logs.serve.aliases":"true","yarn.resourcemanager.placement-constraints.handler":"disabled","yarn.timeline-service.handler-thread-count":"10","yarn.resourcemanager.max-completed-applications":"1000","dfs.hosts.exclude":"/emr/instance-controller/lib/dfs.hosts.exclude","yarn.resourcemanager.placement-constraints.algorithm.class":"org.apache.hadoop.yarn.server.resourcemanager.scheduler.constraint.algorithm.DefaultPlacementAlgorithm","yarn.sharedcache.webapp.address":"0.0.0.0:8788","yarn.resourcemanager.delegation.token.renew-interval":"*********(redacted)","yarn.sharedcache.nm.uploader.replication.factor":"10","hadoop.security.groups.negative-cache.secs":"30","yarn.app.mapreduce.task.container.log.backups":"0","mapreduce.reduce.skip.proc-count.auto-incr":"true","hadoop.security.group.mapping.ldap.posix.attr.gid.name":"gidNumber","ipc.client.fallback-to-simple-auth-allowed":"false","yarn.nodemanager.resource.memory.enforced":"true","yarn.client.failover-proxy-provider":"org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider","yarn.timeline-service.http-authentication.simple.anonymous.allowed":"true","ha.health-monitor.check-interval.ms":"1000","yarn.acl.reservation-enable":"false","yarn.resourcemanager.store.class":"org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore","yarn.app.mapreduce.am.hard-kill-timeout-ms":"10000","fs.s3a.etag.checksum.enabled":"false","yarn.nodemanager.container-metrics.enable":"false","yarn.timeline-service.client.fd-clean-interval-secs":"60","yarn.resourcemanager.nodemanagers.heartbeat-interval-ms":"250","hadoop.common.configuration.version":"3.0.0","fs.s3a.s3guard.ddb.table.capacity.read":"500","yarn.nodemanager.remote-app-log-dir-suffix":"logs","yarn.nodemanager.windows-container.cpu-limit.enabled":"false","yarn.nodemanager.runtime.linux.docker.privileged-containers.allowed":"false","file.blocksize":"67108864","hadoop.registry.zk.retry.ceiling.ms":"60000","mapreduce.reduce.env":"HADOOP_MAPRED_HOME=/usr/lib/hadoop-mapreduce","yarn.scheduler.configuration.leveldb-store.path":"${hadoop.tmp.dir}/yarn/system/confstore","yarn.sharedcache.store.in-memory.initial-delay-mins":"10","mapreduce.jobhistory.principal":"jhs/_HOST@REALM.TLD","mapreduce.map.skip.proc-count.auto-incr":"true","fs.s3a.committer.name":"file","mapreduce.task.profile.reduces":"0-2","hadoop.zk.num-retries":"1000","yarn.webapp.xfs-filter.enabled":"true","seq.io.sort.mb":"100","yarn.scheduler.configuration.max.version":"100","yarn.timeline-service.webapp.https.address":"${yarn.timeline-service.hostname}:8190","yarn.resourcemanager.scheduler.address":"ip-172-31-102-115.ec2.internal:8030","yarn.node-labels.enabled":"false","yarn.resourcemanager.webapp.ui-actions.enabled":"true","mapreduce.task.timeout":"600000","yarn.sharedcache.client-server.thread-count":"50","hadoop.security.groups.shell.command.timeout":"0s","hadoop.security.crypto.cipher.suite":"AES/CTR/NoPadding","yarn.nodemanager.elastic-memory-control.oom-handler":"org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.DefaultOOMHandler","yarn.resourcemanager.connect.max-wait.ms":"900000","fs.defaultFS":"hdfs://ip-172-31-102-115.ec2.internal:8020","yarn.minicluster.use-rpc":"false","yarn.app.mapreduce.am.env":"HADOOP_MAPRED_HOME=/usr/lib/hadoop-mapreduce","fs.s3.impl":"com.amazon.ws.emr.hadoop.fs.EmrFileSystem","fs.har.impl.disable.cache":"true","yarn.webapp.ui2.enable":"false","io.compression.codec.bzip2.library":"system-native","fs.s3a.change.detection.source":"etag","yarn.nodemanager.distributed-scheduling.enabled":"false","mapreduce.shuffle.connection-keep-alive.timeout":"5","yarn.resourcemanager.webapp.https.address":"${yarn.resourcemanager.hostname}:8090","mapreduce.jobhistory.address":"ip-172-31-102-115.ec2.internal:10020","yarn.resourcemanager.nm-tokens.master-key-rolling-interval-secs":"*********(redacted)","yarn.is.minicluster":"false","yarn.nodemanager.address":"${yarn.nodemanager.hostname}:8041","hadoop.proxyuser.livy.groups":"*","fs.abfss.impl":"org.apache.hadoop.fs.azurebfs.SecureAzureBlobFileSystem","fs.AbstractFileSystem.s3a.impl":"org.apache.hadoop.fs.s3a.S3A","mapreduce.task.combine.progress.records":"10000","yarn.resourcemanager.epoch.range":"0","yarn.resourcemanager.am.max-attempts":"2","yarn.nodemanager.linux-container-executor.cgroups.hierarchy":"/hadoop-yarn","fs.AbstractFileSystem.wasbs.impl":"org.apache.hadoop.fs.azure.Wasbs","yarn.timeline-service.entity-group-fs-store.cache-store-class":"org.apache.hadoop.yarn.server.timeline.MemoryTimelineStore","yarn.nodemanager.runtime.linux.docker.default-rw-mounts":"/mnt/yarn:/mnt/yarn,/mnt1/yarn:/mnt1/yarn,/mnt/s3:/mnt/s3,/mnt1/s3:/mnt1/s3","fs.ftp.transfer.mode":"BLOCK_TRANSFER_MODE","ipc.server.log.slow.rpc":"false","yarn.resourcemanager.node-labels.provider.fetch-interval-ms":"1800000","yarn.router.webapp.https.address":"0.0.0.0:8091","yarn.nodemanager.webapp.cross-origin.enabled":"false","fs.wasb.impl":"org.apache.hadoop.fs.azure.NativeAzureFileSystem","yarn.resourcemanager.auto-update.containers":"false","yarn.app.mapreduce.am.job.committer.cancel-timeout":"60000","yarn.scheduler.configuration.zk-store.parent-path":"/confstore","yarn.nodemanager.default-container-executor.log-dirs.permissions":"750","yarn.app.attempt.diagnostics.limit.kc":"64","fs.s3a.change.detection.mode":"server","hadoop.proxyuser.presto.hosts":"*","ftp.bytes-per-checksum":"512","yarn.nodemanager.resource.memory-mb":"12288","io.compression.codecs":"org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec,com.hadoop.compression.lzo.LzoCodec,com.hadoop.compression.lzo.LzopCodec","fs.AbstractFileSystem.abfs.impl":"org.apache.hadoop.fs.azurebfs.Abfs","yarn.timeline-service.writer.flush-interval-seconds":"60","fs.s3a.fast.upload.active.blocks":"4","yarn.resourcemanager.submission-preprocessor.enabled":"false","hadoop.security.credential.clear-text-fallback":"true","yarn.nodemanager.collector-service.thread-count":"5","fs.azure.secure.mode":"false","mapreduce.jobhistory.joblist.cache.size":"20000","fs.ftp.host":"0.0.0.0","yarn.timeline-service.writer.async.queue.capacity":"100","yarn.resourcemanager.fs.state-store.num-retries":"0","yarn.resourcemanager.nodemanager-connect-retries":"10","yarn.nodemanager.log-aggregation.num-log-files-per-app":"30","hadoop.security.kms.client.encrypted.key.cache.low-watermark":"0.3f","fs.s3a.committer.magic.enabled":"false","yarn.timeline-service.client.max-retries":"30","dfs.ha.fencing.ssh.connect-timeout":"30000","yarn.log-aggregation-enable":"true","yarn.system-metrics-publisher.enabled":"true","mapreduce.reduce.markreset.buffer.percent":"0.0","fs.AbstractFileSystem.viewfs.impl":"org.apache.hadoop.fs.viewfs.ViewFs","mapreduce.task.io.sort.factor":"48","yarn.nodemanager.amrmproxy.client.thread-count":"25","ha.failover-controller.new-active.rpc-timeout.ms":"60000","yarn.nodemanager.container-localizer.java.opts":"-Xmx256m","mapreduce.jobhistory.datestring.cache.size":"200000","mapreduce.job.acl-modify-job":" ","dfs.namenode.https-address":"ip-172-31-102-115.ec2.internal:9871","yarn.nodemanager.windows-container.memory-limit.enabled":"false","yarn.timeline-service.webapp.address":"${yarn.timeline-service.hostname}:8188","yarn.app.mapreduce.am.job.committer.commit-window":"10000","yarn.nodemanager.container-manager.thread-count":"64","yarn.minicluster.fixed.ports":"false","hadoop.tags.system":"YARN,HDFS,NAMENODE,DATANODE,REQUIRED,SECURITY,KERBEROS,PERFORMANCE,CLIENT\n ,SERVER,DEBUG,DEPRECATED,COMMON,OPTIONAL","yarn.cluster.max-application-priority":"0","yarn.timeline-service.ttl-enable":"true","mapreduce.jobhistory.recovery.store.fs.uri":"${hadoop.tmp.dir}/mapred/history/recoverystore","hadoop.caller.context.signature.max.size":"40","hadoop.proxyuser.hive.groups":"*","yarn.client.load.resource-types.from-server":"false","ha.zookeeper.session-timeout.ms":"10000","mapreduce.map.java.opts":"-Xmx1229m","tfile.io.chunk.size":"1048576","fs.s3a.s3guard.ddb.table.capacity.write":"100","yarn.dispatcher.print-events-info.threshold":"5000","mapreduce.job.speculative.slowtaskthreshold":"1.0","io.serializations":"org.apache.hadoop.io.serializer.WritableSerialization, org.apache.hadoop.io.serializer.avro.AvroSpecificSerialization, org.apache.hadoop.io.serializer.avro.AvroReflectSerialization","hadoop.security.kms.client.failover.sleep.max.millis":"2000","hadoop.security.group.mapping.ldap.directory.search.timeout":"10000","yarn.scheduler.configuration.store.max-logs":"1000","yarn.nodemanager.node-attributes.provider.fetch-interval-ms":"600000","fs.swift.impl":"org.apache.hadoop.fs.swift.snative.SwiftNativeFileSystem","yarn.nodemanager.local-cache.max-files-per-directory":"8192","hadoop.http.cross-origin.enabled":"false","dfs.namenode.rpc-address":"ip-172-31-102-115.ec2.internal:8020","hadoop.zk.acl":"world:anyone:rwcda","mapreduce.map.sort.spill.percent":"0.80","yarn.timeline-service.entity-group-fs-store.scan-interval-seconds":"60","dfs.datanode.fsdataset.volume.choosing.policy":"org.apache.hadoop.hdfs.server.datanode.fsdataset.AvailableSpaceVolumeChoosingPolicy","yarn.node-attribute.fs-store.impl.class":"org.apache.hadoop.yarn.server.resourcemanager.nodelabels.FileSystemNodeAttributeStore","fs.s3a.retry.interval":"500ms","yarn.timeline-service.client.best-effort":"false","yarn.resourcemanager.webapp.delegation-token-auth-filter.enabled":"*********(redacted)","hadoop.security.group.mapping.ldap.posix.attr.uid.name":"uidNumber","fs.AbstractFileSystem.swebhdfs.impl":"org.apache.hadoop.fs.SWebHdfs","yarn.nodemanager.elastic-memory-control.timeout-sec":"5","mapreduce.ifile.readahead":"true","yarn.timeline-service.leveldb-timeline-store.ttl-interval-ms":"300000","yarn.timeline-service.reader.webapp.address":"${yarn.timeline-service.webapp.address}","yarn.resourcemanager.placement-constraints.algorithm.pool-size":"1","yarn.timeline-service.hbase.coprocessor.jar.hdfs.location":"/hbase/coprocessor/hadoop-yarn-server-timelineservice.jar","hadoop.security.kms.client.encrypted.key.cache.num.refill.threads":"2","yarn.resourcemanager.scheduler.class":"org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler","yarn.app.mapreduce.am.command-opts":"-Xmx2458m","mapreduce.cluster.local.dir":"/mnt/mapred,/mnt1/mapred","hadoop.proxyuser.hue.hosts":"*","io.mapfile.bloom.error.rate":"0.005","fs.client.resolve.topology.enabled":"false","hadoop.proxyuser.hue.groups":"*","yarn.nodemanager.runtime.linux.allowed-runtimes":"default,docker","yarn.sharedcache.store.class":"org.apache.hadoop.yarn.server.sharedcachemanager.store.InMemorySCMStore","ha.failover-controller.graceful-fence.rpc-timeout.ms":"5000","ftp.replication":"3","hadoop.security.uid.cache.secs":"14400","mapreduce.job.maxtaskfailures.per.tracker":"3","fs.s3a.metadatastore.impl":"org.apache.hadoop.fs.s3a.s3guard.NullMetadataStore","io.skip.checksum.errors":"false","yarn.app.mapreduce.client-am.ipc.max-retries-on-timeouts":"3","yarn.timeline-service.webapp.xfs-filter.xframe-options":"SAMEORIGIN","fs.s3a.connection.timeout":"200000","yarn.nodemanager.linux-container-executor.group":"yarn","mapreduce.job.max.split.locations":"15","yarn.resourcemanager.nm-container-queuing.max-queue-length":"15","hadoop.registry.zk.session.timeout.ms":"60000","yarn.federation.cache-ttl.secs":"300","mapreduce.jvm.system-properties-to-log":"os.name,os.version,java.home,java.runtime.version,java.vendor,java.version,java.vm.name,java.class.path,java.io.tmpdir,user.dir,user.name","yarn.resourcemanager.opportunistic-container-allocation.nodes-used":"10","yarn.timeline-service.entity-group-fs-store.active-dir":"/tmp/entity-file-history/active","mapreduce.shuffle.transfer.buffer.size":"131072","yarn.timeline-service.client.retry-interval-ms":"1000","yarn.timeline-service.flowname.max-size":"0","yarn.http.policy":"HTTP_ONLY","fs.s3a.socket.send.buffer":"8192","fs.AbstractFileSystem.abfss.impl":"org.apache.hadoop.fs.azurebfs.Abfss","yarn.sharedcache.uploader.server.address":"0.0.0.0:8046","yarn.resourcemanager.delegation-token.max-conf-size-bytes":"*********(redacted)","hadoop.http.authentication.token.validity":"*********(redacted)","mapreduce.shuffle.max.connections":"0","yarn.minicluster.yarn.nodemanager.resource.memory-mb":"4096","mapreduce.job.emit-timeline-data":"false","yarn.nodemanager.resource.system-reserved-memory-mb":"-1","hadoop.kerberos.min.seconds.before.relogin":"60","mapreduce.jobhistory.move.thread-count":"3","yarn.resourcemanager.admin.client.thread-count":"1","yarn.dispatcher.drain-events.timeout":"300000","fs.s3a.buffer.dir":"${hadoop.tmp.dir}/s3a","hadoop.ssl.enabled.protocols":"TLSv1,SSLv2Hello,TLSv1.1,TLSv1.2","mapreduce.jobhistory.admin.address":"0.0.0.0:10033","yarn.log-aggregation-status.time-out.ms":"600000","fs.s3a.assumed.role.sts.endpoint.region":"us-west-1","mapreduce.shuffle.port":"13562","yarn.resourcemanager.max-log-aggregation-diagnostics-in-memory":"10","yarn.nodemanager.health-checker.interval-ms":"600000","yarn.router.clientrm.interceptor-class.pipeline":"org.apache.hadoop.yarn.server.router.clientrm.DefaultClientRequestInterceptor","yarn.resourcemanager.zk-appid-node.split-index":"0","ftp.blocksize":"67108864","yarn.nodemanager.runtime.linux.sandbox-mode.local-dirs.permissions":"read","yarn.router.rmadmin.interceptor-class.pipeline":"org.apache.hadoop.yarn.server.router.rmadmin.DefaultRMAdminRequestInterceptor","yarn.nodemanager.log-container-debug-info.enabled":"true","yarn.client.max-cached-nodemanagers-proxies":"0","yarn.nodemanager.linux-container-executor.cgroups.delete-delay-ms":"20","yarn.nodemanager.delete.debug-delay-sec":"0","yarn.nodemanager.pmem-check-enabled":"true","yarn.nodemanager.disk-health-checker.max-disk-utilization-per-disk-percentage":"90.0","mapreduce.app-submission.cross-platform":"false","yarn.resourcemanager.work-preserving-recovery.scheduling-wait-ms":"10000","yarn.nodemanager.container-retry-minimum-interval-ms":"1000","hadoop.security.groups.cache.secs":"300","yarn.scheduler.increment-allocation-mb":"32","yarn.federation.enabled":"false","fs.azure.local.sas.key.mode":"false","ipc.maximum.data.length":"67108864","mapreduce.shuffle.max.threads":"0","yarn.router.pipeline.cache-max-size":"25","yarn.resourcemanager.nm-container-queuing.load-comparator":"QUEUE_LENGTH","hadoop.security.authorization":"false","yarn.app.mapreduce.am.jhs.backup.enabled":"true","mapreduce.job.complete.cancel.delegation.tokens":"*********(redacted)","fs.s3a.paging.maximum":"5000","nfs.exports.allowed.hosts":"* rw","yarn.nodemanager.amrmproxy.ha.enable":"false","mapreduce.jobhistory.http.policy":"HTTP_ONLY","yarn.sharedcache.store.in-memory.check-period-mins":"720","mapreduce.reduce.java.opts":"-Xmx2458m","hadoop.security.group.mapping.ldap.ssl":"false","yarn.client.application-client-protocol.poll-interval-ms":"200","yarn.scheduler.configuration.leveldb-store.compaction-interval-secs":"86400","yarn.timeline-service.writer.class":"org.apache.hadoop.yarn.server.timelineservice.storage.HBaseTimelineWriterImpl","ha.zookeeper.parent-znode":"/hadoop-ha","yarn.resourcemanager.submission-preprocessor.file-refresh-interval-ms":"60000","dfs.namenode.safemode.extension":"5000","yarn.nodemanager.log-aggregation.policy.class":"org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation.AllContainerLogAggregationPolicy","mapreduce.reduce.shuffle.merge.percent":"0.66","hadoop.security.group.mapping.ldap.search.filter.group":"(objectClass=group)","yarn.resourcemanager.placement-constraints.scheduler.pool-size":"1","yarn.nodemanager.resourcemanager.minimum.version":"NONE","mapreduce.job.speculative.speculative-cap-running-tasks":"0.1","yarn.admin.acl":"*","dfs.namenode.replication.max-streams-hard-limit":"40","yarn.nodemanager.recovery.supervised":"true","yarn.sharedcache.admin.thread-count":"1","yarn.resourcemanager.ha.automatic-failover.enabled":"true","mapreduce.reduce.skip.maxgroups":"0","mapreduce.reduce.shuffle.connect.timeout":"180000","yarn.resourcemanager.address":"ip-172-31-102-115.ec2.internal:8032","ipc.client.ping":"true","mapreduce.task.local-fs.write-limit.bytes":"-1","mapred.output.committer.class":"org.apache.hadoop.mapred.DirectFileOutputCommitter","fs.adl.oauth2.access.token.provider.type":"*********(redacted)","mapreduce.shuffle.ssl.file.buffer.size":"65536","yarn.resourcemanager.ha.automatic-failover.embedded":"true","yarn.nodemanager.resource-plugins.gpu.docker-plugin":"nvidia-docker-v1","hadoop.ssl.enabled":"false","fs.s3a.multipart.purge":"false","yarn.scheduler.configuration.store.class":"file","yarn.resourcemanager.nm-container-queuing.queue-limit-stdev":"1.0f","mapreduce.job.end-notification.max.attempts":"5","mapreduce.output.fileoutputformat.compress.codec":"org.apache.hadoop.io.compress.DefaultCodec","yarn.nodemanager.container-monitor.procfs-tree.smaps-based-rss.enabled":"false","ipc.client.bind.wildcard.addr":"false","yarn.resourcemanager.webapp.rest-csrf.enabled":"false","ha.health-monitor.connect-retry-interval.ms":"1000","yarn.nodemanager.keytab":"/etc/krb5.keytab","hadoop.security.key.provider.path":"kms://http@ip-172-31-102-115.ec2.internal:9600/kms","mapreduce.jobhistory.keytab":"/etc/security/keytab/jhs.service.keytab","fs.s3a.threads.max":"10","mapreduce.reduce.shuffle.input.buffer.percent":"0.70","hadoop.security.token.service.use_ip":"*********(redacted)","yarn.nodemanager.runtime.linux.docker.allowed-container-networks":"emr-docker-bridge,host,bridge","yarn.nodemanager.node-labels.resync-interval-ms":"120000","hadoop.tmp.dir":"/mnt/var/lib/hadoop/tmp","mapreduce.job.maps":"36","mapreduce.jobhistory.webapp.rest-csrf.custom-header":"X-XSRF-Header","mapreduce.job.end-notification.max.retry.interval":"5000","yarn.log-aggregation.retain-check-interval-seconds":"-1","yarn.resourcemanager.resource-tracker.client.thread-count":"64","yarn.nodemanager.containers-launcher.class":"org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainersLauncher","yarn.rm.system-metrics-publisher.emit-container-events":"false","yarn.timeline-service.leveldb-timeline-store.start-time-read-cache-size":"10000","yarn.resourcemanager.ha.automatic-failover.zk-base-path":"/yarn-leader-election","io.seqfile.local.dir":"${hadoop.tmp.dir}/io/local","fs.s3a.s3guard.ddb.throttle.retry.interval":"100ms","fs.AbstractFileSystem.wasb.impl":"org.apache.hadoop.fs.azure.Wasb","mapreduce.client.submit.file.replication":"10","mapreduce.jobhistory.minicluster.fixed.ports":"false","fs.s3a.multipart.threshold":"2147483647","yarn.resourcemanager.webapp.xfs-filter.xframe-options":"SAMEORIGIN","mapreduce.jobhistory.done-dir":"${yarn.app.mapreduce.am.staging-dir}/history/done","dfs.namenode.name.dir":"/mnt/namenode,/mnt1/namenode","ipc.client.idlethreshold":"4000","yarn.nodemanager.linux-container-executor.cgroups.strict-resource-usage":"false","yarn.nodemanager.runtime.linux.docker.default-ro-mounts":"/etc/passwd:/etc/passwd,/usr/lib:/docker/usr/lib,/usr/share:/docker/usr/share","mapreduce.reduce.input.buffer.percent":"0.0","yarn.nodemanager.runtime.linux.docker.userremapping-gid-threshold":"1","yarn.nodemanager.webapp.rest-csrf.enabled":"false","fs.ftp.host.port":"21","ipc.ping.interval":"60000","yarn.resourcemanager.history-writer.multi-threaded-dispatcher.pool-size":"10","yarn.resourcemanager.admin.address":"${yarn.resourcemanager.hostname}:8033","file.client-write-packet-size":"65536","ipc.client.kill.max":"10","mapreduce.reduce.speculative":"true","hadoop.security.key.default.bitlength":"256","mapreduce.job.reducer.unconditional-preempt.delay.sec":"300","yarn.nodemanager.disk-health-checker.interval-ms":"120000","yarn.nodemanager.log.deletion-threads-count":"4","yarn.webapp.filter-entity-list-by-user":"false","yarn.web-proxy.address":"ip-172-31-102-115.ec2.internal:20888","ipc.client.connection.maxidletime":"10000","mapreduce.task.io.sort.mb":"200","yarn.nodemanager.localizer.client.thread-count":"20","io.erasurecode.codec.rs.rawcoders":"rs_native,rs_java","io.erasurecode.codec.rs-legacy.rawcoders":"rs-legacy_java","yarn.sharedcache.admin.address":"0.0.0.0:8047","yarn.resourcemanager.placement-constraints.algorithm.iterator":"SERIAL","yarn.nodemanager.localizer.cache.cleanup.interval-ms":"600000","hadoop.security.crypto.codec.classes.aes.ctr.nopadding":"org.apache.hadoop.crypto.OpensslAesCtrCryptoCodec, org.apache.hadoop.crypto.JceAesCtrCryptoCodec","mapreduce.job.cache.limit.max-resources-mb":"0","fs.s3a.connection.ssl.enabled":"true","yarn.nodemanager.process-kill-wait.ms":"5000","mapreduce.job.hdfs-servers":"${fs.defaultFS}","hadoop.workaround.non.threadsafe.getpwuid":"true","fs.df.interval":"60000","yarn.dispatcher.exit-on-error":"true","fs.s3a.multiobjectdelete.enable":"true","yarn.sharedcache.cleaner.resource-sleep-ms":"0","yarn.nodemanager.disk-health-checker.min-healthy-disks":"0.25","hadoop.shell.missing.defaultFs.warning":"false","io.file.buffer.size":"65536","dfs.permissions.superusergroup":"hadoop","hadoop.security.group.mapping.ldap.search.attr.member":"member","hadoop.security.random.device.file.path":"/dev/urandom","hadoop.security.sensitive-config-keys":"*********(redacted)","fs.s3a.s3guard.ddb.max.retries":"9","hadoop.rpc.socket.factory.class.default":"org.apache.hadoop.net.StandardSocketFactory","yarn.intermediate-data-encryption.enable":"false","yarn.resourcemanager.connect.retry-interval.ms":"30000","yarn.nodemanager.container.stderr.pattern":"{*stderr*,*STDERR*}","fs.s3bfs.impl":"org.apache.hadoop.fs.s3.S3FileSystem","yarn.scheduler.minimum-allocation-mb":"32","yarn.app.mapreduce.am.staging-dir":"/tmp/hadoop-yarn/staging","mapreduce.reduce.shuffle.read.timeout":"180000","hadoop.http.cross-origin.max-age":"1800","io.erasurecode.codec.xor.rawcoders":"xor_native,xor_java","fs.s3a.connection.establish.timeout":"5000","mapreduce.job.running.map.limit":"0","yarn.minicluster.control-resource-monitoring":"false","hadoop.ssl.require.client.cert":"false","hadoop.kerberos.kinit.command":"kinit","yarn.federation.state-store.class":"org.apache.hadoop.yarn.server.federation.store.impl.MemoryFederationStateStore","mapreduce.reduce.log.level":"INFO","hadoop.security.dns.log-slow-lookups.threshold.ms":"1000","mapreduce.job.ubertask.enable":"false","adl.http.timeout":"-1","yarn.resourcemanager.placement-constraints.retry-attempts":"3","hadoop.caller.context.enabled":"false","hadoop.security.group.mapping.ldap.num.attempts":"3","yarn.nodemanager.vmem-pmem-ratio":"5","hadoop.rpc.protection":"authentication","ha.health-monitor.rpc-timeout.ms":"45000","yarn.nodemanager.remote-app-log-dir":"/var/log/hadoop-yarn/apps","hadoop.zk.timeout-ms":"10000","fs.s3a.s3guard.cli.prune.age":"86400000","yarn.nodemanager.resource.pcores-vcores-multiplier":"1.0","yarn.nodemanager.runtime.linux.sandbox-mode":"disabled","yarn.app.mapreduce.am.containerlauncher.threadpool-initial-size":"10","fs.s3a.committer.threads":"8","hadoop.zk.retry-interval-ms":"1000","hadoop.security.crypto.buffer.size":"8192","yarn.nodemanager.node-labels.provider.fetch-interval-ms":"600000","mapreduce.jobhistory.recovery.store.leveldb.path":"${hadoop.tmp.dir}/mapred/history/recoverystore","yarn.client.failover-retries-on-socket-timeouts":"0","yarn.nodemanager.resource.memory.enabled":"false","fs.azure.authorization.caching.enable":"true","hadoop.security.instrumentation.requires.admin":"false","yarn.nodemanager.delete.thread-count":"4","mapreduce.job.finish-when-all-reducers-done":"true","hadoop.registry.jaas.context":"Client","yarn.timeline-service.leveldb-timeline-store.path":"${hadoop.tmp.dir}/yarn/timeline","io.map.index.interval":"128","yarn.resourcemanager.nm-container-queuing.max-queue-wait-time-ms":"100","fs.abfs.impl":"org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem","mapreduce.job.counters.max":"120","mapreduce.jobhistory.webapp.rest-csrf.enabled":"false","yarn.timeline-service.store-class":"org.apache.hadoop.yarn.server.timeline.EntityGroupFSTimelineStore","mapreduce.jobhistory.move.interval-ms":"180000","fs.s3a.change.detection.version.required":"true","yarn.nodemanager.localizer.fetch.thread-count":"20","yarn.resourcemanager.scheduler.client.thread-count":"64","hadoop.ssl.hostname.verifier":"DEFAULT","yarn.timeline-service.leveldb-state-store.path":"${hadoop.tmp.dir}/yarn/timeline","mapreduce.job.classloader":"false","mapreduce.task.profile.map.params":"${mapreduce.task.profile.params}","ipc.client.connect.timeout":"20000","hadoop.security.auth_to_local.mechanism":"hadoop","yarn.timeline-service.app-collector.linger-period.ms":"60000","yarn.nm.liveness-monitor.expiry-interval-ms":"600000","yarn.resourcemanager.reservation-system.planfollower.time-step":"1000","yarn.nodemanager.runtime.linux.docker.enable-userremapping.allowed":"true","hadoop.proxyuser.hadoop.groups":"*","yarn.webapp.api-service.enable":"true","yarn.nodemanager.recovery.enabled":"true","mapreduce.job.end-notification.retry.interval":"1000","fs.du.interval":"600000","fs.ftp.impl":"org.apache.hadoop.fs.ftp.FTPFileSystem","yarn.nodemanager.container.stderr.tail.bytes":"4096","hadoop.security.group.mapping.ldap.read.timeout.ms":"60000","mapreduce.map.env":"HADOOP_MAPRED_HOME=/usr/lib/hadoop-mapreduce","hadoop.security.groups.cache.warn.after.ms":"5000","file.bytes-per-checksum":"512","mapreduce.outputcommitter.factory.scheme.s3a":"org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory","hadoop.security.groups.cache.background.reload":"false","yarn.nodemanager.container-monitor.enabled":"true","yarn.nodemanager.elastic-memory-control.enabled":"false","net.topology.script.number.args":"100","mapreduce.task.merge.progress.records":"10000","yarn.nodemanager.localizer.address":"${yarn.nodemanager.hostname}:8040","yarn.timeline-service.keytab":"/etc/krb5.keytab","mapreduce.reduce.shuffle.fetch.retry.timeout-ms":"30000","yarn.resourcemanager.rm.container-allocation.expiry-interval-ms":"600000","mapreduce.fileoutputcommitter.algorithm.version":"2","yarn.resourcemanager.work-preserving-recovery.enabled":"true","mapreduce.map.skip.maxrecords":"0","yarn.sharedcache.root-dir":"/sharedcache","fs.s3a.retry.throttle.limit":"${fs.s3a.attempts.maximum}","hadoop.http.authentication.type":"simple","mapreduce.job.jvm.numtasks":"20","mapreduce.job.cache.limit.max-resources":"0","mapreduce.task.userlog.limit.kb":"0","yarn.resourcemanager.scheduler.monitor.enable":"false","ipc.client.connect.max.retries":"10","hadoop.registry.zk.retry.times":"5","dfs.namenode.http-address":"ip-172-31-102-115.ec2.internal:9870","yarn.nodemanager.resource-monitor.interval-ms":"3000","yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices":"auto","mapreduce.job.sharedcache.mode":"disabled","yarn.app.mapreduce.am.jhs.backup-dir":"file:///var/log/hadoop-mapreduce/history","yarn.nodemanager.webapp.rest-csrf.custom-header":"X-XSRF-Header","mapreduce.shuffle.listen.queue.size":"128","yarn.scheduler.configuration.mutation.acl-policy.class":"org.apache.hadoop.yarn.server.resourcemanager.scheduler.DefaultConfigurationMutationACLPolicy","mapreduce.map.cpu.vcores":"1","yarn.log-aggregation.file-formats":"TFile","yarn.timeline-service.client.fd-retain-secs":"300","hadoop.user.group.static.mapping.overrides":"dr.who=;","fs.azure.sas.expiry.period":"90d","mapreduce.jobhistory.recovery.store.class":"org.apache.hadoop.mapreduce.v2.hs.HistoryServerFileSystemStateStoreService","yarn.resourcemanager.fail-fast":"${yarn.fail-fast}","yarn.resourcemanager.proxy-user-privileges.enabled":"false","yarn.router.webapp.interceptor-class.pipeline":"org.apache.hadoop.yarn.server.router.webapp.DefaultRequestInterceptorREST","yarn.nodemanager.resource.memory.cgroups.soft-limit-percentage":"90.0","mapreduce.job.reducer.preempt.delay.sec":"0","hadoop.util.hash.type":"murmur","yarn.nodemanager.disk-validator":"basic","yarn.app.mapreduce.client.job.max-retries":"3","mapreduce.reduce.shuffle.retry-delay.max.ms":"60000","hadoop.security.group.mapping.ldap.connection.timeout.ms":"60000","mapreduce.task.profile.params":"-agentlib:hprof=cpu=samples,heap=sites,force=n,thread=y,verbose=n,file=%s","yarn.app.mapreduce.shuffle.log.backups":"0","yarn.nodemanager.container-diagnostics-maximum-size":"10000","hadoop.registry.zk.retry.interval.ms":"1000","yarn.nodemanager.linux-container-executor.cgroups.delete-timeout-ms":"1000","fs.AbstractFileSystem.file.impl":"org.apache.hadoop.fs.local.LocalFs","yarn.nodemanager.log-aggregation.roll-monitoring-interval-seconds":"-1","mapreduce.jobhistory.cleaner.interval-ms":"86400000","hadoop.registry.zk.quorum":"ip-172-31-102-115.ec2.internal:2181","mapreduce.output.fileoutputformat.compress":"false","yarn.resourcemanager.am-rm-tokens.master-key-rolling-interval-secs":"*********(redacted)","fs.s3a.assumed.role.session.duration":"30m","hadoop.security.group.mapping.ldap.conversion.rule":"none","hadoop.proxyuser.livy.hosts":"*","hadoop.ssl.server.conf":"ssl-server.xml","fs.s3a.retry.throttle.interval":"1000ms","seq.io.sort.factor":"100","yarn.sharedcache.cleaner.initial-delay-mins":"10","mapreduce.client.completion.pollinterval":"5000","hadoop.ssl.keystores.factory.class":"org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory","yarn.app.mapreduce.am.resource.cpu-vcores":"1","yarn.timeline-service.enabled":"false","yarn.nodemanager.runtime.linux.docker.capabilities":"CHOWN,DAC_OVERRIDE,FSETID,FOWNER,MKNOD,NET_RAW,SETGID,SETUID, SETFCAP,SETPCAP,NET_BIND_SERVICE,SYS_CHROOT,KILL,AUDIT_WRITE","yarn.acl.enable":"false","yarn.timeline-service.entity-group-fs-store.done-dir":"/tmp/entity-file-history/done/","mapreduce.tasktracker.map.tasks.maximum":"1","hadoop.security.group.mapping.ldap.num.attempts.before.failover":"3","mapreduce.task.profile":"false","dfs.webhdfs.enabled":"true","yarn.resourcemanager.fs.state-store.uri":"${hadoop.tmp.dir}/yarn/system/rmstore","mapreduce.jobhistory.always-scan-user-dir":"false","yarn.nodemanager.opportunistic-containers-use-pause-for-preemption":"false","yarn.nodemanager.linux-container-executor.nonsecure-mode.local-user":"hadoop","mapred.output.direct.EmrFileSystem":"true","yarn.timeline-service.reader.class":"org.apache.hadoop.yarn.server.timelineservice.storage.HBaseTimelineReaderImpl","yarn.resourcemanager.configuration.provider-class":"org.apache.hadoop.yarn.LocalConfigurationProvider","yarn.nodemanager.runtime.linux.docker.userremapping-uid-threshold":"1","yarn.resourcemanager.configuration.file-system-based-store":"/yarn/conf","mapreduce.job.cache.limit.max-single-resource-mb":"0","yarn.nodemanager.runtime.linux.docker.stop.grace-period":"10","yarn.resourcemanager.resource-profiles.source-file":"resource-profiles.json","yarn.nodemanager.resource.percentage-physical-cpu-limit":"100","mapreduce.jobhistory.client.thread-count":"10","tfile.fs.input.buffer.size":"262144","mapreduce.client.progressmonitor.pollinterval":"1000","yarn.nodemanager.log-dirs":"/var/log/hadoop-yarn/containers","hadoop.security.auth_to_local":"\n RULE:[1:$1@$0](.*@)s/@.*///L\n RULE:[2:$1@$0](.*@)s/@.*///L\n DEFAULT\n ","fs.automatic.close":"true","yarn.nodemanager.hostname":"0.0.0.0","yarn.nodemanager.resource.memory.cgroups.swappiness":"0","fs.s3n.impl":"com.amazon.ws.emr.hadoop.fs.EmrFileSystem","ftp.stream-buffer-size":"4096","yarn.fail-fast":"false","yarn.timeline-service.app-aggregation-interval-secs":"15","hadoop.security.group.mapping.ldap.search.filter.user":"(&(objectClass=user)(sAMAccountName={0}))","yarn.nodemanager.container-localizer.log.level":"INFO","yarn.timeline-service.address":"${yarn.timeline-service.hostname}:10200","dfs.namenode.replication.work.multiplier.per.iteration":"10","mapreduce.job.ubertask.maxmaps":"9","fs.s3a.threads.keepalivetime":"60","dfs.datanode.available-space-volume-choosing-policy.balanced-space-preference-fraction":"1.0","mapreduce.jobhistory.webapp.rest-csrf.methods-to-ignore":"GET,OPTIONS,HEAD","mapreduce.task.files.preserve.failedtasks":"false","yarn.app.mapreduce.client.job.retry-interval":"2000","ha.failover-controller.graceful-fence.connection.retries":"1","yarn.resourcemanager.delegation.token.max-lifetime":"*********(redacted)","yarn.timeline-service.client.drain-entities.timeout.ms":"2000","yarn.nodemanager.resource-plugins.fpga.vendor-plugin.class":"org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.fpga.IntelFpgaOpenclPlugin","yarn.timeline-service.entity-group-fs-store.summary-store":"org.apache.hadoop.yarn.server.timeline.RollingLevelDBTimelineStore","mapreduce.reduce.cpu.vcores":"1","hadoop.proxyuser.oozie.groups":"*","mapreduce.job.encrypted-intermediate-data.buffer.kb":"128","fs.client.resolve.remote.symlinks":"true","yarn.nodemanager.webapp.https.address":"0.0.0.0:8044","hadoop.http.cross-origin.allowed-origins":"*","mapreduce.job.encrypted-intermediate-data":"false","yarn.timeline-service.entity-group-fs-store.retain-seconds":"604800","yarn.resourcemanager.metrics.runtime.buckets":"60,300,1440","yarn.timeline-service.generic-application-history.max-applications":"10000","mapreduce.tasktracker.reduce.tasks.maximum":"1","yarn.nodemanager.local-dirs":"/mnt/yarn,/mnt1/yarn","mapreduce.shuffle.connection-keep-alive.enable":"false","yarn.node-labels.configuration-type":"distributed","fs.s3a.path.style.access":"false","yarn.nodemanager.aux-services.mapreduce_shuffle.class":"org.apache.hadoop.mapred.ShuffleHandler","yarn.sharedcache.store.in-memory.staleness-period-mins":"10080","fs.adl.impl":"org.apache.hadoop.fs.adl.AdlFileSystem","yarn.resourcemanager.nodemanager.minimum.version":"NONE","mapreduce.jobhistory.webapp.xfs-filter.xframe-options":"SAMEORIGIN","yarn.app.mapreduce.am.staging-dir.erasurecoding.enabled":"false","net.topology.impl":"org.apache.hadoop.net.NetworkTopology","io.map.index.skip":"0","fs.AbstractFileSystem.s3.impl":"org.apache.hadoop.fs.s3.EMRFSDelegate","yarn.timeline-service.reader.webapp.https.address":"${yarn.timeline-service.webapp.https.address}","fs.ftp.data.connection.mode":"ACTIVE_LOCAL_DATA_CONNECTION_MODE","mapreduce.job.userlog.retain.hours":"48","mapreduce.job.local-fs.single-disk-limit.check.kill-limit-exceed":"true","yarn.scheduler.maximum-allocation-vcores":"128","hadoop.http.cross-origin.allowed-headers":"X-Requested-With,Content-Type,Accept,Origin","yarn.nodemanager.log-aggregation.compression-type":"none","yarn.timeline-service.version":"1.5","yarn.ipc.rpc.class":"org.apache.hadoop.yarn.ipc.HadoopYarnProtoRPC","mapreduce.reduce.maxattempts":"4","hadoop.security.dns.log-slow-lookups.enabled":"false","mapreduce.job.committer.setup.cleanup.needed":"true","mapreduce.job.running.reduce.limit":"0","ipc.maximum.response.length":"134217728","yarn.resourcemanager.webapp.rest-csrf.methods-to-ignore":"GET,OPTIONS,HEAD","mapreduce.job.token.tracking.ids.enabled":"*********(redacted)","hadoop.caller.context.max.size":"128","yarn.nodemanager.runtime.linux.docker.host-pid-namespace.allowed":"false","yarn.nodemanager.runtime.linux.docker.delayed-removal.allowed":"false","hadoop.registry.system.acls":"sasl:yarn@, sasl:mapred@, sasl:hdfs@","yarn.nodemanager.recovery.dir":"${hadoop.tmp.dir}/yarn-nm-recovery","fs.s3a.fast.upload.buffer":"disk","mapreduce.jobhistory.intermediate-done-dir":"${yarn.app.mapreduce.am.staging-dir}/history/done_intermediate","yarn.app.mapreduce.shuffle.log.separate":"true","yarn.log-aggregation.debug.filesize":"104857600","fs.s3a.max.total.tasks":"5","fs.s3a.readahead.range":"64K","hadoop.http.authentication.simple.anonymous.allowed":"true","fs.s3a.attempts.maximum":"20","hadoop.registry.zk.connection.timeout.ms":"15000","yarn.resourcemanager.delegation-token-renewer.thread-count":"*********(redacted)","yarn.nodemanager.health-checker.script.timeout-ms":"1200000","yarn.timeline-service.leveldb-timeline-store.start-time-write-cache-size":"10000","yarn.nodemanager.emit-container-events":"true","yarn.log.server.url":"http://ip-172-31-102-115.ec2.internal:19888/jobhistory/logs","yarn.resourcemanager.resource-profiles.enabled":"false","yarn.timeline-service.hbase-schema.prefix":"prod.","fs.azure.authorization":"false","mapreduce.map.log.level":"INFO","yarn.resourcemanager.decommissioning-nodes-watcher.poll-interval-secs":"20","hadoop.job.history.user.location":"none","mapreduce.output.fileoutputformat.compress.type":"BLOCK","yarn.resourcemanager.leveldb-state-store.path":"${hadoop.tmp.dir}/yarn/system/rmstore","yarn.timeline-service.webapp.rest-csrf.custom-header":"X-XSRF-Header","mapreduce.ifile.readahead.bytes":"4194304","yarn.sharedcache.app-checker.class":"org.apache.hadoop.yarn.server.sharedcachemanager.RemoteAppChecker","yarn.nodemanager.linux-container-executor.nonsecure-mode.limit-users":"true","yarn.nodemanager.resource.detect-hardware-capabilities":"false","mapreduce.cluster.acls.enabled":"false","mapreduce.job.speculative.retry-after-no-speculate":"1000","hadoop.security.group.mapping.ldap.search.group.hierarchy.levels":"0","yarn.resourcemanager.fs.state-store.retry-interval-ms":"1000","hadoop.proxyuser.hadoop.hosts":"*","yarn.resourcemanager.nodes.exclude-path":"/emr/instance-controller/lib/yarn.nodes.exclude.xml","file.stream-buffer-size":"4096","yarn.resourcemanager.application-timeouts.monitor.interval-ms":"3000","mapreduce.map.output.compress.codec":"org.apache.hadoop.io.compress.SnappyCodec","mapreduce.map.speculative":"true","mapreduce.job.speculative.retry-after-speculate":"15000","yarn.nodemanager.linux-container-executor.cgroups.mount":"false","yarn.app.mapreduce.am.container.log.backups":"0","yarn.app.mapreduce.am.log.level":"INFO","mapreduce.job.reduce.slowstart.completedmaps":"0.05","yarn.timeline-service.http-authentication.type":"simple","hadoop.security.group.mapping.ldap.search.attr.group.name":"cn","yarn.nodemanager.resource-plugins.fpga.allowed-fpga-devices":"auto","yarn.timeline-service.client.internal-timers-ttl-secs":"420","hadoop.http.logs.enabled":"true","fs.s3a.block.size":"32M","yarn.sharedcache.client-server.address":"0.0.0.0:8045","yarn.nodemanager.logaggregation.threadpool-size-max":"100","yarn.resourcemanager.hostname":"172.31.102.115","yarn.resourcemanager.delegation.key.update-interval":"86400000","mapreduce.reduce.shuffle.fetch.retry.enabled":"${yarn.nodemanager.recovery.enabled}","mapreduce.map.memory.mb":"1536","mapreduce.task.skip.start.attempts":"2","fs.AbstractFileSystem.hdfs.impl":"org.apache.hadoop.fs.Hdfs","yarn.nodemanager.disk-health-checker.enable":"true","ipc.client.tcpnodelay":"true","ipc.client.rpc-timeout.ms":"0","yarn.nodemanager.webapp.rest-csrf.methods-to-ignore":"GET,OPTIONS,HEAD","ipc.client.low-latency":"false","mapreduce.input.lineinputformat.linespermap":"1","yarn.router.interceptor.user.threadpool-size":"5","ipc.client.connect.max.retries.on.timeouts":"5","yarn.timeline-service.leveldb-timeline-store.read-cache-size":"104857600","fs.AbstractFileSystem.har.impl":"org.apache.hadoop.fs.HarFs","mapreduce.job.split.metainfo.maxsize":"10000000","yarn.am.liveness-monitor.expiry-interval-ms":"600000","yarn.resourcemanager.container-tokens.master-key-rolling-interval-secs":"*********(redacted)","yarn.timeline-service.entity-group-fs-store.app-cache-size":"10","fs.s3a.socket.recv.buffer":"8192","yarn.application.classpath":"\n $HADOOP_CONF_DIR,\n $HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,\n $HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,\n $HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,\n $HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*,\n /usr/lib/hadoop-lzo/lib/*,\n /usr/share/aws/emr/emrfs/conf,\n /usr/share/aws/emr/emrfs/lib/*,\n /usr/share/aws/emr/emrfs/auxlib/*,\n /usr/share/aws/emr/lib/*,\n /usr/share/aws/emr/ddb/lib/emr-ddb-hadoop.jar,\n /usr/share/aws/emr/goodies/lib/emr-hadoop-goodies.jar,\n /usr/lib/spark/yarn/lib/datanucleus-api-jdo.jar,\n /usr/lib/spark/yarn/lib/datanucleus-core.jar,\n /usr/lib/spark/yarn/lib/datanucleus-rdbms.jar,\n /usr/share/aws/emr/cloudwatch-sink/lib/*,\n /usr/share/aws/aws-java-sdk/*\n ","yarn.resourcemanager.resource-tracker.address":"ip-172-31-102-115.ec2.internal:8025","yarn.nodemanager.node-labels.provider.fetch-timeout-ms":"1200000","mapreduce.job.heap.memory-mb.ratio":"0.8","yarn.resourcemanager.leveldb-state-store.compaction-interval-secs":"3600","yarn.resourcemanager.webapp.rest-csrf.custom-header":"X-XSRF-Header","yarn.scheduler.configuration.fs.path":"file://${hadoop.tmp.dir}/yarn/system/schedconf","dfs.datanode.max.transfer.threads":"4096","mapreduce.client.output.filter":"FAILED","hadoop.http.filter.initializers":"org.apache.hadoop.security.HttpCrossOriginFilterInitializer,org.apache.hadoop.yarn.server.security.http.RMAuthenticationFilterInitializer,org.apache.hadoop.http.lib.StaticUserWebFilter","mapreduce.reduce.memory.mb":"3072","mapreduce.admin.user.env":"LD_LIBRARY_PATH=$HADOOP_COMMON_HOME/lib/native:/usr/lib/hadoop-lzo/lib/native","yarn.timeline-service.hostname":"ip-172-31-102-115.ec2.internal","file.replication":"1","yarn.nodemanager.container-metrics.unregister-delay-ms":"10000","yarn.nodemanager.container-metrics.period-ms":"-1","mapreduce.fileoutputcommitter.task.cleanup.enabled":"false","hadoop.proxyuser.oozie.hosts":"*","yarn.nodemanager.log.retain-seconds":"10800","yarn.timeline-service.entity-group-fs-store.cleaner-interval-seconds":"3600","yarn.resourcemanager.keytab":"/etc/krb5.keytab","hadoop.security.group.mapping.providers.combined":"true","mapreduce.reduce.merge.inmem.threshold":"1000","yarn.timeline-service.recovery.enabled":"false","fs.azure.saskey.usecontainersaskeyforallaccess":"true","yarn.sharedcache.nm.uploader.thread-count":"20","mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","yarn.resourcemanager.nodemanager-graceful-decommission-timeout-secs":"3600","mapreduce.shuffle.ssl.enabled":"false","yarn.timeline-service.hbase.coprocessor.app-final-value-retention-milliseconds":"259200000","fs.s3a.committer.staging.abort.pending.uploads":"true","yarn.nodemanager.opportunistic-containers-max-queue-length":"0","yarn.resourcemanager.state-store.max-completed-applications":"${yarn.resourcemanager.max-completed-applications}","mapreduce.job.speculative.minimum-allowed-tasks":"10","yarn.nodemanager.node-labels.provider":"config","yarn.log-aggregation.retain-seconds":"172800","yarn.nodemanager.disk-health-checker.min-free-space-per-disk-mb":"0","mapreduce.jobhistory.max-age-ms":"604800000","hadoop.http.cross-origin.allowed-methods":"GET,POST,HEAD","yarn.resourcemanager.opportunistic-container-allocation.enabled":"false","mapreduce.jobhistory.webapp.address":"ip-172-31-102-115.ec2.internal:19888","hadoop.system.tags":"YARN,HDFS,NAMENODE,DATANODE,REQUIRED,SECURITY,KERBEROS,PERFORMANCE,CLIENT\n ,SERVER,DEBUG,DEPRECATED,COMMON,OPTIONAL","yarn.log-aggregation.file-controller.TFile.class":"org.apache.hadoop.yarn.logaggregation.filecontroller.tfile.LogAggregationTFileController","yarn.client.nodemanager-connect.max-wait-ms":"180000","yarn.resourcemanager.webapp.address":"${yarn.resourcemanager.hostname}:8088","mapreduce.jobhistory.recovery.enable":"false","mapreduce.reduce.shuffle.parallelcopies":"20","fs.AbstractFileSystem.webhdfs.impl":"org.apache.hadoop.fs.WebHdfs","fs.trash.interval":"0","yarn.nodemanager.node-labels.provider.configured-node-partition":"CORE","yarn.app.mapreduce.client.max-retries":"3","hadoop.security.authentication":"simple","mapreduce.task.profile.reduce.params":"${mapreduce.task.profile.params}","dfs.datanode.du.reserved":"536870912","yarn.app.mapreduce.am.resource.mb":"3072","mapreduce.input.fileinputformat.list-status.num-threads":"1","io.compression.codec.lzo.class":"com.hadoop.compression.lzo.LzoCodec","yarn.nodemanager.container-executor.class":"org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor","io.mapfile.bloom.size":"1048576","yarn.timeline-service.ttl-ms":"604800000","yarn.resourcemanager.nm-container-queuing.min-queue-length":"5","yarn.nodemanager.resource.cpu-vcores":"8","mapreduce.job.reduces":"17","fs.s3a.multipart.size":"100M","mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","yarn.scheduler.minimum-allocation-vcores":"1","mapreduce.job.speculative.speculative-cap-total-tasks":"0.01","hadoop.ssl.client.conf":"ssl-client.xml","mapreduce.job.queuename":"default","mapreduce.job.encrypted-intermediate-data-key-size-bits":"128","fs.s3a.metadatastore.authoritative":"false","yarn.nodemanager.webapp.xfs-filter.xframe-options":"SAMEORIGIN","ha.health-monitor.sleep-after-disconnect.ms":"1000","yarn.app.mapreduce.shuffle.log.limit.kb":"0","hadoop.security.group.mapping":"org.apache.hadoop.security.JniBasedUnixGroupsMappingWithFallback","yarn.client.application-client-protocol.poll-timeout-ms":"-1","mapreduce.jobhistory.jhist.format":"binary","yarn.resourcemanager.ha.enabled":"false","hadoop.http.staticuser.user":"dr.who","mapreduce.task.exit.timeout.check-interval-ms":"20000","mapreduce.jobhistory.intermediate-user-done-dir.permissions":"770","mapreduce.task.exit.timeout":"60000","yarn.nodemanager.linux-container-executor.resources-handler.class":"org.apache.hadoop.yarn.server.nodemanager.util.DefaultLCEResourcesHandler","mapreduce.reduce.shuffle.memory.limit.percent":"0.25","yarn.resourcemanager.reservation-system.enable":"false","mapreduce.map.output.compress":"true","ha.zookeeper.acl":"world:anyone:rwcda","hadoop.proxyuser.presto.groups":"*","ipc.server.max.connections":"0","yarn.nodemanager.runtime.linux.docker.default-container-network":"host","yarn.router.webapp.address":"0.0.0.0:8089","yarn.scheduler.maximum-allocation-mb":"54272","yarn.resourcemanager.scheduler.monitor.policies":"org.apache.hadoop.yarn.server.resourcemanager.monitor.capacity.ProportionalCapacityPreemptionPolicy","yarn.sharedcache.cleaner.period-mins":"1440","yarn.nodemanager.resource-plugins.gpu.docker-plugin.nvidia-docker-v1.endpoint":"http://localhost:3476/v1.0/docker/cli","yarn.app.mapreduce.am.container.log.limit.kb":"0","ipc.client.connect.retry.interval":"1000","yarn.timeline-service.http-cross-origin.enabled":"true","fs.wasbs.impl":"org.apache.hadoop.fs.azure.NativeAzureFileSystem$Secure","hadoop.proxyuser.httpfs.groups":"*","yarn.federation.subcluster-resolver.class":"org.apache.hadoop.yarn.server.federation.resolver.DefaultSubClusterResolverImpl","yarn.resourcemanager.zk-state-store.parent-path":"/rmstore","mapreduce.jobhistory.cleaner.enable":"true","yarn.timeline-service.client.fd-flush-interval-secs":"10","hadoop.security.kms.client.encrypted.key.cache.expiry":"43200000","hadoop.proxyuser.httpfs.hosts":"*","yarn.client.nodemanager-client-async.thread-pool-max-size":"500","mapreduce.map.maxattempts":"4","yarn.resourcemanager.nm-container-queuing.sorting-nodes-interval-ms":"1000","fs.s3a.committer.staging.tmp.path":"tmp/staging","yarn.nodemanager.sleep-delay-before-sigkill.ms":"250","yarn.resourcemanager.nm-container-queuing.min-queue-wait-time-ms":"10","mapreduce.job.end-notification.retry.attempts":"0","yarn.nodemanager.resource.count-logical-processors-as-cores":"false","mapred.output.direct.NativeS3FileSystem":"true","hadoop.registry.zk.root":"/registry","adl.feature.ownerandgroup.enableupn":"false","yarn.resourcemanager.zk-max-znode-size.bytes":"1048576","mapreduce.job.reduce.shuffle.consumer.plugin.class":"org.apache.hadoop.mapreduce.task.reduce.Shuffle","yarn.resourcemanager.delayed.delegation-token.removal-interval-ms":"*********(redacted)","yarn.nodemanager.localizer.cache.target-size-mb":"10240","fs.s3a.committer.staging.conflict-mode":"fail","mapreduce.client.libjars.wildcard":"true","fs.s3a.committer.staging.unique-filenames":"true","yarn.nodemanager.node-attributes.provider.fetch-timeout-ms":"1200000","fs.s3a.list.version":"2","ftp.client-write-packet-size":"65536","fs.AbstractFileSystem.adl.impl":"org.apache.hadoop.fs.adl.Adl","hadoop.proxyuser.hive.hosts":"*","yarn.node-labels.fs-store.root-dir":"file:///mnt/var/lib/hadoop-yarn/nodelabels","hadoop.security.key.default.cipher":"AES/CTR/NoPadding","yarn.client.failover-retries":"0","fs.s3a.multipart.purge.age":"86400","mapreduce.job.local-fs.single-disk-limit.check.interval-ms":"5000","net.topology.node.switch.mapping.impl":"org.apache.hadoop.net.ScriptBasedMapping","yarn.nodemanager.amrmproxy.address":"0.0.0.0:8049","ipc.server.listen.queue.size":"128","map.sort.class":"org.apache.hadoop.util.QuickSort","fs.viewfs.rename.strategy":"SAME_MOUNTPOINT","hadoop.security.kms.client.authentication.retry-count":"1","fs.permissions.umask-mode":"022","fs.s3a.assumed.role.credentials.provider":"org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider","yarn.nodemanager.vmem-check-enabled":"true","yarn.nodemanager.numa-awareness.enabled":"false","yarn.nodemanager.recovery.compaction-interval-secs":"3600","yarn.app.mapreduce.client-am.ipc.max-retries":"3","yarn.federation.registry.base-dir":"yarnfederation/","mapreduce.job.max.map":"-1","mapreduce.job.local-fs.single-disk-limit.bytes":"-1","mapreduce.job.ubertask.maxreduces":"1","hadoop.security.kms.client.encrypted.key.cache.size":"500","hadoop.security.java.secure.random.algorithm":"SHA1PRNG","ha.failover-controller.cli-check.rpc-timeout.ms":"20000","mapreduce.jobhistory.jobname.limit":"50","mapreduce.application.classpath":"\n $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*,\n $HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*,\n /usr/lib/hadoop-lzo/lib/*,\n /usr/share/aws/emr/emrfs/conf,\n /usr/share/aws/emr/emrfs/lib/*,\n /usr/share/aws/emr/emrfs/auxlib/*,\n /usr/share/aws/emr/lib/*,\n /usr/share/aws/emr/ddb/lib/emr-ddb-hadoop.jar,\n /usr/share/aws/emr/goodies/lib/emr-hadoop-goodies.jar,\n /usr/share/aws/emr/cloudwatch-sink/lib/*,\n /usr/share/aws/aws-java-sdk/*\n ","yarn.client.nodemanager-connect.retry-interval-ms":"10000","yarn.timeline-service.state-store-class":"org.apache.hadoop.yarn.server.timeline.recovery.LeveldbTimelineStateStore","yarn.nodemanager.env-whitelist":"JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME,PATH","yarn.sharedcache.nested-level":"3","yarn.timeline-service.webapp.rest-csrf.methods-to-ignore":"GET,OPTIONS,HEAD","fs.azure.user.agent.prefix":"unknown","yarn.resourcemanager.zk-delegation-token-node.split-index":"*********(redacted)","yarn.nodemanager.numa-awareness.read-topology":"false","yarn.nodemanager.webapp.address":"${yarn.nodemanager.hostname}:8042","rpc.metrics.quantile.enable":"false","yarn.registry.class":"org.apache.hadoop.registry.client.impl.FSRegistryOperationsService","mapreduce.jobhistory.admin.acl":"*","yarn.resourcemanager.system-metrics-publisher.dispatcher.pool-size":"10","yarn.scheduler.queue-placement-rules":"user-group","hadoop.http.authentication.kerberos.keytab":"${user.home}/hadoop.keytab","yarn.resourcemanager.recovery.enabled":"false","yarn.timeline-service.webapp.rest-csrf.enabled":"false","dfs.datanode.available-space-volume-choosing-policy.balanced-space-threshold":"10737418240"},"System Properties":{"java.io.tmpdir":"/tmp","line.separator":"\n","path.separator":":","sun.management.compiler":"HotSpot 64-Bit Tiered Compilers","SPARK_SUBMIT":"true","sun.cpu.endian":"little","java.specification.maintenance.version":"4","java.specification.version":"1.8","java.vm.specification.name":"Java Virtual Machine Specification","java.vendor":"Amazon.com Inc.","java.vm.specification.version":"1.8","user.home":"/home/hadoop","file.encoding.pkg":"sun.io","sun.nio.ch.bugLevel":"","sun.arch.data.model":"64","sun.boot.library.path":"/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/amd64","user.dir":"/mnt/var/lib/hadoop/steps/s-1EF238MZKOWWR","java.library.path":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native:/usr/java/packages/lib/amd64:/usr/lib64:/lib64:/lib:/usr/lib","sun.cpu.isalist":"","os.arch":"amd64","java.vm.version":"25.362-b08","jetty.git.hash":"84700530e645e812b336747464d6fbbf370c9a20","java.endorsed.dirs":"/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/endorsed","java.runtime.version":"1.8.0_362-b08","java.vm.info":"mixed mode","java.ext.dirs":"/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/ext:/usr/java/packages/lib/ext","java.runtime.name":"OpenJDK Runtime Environment","EMR_RELEASE_LABEL":"emr-6.2.0","file.separator":"/","java.class.version":"52.0","EMR_CLUSTER_ID":"j-14QV64S2PV1Y2","java.specification.name":"Java Platform API Specification","sun.boot.class.path":"/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/resources.jar:/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/rt.jar:/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/sunrsasign.jar:/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/jsse.jar:/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/jce.jar:/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/charsets.jar:/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/lib/jfr.jar:/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre/classes","file.encoding":"UTF-8","user.timezone":"UTC","java.specification.vendor":"Oracle Corporation","sun.java.launcher":"SUN_STANDARD","os.version":"4.14.200-155.322.amzn2.x86_64","sun.os.patch.level":"unknown","java.vm.specification.vendor":"Oracle Corporation","user.country":"US","sun.jnu.encoding":"UTF-8","user.language":"en","java.vendor.url":"https://aws.amazon.com/corretto/","java.awt.printerjob":"sun.print.PSPrinterJob","java.awt.graphicsenv":"sun.awt.X11GraphicsEnvironment","awt.toolkit":"sun.awt.X11.XToolkit","os.name":"Linux","java.vm.vendor":"Amazon.com Inc.","java.vendor.url.bug":"https://github.com/corretto/corretto-8/issues/","user.name":"hadoop","java.vm.name":"OpenJDK 64-Bit Server VM","sun.java.command":"org.apache.spark.deploy.SparkSubmit /home/hadoop/index_data_etl_1GB.py","java.home":"/usr/lib/jvm/java-1.8.0-amazon-corretto.x86_64/jre","java.version":"1.8.0_362","EMR_STEP_ID":"s-1EF238MZKOWWR","sun.io.unicode.encoding":"UnicodeLittle"},"Classpath Entries":{"/usr/share/aws/aws-java-sdk/aws-java-sdk-macie-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/cats-kernel_2.12-2.0.0-M4.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mediaconvert-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jersey-container-servlet-core-2.30.jar":"System Classpath","/usr/lib/spark/jars/jackson-core-asl-1.9.13.jar":"System Classpath","/usr/lib/spark/jars/jackson-databind-2.10.0.jar":"System Classpath","/usr/lib/spark/jars/spark-kvstore_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-elastictranscoder-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-appsync-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/json4s-ast_2.12-3.6.6.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-groundstation-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/scala-xml_2.12-1.2.0.jar":"System Classpath","/usr/lib/spark/conf/":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-efs-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/dnsjava-2.1.7.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/emrfs-hadoop-assembly-2.44.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mediaconnect-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-macie2-1.11.880.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/jcl-over-slf4j-1.7.21.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codestar-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-organizations-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-directory-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/aopalliance-repackaged-2.6.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-health-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ssooidc-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-tags_2.12-3.0.1-amzn-0-tests.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-forecastquery-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-frauddetector-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-common-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-machinelearning-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/osgi-resource-locator-1.0.3.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-backup-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/libfb303-0.9.3.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-servicequotas-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudsearch-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/guice-4.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-workspaces-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-honeycode-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-comprehend-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/datanucleus-rdbms-4.1.19.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-budgets-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jakarta.inject-2.6.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-route53-1.11.880.jar":"System Classpath","/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iotsitewise-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/machinist_2.12-0.6.8.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iot-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ebs-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-daemon-1.0.13.jar":"System Classpath","/usr/lib/spark/jars/jsr305-3.0.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-dlm-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ses-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-serde-2.3.7-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/HikariCP-2.5.1.jar":"System Classpath","/usr/lib/spark/jars/jdo-api-3.0.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-signer-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-network-common_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_system-osx-x86_64-1.1-natives.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ioteventsdata-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-applicationinsights-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-imagebuilder-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iam-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/JTransforms-3.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-elasticache-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ecs-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/lz4-java-1.7.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-quicksight-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/kerb-server-1.0.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudformation-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-math3-3.4.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-snowball-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-docdb-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-resourcegroupstaggingapi-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jackson-datatype-jsr310-2.10.3.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-dms-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/okhttp-3.12.6.jar":"System Classpath","/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar":"System Classpath","/usr/lib/spark/jars/parquet-encoding-1.10.1-spark-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-sagemaker-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/stax2-api-3.1.4.jar":"System Classpath","/usr/lib/spark/jars/libthrift-0.12.0.jar":"System Classpath","/usr/lib/spark/jars/kerby-config-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/httpclient-4.5.9.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codeguruprofiler-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jpam-1.1.jar":"System Classpath","/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ssoadmin-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/kryo-shaded-4.0.2.jar":"System Classpath","/usr/lib/spark/jars/json-smart-2.3.jar":"System Classpath","/usr/lib/spark/jars/commons-cli-1.2.jar":"System Classpath","/usr/lib/spark/jars/scala-compiler-2.12.10.jar":"System Classpath","/usr/lib/spark/jars/commons-beanutils-1.9.4.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-migrationhubconfig-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codebuild-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/curator-client-2.13.0.jar":"System Classpath","/usr/lib/spark/jars/slf4j-api-1.7.30.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-stepfunctions-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-autoscaling-1.11.880.jar":"System Classpath","/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cognitoidp-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/curator-framework-2.13.0.jar":"System Classpath","/usr/share/aws/emr/security/conf":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-guardduty-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-configuration2-2.1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-autoscalingplans-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/istack-commons-runtime-3.0.8.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-route53resolver-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/parquet-common-1.10.1-spark-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/hk2-locator-2.6.1.jar":"System Classpath","/usr/lib/spark/jars/commons-lang-2.6.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codecommit-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cognitosync-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/log4j-1.2.17.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-transfer-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-comprehendmedical-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-connectparticipant-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/curator-recipes-2.13.0.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/javax.inject-1.jar":"System Classpath","/usr/lib/spark/jars/jersey-client-2.30.jar":"System Classpath","/usr/lib/spark/jars/spark-mllib_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/breeze-macros_2.12-1.0.jar":"System Classpath","/usr/lib/spark/jars/arrow-vector-0.15.1.jar":"System Classpath","/usr/lib/spark/jars/json4s-scalap_2.12-3.6.6.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-braket-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-events-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/kerb-core-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/kerby-asn1-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/py4j-0.10.9.jar":"System Classpath","/usr/lib/spark/jars/paranamer-2.8.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-appconfig-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-llap-common-2.3.7.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-networkmanager-1.11.880.jar":"System Classpath","/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar":"System Classpath","/usr/lib/spark/jars/jakarta.annotation-api-1.3.5.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/ion-java-1.0.2.jar":"System Classpath","/usr/lib/spark/jars/janino-3.0.16.jar":"System Classpath","/usr/lib/spark/jars/hadoop-auth-3.2.1-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/threeten-extra-1.5.0.jar":"System Classpath","/usr/lib/spark/jars/commons-collections-3.2.2.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/fluent-hc-4.5.6.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-secretsmanager-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-logging-1.1.3.jar":"System Classpath","/usr/lib/spark/jars/jackson-jaxrs-json-provider-2.10.0.jar":"System Classpath","/usr/lib/spark/jars/objenesis-2.5.1.jar":"System Classpath","/usr/lib/spark/jars/chill-java-0.9.5.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-importexport-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-greengrass-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/arrow-format-0.15.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-chime-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-sqs-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/nimbus-jose-jwt-4.41.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-neptune-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/metrics-jvm-4.1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-schemas-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/orc-core-1.5.10.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-marketplacecommerceanalytics-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-exec-2.3.7-amzn-2-core.jar":"System Classpath","/usr/lib/spark/jars/re2j-1.1.jar":"System Classpath","/usr/lib/spark/jars/hive-common-2.3.7-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-pinpointsmsvoice-1.11.880.jar":"System Classpath","/usr/lib/hadoop-lzo/lib/hadoop-lzo-0.4.19.jar":"System Classpath","/usr/lib/spark/jars/commons-codec-1.10.jar":"System Classpath","/usr/lib/spark/jars/compress-lzf-1.0.3.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-pi-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/bcprov-jdk15on-1.60.jar":"System Classpath","/usr/lib/spark/jars/jersey-common-2.30.jar":"System Classpath","/usr/lib/spark/jars/javax.jdo-3.2.0-m3.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-opsworkscm-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-config-1.11.880.jar":"System Classpath","/usr/share/aws/emr/security/lib/*":"System Classpath","/usr/lib/spark/jars/kerb-util-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_ref-linux-i686-1.1-natives.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-costandusagereport-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_system-linux-armhf-1.1-natives.jar":"System Classpath","/usr/lib/spark/jars/jta-1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudtrail-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-net-3.1.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_system-linux-i686-1.1-natives.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-opsworks-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/slf4j-log4j12-1.7.30.jar":"System Classpath","/usr/lib/spark/jars/jackson-module-scala_2.12-2.10.0.jar":"System Classpath","/usr/lib/spark/jars/json-1.8.jar":"System Classpath","/usr/lib/spark/jars/macro-compat_2.12-1.1.1.jar":"System Classpath","/usr/lib/spark/jars/hadoop-mapreduce-client-common-3.2.1-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/stream-2.9.6.jar":"System Classpath","/usr/lib/spark/jars/jackson-annotations-2.10.0.jar":"System Classpath","/usr/lib/spark/jars/aircompressor-0.10.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-glue-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-kinesisvideosignalingchannels-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-launcher_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/metrics-json-4.1.1.jar":"System Classpath","/usr/lib/spark/jars/httpcore-4.4.11.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-forecast-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-resourcegroups-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-appstream-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/parquet-format-2.4.0.jar":"System Classpath","/usr/lib/spark/jars/ehcache-3.3.1.jar":"System Classpath","/usr/lib/spark/jars/native_ref-java-1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-synthetics-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-catalyst_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudwatchmetrics-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-pricing-1.11.880.jar":"System Classpath","/etc/hadoop/conf/":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-sns-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-savingsplans-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-redshiftdataapi-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-mllib-local_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codegurureviewer-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/native_system-java-1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-logs-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-mapreduce-client-jobclient-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-alexaforbusiness-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/remotetea-oncrpc-1.1.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-inspector-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-pinpoint-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iotevents-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/woodstox-core-5.0.3.jar":"System Classpath","/usr/lib/spark/jars/commons-io-2.4.jar":"System Classpath","/usr/lib/spark/jars/htrace-core4-4.1.0-incubating.jar":"System Classpath","/usr/lib/spark/jars/jline-2.14.6.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-rekognition-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-qldbsession-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-dynamodb-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-wafv2-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-lex-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/accessors-smart-1.2.jar":"System Classpath","/usr/lib/spark/jars/jakarta.activation-api-1.2.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-elasticinference-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-shims-2.3.7-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-augmentedairuntime-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-network-shuffle_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/commons-lang3-3.9.jar":"System Classpath","/usr/lib/spark/jars/activation-1.1.1.jar":"System Classpath","/usr/lib/spark/jars/hadoop-yarn-server-web-proxy-3.2.1-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/jackson-core-2.10.0.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_ref-win-x86_64-1.1-natives.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iotanalytics-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_ref-linux-armhf-1.1-natives.jar":"System Classpath","/usr/lib/spark/jars/flatbuffers-java-1.9.0.jar":"System Classpath","/usr/lib/spark/jars/shapeless_2.12-2.3.3.jar":"System Classpath","/usr/lib/spark/jars/jetty-rewrite-9.3.27.v20190418.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-timestreamquery-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-simpleworkflow-1.11.880.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/slf4j-api-1.7.21.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codegen-maven-plugin-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-sso-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-shims-0.23-2.3.7-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-apigatewaymanagementapi-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/metrics-core-4.1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iot1clickprojects-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-datasync-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-athena-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/leveldbjni-all-1.8.jar":"System Classpath","/usr/lib/spark/jars/spire-platform_2.12-0.17.0-M1.jar":"System Classpath","/usr/lib/spark/jars/gmetric4j-1.0.10.jar":"System Classpath","/usr/lib/spark/jars/javax.inject-1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-batch-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iotjobsdataplane-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jersey-server-2.30.jar":"System Classpath","/usr/lib/spark/jars/kerby-xdr-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/javax.servlet-api-3.1.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-s3outposts-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-elasticloadbalancingv2-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-sql_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/antlr4-runtime-4.7.1.jar":"System Classpath","/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mediapackagevod-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/arrow-memory-0.15.1.jar":"System Classpath","/usr/lib/spark/jars/gson-2.2.4.jar":"System Classpath","/usr/lib/spark/jars/hadoop-yarn-registry-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-lexmodelbuilding-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-simpledb-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-core-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spire-macros_2.12-0.17.0-M1.jar":"System Classpath","/usr/lib/spark/jars/okio-1.15.0.jar":"System Classpath","/usr/lib/spark/jars/hadoop-annotations-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-devicefarm-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_system-win-x86_64-1.1-natives.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-kinesisanalyticsv2-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mobile-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-textract-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/transaction-api-1.1.jar":"System Classpath","/usr/lib/spark/jars/kerby-util-1.0.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-elasticbeanstalk-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-servermigration-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/avro-mapred-1.8.2-hadoop2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mechanicalturkrequester-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/protobuf-java-2.5.0.jar":"System Classpath","/usr/lib/spark/jars/emr-spark-goodies.jar":"System Classpath","/usr/lib/spark/jars/spark-ganglia-lgpl_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/kerb-client-1.0.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-clouddirectory-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jaxb-api-2.2.11.jar":"System Classpath","/usr/share/aws/aws-java-sdk/jmespath-java-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-kafka-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_ref-linux-x86_64-1.1-natives.jar":"System Classpath","/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-appflow-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jersey-media-jaxb-2.30.jar":"System Classpath","/usr/lib/spark/jars/spark-graphx_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-personalize-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloud9-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-unsafe_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/commons-compress-1.8.1.jar":"System Classpath","/usr/lib/spark/jars/kerby-pkix-1.0.1.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/lombok-1.18.4.jar":"System Classpath","/usr/lib/spark/jars/datanucleus-core-4.1.17.jar":"System Classpath","/usr/lib/spark/jars/hive-metastore-2.3.7-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/netty-all-4.1.47.Final.jar":"System Classpath","/usr/lib/spark/jars/spark-tags_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-personalizeevents-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/minlog-1.3.0.jar":"System Classpath","/usr/lib/spark/jars/hadoop-yarn-api-3.2.1-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/jackson-mapper-asl-1.9.13.jar":"System Classpath","/usr/lib/spark/jars/zjsonpatch-0.3.0.jar":"System Classpath","/usr/lib/spark/jars/jackson-dataformat-yaml-2.10.0.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_system-linux-x86_64-1.1-natives.jar":"System Classpath","/usr/lib/spark/jars/orc-shims-1.5.10.jar":"System Classpath","/usr/lib/spark/jars/okhttp-2.7.5.jar":"System Classpath","/usr/lib/spark/jars/jniloader-1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-acmpca-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-servicecatalog-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-yarn-common-3.2.1-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/spark-hive_2.12-3.0.1-amzn-0.jar":"System Classpath","/docker/usr/share/aws/emr/emrfs/conf":"System Classpath","/usr/lib/spark/jars/spark-repl_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/orc-mapreduce-1.5.10.jar":"System Classpath","/usr/lib/spark/jars/avro-1.8.2.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/secret-agent-interface-1.3.0.jar":"System Classpath","/usr/lib/spark/jars/oro-2.0.8.jar":"System Classpath","/usr/lib/spark/jars/automaton-1.11-8.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mq-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-sketch_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/token-provider-1.0.1.jar":"System Classpath","/docker/usr/share/aws/emr/security/conf":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-appmesh-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ec2-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/logging-interceptor-3.12.6.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-fsx-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iot1clickdevices-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/xz-1.5.jar":"System Classpath","/usr/lib/spark/jars/json4s-core_2.12-3.6.6.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-securityhub-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iotsecuretunneling-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codedeploy-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-shims-scheduler-2.3.7-amzn-2.jar":"System Classpath","/docker/usr/share/aws/emr/emrfs/auxlib/*":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-glacier-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-pool-1.5.4.jar":"System Classpath","/usr/lib/spark/jars/kerb-common-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/bonecp-0.8.0.RELEASE.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codestarnotifications-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hk2-utils-2.6.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-eks-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudhsmv2-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-yarn-server-common-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ram-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-marketplaceentitlement-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_ref-osx-x86_64-1.1-natives.jar":"System Classpath","/usr/lib/spark/jars/spark-hive-thriftserver_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-marketplacemeteringservice-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-workmail-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/scala-collection-compat_2.12-2.1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-test-utils-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-s3-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-dbcp-1.4.jar":"System Classpath","/usr/lib/spark/jars/jersey-hk2-2.30.jar":"System Classpath","/usr/lib/spark/jars/metrics-jmx-4.1.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-marketplacecatalog-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-servicediscovery-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-yarn-client-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mediatailor-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/derby-10.12.1.1.jar":"System Classpath","/usr/lib/spark/jars/json4s-jackson_2.12-3.6.6.jar":"System Classpath","/usr/lib/spark/jars/scala-library-2.12.10.jar":"System Classpath","/usr/lib/spark/jars/hive-jdbc-2.3.7-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-redshift-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/commons-crypto-1.1.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-lakeformation-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/metrics-graphite-4.1.1.jar":"System Classpath","/usr/lib/spark/jars/JLargeArrays-1.5.jar":"System Classpath","/usr/lib/spark/jars/zookeeper-3.4.14.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codestarconnections-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudfront-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/antlr-runtime-3.5.2.jar":"System Classpath","/usr/lib/spark/jars/generex-1.0.2.jar":"System Classpath","/usr/lib/spark/jars/parquet-jackson-1.10.1-spark-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/jackson-module-paranamer-2.10.0.jar":"System Classpath","/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/jmespath-java-1.11.852.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-xray-1.11.880.jar":"System Classpath","/usr/share/aws/emr/emrfs/auxlib/*":"System Classpath","/usr/lib/spark/jars/commons-text-1.6.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/mockito-core-1.10.19.jar":"System Classpath","/docker/usr/lib/hadoop/hadoop-aws.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-elasticsearch-1.11.880.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/objenesis-2.1.jar":"System Classpath","/docker/usr/share/aws/emr/emrfs/lib/*":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-api-gateway-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/parquet-column-1.10.1-spark-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/RoaringBitmap-0.7.45.jar":"System Classpath","/usr/lib/spark/jars/jackson-module-jaxb-annotations-2.10.0.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/bcprov-ext-jdk15on-1.66.jar":"System Classpath","/docker/usr/share/aws/aws-java-sdk/*":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-storagegateway-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-sts-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/avro-ipc-1.8.2.jar":"System Classpath","/usr/lib/spark/jars/jackson-jaxrs-base-2.10.0.jar":"System Classpath","/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-apigatewayv2-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/breeze_2.12-1.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-directconnect-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spire-util_2.12-0.17.0-M1.jar":"System Classpath","/usr/lib/spark/jars/jcl-over-slf4j-1.7.30.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-globalaccelerator-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/datanucleus-api-jdo-4.2.4.jar":"System Classpath","/usr/lib/spark/jars/snappy-java-1.1.7.5.jar":"System Classpath","/usr/lib/spark/jars/dropwizard-metrics-hadoop-metrics2-reporter-0.1.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-outposts-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/guice-servlet-4.0.jar":"System Classpath","/usr/lib/spark/jars/spark-yarn_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-support-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jaxb-runtime-2.3.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-eventbridge-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/bcpkix-jdk15on-1.60.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-qldb-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cognitoidentity-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudwatch-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-translate-1.11.880.jar":"System Classpath","/usr/lib/hadoop-lzo/lib/hadoop-lzo.jar":"System Classpath","/usr/lib/spark/jars/joda-time-2.10.5.jar":"System Classpath","/usr/lib/spark/jars/arpack_combined_all-0.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-detective-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/aopalliance-1.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-kendra-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jsp-api-2.1.jar":"System Classpath","/usr/lib/spark/jars/javassist-3.25.0-GA.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-identitystore-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-workdocs-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-kinesis-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-timestreamwrite-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-kinesisvideo-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-dataexchange-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-dax-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spire_2.12-0.17.0-M1.jar":"System Classpath","/usr/lib/spark/jars/core-1.1.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-worklink-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_system-win-i686-1.1-natives.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-robomaker-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/kerb-admin-1.0.1.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/aws-glue-sdk-1.12.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-lambda-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-iotthingsgraph-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-opensdk-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-models-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jakarta.validation-api-2.0.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-sesv2-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-cli-2.3.7-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-acm-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-lightsail-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-accessanalyzer-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-datapipeline-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-mapreduce-client-core-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mediastoredata-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ecr-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codepipeline-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-computeoptimizer-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/scala-reflect-2.12.10.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-personalizeruntime-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/geronimo-jcache_1.0_spec-1.0-alpha-1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-shield-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-beeline-2.3.7-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/velocity-1.5.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-fms-1.11.880.jar":"System Classpath","/docker/usr/share/aws/emr/security/lib/*":"System Classpath","/usr/lib/spark/jars/chill_2.12-0.9.5.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-transcribe-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-shims-common-2.3.7-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/super-csv-2.2.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-applicationautoscaling-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jodd-core-3.5.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-licensemanager-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-waf-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/univocity-parsers-2.9.0.jar":"System Classpath","/usr/lib/spark/jars/zstd-jni-1.4.4-3.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-pinpointemail-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mediapackage-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-hdfs-client-3.2.1-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-gamelift-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/ivy-2.4.0.jar":"System Classpath","/usr/lib/spark/jars/hive-vector-code-gen-2.3.7-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-codeartifact-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/kerb-simplekdc-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/scala-parser-combinators_2.12-1.1.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-kms-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hk2-api-2.6.1.jar":"System Classpath","/usr/lib/hadoop/hadoop-aws.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-costexplorer-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hive-storage-api-2.7.1.jar":"System Classpath","/usr/lib/spark/jars/snakeyaml-1.24.jar":"System Classpath","/usr/lib/spark/jars/jakarta.ws.rs-api-2.1.6.jar":"System Classpath","/usr/lib/spark/jars/jul-to-slf4j-1.7.30.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-polly-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-connect-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-rds-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/ST4-4.0.4.jar":"System Classpath","/usr/lib/spark/jars/opencsv-2.3.jar":"System Classpath","/usr/lib/spark/jars/stax-api-1.0.1.jar":"System Classpath","/usr/lib/spark/jars/parquet-hadoop-1.10.1-spark-amzn-2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-amplify-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/javolution-5.5.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-managedblockchain-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-emr-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-migrationhub-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-s3control-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/xbean-asm7-shaded-4.15.jar":"System Classpath","/usr/lib/spark/jars/algebra_2.12-2.0.0-M2.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/annotations-16.0.2.jar":"System Classpath","/usr/lib/spark/jars/spark-streaming_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/lib/spark/jars/kerb-crypto-1.0.1.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ssm-1.11.880.jar":"System Classpath","/usr/share/aws/emr/emrfs/conf/":"System Classpath","/usr/lib/spark/jars/jersey-container-servlet-2.30.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-serverlessapplicationrepository-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/kerb-identity-1.0.1.jar":"System Classpath","/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-sagemakerruntime-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/guava-14.0.1.jar":"System Classpath","/usr/share/aws/emr/emrfs/lib/aopalliance-1.0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-cloudhsm-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/spark-core_2.12-3.0.1-amzn-0.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ec2instanceconnect-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-code-generator-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/hadoop-client-3.2.1-amzn-2.jar":"System Classpath","/usr/lib/spark/jars/shims-0.7.45.jar":"System Classpath","/usr/lib/spark/jars/commons-compiler-3.0.16.jar":"System Classpath","/usr/lib/spark/jars/jcip-annotations-1.0-1.jar":"System Classpath","/usr/lib/spark/jars/pyrolite-4.30.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-medialive-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-rdsdata-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-elasticloadbalancing-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-ivs-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-discovery-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/jakarta.xml.bind-api-2.3.2.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-mediastore-1.11.880.jar":"System Classpath","/usr/share/aws/aws-java-sdk/aws-java-sdk-workmailmessageflow-1.11.880.jar":"System Classpath","/usr/lib/spark/jars/netlib-native_ref-win-i686-1.1-natives.jar":"System Classpath","/docker/usr/lib/hadoop-lzo/lib/*":"System Classpath"}} -{"Event":"SparkListenerApplicationStart","App Name":"index_data_etl_1GB","App ID":"application_1678162862227_0001","Timestamp":1678162946352,"User":"hadoop"} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162968878,"Executor ID":"7","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000009/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000009/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000009"},"Resources":{}}} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162968886,"Executor ID":"1","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000002/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000002/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000002"},"Resources":{}}} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162968889,"Executor ID":"8","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000010/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000010/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000010"},"Resources":{}}} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162968899,"Executor ID":"3","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000005/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000005/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000005"},"Resources":{}}} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162968934,"Executor ID":"2","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000003/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000003/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000003"},"Resources":{}}} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162968941,"Executor ID":"5","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000007/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000007/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000007"},"Resources":{}}} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162968952,"Executor ID":"4","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000006/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000006/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000006"},"Resources":{}}} -{"Event":"SparkListenerExecutorAdded","Timestamp":1678162969012,"Executor ID":"6","Executor Info":{"Host":"ip-172-31-102-249.ec2.internal","Total Cores":4,"Log Urls":{"stdout":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000008/hadoop/stdout?start=-4096","stderr":"http://ip-172-31-102-249.ec2.internal:8042/node/containerlogs/container_1678162862227_0001_01_000008/hadoop/stderr?start=-4096"},"Attributes":{"NM_HTTP_ADDRESS":"ip-172-31-102-249.ec2.internal:8042","USER":"hadoop","LOG_FILES":"stderr,stdout","NM_HTTP_PORT":"8042","CLUSTER_ID":"","NM_PORT":"8041","HTTP_SCHEME":"http://","NM_HOST":"ip-172-31-102-249.ec2.internal","CONTAINER_ID":"container_1678162862227_0001_01_000008"},"Resources":{}}} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Port":42615},"Maximum Memory":2415289958,"Timestamp":1678162969091,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Port":41251},"Maximum Memory":2415289958,"Timestamp":1678162969097,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Port":46355},"Maximum Memory":2415289958,"Timestamp":1678162969125,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Port":36581},"Maximum Memory":2415289958,"Timestamp":1678162969143,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Port":41805},"Maximum Memory":2415289958,"Timestamp":1678162969185,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Port":40771},"Maximum Memory":2415289958,"Timestamp":1678162969222,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Port":37423},"Maximum Memory":2415289958,"Timestamp":1678162969237,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerBlockManagerAdded","Block Manager ID":{"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Port":34425},"Maximum Memory":2415289958,"Timestamp":1678162969295,"Maximum Onheap Memory":2415289958,"Maximum Offheap Memory":0} -{"Event":"SparkListenerJobStart","Job ID":0,"Submission Time":1678162971172,"Stage Infos":[{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"parquet at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":1,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"1\",\"name\":\"mapPartitions\"}","Callsite":"parquet at NativeMethodAccessorImpl.java:0","Parent IDs":[0],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":0,"Name":"ParallelCollectionRDD","Scope":"{\"id\":\"0\",\"name\":\"parallelize\"}","Callsite":"parquet at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:755)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[0],"Properties":{"spark.rdd.scope":"{\"id\":\"2\",\"name\":\"collect\"}","spark.rdd.scope.noOverride":"true"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"parquet at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":1,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"1\",\"name\":\"mapPartitions\"}","Callsite":"parquet at NativeMethodAccessorImpl.java:0","Parent IDs":[0],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":0,"Name":"ParallelCollectionRDD","Scope":"{\"id\":\"0\",\"name\":\"parallelize\"}","Callsite":"parquet at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:755)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162971221,"Accumulables":[]},"Properties":{"spark.rdd.scope":"{\"id\":\"2\",\"name\":\"collect\"}","spark.rdd.scope.noOverride":"true"}} -{"Event":"SparkListenerTaskStart","Stage ID":0,"Stage Attempt ID":0,"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1678162971381,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":0,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":0,"Index":0,"Attempt":0,"Launch Time":1678162971381,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162973008,"Failed":false,"Killed":false,"Accumulables":[{"ID":7,"Name":"internal.metrics.resultSerializationTime","Update":6,"Value":6,"Internal":true,"Count Failed Values":true},{"ID":6,"Name":"internal.metrics.jvmGCTime","Update":105,"Value":105,"Internal":true,"Count Failed Values":true},{"ID":5,"Name":"internal.metrics.resultSize","Update":6448,"Value":6448,"Internal":true,"Count Failed Values":true},{"ID":4,"Name":"internal.metrics.executorCpuTime","Update":123884829,"Value":123884829,"Internal":true,"Count Failed Values":true},{"ID":3,"Name":"internal.metrics.executorRunTime","Update":1010,"Value":1010,"Internal":true,"Count Failed Values":true},{"ID":2,"Name":"internal.metrics.executorDeserializeCpuTime","Update":397259197,"Value":397259197,"Internal":true,"Count Failed Values":true},{"ID":1,"Name":"internal.metrics.executorDeserializeTime","Update":523,"Value":523,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":523,"Executor Deserialize CPU Time":397259197,"Executor Run Time":1010,"Executor CPU Time":123884829,"Peak Execution Memory":0,"Result Size":6448,"JVM GC Time":105,"Result Serialization Time":6,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":0,"Stage Attempt ID":0,"Stage Name":"parquet at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":1,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"1\",\"name\":\"mapPartitions\"}","Callsite":"parquet at NativeMethodAccessorImpl.java:0","Parent IDs":[0],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":0,"Name":"ParallelCollectionRDD","Scope":"{\"id\":\"0\",\"name\":\"parallelize\"}","Callsite":"parquet at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameReader.parquet(DataFrameReader.scala:755)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162971221,"Completion Time":1678162973025,"Accumulables":[{"ID":2,"Name":"internal.metrics.executorDeserializeCpuTime","Value":397259197,"Internal":true,"Count Failed Values":true},{"ID":5,"Name":"internal.metrics.resultSize","Value":6448,"Internal":true,"Count Failed Values":true},{"ID":4,"Name":"internal.metrics.executorCpuTime","Value":123884829,"Internal":true,"Count Failed Values":true},{"ID":7,"Name":"internal.metrics.resultSerializationTime","Value":6,"Internal":true,"Count Failed Values":true},{"ID":1,"Name":"internal.metrics.executorDeserializeTime","Value":523,"Internal":true,"Count Failed Values":true},{"ID":3,"Name":"internal.metrics.executorRunTime","Value":1010,"Internal":true,"Count Failed Values":true},{"ID":6,"Name":"internal.metrics.jvmGCTime","Value":105,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":0,"Completion Time":1678162973033,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":0,"description":"createOrReplaceTempView at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.Dataset.createOrReplaceTempView(Dataset.scala:3312)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nCreateViewCommand `alexa_top_1m`, false, true, LocalTempView\n +- Relation[rank#0,site#1] csv\n\n== Analyzed Logical Plan ==\n\nCreateViewCommand `alexa_top_1m`, false, true, LocalTempView\n +- Relation[rank#0,site#1] csv\n\n== Optimized Logical Plan ==\nCreateViewCommand `alexa_top_1m`, false, true, LocalTempView\n +- Relation[rank#0,site#1] csv\n\n== Physical Plan ==\nExecute CreateViewCommand\n +- CreateViewCommand `alexa_top_1m`, false, true, LocalTempView\n +- Relation[rank#0,site#1] csv\n","sparkPlanInfo":{"nodeName":"Execute CreateViewCommand","simpleString":"Execute CreateViewCommand","children":[],"metadata":{},"metrics":[]},"time":1678162975023} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":0,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":0,"timePerRule":{"PruneFileSourcePartitions":65031,"ReassignLambdaVariableID":74591,"PushPredicateThroughNonJoin":31582,"Analyzer$HandleNullInputsForUDF":19631,"Analyzer$ResolveSubqueryColumnAliases":5119,"ResolveTimeZone":13477,"Analyzer$ResolveNamespace":5947,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":9689,"RewriteCorrelatedScalarSubquery":59190,"RemoveLiteralFromGroupExpressions":34657,"PushProjectionThroughUnion":59799,"EliminateSubqueryAliases":43229,"ResolveCatalogs":9023,"PushLeftSemiLeftAntiThroughJoin":59095,"FlattenScalarSubqueriesWithAggregates":73889,"LikeSimplification":128631,"CollapseRepartition":71716,"ResolveHints$ResolveCoalesceHints":5149,"Analyzer$ExtractGenerator":56142,"RewriteIntersectAll":33920,"ResolveHints$ResolveJoinStrategyHints":6880,"TypeCoercion$MapZipWithCoercion":15994,"NullPropagation":75242,"PullupCorrelatedPredicates":39261,"UpdateOuterReferences":9590,"ExtractPythonUDFs":90498,"Analyzer$WindowsSubstitution":8868,"CombineUnions":89241,"ExtractGroupingPythonUDFFromAggregate":40453,"ReorderAssociativeOperator":158229,"CleanupDynamicPruningFilters":75059,"ResolveHints$RemoveAllHints":7474,"SimplifyBinaryComparison":67857,"ResolveTableValuedFunctions":8978,"EliminateSerialization":57335,"TypeCoercion$BooleanEquality":13069,"ReplaceIntersectWithSemiJoin":29699,"ConstantPropagation":122845,"CostBasedJoinReorder":21772,"Analyzer$ResolveReferences":79250,"CTESubstitution":389476,"RemoveRedundantAliases":62454,"TypeCoercion$ImplicitTypeCasts":18742,"RewriteExceptAll":36159,"UpdateAttributeNullability":68886,"PropagateEmptyRelation":79072,"SimplifyCasts":126285,"EliminateMapObjects":67859,"CombineLimits":59089,"DetectAmbiguousSelfJoin":34215,"ReplaceExpressions":71910,"ResolveInlineTables":5552,"OptimizeIn":76727,"CollapseWindow":68907,"TypeCoercion$IfCoercion":17121,"ResolveSessionCatalog":14402,"PartitionPruning":58093,"BooleanSimplification":77570,"TypeCoercion$PromoteStrings":16082,"Analyzer$ResolveAliases":6025,"DecimalAggregates":41811,"PruneFilters":85111,"Analyzer$ResolveMissingReferences":5237,"TransposeWindow":75740,"Analyzer$ResolveRelations":12484,"EliminateUnions":26302,"RewritePredicateSubquery":34612,"ObjectSerializerPruning":30512,"LimitPushDown":58347,"SimplifyCaseConversionExpressions":69891,"Analyzer$ResolveNaturalAndUsingJoin":5548,"EliminateView":54668,"CombineTypedFilters":29742,"OptimizeLimitZero":41606,"CheckCartesianProducts":33966,"ExtractPythonUDFFromAggregate":39520,"Analyzer$ExtractWindowExpressions":11498,"ReplaceExceptWithAntiJoin":31764,"ResolveLambdaVariables":11667,"FallBackFileSourceV2":5248,"Analyzer$ResolveTables":8621,"SubstituteUnresolvedOrdinals":6411,"TypeCoercion$CaseWhenCoercion":17399,"DecimalPrecision":25499,"EliminateSorts":36162,"PushDownLeftSemiAntiJoin":59408,"ExtractPythonUDFFromJoinCondition":43077,"TypeCoercion$StackCoercion":16290,"Analyzer$ResolveAggAliasInGroupBy":5493,"TypeCoercion$StringLiteralCoercion":16011,"FoldablePropagation":114280,"V2ScanRelationPushDown":63452,"EliminateDistinct":9832,"InferFiltersFromConstraints":58309,"Analyzer$PullOutNondeterministic":11905,"Analyzer$ResolveFunctions":13399,"ReplaceNullWithFalseInPredicate":65222,"ResolveHigherOrderFunctions":14448,"Analyzer$ResolvePivot":6080,"CollapseProject":108089,"Analyzer$ResolveNewInstance":11369,"ColumnPruning":287750,"Analyzer$ResolveWindowOrder":15958,"TypeCoercion$ConcatCoercion":14857,"PushDownPredicates":176247,"TimeWindowing":11171,"Optimizer$OptimizeSubqueries":199735,"RewriteNonCorrelatedExists":86837,"TypeCoercion$Division":16636,"ComputeCurrentTime":111116,"ResolveCreateNamedStruct":16593,"TypeCoercion$EltCoercion":15296,"ConvertToLocalRelation":68286,"RemoveRepetitionFromGroupExpressions":31985,"ReplaceDistinctWithAggregate":30058,"PreprocessTableCreation":11763,"ResolveSQLOnFile":5162,"Analyzer$ResolveSubquery":5673,"CombineConcats":13144,"Analyzer$ResolveGroupingAnalytics":12499,"Analyzer$ResolveBinaryArithmetic":15679,"RemoveDispensableExpressions":127535,"Analyzer$ResolveAlterTableChanges":8775,"ResolveEncodersInScalaAgg":14326,"TypeCoercion$IntegralDivision":15617,"Analyzer$ResolveWindowFrame":11699,"Analyzer$ResolveDeserializer":50044,"RewriteDistinctAggregates":44577,"RemoveNoopOperators":125186,"Analyzer$ResolveAggregateFunctions":5410,"NormalizeFloatingNumbers":33502,"ReorderJoin":64222,"Analyzer$ResolveUpCast":8679,"Analyzer$ResolveGenerate":6741,"TypeCoercion$WidenSetOperationTypes":6000,"EliminateOuterJoin":67393,"SimplifyExtractValueOps":70804,"OptimizeMetadataOnlyQuery":14153,"EliminateResolvedHint":89547,"Analyzer$ResolveInsertInto":5349,"ReplaceExceptWithFilter":54416,"CleanupAliases":18110,"GetCurrentDatabase":177455,"SchemaPruning":351703,"Analyzer$ResolveOutputRelation":5540,"BloomFilterJoinRule":48922,"Analyzer$ResolveRandomSeed":5887,"TypeCoercion$WindowFrameCoercion":16730,"ConstantFolding":67504,"TypeCoercion$DateTimeOperations":14465,"TypeCoercion$InConversion":19141,"FindDataSourceTable":7874,"SimplifyConditionals":70378,"DataSourceAnalysis":6744,"TypeCoercion$FunctionArgumentConversion":15823,"Analyzer$GlobalAggregates":5394,"Analyzer$LookupFunctions":90831,"CombineFilters":88454,"ReplaceDeduplicateWithAggregate":33529,"PreprocessTableInsertion":5546},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":2,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":2,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":2,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":2,"CollapseRepartition":2,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":2,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":3,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":2,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":2,"ResolveTableValuedFunctions":1,"EliminateSerialization":2,"TypeCoercion$BooleanEquality":1,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":2,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":2,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":2,"EliminateMapObjects":1,"CombineLimits":2,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":2,"CollapseWindow":2,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":2,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":3,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":2,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":2,"SimplifyCaseConversionExpressions":2,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":2,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":2,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":2,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":3,"Analyzer$ResolveNewInstance":1,"ColumnPruning":4,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":4,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":2,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":2,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":4,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":2,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":2,"SimplifyExtractValueOps":2,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":2,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":2,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":3,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{},"timeEffectiveRunsPerRule":{},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":0,"time":1678162975117} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":1,"description":"createOrReplaceTempView at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.Dataset.createOrReplaceTempView(Dataset.scala:3312)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nCreateViewCommand `ccindex`, false, true, LocalTempView\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nCreateViewCommand `ccindex`, false, true, LocalTempView\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nCreateViewCommand `ccindex`, false, true, LocalTempView\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Physical Plan ==\nExecute CreateViewCommand\n +- CreateViewCommand `ccindex`, false, true, LocalTempView\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n","sparkPlanInfo":{"nodeName":"Execute CreateViewCommand","simpleString":"Execute CreateViewCommand","children":[],"metadata":{},"metrics":[]},"time":1678162975192} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":1,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":1,"timePerRule":{"PruneFileSourcePartitions":71400,"ReassignLambdaVariableID":74086,"PushPredicateThroughNonJoin":31467,"Analyzer$HandleNullInputsForUDF":22196,"Analyzer$ResolveSubqueryColumnAliases":5965,"ResolveTimeZone":12842,"Analyzer$ResolveNamespace":8155,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":11584,"RewriteCorrelatedScalarSubquery":56173,"RemoveLiteralFromGroupExpressions":29963,"PushProjectionThroughUnion":57958,"EliminateSubqueryAliases":45359,"ResolveCatalogs":9207,"PushLeftSemiLeftAntiThroughJoin":61969,"FlattenScalarSubqueriesWithAggregates":48938,"LikeSimplification":124594,"CollapseRepartition":71007,"ResolveHints$ResolveCoalesceHints":5501,"Analyzer$ExtractGenerator":40418,"RewriteIntersectAll":26828,"ResolveHints$ResolveJoinStrategyHints":7353,"TypeCoercion$MapZipWithCoercion":14020,"NullPropagation":73646,"PullupCorrelatedPredicates":39717,"UpdateOuterReferences":11232,"ExtractPythonUDFs":75995,"Analyzer$WindowsSubstitution":9154,"CombineUnions":88239,"ExtractGroupingPythonUDFFromAggregate":37191,"ReorderAssociativeOperator":139635,"CleanupDynamicPruningFilters":72665,"ResolveHints$RemoveAllHints":14804,"SimplifyBinaryComparison":71547,"ResolveTableValuedFunctions":7542,"EliminateSerialization":54822,"TypeCoercion$BooleanEquality":10925,"ReplaceIntersectWithSemiJoin":29423,"ConstantPropagation":58549,"CostBasedJoinReorder":18171,"Analyzer$ResolveReferences":66766,"CTESubstitution":232204,"RemoveRedundantAliases":58406,"TypeCoercion$ImplicitTypeCasts":14614,"RewriteExceptAll":32468,"UpdateAttributeNullability":88326,"PropagateEmptyRelation":78263,"SimplifyCasts":141663,"EliminateMapObjects":71028,"CombineLimits":60820,"DetectAmbiguousSelfJoin":22750,"ReplaceExpressions":70683,"ResolveInlineTables":5349,"OptimizeIn":81946,"CollapseWindow":70449,"TypeCoercion$IfCoercion":12885,"ResolveSessionCatalog":10621,"PartitionPruning":69955,"BooleanSimplification":76390,"TypeCoercion$PromoteStrings":13763,"Analyzer$ResolveAliases":5773,"DecimalAggregates":37589,"PruneFilters":87158,"Analyzer$ResolveMissingReferences":5411,"TransposeWindow":76661,"Analyzer$ResolveRelations":12350,"EliminateUnions":12390,"RewritePredicateSubquery":32092,"ObjectSerializerPruning":29148,"LimitPushDown":58792,"SimplifyCaseConversionExpressions":66246,"Analyzer$ResolveNaturalAndUsingJoin":5896,"EliminateView":38153,"CombineTypedFilters":31469,"OptimizeLimitZero":42393,"CheckCartesianProducts":32436,"ExtractPythonUDFFromAggregate":36890,"Analyzer$ExtractWindowExpressions":12035,"ReplaceExceptWithAntiJoin":30105,"ResolveLambdaVariables":9909,"FallBackFileSourceV2":3658,"Analyzer$ResolveTables":8810,"SubstituteUnresolvedOrdinals":6224,"TypeCoercion$CaseWhenCoercion":13331,"DecimalPrecision":23639,"EliminateSorts":38901,"PushDownLeftSemiAntiJoin":65334,"ExtractPythonUDFFromJoinCondition":37561,"TypeCoercion$StackCoercion":12301,"Analyzer$ResolveAggAliasInGroupBy":5399,"TypeCoercion$StringLiteralCoercion":7782,"FoldablePropagation":109453,"V2ScanRelationPushDown":52250,"EliminateDistinct":10525,"InferFiltersFromConstraints":50509,"Analyzer$PullOutNondeterministic":10982,"Analyzer$ResolveFunctions":11245,"ReplaceNullWithFalseInPredicate":69271,"ResolveHigherOrderFunctions":13090,"Analyzer$ResolvePivot":5998,"CollapseProject":104742,"Analyzer$ResolveNewInstance":10224,"ColumnPruning":272433,"Analyzer$ResolveWindowOrder":15719,"TypeCoercion$ConcatCoercion":13842,"PushDownPredicates":210176,"TimeWindowing":12054,"Optimizer$OptimizeSubqueries":205284,"RewriteNonCorrelatedExists":67806,"TypeCoercion$Division":88255,"ComputeCurrentTime":116027,"ResolveCreateNamedStruct":12962,"TypeCoercion$EltCoercion":12812,"ConvertToLocalRelation":60269,"RemoveRepetitionFromGroupExpressions":28848,"ReplaceDistinctWithAggregate":28582,"PreprocessTableCreation":6172,"ResolveSQLOnFile":3849,"Analyzer$ResolveSubquery":5838,"CombineConcats":13589,"Analyzer$ResolveGroupingAnalytics":14805,"Analyzer$ResolveBinaryArithmetic":10992,"RemoveDispensableExpressions":144866,"Analyzer$ResolveAlterTableChanges":7892,"ResolveEncodersInScalaAgg":8512,"TypeCoercion$IntegralDivision":17610,"Analyzer$ResolveWindowFrame":10534,"Analyzer$ResolveDeserializer":46628,"RewriteDistinctAggregates":45360,"RemoveNoopOperators":114961,"Analyzer$ResolveAggregateFunctions":5722,"NormalizeFloatingNumbers":33213,"ReorderJoin":63610,"Analyzer$ResolveUpCast":8480,"Analyzer$ResolveGenerate":6100,"TypeCoercion$WidenSetOperationTypes":6144,"EliminateOuterJoin":56424,"SimplifyExtractValueOps":63752,"OptimizeMetadataOnlyQuery":17254,"EliminateResolvedHint":3505095,"Analyzer$ResolveInsertInto":5618,"ReplaceExceptWithFilter":53754,"CleanupAliases":16136,"GetCurrentDatabase":173342,"SchemaPruning":89692,"Analyzer$ResolveOutputRelation":5294,"BloomFilterJoinRule":50934,"Analyzer$ResolveRandomSeed":5902,"TypeCoercion$WindowFrameCoercion":11960,"ConstantFolding":67181,"TypeCoercion$DateTimeOperations":10934,"TypeCoercion$InConversion":18371,"FindDataSourceTable":4928,"SimplifyConditionals":72778,"DataSourceAnalysis":4525,"TypeCoercion$FunctionArgumentConversion":15045,"Analyzer$GlobalAggregates":5439,"Analyzer$LookupFunctions":48926,"CombineFilters":86396,"ReplaceDeduplicateWithAggregate":31154,"PreprocessTableInsertion":3962},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":2,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":2,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":2,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":2,"CollapseRepartition":2,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":2,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":3,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":2,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":2,"ResolveTableValuedFunctions":1,"EliminateSerialization":2,"TypeCoercion$BooleanEquality":1,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":2,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":2,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":2,"EliminateMapObjects":1,"CombineLimits":2,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":2,"CollapseWindow":2,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":2,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":3,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":2,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":2,"SimplifyCaseConversionExpressions":2,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":2,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":2,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":2,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":3,"Analyzer$ResolveNewInstance":1,"ColumnPruning":4,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":4,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":2,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":2,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":4,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":2,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":2,"SimplifyExtractValueOps":2,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":2,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":2,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":3,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{},"timeEffectiveRunsPerRule":{},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":1,"time":1678162975198} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":2,"description":"createOrReplaceTempView at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.Dataset.createOrReplaceTempView(Dataset.scala:3312)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nCreateViewCommand `site_summary`, false, true, LocalTempView\n +- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nCreateViewCommand `site_summary`, false, true, LocalTempView\n +- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nCreateViewCommand `site_summary`, false, true, LocalTempView\n +- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Physical Plan ==\nExecute CreateViewCommand\n +- CreateViewCommand `site_summary`, false, true, LocalTempView\n +- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n","sparkPlanInfo":{"nodeName":"Execute CreateViewCommand","simpleString":"Execute CreateViewCommand","children":[],"metadata":{},"metrics":[]},"time":1678162976364} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":2,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":2,"timePerRule":{"PruneFileSourcePartitions":66381,"ReassignLambdaVariableID":70148,"PushPredicateThroughNonJoin":27184,"Analyzer$HandleNullInputsForUDF":10733,"Analyzer$ResolveSubqueryColumnAliases":4403,"ResolveTimeZone":5218,"Analyzer$ResolveNamespace":5187,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":8587,"RewriteCorrelatedScalarSubquery":60794,"RemoveLiteralFromGroupExpressions":31869,"PushProjectionThroughUnion":67645,"EliminateSubqueryAliases":42747,"ResolveCatalogs":7759,"PushLeftSemiLeftAntiThroughJoin":58897,"FlattenScalarSubqueriesWithAggregates":46105,"LikeSimplification":132448,"CollapseRepartition":74103,"ResolveHints$ResolveCoalesceHints":5328,"Analyzer$ExtractGenerator":20339,"RewriteIntersectAll":30932,"ResolveHints$ResolveJoinStrategyHints":6268,"TypeCoercion$MapZipWithCoercion":5879,"NullPropagation":72248,"PullupCorrelatedPredicates":39287,"UpdateOuterReferences":6580,"ExtractPythonUDFs":66273,"Analyzer$WindowsSubstitution":6094,"CombineUnions":91820,"ExtractGroupingPythonUDFFromAggregate":37912,"ReorderAssociativeOperator":117356,"CleanupDynamicPruningFilters":69722,"ResolveHints$RemoveAllHints":6153,"SimplifyBinaryComparison":70524,"ResolveTableValuedFunctions":6603,"EliminateSerialization":60535,"TypeCoercion$BooleanEquality":5757,"ReplaceIntersectWithSemiJoin":30875,"ConstantPropagation":61768,"CostBasedJoinReorder":16825,"Analyzer$ResolveReferences":37755,"CTESubstitution":207391,"RemoveRedundantAliases":54150,"TypeCoercion$ImplicitTypeCasts":6265,"RewriteExceptAll":32275,"UpdateAttributeNullability":54708,"PropagateEmptyRelation":79096,"SimplifyCasts":133314,"EliminateMapObjects":66297,"CombineLimits":60819,"DetectAmbiguousSelfJoin":29601,"ReplaceExpressions":88966,"ResolveInlineTables":4838,"OptimizeIn":73721,"CollapseWindow":73641,"TypeCoercion$IfCoercion":6135,"ResolveSessionCatalog":10877,"PartitionPruning":60103,"BooleanSimplification":73623,"TypeCoercion$PromoteStrings":5960,"Analyzer$ResolveAliases":5026,"DecimalAggregates":40021,"PruneFilters":88564,"Analyzer$ResolveMissingReferences":4572,"TransposeWindow":73821,"Analyzer$ResolveRelations":9740,"EliminateUnions":9352,"RewritePredicateSubquery":30648,"ObjectSerializerPruning":28182,"LimitPushDown":61656,"SimplifyCaseConversionExpressions":68506,"Analyzer$ResolveNaturalAndUsingJoin":4721,"EliminateView":36605,"CombineTypedFilters":34011,"OptimizeLimitZero":40821,"CheckCartesianProducts":30621,"ExtractPythonUDFFromAggregate":35692,"Analyzer$ExtractWindowExpressions":10664,"ReplaceExceptWithAntiJoin":29443,"ResolveLambdaVariables":8194,"FallBackFileSourceV2":3581,"Analyzer$ResolveTables":7088,"SubstituteUnresolvedOrdinals":5454,"TypeCoercion$CaseWhenCoercion":6200,"DecimalPrecision":19637,"EliminateSorts":36370,"PushDownLeftSemiAntiJoin":62483,"ExtractPythonUDFFromJoinCondition":37659,"TypeCoercion$StackCoercion":5918,"Analyzer$ResolveAggAliasInGroupBy":4955,"TypeCoercion$StringLiteralCoercion":5693,"FoldablePropagation":98340,"V2ScanRelationPushDown":54051,"EliminateDistinct":9954,"InferFiltersFromConstraints":93261,"Analyzer$PullOutNondeterministic":6558,"Analyzer$ResolveFunctions":9935,"ReplaceNullWithFalseInPredicate":68230,"ResolveHigherOrderFunctions":7034,"Analyzer$ResolvePivot":5392,"CollapseProject":107268,"Analyzer$ResolveNewInstance":10805,"ColumnPruning":264366,"Analyzer$ResolveWindowOrder":7117,"TypeCoercion$ConcatCoercion":8094,"PushDownPredicates":177057,"TimeWindowing":9074,"Optimizer$OptimizeSubqueries":195440,"RewriteNonCorrelatedExists":70426,"TypeCoercion$Division":5419,"ComputeCurrentTime":108663,"ResolveCreateNamedStruct":7419,"TypeCoercion$EltCoercion":8233,"ConvertToLocalRelation":61653,"RemoveRepetitionFromGroupExpressions":29686,"ReplaceDistinctWithAggregate":29344,"PreprocessTableCreation":7448,"ResolveSQLOnFile":3999,"Analyzer$ResolveSubquery":4681,"CombineConcats":13294,"Analyzer$ResolveGroupingAnalytics":11159,"Analyzer$ResolveBinaryArithmetic":7987,"RemoveDispensableExpressions":128186,"Analyzer$ResolveAlterTableChanges":6687,"ResolveEncodersInScalaAgg":8044,"TypeCoercion$IntegralDivision":5987,"Analyzer$ResolveWindowFrame":5246,"Analyzer$ResolveDeserializer":275008,"RewriteDistinctAggregates":40617,"RemoveNoopOperators":115093,"Analyzer$ResolveAggregateFunctions":5186,"NormalizeFloatingNumbers":28962,"ReorderJoin":64090,"Analyzer$ResolveUpCast":7051,"Analyzer$ResolveGenerate":5203,"TypeCoercion$WidenSetOperationTypes":4882,"EliminateOuterJoin":61775,"SimplifyExtractValueOps":133858,"OptimizeMetadataOnlyQuery":15136,"EliminateResolvedHint":89577,"Analyzer$ResolveInsertInto":4860,"ReplaceExceptWithFilter":50973,"CleanupAliases":42713,"GetCurrentDatabase":199404,"SchemaPruning":87363,"Analyzer$ResolveOutputRelation":5027,"BloomFilterJoinRule":45192,"Analyzer$ResolveRandomSeed":4711,"TypeCoercion$WindowFrameCoercion":6365,"ConstantFolding":70169,"TypeCoercion$DateTimeOperations":5835,"TypeCoercion$InConversion":6996,"FindDataSourceTable":5154,"SimplifyConditionals":70263,"DataSourceAnalysis":4883,"TypeCoercion$FunctionArgumentConversion":5819,"Analyzer$GlobalAggregates":4443,"Analyzer$LookupFunctions":21525,"CombineFilters":84970,"ReplaceDeduplicateWithAggregate":32340,"PreprocessTableInsertion":4597},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":2,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":2,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":2,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":2,"CollapseRepartition":2,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":2,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":3,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":2,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":2,"ResolveTableValuedFunctions":1,"EliminateSerialization":2,"TypeCoercion$BooleanEquality":1,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":2,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":2,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":2,"EliminateMapObjects":1,"CombineLimits":2,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":2,"CollapseWindow":2,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":2,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":3,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":2,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":2,"SimplifyCaseConversionExpressions":2,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":2,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":2,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":2,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":3,"Analyzer$ResolveNewInstance":1,"ColumnPruning":4,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":4,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":2,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":2,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":4,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":2,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":2,"SimplifyExtractValueOps":2,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":2,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":2,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":3,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{},"timeEffectiveRunsPerRule":{},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":2,"time":1678162976405} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":3,"description":"createOrReplaceTempView at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.Dataset.createOrReplaceTempView(Dataset.scala:3312)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nCreateViewCommand `language_count_tmp`, false, true, LocalTempView\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nCreateViewCommand `language_count_tmp`, false, true, LocalTempView\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nCreateViewCommand `language_count_tmp`, false, true, LocalTempView\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Physical Plan ==\nExecute CreateViewCommand\n +- CreateViewCommand `language_count_tmp`, false, true, LocalTempView\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n","sparkPlanInfo":{"nodeName":"Execute CreateViewCommand","simpleString":"Execute CreateViewCommand","children":[],"metadata":{},"metrics":[]},"time":1678162976740} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":3,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":3,"timePerRule":{"PruneFileSourcePartitions":65698,"ReassignLambdaVariableID":72929,"PushPredicateThroughNonJoin":27568,"Analyzer$HandleNullInputsForUDF":11536,"Analyzer$ResolveSubqueryColumnAliases":4719,"ResolveTimeZone":6693,"Analyzer$ResolveNamespace":6222,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":9443,"RewriteCorrelatedScalarSubquery":64538,"RemoveLiteralFromGroupExpressions":30597,"PushProjectionThroughUnion":58963,"EliminateSubqueryAliases":45538,"ResolveCatalogs":8264,"PushLeftSemiLeftAntiThroughJoin":52788,"FlattenScalarSubqueriesWithAggregates":50271,"LikeSimplification":139306,"CollapseRepartition":72379,"ResolveHints$ResolveCoalesceHints":6157,"Analyzer$ExtractGenerator":21895,"RewriteIntersectAll":31148,"ResolveHints$ResolveJoinStrategyHints":7363,"TypeCoercion$MapZipWithCoercion":6160,"NullPropagation":78835,"PullupCorrelatedPredicates":41093,"UpdateOuterReferences":6617,"ExtractPythonUDFs":58387,"Analyzer$WindowsSubstitution":7236,"CombineUnions":93165,"ExtractGroupingPythonUDFFromAggregate":32353,"ReorderAssociativeOperator":145006,"CleanupDynamicPruningFilters":79817,"ResolveHints$RemoveAllHints":4600,"SimplifyBinaryComparison":72121,"ResolveTableValuedFunctions":7525,"EliminateSerialization":54484,"TypeCoercion$BooleanEquality":9530,"ReplaceIntersectWithSemiJoin":28664,"ConstantPropagation":60357,"CostBasedJoinReorder":20227,"Analyzer$ResolveReferences":61919,"CTESubstitution":193253,"RemoveRedundantAliases":64210,"TypeCoercion$ImplicitTypeCasts":5236,"RewriteExceptAll":33860,"UpdateAttributeNullability":48302,"PropagateEmptyRelation":82086,"SimplifyCasts":136203,"EliminateMapObjects":70884,"CombineLimits":58072,"DetectAmbiguousSelfJoin":27514,"ReplaceExpressions":75795,"ResolveInlineTables":5329,"OptimizeIn":73976,"CollapseWindow":65179,"TypeCoercion$IfCoercion":5652,"ResolveSessionCatalog":6313,"PartitionPruning":67272,"BooleanSimplification":73811,"TypeCoercion$PromoteStrings":8307,"Analyzer$ResolveAliases":6290,"DecimalAggregates":38128,"PruneFilters":85436,"Analyzer$ResolveMissingReferences":5596,"TransposeWindow":78446,"Analyzer$ResolveRelations":13022,"EliminateUnions":6843,"RewritePredicateSubquery":30493,"ObjectSerializerPruning":31619,"LimitPushDown":55578,"SimplifyCaseConversionExpressions":66025,"Analyzer$ResolveNaturalAndUsingJoin":5701,"EliminateView":42753,"CombineTypedFilters":28066,"OptimizeLimitZero":41950,"CheckCartesianProducts":35178,"ExtractPythonUDFFromAggregate":35801,"Analyzer$ExtractWindowExpressions":11652,"ReplaceExceptWithAntiJoin":30669,"ResolveLambdaVariables":10018,"FallBackFileSourceV2":2887,"Analyzer$ResolveTables":8417,"SubstituteUnresolvedOrdinals":6298,"TypeCoercion$CaseWhenCoercion":5879,"DecimalPrecision":20119,"EliminateSorts":32814,"PushDownLeftSemiAntiJoin":54294,"ExtractPythonUDFFromJoinCondition":46473,"TypeCoercion$StackCoercion":5589,"Analyzer$ResolveAggAliasInGroupBy":5343,"TypeCoercion$StringLiteralCoercion":5125,"FoldablePropagation":109287,"V2ScanRelationPushDown":51124,"EliminateDistinct":10658,"InferFiltersFromConstraints":54387,"Analyzer$PullOutNondeterministic":5500,"Analyzer$ResolveFunctions":10377,"ReplaceNullWithFalseInPredicate":75563,"ResolveHigherOrderFunctions":8499,"Analyzer$ResolvePivot":5738,"CollapseProject":104480,"Analyzer$ResolveNewInstance":9065,"ColumnPruning":274474,"Analyzer$ResolveWindowOrder":8374,"TypeCoercion$ConcatCoercion":7072,"PushDownPredicates":177907,"TimeWindowing":10415,"Optimizer$OptimizeSubqueries":217826,"RewriteNonCorrelatedExists":75551,"TypeCoercion$Division":5955,"ComputeCurrentTime":110775,"ResolveCreateNamedStruct":7962,"TypeCoercion$EltCoercion":6648,"ConvertToLocalRelation":60200,"RemoveRepetitionFromGroupExpressions":30127,"ReplaceDistinctWithAggregate":29815,"PreprocessTableCreation":5793,"ResolveSQLOnFile":3199,"Analyzer$ResolveSubquery":5693,"CombineConcats":14125,"Analyzer$ResolveGroupingAnalytics":11038,"Analyzer$ResolveBinaryArithmetic":7237,"RemoveDispensableExpressions":132171,"Analyzer$ResolveAlterTableChanges":5382,"ResolveEncodersInScalaAgg":8176,"TypeCoercion$IntegralDivision":5093,"Analyzer$ResolveWindowFrame":5950,"Analyzer$ResolveDeserializer":29706,"RewriteDistinctAggregates":40688,"RemoveNoopOperators":112524,"Analyzer$ResolveAggregateFunctions":6194,"NormalizeFloatingNumbers":30746,"ReorderJoin":62428,"Analyzer$ResolveUpCast":7072,"Analyzer$ResolveGenerate":6937,"TypeCoercion$WidenSetOperationTypes":6212,"EliminateOuterJoin":58649,"SimplifyExtractValueOps":75463,"OptimizeMetadataOnlyQuery":18795,"EliminateResolvedHint":94161,"Analyzer$ResolveInsertInto":6027,"ReplaceExceptWithFilter":53028,"CleanupAliases":10402,"GetCurrentDatabase":179739,"SchemaPruning":87947,"Analyzer$ResolveOutputRelation":5170,"BloomFilterJoinRule":45692,"Analyzer$ResolveRandomSeed":5312,"TypeCoercion$WindowFrameCoercion":5673,"ConstantFolding":70574,"TypeCoercion$DateTimeOperations":4960,"TypeCoercion$InConversion":10076,"FindDataSourceTable":4298,"SimplifyConditionals":68775,"DataSourceAnalysis":3529,"TypeCoercion$FunctionArgumentConversion":5731,"Analyzer$GlobalAggregates":4168,"Analyzer$LookupFunctions":25019,"CombineFilters":84098,"ReplaceDeduplicateWithAggregate":30851,"PreprocessTableInsertion":3551},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":2,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":2,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":2,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":2,"CollapseRepartition":2,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":2,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":3,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":2,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":2,"ResolveTableValuedFunctions":1,"EliminateSerialization":2,"TypeCoercion$BooleanEquality":1,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":2,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":2,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":2,"EliminateMapObjects":1,"CombineLimits":2,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":2,"CollapseWindow":2,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":2,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":3,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":2,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":2,"SimplifyCaseConversionExpressions":2,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":2,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":2,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":2,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":3,"Analyzer$ResolveNewInstance":1,"ColumnPruning":4,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":4,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":2,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":2,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":4,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":2,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":2,"SimplifyExtractValueOps":2,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":2,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":2,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":3,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{},"timeEffectiveRunsPerRule":{},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":3,"time":1678162976746} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":4,"description":"createOrReplaceTempView at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.Dataset.createOrReplaceTempView(Dataset.scala:3312)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nCreateViewCommand `language_charset_loglikelihood`, false, true, LocalTempView\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nCreateViewCommand `language_charset_loglikelihood`, false, true, LocalTempView\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nCreateViewCommand `language_charset_loglikelihood`, false, true, LocalTempView\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Physical Plan ==\nExecute CreateViewCommand\n +- CreateViewCommand `language_charset_loglikelihood`, false, true, LocalTempView\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n","sparkPlanInfo":{"nodeName":"Execute CreateViewCommand","simpleString":"Execute CreateViewCommand","children":[],"metadata":{},"metrics":[]},"time":1678162977269} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":4,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":4,"timePerRule":{"PruneFileSourcePartitions":52779,"ReassignLambdaVariableID":77158,"PushPredicateThroughNonJoin":27490,"Analyzer$HandleNullInputsForUDF":11453,"Analyzer$ResolveSubqueryColumnAliases":3717,"ResolveTimeZone":5751,"Analyzer$ResolveNamespace":4009,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":7515,"RewriteCorrelatedScalarSubquery":57042,"RemoveLiteralFromGroupExpressions":27107,"PushProjectionThroughUnion":57237,"EliminateSubqueryAliases":40188,"ResolveCatalogs":5636,"PushLeftSemiLeftAntiThroughJoin":53699,"FlattenScalarSubqueriesWithAggregates":49177,"LikeSimplification":128831,"CollapseRepartition":68445,"ResolveHints$ResolveCoalesceHints":5088,"Analyzer$ExtractGenerator":18674,"RewriteIntersectAll":26895,"ResolveHints$ResolveJoinStrategyHints":5206,"TypeCoercion$MapZipWithCoercion":10749,"NullPropagation":68432,"PullupCorrelatedPredicates":32968,"UpdateOuterReferences":9446,"ExtractPythonUDFs":90523,"Analyzer$WindowsSubstitution":7244,"CombineUnions":80883,"ExtractGroupingPythonUDFFromAggregate":34582,"ReorderAssociativeOperator":123666,"CleanupDynamicPruningFilters":71701,"ResolveHints$RemoveAllHints":5703,"SimplifyBinaryComparison":71739,"ResolveTableValuedFunctions":5955,"EliminateSerialization":54844,"TypeCoercion$BooleanEquality":5911,"ReplaceIntersectWithSemiJoin":35390,"ConstantPropagation":55503,"CostBasedJoinReorder":20713,"Analyzer$ResolveReferences":34660,"CTESubstitution":198488,"RemoveRedundantAliases":68191,"TypeCoercion$ImplicitTypeCasts":6121,"RewriteExceptAll":28552,"UpdateAttributeNullability":310797,"PropagateEmptyRelation":74681,"SimplifyCasts":151208,"EliminateMapObjects":64973,"CombineLimits":57570,"DetectAmbiguousSelfJoin":30309,"ReplaceExpressions":65610,"ResolveInlineTables":4574,"OptimizeIn":77708,"CollapseWindow":65602,"TypeCoercion$IfCoercion":5046,"ResolveSessionCatalog":7602,"PartitionPruning":69556,"BooleanSimplification":72893,"TypeCoercion$PromoteStrings":6246,"Analyzer$ResolveAliases":4086,"DecimalAggregates":37860,"PruneFilters":82018,"Analyzer$ResolveMissingReferences":3812,"TransposeWindow":71711,"Analyzer$ResolveRelations":6804,"EliminateUnions":7164,"RewritePredicateSubquery":32105,"ObjectSerializerPruning":27349,"LimitPushDown":54587,"SimplifyCaseConversionExpressions":62967,"Analyzer$ResolveNaturalAndUsingJoin":3940,"EliminateView":35223,"CombineTypedFilters":27166,"OptimizeLimitZero":35928,"CheckCartesianProducts":34683,"ExtractPythonUDFFromAggregate":38954,"Analyzer$ExtractWindowExpressions":10337,"ReplaceExceptWithAntiJoin":26779,"ResolveLambdaVariables":7231,"FallBackFileSourceV2":3034,"Analyzer$ResolveTables":5367,"SubstituteUnresolvedOrdinals":4926,"TypeCoercion$CaseWhenCoercion":5366,"DecimalPrecision":8998,"EliminateSorts":33679,"PushDownLeftSemiAntiJoin":56788,"ExtractPythonUDFFromJoinCondition":40538,"TypeCoercion$StackCoercion":5406,"Analyzer$ResolveAggAliasInGroupBy":3571,"TypeCoercion$StringLiteralCoercion":4899,"FoldablePropagation":931302,"V2ScanRelationPushDown":48737,"EliminateDistinct":10167,"InferFiltersFromConstraints":5579378,"Analyzer$PullOutNondeterministic":5700,"Analyzer$ResolveFunctions":8826,"ReplaceNullWithFalseInPredicate":64887,"ResolveHigherOrderFunctions":7325,"Analyzer$ResolvePivot":4892,"CollapseProject":101596,"Analyzer$ResolveNewInstance":7800,"ColumnPruning":296080,"Analyzer$ResolveWindowOrder":7558,"TypeCoercion$ConcatCoercion":7330,"PushDownPredicates":141892,"TimeWindowing":9305,"Optimizer$OptimizeSubqueries":196707,"RewriteNonCorrelatedExists":61209,"TypeCoercion$Division":5465,"ComputeCurrentTime":105508,"ResolveCreateNamedStruct":7342,"TypeCoercion$EltCoercion":7100,"ConvertToLocalRelation":60075,"RemoveRepetitionFromGroupExpressions":26826,"ReplaceDistinctWithAggregate":26252,"PreprocessTableCreation":6298,"ResolveSQLOnFile":3825,"Analyzer$ResolveSubquery":4146,"CombineConcats":15370,"Analyzer$ResolveGroupingAnalytics":8799,"Analyzer$ResolveBinaryArithmetic":6498,"RemoveDispensableExpressions":128736,"Analyzer$ResolveAlterTableChanges":6112,"ResolveEncodersInScalaAgg":8292,"TypeCoercion$IntegralDivision":5388,"Analyzer$ResolveWindowFrame":5516,"Analyzer$ResolveDeserializer":26368,"RewriteDistinctAggregates":40294,"RemoveNoopOperators":111773,"Analyzer$ResolveAggregateFunctions":4547,"NormalizeFloatingNumbers":29874,"ReorderJoin":58615,"Analyzer$ResolveUpCast":6122,"Analyzer$ResolveGenerate":4358,"TypeCoercion$WidenSetOperationTypes":4077,"EliminateOuterJoin":54802,"SimplifyExtractValueOps":64890,"OptimizeMetadataOnlyQuery":17618,"EliminateResolvedHint":91363,"Analyzer$ResolveInsertInto":3172,"ReplaceExceptWithFilter":47303,"CleanupAliases":13154,"GetCurrentDatabase":155827,"SchemaPruning":98186,"Analyzer$ResolveOutputRelation":3776,"BloomFilterJoinRule":48361,"Analyzer$ResolveRandomSeed":4369,"TypeCoercion$WindowFrameCoercion":5685,"ConstantFolding":63447,"TypeCoercion$DateTimeOperations":5330,"TypeCoercion$InConversion":6676,"FindDataSourceTable":4768,"SimplifyConditionals":66769,"DataSourceAnalysis":4680,"TypeCoercion$FunctionArgumentConversion":5557,"Analyzer$GlobalAggregates":3684,"Analyzer$LookupFunctions":21801,"CombineFilters":76190,"ReplaceDeduplicateWithAggregate":29437,"PreprocessTableInsertion":3461},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":2,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":2,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":2,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":2,"CollapseRepartition":2,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":2,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":3,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":2,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":2,"ResolveTableValuedFunctions":1,"EliminateSerialization":2,"TypeCoercion$BooleanEquality":1,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":2,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":2,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":2,"EliminateMapObjects":1,"CombineLimits":2,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":2,"CollapseWindow":2,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":2,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":3,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":2,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":2,"SimplifyCaseConversionExpressions":2,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":2,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":2,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":2,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":3,"Analyzer$ResolveNewInstance":1,"ColumnPruning":4,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":4,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":2,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":2,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":4,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":2,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":2,"SimplifyExtractValueOps":2,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":2,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":2,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":3,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{},"timeEffectiveRunsPerRule":{},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":4,"time":1678162977276} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":5,"description":"save at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Sort [Language#279 ASC NULLS FIRST], false, 0\n +- Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Sort [Charset#311 ASC NULLS FIRST], false, 0\n +- Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Exchange SinglePartition, true, [id=#93]\n +- HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)], output=[n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L])\n +- Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#90]\n +- HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)], output=[Charset#311, Language#279, count#1053L])\n +- Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0","children":[{"nodeName":"Project","simpleString":"Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]","children":[{"nodeName":"Filter","simpleString":"Filter (n_pages#545L >= 100)","children":[{"nodeName":"Window","simpleString":"Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]","children":[{"nodeName":"Filter","simpleString":"Filter NOT Contains(Language#279, ,)","children":[{"nodeName":"Window","simpleString":"Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]","children":[{"nodeName":"Sort","simpleString":"Sort [Charset#311 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]","children":[{"nodeName":"Window","simpleString":"Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]","children":[{"nodeName":"Exchange","simpleString":"Exchange SinglePartition, true, [id=#93]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)])","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#90]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)])","children":[{"nodeName":"Project","simpleString":"Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]","children":[{"nodeName":"Filter","simpleString":"Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":109,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":108,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":105,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":106,"metricType":"timing"},{"name":"peak memory","accumulatorId":104,"metricType":"size"},{"name":"number of output rows","accumulatorId":103,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":107,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":37,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":38,"metricType":"nsTiming"},{"name":"records read","accumulatorId":35,"metricType":"sum"},{"name":"local bytes read","accumulatorId":33,"metricType":"size"},{"name":"fetch wait time","accumulatorId":34,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":31,"metricType":"size"},{"name":"local blocks read","accumulatorId":30,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":29,"metricType":"sum"},{"name":"data size","accumulatorId":28,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":32,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":36,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":100,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":101,"metricType":"timing"},{"name":"peak memory","accumulatorId":99,"metricType":"size"},{"name":"number of output rows","accumulatorId":98,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":102,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":48,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":49,"metricType":"nsTiming"},{"name":"records read","accumulatorId":46,"metricType":"sum"},{"name":"local bytes read","accumulatorId":44,"metricType":"size"},{"name":"fetch wait time","accumulatorId":45,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":42,"metricType":"size"},{"name":"local blocks read","accumulatorId":41,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":40,"metricType":"sum"},{"name":"data size","accumulatorId":39,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":43,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":47,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":95,"metricType":"timing"},{"name":"peak memory","accumulatorId":96,"metricType":"size"},{"name":"spill size","accumulatorId":97,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":94,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":91,"metricType":"timing"},{"name":"peak memory","accumulatorId":92,"metricType":"size"},{"name":"spill size","accumulatorId":93,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":90,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":87,"metricType":"timing"},{"name":"peak memory","accumulatorId":88,"metricType":"size"},{"name":"spill size","accumulatorId":89,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":83,"metricType":"sum"},{"name":"written output","accumulatorId":84,"metricType":"size"},{"name":"number of output rows","accumulatorId":85,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":86,"metricType":"sum"}]},"time":1678162978214} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":5,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":5,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Sort [Language#279 ASC NULLS FIRST], false, 0\n +- Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Sort [Charset#311 ASC NULLS FIRST], false, 0\n +- Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Exchange SinglePartition, true, [id=#129]\n +- HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)], output=[n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L])\n +- ShuffleQueryStage 0\n +- Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#126]\n +- *(1) HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)], output=[Charset#311, Language#279, count#1053L])\n +- *(1) Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- *(1) Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0","children":[{"nodeName":"Project","simpleString":"Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]","children":[{"nodeName":"Filter","simpleString":"Filter (n_pages#545L >= 100)","children":[{"nodeName":"Window","simpleString":"Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]","children":[{"nodeName":"Filter","simpleString":"Filter NOT Contains(Language#279, ,)","children":[{"nodeName":"Window","simpleString":"Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]","children":[{"nodeName":"Sort","simpleString":"Sort [Charset#311 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]","children":[{"nodeName":"Window","simpleString":"Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]","children":[{"nodeName":"Exchange","simpleString":"Exchange SinglePartition, true, [id=#129]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)])","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#126]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)])","children":[{"nodeName":"Project","simpleString":"Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]","children":[{"nodeName":"Filter","simpleString":"Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":109,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":166,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":163,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":164,"metricType":"timing"},{"name":"peak memory","accumulatorId":162,"metricType":"size"},{"name":"number of output rows","accumulatorId":161,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":165,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":160,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":131,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":132,"metricType":"nsTiming"},{"name":"records read","accumulatorId":129,"metricType":"sum"},{"name":"local bytes read","accumulatorId":127,"metricType":"size"},{"name":"fetch wait time","accumulatorId":128,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":125,"metricType":"size"},{"name":"local blocks read","accumulatorId":124,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":123,"metricType":"sum"},{"name":"data size","accumulatorId":122,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":126,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":130,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":157,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":158,"metricType":"timing"},{"name":"peak memory","accumulatorId":156,"metricType":"size"},{"name":"number of output rows","accumulatorId":155,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":159,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":142,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":143,"metricType":"nsTiming"},{"name":"records read","accumulatorId":140,"metricType":"sum"},{"name":"local bytes read","accumulatorId":138,"metricType":"size"},{"name":"fetch wait time","accumulatorId":139,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":136,"metricType":"size"},{"name":"local blocks read","accumulatorId":135,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":134,"metricType":"sum"},{"name":"data size","accumulatorId":133,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":137,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":141,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":152,"metricType":"timing"},{"name":"peak memory","accumulatorId":153,"metricType":"size"},{"name":"spill size","accumulatorId":154,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":151,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":148,"metricType":"timing"},{"name":"peak memory","accumulatorId":149,"metricType":"size"},{"name":"spill size","accumulatorId":150,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":147,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":144,"metricType":"timing"},{"name":"peak memory","accumulatorId":145,"metricType":"size"},{"name":"spill size","accumulatorId":146,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":83,"metricType":"sum"},{"name":"written output","accumulatorId":84,"metricType":"size"},{"name":"number of output rows","accumulatorId":85,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":86,"metricType":"sum"}]}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates","executionId":5,"accumUpdates":[[115,5],[116,4],[117,1090354548]]} -{"Event":"SparkListenerJobStart","Job ID":1,"Submission Time":1678162979920,"Stage Infos":[{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":10,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"7\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[8],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[1],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"7\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"5","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":10,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"7\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[8],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162979931,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"7\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"5","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":1,"Index":0,"Attempt":0,"Launch Time":1678162979998,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":2,"Index":1,"Attempt":0,"Launch Time":1678162980008,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":3,"Index":2,"Attempt":0,"Launch Time":1678162980011,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":4,"Index":3,"Attempt":0,"Launch Time":1678162980012,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":5,"Index":4,"Attempt":0,"Launch Time":1678162980013,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":6,"Index":5,"Attempt":0,"Launch Time":1678162980014,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":7,"Index":6,"Attempt":0,"Launch Time":1678162980015,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":8,"Index":7,"Attempt":0,"Launch Time":1678162980017,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":9,"Index":8,"Attempt":0,"Launch Time":1678162980019,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":10,"Index":9,"Attempt":0,"Launch Time":1678162980020,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":11,"Index":10,"Attempt":0,"Launch Time":1678162980022,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":12,"Index":11,"Attempt":0,"Launch Time":1678162980023,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":13,"Index":12,"Attempt":0,"Launch Time":1678162980025,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":14,"Index":13,"Attempt":0,"Launch Time":1678162980026,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":15,"Index":14,"Attempt":0,"Launch Time":1678162980030,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":16,"Index":15,"Attempt":0,"Launch Time":1678162980031,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":17,"Index":16,"Attempt":0,"Launch Time":1678162980032,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":18,"Index":17,"Attempt":0,"Launch Time":1678162980035,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":19,"Index":18,"Attempt":0,"Launch Time":1678162980036,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":20,"Index":19,"Attempt":0,"Launch Time":1678162980037,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":21,"Index":20,"Attempt":0,"Launch Time":1678162980039,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":22,"Index":21,"Attempt":0,"Launch Time":1678162980041,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":23,"Index":22,"Attempt":0,"Launch Time":1678162980042,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":24,"Index":23,"Attempt":0,"Launch Time":1678162980043,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":25,"Index":24,"Attempt":0,"Launch Time":1678162980045,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":26,"Index":25,"Attempt":0,"Launch Time":1678162980046,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":27,"Index":26,"Attempt":0,"Launch Time":1678162980048,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":28,"Index":27,"Attempt":0,"Launch Time":1678162980049,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":29,"Index":28,"Attempt":0,"Launch Time":1678162980050,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":30,"Index":29,"Attempt":0,"Launch Time":1678162980052,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":31,"Index":30,"Attempt":0,"Launch Time":1678162980054,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":32,"Index":31,"Attempt":0,"Launch Time":1678162980055,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":1,"Stage Attempt ID":0,"Task Info":{"Task ID":33,"Index":32,"Attempt":0,"Launch Time":1678162982742,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":13,"Index":12,"Attempt":0,"Launch Time":1678162980025,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162982748,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"249","Value":"249","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"2","Value":"2","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"70","Value":"70","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"50","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14145,"Value":14145,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":294912,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":128,"Value":128,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":3809,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":766099159,"Value":766099159,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2181,"Value":2181,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":98930502,"Value":98930502,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":480,"Value":480,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":480,"Executor Deserialize CPU Time":98930502,"Executor Run Time":2181,"Executor CPU Time":766099159,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":128,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14145,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":21,"Index":20,"Attempt":0,"Launch Time":1678162980039,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162982750,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"252","Value":"501","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"11","Value":"13","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"2","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"57","Value":"127","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"54","Value":"104","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13311,"Value":27456,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":589824,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":128,"Value":256,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":7618,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":207903279,"Value":974002438,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2182,"Value":4363,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":226997291,"Value":325927793,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":476,"Value":956,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":476,"Executor Deserialize CPU Time":226997291,"Executor Run Time":2182,"Executor CPU Time":207903279,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":128,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13311,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":29,"Index":28,"Attempt":0,"Launch Time":1678162980050,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162982753,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"249","Value":"750","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"786432","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"7","Value":"20","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"70","Value":"197","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"48","Value":"152","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14145,"Value":41601,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":884736,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":128,"Value":384,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":11427,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":606057407,"Value":1580059845,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2197,"Value":6560,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":96377652,"Value":422305445,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":465,"Value":1421,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":465,"Executor Deserialize CPU Time":96377652,"Executor Run Time":2197,"Executor CPU Time":606057407,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":128,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14145,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":33,"Index":32,"Attempt":0,"Launch Time":1678162982742,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162982916,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"20","Value":"770","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"1048576","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"20","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"4","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"112","Value":"309","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"112","Value":"264","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14746,"Value":56347,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1179648,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3766,"Value":15193,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":30744569,"Value":1610804414,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":158,"Value":6718,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6280822,"Value":428586267,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":7,"Value":1428,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":390886176,"JVMOffHeapMemory":113641848,"OnHeapExecutionMemory":100794368,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":1712433,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":102506801,"OffHeapUnifiedMemory":0,"DirectPoolMemory":181850,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7154511872,"ProcessTreeJVMRSSMemory":1206054912,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":8,"MinorGCTime":192,"MajorGCCount":3,"MajorGCTime":220},"Task Metrics":{"Executor Deserialize Time":7,"Executor Deserialize CPU Time":6280822,"Executor Run Time":158,"Executor CPU Time":30744569,"Peak Execution Memory":294912,"Result Size":3766,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14746,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":26,"Index":25,"Attempt":0,"Launch Time":1678162980046,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984247,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"150","Value":"920","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"1310720","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"24","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1140","Value":"1449","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1137","Value":"1401","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13405,"Value":69752,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1474560,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":223,"Value":607,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":19002,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":292778735,"Value":1903583149,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2830,"Value":9548,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":215106889,"Value":643693156,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1320,"Value":2748,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1320,"Executor Deserialize CPU Time":215106889,"Executor Run Time":2830,"Executor CPU Time":292778735,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":223,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13405,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":2,"Index":1,"Attempt":0,"Launch Time":1678162980008,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984248,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"150","Value":"1070","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"1572864","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"5","Value":"29","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1140","Value":"2589","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1137","Value":"2538","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14746,"Value":84498,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1769472,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":223,"Value":830,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":22811,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":785092723,"Value":2688675872,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2841,"Value":12389,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":309626858,"Value":953320014,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1321,"Value":4069,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1321,"Executor Deserialize CPU Time":309626858,"Executor Run Time":2841,"Executor CPU Time":785092723,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":223,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14746,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":4,"Index":3,"Attempt":0,"Launch Time":1678162980012,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984349,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"162","Value":"1232","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"1835008","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"6","Value":"35","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"989","Value":"3578","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"986","Value":"3524","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14746,"Value":99244,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2064384,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":260,"Value":1090,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":26620,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":415924911,"Value":3104600783,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2754,"Value":15143,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":368314331,"Value":1321634345,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1495,"Value":5564,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1495,"Executor Deserialize CPU Time":368314331,"Executor Run Time":2754,"Executor CPU Time":415924911,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":260,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14746,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":28,"Index":27,"Attempt":0,"Launch Time":1678162980049,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984349,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"163","Value":"1395","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"2097152","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"7","Value":"42","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"989","Value":"4567","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"984","Value":"4508","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14746,"Value":113990,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2359296,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":260,"Value":1350,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":30429,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1001089760,"Value":4105690543,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2751,"Value":17894,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":212652740,"Value":1534287085,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1499,"Value":7063,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1499,"Executor Deserialize CPU Time":212652740,"Executor Run Time":2751,"Executor CPU Time":1001089760,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":260,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14746,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":25,"Index":24,"Attempt":0,"Launch Time":1678162980045,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984364,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"143","Value":"1538","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"2359296","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"5","Value":"47","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"9","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"983","Value":"5550","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"980","Value":"5488","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13311,"Value":127301,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2654208,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":2,"Value":2,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":291,"Value":1641,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":34281,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":342278729,"Value":4447969272,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2770,"Value":20664,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":321868716,"Value":1856155801,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1494,"Value":8557,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1494,"Executor Deserialize CPU Time":321868716,"Executor Run Time":2770,"Executor CPU Time":342278729,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":291,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13311,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":1,"Index":0,"Attempt":0,"Launch Time":1678162979998,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984365,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"143","Value":"1681","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"2621440","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"47","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"983","Value":"6533","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"980","Value":"6468","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14145,"Value":141446,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2949120,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":2,"Value":4,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":291,"Value":1932,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":38133,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":228058736,"Value":4676028008,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2771,"Value":23435,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":184976661,"Value":2041132462,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1494,"Value":10051,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1494,"Executor Deserialize CPU Time":184976661,"Executor Run Time":2771,"Executor CPU Time":228058736,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":291,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14145,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":9,"Index":8,"Attempt":0,"Launch Time":1678162980019,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984376,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"143","Value":"1824","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"2883584","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"47","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"11","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"983","Value":"7516","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"980","Value":"7448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13297,"Value":154743,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3244032,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":5,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":291,"Value":2223,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":41985,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1070307064,"Value":5746335072,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2777,"Value":26212,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":445887440,"Value":2487019902,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1488,"Value":11539,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1488,"Executor Deserialize CPU Time":445887440,"Executor Run Time":2777,"Executor CPU Time":1070307064,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":291,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13297,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":3,"Index":2,"Attempt":0,"Launch Time":1678162980011,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984466,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"197","Value":"2021","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"3145728","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"51","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"12","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1053","Value":"8569","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1050","Value":"8498","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13297,"Value":168040,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3538944,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":6,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":264,"Value":2487,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":45837,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1141720052,"Value":6888055124,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2904,"Value":29116,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":317476676,"Value":2804496578,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1461,"Value":13000,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1461,"Executor Deserialize CPU Time":317476676,"Executor Run Time":2904,"Executor CPU Time":1141720052,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":264,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13297,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":11,"Index":10,"Attempt":0,"Launch Time":1678162980022,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984469,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"197","Value":"2218","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"3407872","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"55","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"13","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1052","Value":"9621","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1050","Value":"9548","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13297,"Value":181337,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3833856,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":7,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":264,"Value":2751,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":49689,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":242581006,"Value":7130636130,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2903,"Value":32019,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":207123400,"Value":3011619978,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1462,"Value":14462,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1462,"Executor Deserialize CPU Time":207123400,"Executor Run Time":2903,"Executor CPU Time":242581006,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":264,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13297,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":19,"Index":18,"Attempt":0,"Launch Time":1678162980036,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984472,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"197","Value":"2415","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"3670016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"59","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"14","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1059","Value":"10680","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1056","Value":"10604","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13405,"Value":194742,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4128768,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":8,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":264,"Value":3015,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":53541,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":308774930,"Value":7439411060,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2903,"Value":34922,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":259126857,"Value":3270746835,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1462,"Value":15924,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1462,"Executor Deserialize CPU Time":259126857,"Executor Run Time":2903,"Executor CPU Time":308774930,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":264,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13405,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":24,"Index":23,"Attempt":0,"Launch Time":1678162980043,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984495,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"165","Value":"2580","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"3932160","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"5","Value":"64","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"15","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1011","Value":"11691","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1009","Value":"11613","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13405,"Value":208147,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4423680,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":254,"Value":3269,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":57350,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":219182581,"Value":7658593641,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2880,"Value":37802,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":250025131,"Value":3520771966,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1497,"Value":17421,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":526178872,"JVMOffHeapMemory":102106288,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":663903,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":794975,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7072075776,"ProcessTreeJVMRSSMemory":828153856,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":6,"MinorGCTime":89,"MajorGCCount":3,"MajorGCTime":305},"Task Metrics":{"Executor Deserialize Time":1497,"Executor Deserialize CPU Time":250025131,"Executor Run Time":2880,"Executor CPU Time":219182581,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":254,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13405,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":8,"Index":7,"Attempt":0,"Launch Time":1678162980017,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984497,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"155","Value":"2735","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"4194304","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"5","Value":"69","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"16","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1011","Value":"12702","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1009","Value":"12622","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14145,"Value":222292,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4718592,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":254,"Value":3523,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":61159,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":764049657,"Value":8422643298,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2877,"Value":40679,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":216945390,"Value":3737717356,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1503,"Value":18924,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":526178872,"JVMOffHeapMemory":102106288,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":663903,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":794975,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7072075776,"ProcessTreeJVMRSSMemory":828153856,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":6,"MinorGCTime":89,"MajorGCCount":3,"MajorGCTime":305},"Task Metrics":{"Executor Deserialize Time":1503,"Executor Deserialize CPU Time":216945390,"Executor Run Time":2877,"Executor CPU Time":764049657,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":254,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14145,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":32,"Index":31,"Attempt":0,"Launch Time":1678162980055,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984499,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"168","Value":"2903","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"4456448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"5","Value":"74","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"17","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1010","Value":"13712","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1008","Value":"13630","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13311,"Value":235603,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5013504,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":254,"Value":3777,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":64968,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":542525408,"Value":8965168706,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2875,"Value":43554,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":399029227,"Value":4136746583,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1501,"Value":20425,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":526178872,"JVMOffHeapMemory":102106288,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":663903,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":794975,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7072075776,"ProcessTreeJVMRSSMemory":828153856,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":6,"MinorGCTime":89,"MajorGCCount":3,"MajorGCTime":305},"Task Metrics":{"Executor Deserialize Time":1501,"Executor Deserialize CPU Time":399029227,"Executor Run Time":2875,"Executor CPU Time":542525408,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":254,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13311,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":22,"Index":21,"Attempt":0,"Launch Time":1678162980041,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984517,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"145","Value":"3048","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"78","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"18","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1136","Value":"14848","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1133","Value":"14763","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":14746,"Value":250349,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5308416,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":9,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":264,"Value":4041,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":68820,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":419171962,"Value":9384340668,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2887,"Value":46441,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":392112202,"Value":4528858785,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1514,"Value":21939,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":141939104,"JVMOffHeapMemory":72113424,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":107290,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":107290,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7042990080,"ProcessTreeJVMRSSMemory":626630656,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":5,"MinorGCTime":61,"MajorGCCount":2,"MajorGCTime":149},"Task Metrics":{"Executor Deserialize Time":1514,"Executor Deserialize CPU Time":392112202,"Executor Run Time":2887,"Executor CPU Time":419171962,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":264,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":14746,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":16,"Index":15,"Attempt":0,"Launch Time":1678162980031,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984518,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"165","Value":"3213","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"4980736","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"6","Value":"84","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"19","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1010","Value":"15858","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1008","Value":"15771","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13311,"Value":263660,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5603328,"Internal":true,"Count Failed Values":true},{"ID":173,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":10,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":254,"Value":4295,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":72672,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":293076137,"Value":9677416805,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2874,"Value":49315,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":263297650,"Value":4792156435,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1497,"Value":23436,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":526178872,"JVMOffHeapMemory":102106288,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":663903,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":794975,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7072075776,"ProcessTreeJVMRSSMemory":828153856,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":6,"MinorGCTime":89,"MajorGCCount":3,"MajorGCTime":305},"Task Metrics":{"Executor Deserialize Time":1497,"Executor Deserialize CPU Time":263297650,"Executor Run Time":2874,"Executor CPU Time":293076137,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":254,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13311,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":30,"Index":29,"Attempt":0,"Launch Time":1678162980052,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984519,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"144","Value":"3357","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"5242880","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"2","Value":"86","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"20","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1140","Value":"16998","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1138","Value":"16909","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":26702,"Value":290362,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5898240,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":264,"Value":4559,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":76481,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":687384885,"Value":10364801690,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2887,"Value":52202,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":342520347,"Value":5134676782,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1514,"Value":24950,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":141939104,"JVMOffHeapMemory":72113424,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":107290,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":107290,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7042990080,"ProcessTreeJVMRSSMemory":626630656,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":5,"MinorGCTime":61,"MajorGCCount":2,"MajorGCTime":149},"Task Metrics":{"Executor Deserialize Time":1514,"Executor Deserialize CPU Time":342520347,"Executor Run Time":2887,"Executor CPU Time":687384885,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":264,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":26702,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":7,"Index":6,"Attempt":0,"Launch Time":1678162980015,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984579,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"157","Value":"3514","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"5505024","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"90","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"21","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1105","Value":"18103","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1101","Value":"18010","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13311,"Value":303673,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6193152,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":259,"Value":4818,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":80290,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":248267655,"Value":10613069345,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2920,"Value":55122,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":207027837,"Value":5341704619,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1537,"Value":26487,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":348482528,"JVMOffHeapMemory":107273224,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":1712433,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":1843505,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7081742336,"ProcessTreeJVMRSSMemory":852627456,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":7,"MinorGCTime":134,"MajorGCCount":3,"MajorGCTime":367},"Task Metrics":{"Executor Deserialize Time":1537,"Executor Deserialize CPU Time":207027837,"Executor Run Time":2920,"Executor CPU Time":248267655,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":259,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13311,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":15,"Index":14,"Attempt":0,"Launch Time":1678162980030,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984579,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"157","Value":"3671","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"5767168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"94","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"22","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1105","Value":"19208","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1101","Value":"19111","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13297,"Value":316970,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6488064,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":259,"Value":5077,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":84099,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":753349708,"Value":11366419053,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2923,"Value":58045,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":213345547,"Value":5555050166,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1527,"Value":28014,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":348482528,"JVMOffHeapMemory":107273224,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":1712433,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":1843505,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7081742336,"ProcessTreeJVMRSSMemory":852627456,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":7,"MinorGCTime":134,"MajorGCCount":3,"MajorGCTime":367},"Task Metrics":{"Executor Deserialize Time":1527,"Executor Deserialize CPU Time":213345547,"Executor Run Time":2923,"Executor CPU Time":753349708,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":259,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13297,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":31,"Index":30,"Attempt":0,"Launch Time":1678162980054,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162984580,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"157","Value":"3828","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"6029312","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"4","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"23","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"1105","Value":"20313","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"1101","Value":"20212","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":13405,"Value":330375,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6782976,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":259,"Value":5336,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3809,"Value":87908,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":692206155,"Value":12058625208,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":2920,"Value":60965,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":511768345,"Value":6066818511,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1526,"Value":29540,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":348482528,"JVMOffHeapMemory":107273224,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":1712433,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":1843505,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7081742336,"ProcessTreeJVMRSSMemory":852627456,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":7,"MinorGCTime":134,"MajorGCCount":3,"MajorGCTime":367},"Task Metrics":{"Executor Deserialize Time":1526,"Executor Deserialize CPU Time":511768345,"Executor Run Time":2920,"Executor CPU Time":692206155,"Peak Execution Memory":294912,"Result Size":3809,"JVM GC Time":259,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":13405,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":5,"Index":4,"Attempt":0,"Launch Time":1678162980013,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162986094,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"8","Value":"3836","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"6291456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"24","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"4619","Value":"24932","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"274","Value":"274","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"1120882","Value":"1120882","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"1120882","Value":"1120882","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"4446","Value":"24658","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":1120882,"Value":1120882,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":129065639,"Value":129396014,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7077888,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":159,"Value":5495,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":91760,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1247949979,"Value":13306575187,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":5535,"Value":66500,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":206215778,"Value":6273034289,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":482,"Value":30022,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":390886176,"JVMOffHeapMemory":113641848,"OnHeapExecutionMemory":100794368,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":1712433,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":102506801,"OffHeapUnifiedMemory":0,"DirectPoolMemory":181850,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7154511872,"ProcessTreeJVMRSSMemory":1206054912,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":8,"MinorGCTime":192,"MajorGCCount":3,"MajorGCTime":220},"Task Metrics":{"Executor Deserialize Time":482,"Executor Deserialize CPU Time":206215778,"Executor Run Time":5535,"Executor CPU Time":1247949979,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":159,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":129065639,"Records Read":1120882},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":17,"Index":16,"Attempt":0,"Launch Time":1678162980032,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162986160,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"9","Value":"3845","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"6553600","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"25","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"3744","Value":"28676","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"17","Value":"291","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"69516","Value":"1190398","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"69516","Value":"1190398","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"3724","Value":"28382","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":69516,"Value":1190398,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":72231192,"Value":201627206,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7372800,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":303,"Value":5798,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":95612,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":737889122,"Value":14044464309,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":4585,"Value":71085,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":202715130,"Value":6475749419,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1494,"Value":31516,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":395991528,"JVMOffHeapMemory":118706328,"OnHeapExecutionMemory":32768,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":1712433,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":1745201,"OffHeapUnifiedMemory":0,"DirectPoolMemory":13821364,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7106768896,"ProcessTreeJVMRSSMemory":995651584,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":8,"MinorGCTime":181,"MajorGCCount":3,"MajorGCTime":311},"Task Metrics":{"Executor Deserialize Time":1494,"Executor Deserialize CPU Time":202715130,"Executor Run Time":4585,"Executor CPU Time":737889122,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":303,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":72231192,"Records Read":69516},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":10,"Index":9,"Attempt":0,"Launch Time":1678162980020,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162986840,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"13","Value":"3858","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"6815744","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"26","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"4666","Value":"33342","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"181","Value":"472","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"740621","Value":"1931019","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"740621","Value":"1931019","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"4563","Value":"32945","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":740621,"Value":1931019,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":96064540,"Value":297691746,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7667712,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":268,"Value":6066,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":99464,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":920976379,"Value":14965440688,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":5440,"Value":76525,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":266952657,"Value":6742702076,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1318,"Value":32834,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1318,"Executor Deserialize CPU Time":266952657,"Executor Run Time":5440,"Executor CPU Time":920976379,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":268,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":96064540,"Records Read":740621},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":18,"Index":17,"Attempt":0,"Launch Time":1678162980035,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162986857,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"7","Value":"3865","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"7077888","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"27","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"4690","Value":"38032","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"186","Value":"658","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"760701","Value":"2691720","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"760701","Value":"2691720","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"4584","Value":"37529","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":760701,"Value":2691720,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":96562829,"Value":394254575,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7962624,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":268,"Value":6334,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":103316,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1084466489,"Value":16049907177,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":5440,"Value":81965,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":353709505,"Value":7096411581,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1336,"Value":34170,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1336,"Executor Deserialize CPU Time":353709505,"Executor Run Time":5440,"Executor CPU Time":1084466489,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":268,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":96562829,"Records Read":760701},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":20,"Index":19,"Attempt":0,"Launch Time":1678162980037,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162987637,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"15","Value":"3880","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"7340032","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"28","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"5185","Value":"43217","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"178","Value":"836","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"728996","Value":"3420716","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"728996","Value":"3420716","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"5066","Value":"42595","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":728996,"Value":3420716,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":107980462,"Value":502235037,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8257536,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":303,"Value":6637,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":107168,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1320529166,"Value":17370436343,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":6050,"Value":88015,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":207428063,"Value":7303839644,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1499,"Value":35669,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":946782296,"JVMOffHeapMemory":112107152,"OnHeapExecutionMemory":65536,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":2760963,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":2826499,"OffHeapUnifiedMemory":0,"DirectPoolMemory":13955,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7083728896,"ProcessTreeJVMRSSMemory":1319464960,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":8,"MinorGCTime":165,"MajorGCCount":3,"MajorGCTime":332},"Task Metrics":{"Executor Deserialize Time":1499,"Executor Deserialize CPU Time":207428063,"Executor Run Time":6050,"Executor CPU Time":1320529166,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":303,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":107980462,"Records Read":728996},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":12,"Index":11,"Attempt":0,"Launch Time":1678162980023,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162987801,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"8","Value":"3888","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"7602176","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"29","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"5356","Value":"48573","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"137","Value":"973","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"560100","Value":"3980816","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"560100","Value":"3980816","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"5271","Value":"47866","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":560100,"Value":3980816,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":131899546,"Value":634134583,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8552448,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":303,"Value":6940,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":111020,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":723894455,"Value":18094330798,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":6215,"Value":94230,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":299017127,"Value":7602856771,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1499,"Value":37168,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":946782296,"JVMOffHeapMemory":112107152,"OnHeapExecutionMemory":65536,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":2760963,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":2826499,"OffHeapUnifiedMemory":0,"DirectPoolMemory":13955,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7083728896,"ProcessTreeJVMRSSMemory":1319464960,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":8,"MinorGCTime":165,"MajorGCCount":3,"MajorGCTime":332},"Task Metrics":{"Executor Deserialize Time":1499,"Executor Deserialize CPU Time":299017127,"Executor Run Time":6215,"Executor CPU Time":723894455,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":303,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":131899546,"Records Read":560100},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":14,"Index":13,"Attempt":0,"Launch Time":1678162980026,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162987843,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"15","Value":"3903","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"7864320","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"30","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"5432","Value":"54005","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"191","Value":"1164","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"781104","Value":"4761920","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"781104","Value":"4761920","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"5318","Value":"53184","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":781104,"Value":4761920,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":108077075,"Value":742211658,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8847360,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":308,"Value":7248,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":114872,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1178900448,"Value":19273231246,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":6227,"Value":100457,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":198881590,"Value":7801738361,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1514,"Value":38682,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":141939104,"JVMOffHeapMemory":72113424,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":107290,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":107290,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7042990080,"ProcessTreeJVMRSSMemory":626630656,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":5,"MinorGCTime":61,"MajorGCCount":2,"MajorGCTime":149},"Task Metrics":{"Executor Deserialize Time":1514,"Executor Deserialize CPU Time":198881590,"Executor Run Time":6227,"Executor CPU Time":1178900448,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":308,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":108077075,"Records Read":781104},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":6,"Index":5,"Attempt":0,"Launch Time":1678162980014,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162987845,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"19","Value":"3922","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"8126464","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"31","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"5426","Value":"59431","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"198","Value":"1362","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"809218","Value":"5571138","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"809218","Value":"5571138","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"5293","Value":"58477","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":809218,"Value":5571138,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":96212273,"Value":838423931,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9142272,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":308,"Value":7556,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":118724,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1133807133,"Value":20407038379,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":6229,"Value":106686,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":196597733,"Value":7998336094,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1514,"Value":40196,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":141939104,"JVMOffHeapMemory":72113424,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":107290,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":107290,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7042990080,"ProcessTreeJVMRSSMemory":626630656,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":5,"MinorGCTime":61,"MajorGCCount":2,"MajorGCTime":149},"Task Metrics":{"Executor Deserialize Time":1514,"Executor Deserialize CPU Time":196597733,"Executor Run Time":6229,"Executor CPU Time":1133807133,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":308,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":96212273,"Records Read":809218},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":27,"Index":26,"Attempt":0,"Launch Time":1678162980048,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162988094,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"10","Value":"3932","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"8388608","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"32","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"5676","Value":"65107","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"215","Value":"1577","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"879479","Value":"6450617","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"879479","Value":"6450617","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"5535","Value":"64012","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":879479,"Value":6450617,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":125375392,"Value":963799323,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9437184,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":300,"Value":7856,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":122576,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1311072168,"Value":21718110547,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":6545,"Value":113231,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":368371012,"Value":8366707106,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1462,"Value":41658,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":1462,"Executor Deserialize CPU Time":368371012,"Executor Run Time":6545,"Executor CPU Time":1311072168,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":300,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":125375392,"Records Read":879479},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":1,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":23,"Index":22,"Attempt":0,"Launch Time":1678162980042,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"RACK_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162988553,"Failed":false,"Killed":false,"Accumulables":[{"ID":160,"Name":"duration","Update":"7","Value":"3939","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":162,"Name":"peak memory","Update":"262144","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":164,"Name":"time in aggregation build","Update":"0","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Update":"1","Value":"33","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Update":"6090","Value":"71197","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Update":"237","Value":"1814","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":112,"Name":"number of output rows","Update":"969399","Value":"7420016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":114,"Name":"number of output rows","Update":"969399","Value":"7420016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Update":"5951","Value":"69963","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Update":969399,"Value":7420016,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Update":126954524,"Value":1090753847,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Update":292,"Value":8148,"Internal":true,"Count Failed Values":true},{"ID":171,"Name":"internal.metrics.resultSize","Update":3852,"Value":126428,"Internal":true,"Count Failed Values":true},{"ID":170,"Name":"internal.metrics.executorCpuTime","Update":1276437053,"Value":22994547600,"Internal":true,"Count Failed Values":true},{"ID":169,"Name":"internal.metrics.executorRunTime","Update":6903,"Value":120134,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Update":225138204,"Value":8591845310,"Internal":true,"Count Failed Values":true},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Update":1527,"Value":43185,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":348482528,"JVMOffHeapMemory":107273224,"OnHeapExecutionMemory":131072,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":1712433,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":1843505,"OffHeapUnifiedMemory":0,"DirectPoolMemory":10540,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7081742336,"ProcessTreeJVMRSSMemory":852627456,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":7,"MinorGCTime":134,"MajorGCCount":3,"MajorGCTime":367},"Task Metrics":{"Executor Deserialize Time":1527,"Executor Deserialize CPU Time":225138204,"Executor Run Time":6903,"Executor CPU Time":1276437053,"Peak Execution Memory":294912,"Result Size":3852,"JVM GC Time":292,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":126954524,"Records Read":969399},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":1,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":10,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"7\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[8],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162979931,"Completion Time":1678162988554,"Accumulables":[{"ID":173,"Name":"internal.metrics.resultSerializationTime","Value":10,"Internal":true,"Count Failed Values":true},{"ID":164,"Name":"time in aggregation build","Value":"98","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":119,"Name":"scan time","Value":"69963","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":110,"Name":"duration","Value":"71197","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":113,"Name":"number of input batches","Value":"1814","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":167,"Name":"internal.metrics.executorDeserializeTime","Value":43185,"Internal":true,"Count Failed Values":true},{"ID":176,"Name":"internal.metrics.peakExecutionMemory","Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":160,"Name":"duration","Value":"3939","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":169,"Name":"internal.metrics.executorRunTime","Value":120134,"Internal":true,"Count Failed Values":true},{"ID":172,"Name":"internal.metrics.jvmGCTime","Value":8148,"Internal":true,"Count Failed Values":true},{"ID":112,"Name":"number of output rows","Value":"7420016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":121,"Name":"numComputedPartitions","Value":"33","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":189,"Name":"internal.metrics.input.recordsRead","Value":7420016,"Internal":true,"Count Failed Values":true},{"ID":168,"Name":"internal.metrics.executorDeserializeCpuTime","Value":8591845310,"Internal":true,"Count Failed Values":true},{"ID":114,"Name":"number of output rows","Value":"7420016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":171,"Name":"internal.metrics.resultSize","Value":126428,"Internal":true,"Count Failed Values":true},{"ID":162,"Name":"peak memory","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":170,"Name":"internal.metrics.executorCpuTime","Value":22994547600,"Internal":true,"Count Failed Values":true},{"ID":188,"Name":"internal.metrics.input.bytesRead","Value":1090753847,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":1,"Completion Time":1678162988571,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":5,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Sort [Language#279 ASC NULLS FIRST], false, 0\n +- Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Sort [Charset#311 ASC NULLS FIRST], false, 0\n +- Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- ShuffleQueryStage 1\n +- Exchange SinglePartition, true, [id=#204]\n +- *(2) HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)], output=[n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L])\n +- CustomShuffleReader coalesced\n +- ShuffleQueryStage 0\n +- Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#126]\n +- *(1) HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)], output=[Charset#311, Language#279, count#1053L])\n +- *(1) Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- *(1) Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0","children":[{"nodeName":"Project","simpleString":"Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]","children":[{"nodeName":"Filter","simpleString":"Filter (n_pages#545L >= 100)","children":[{"nodeName":"Window","simpleString":"Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]","children":[{"nodeName":"Filter","simpleString":"Filter NOT Contains(Language#279, ,)","children":[{"nodeName":"Window","simpleString":"Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]","children":[{"nodeName":"Sort","simpleString":"Sort [Charset#311 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]","children":[{"nodeName":"Window","simpleString":"Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 1","children":[{"nodeName":"Exchange","simpleString":"Exchange SinglePartition, true, [id=#204]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#126]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)])","children":[{"nodeName":"Project","simpleString":"Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]","children":[{"nodeName":"Filter","simpleString":"Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":109,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":166,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":163,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":164,"metricType":"timing"},{"name":"peak memory","accumulatorId":162,"metricType":"size"},{"name":"number of output rows","accumulatorId":161,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":165,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":160,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":131,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":132,"metricType":"nsTiming"},{"name":"records read","accumulatorId":129,"metricType":"sum"},{"name":"local bytes read","accumulatorId":127,"metricType":"size"},{"name":"fetch wait time","accumulatorId":128,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":125,"metricType":"size"},{"name":"local blocks read","accumulatorId":124,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":123,"metricType":"sum"},{"name":"data size","accumulatorId":122,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":126,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":130,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":283,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":284,"metricType":"timing"},{"name":"peak memory","accumulatorId":282,"metricType":"size"},{"name":"number of output rows","accumulatorId":281,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":285,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":280,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":267,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":268,"metricType":"nsTiming"},{"name":"records read","accumulatorId":265,"metricType":"sum"},{"name":"local bytes read","accumulatorId":263,"metricType":"size"},{"name":"fetch wait time","accumulatorId":264,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":261,"metricType":"size"},{"name":"local blocks read","accumulatorId":260,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":259,"metricType":"sum"},{"name":"data size","accumulatorId":258,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":262,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":266,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":277,"metricType":"timing"},{"name":"peak memory","accumulatorId":278,"metricType":"size"},{"name":"spill size","accumulatorId":279,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":276,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":273,"metricType":"timing"},{"name":"peak memory","accumulatorId":274,"metricType":"size"},{"name":"spill size","accumulatorId":275,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":272,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":269,"metricType":"timing"},{"name":"peak memory","accumulatorId":270,"metricType":"size"},{"name":"spill size","accumulatorId":271,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":83,"metricType":"sum"},{"name":"written output","accumulatorId":84,"metricType":"size"},{"name":"number of output rows","accumulatorId":85,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":86,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":2,"Submission Time":1678162988703,"Stage Infos":[{"Stage ID":2,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":10,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"7\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[8],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":3,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":13,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"22\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[12],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":11,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"26\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[10],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":12,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"23\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[11],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[2],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[2,3],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"22\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"5","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":3,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":13,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"22\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[12],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":11,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"26\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[10],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":12,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"23\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[11],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[2],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162988710,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"22\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"5","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":3,"Stage Attempt ID":0,"Task Info":{"Task ID":34,"Index":0,"Attempt":0,"Launch Time":1678162988725,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":3,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":34,"Index":0,"Attempt":0,"Launch Time":1678162988725,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162988856,"Failed":false,"Killed":false,"Accumulables":[{"ID":280,"Name":"duration","Update":"5","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":282,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":284,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":303,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":302,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":301,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":300,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":299,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":298,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":297,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":295,"Name":"internal.metrics.peakExecutionMemory","Update":262144,"Value":262144,"Internal":true,"Count Failed Values":true},{"ID":290,"Name":"internal.metrics.resultSize","Update":4175,"Value":4175,"Internal":true,"Count Failed Values":true},{"ID":289,"Name":"internal.metrics.executorCpuTime","Update":54273023,"Value":54273023,"Internal":true,"Count Failed Values":true},{"ID":288,"Name":"internal.metrics.executorRunTime","Update":85,"Value":85,"Internal":true,"Count Failed Values":true},{"ID":287,"Name":"internal.metrics.executorDeserializeCpuTime","Update":26396779,"Value":26396779,"Internal":true,"Count Failed Values":true},{"ID":286,"Name":"internal.metrics.executorDeserializeTime","Update":35,"Value":35,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":35,"Executor Deserialize CPU Time":26396779,"Executor Run Time":85,"Executor CPU Time":54273023,"Peak Execution Memory":262144,"Result Size":4175,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":3,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":13,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"22\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[12],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":11,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"26\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[10],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":12,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"23\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[11],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[2],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162988710,"Completion Time":1678162988857,"Accumulables":[{"ID":286,"Name":"internal.metrics.executorDeserializeTime","Value":35,"Internal":true,"Count Failed Values":true},{"ID":295,"Name":"internal.metrics.peakExecutionMemory","Value":262144,"Internal":true,"Count Failed Values":true},{"ID":298,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":280,"Name":"duration","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":289,"Name":"internal.metrics.executorCpuTime","Value":54273023,"Internal":true,"Count Failed Values":true},{"ID":301,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":300,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":303,"Name":"internal.metrics.shuffle.read.recordsRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":288,"Name":"internal.metrics.executorRunTime","Value":85,"Internal":true,"Count Failed Values":true},{"ID":297,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":282,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":299,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":290,"Name":"internal.metrics.resultSize","Value":4175,"Internal":true,"Count Failed Values":true},{"ID":284,"Name":"time in aggregation build","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":302,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":287,"Name":"internal.metrics.executorDeserializeCpuTime","Value":26396779,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":2,"Completion Time":1678162988858,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":5,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, loglikelihood#565, prob_cs_given_lang#566, prob_lang_given_cs#567]\n +- Filter ((n_pages#545L >= cast(100 as bigint)) AND NOT Language#279 LIKE %,%)\n +- SubqueryAlias language_charset_loglikelihood\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (cast(2 as double) * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(cast(total_pages#546L as double) as double)))) + if ((n_pages_charset#547L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if ((n_pages_languages#548L = n_pages#545L)) cast(0 as double) else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(cast(total_pages#546L as double) as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) cast(0 as double) else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / cast(((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)) as double))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(cast(n_pages_languages#548L as double) as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(cast(n_pages_charset#547L as double) as double)) AS prob_lang_given_cs#567]\n +- SubqueryAlias language_count_tmp\n +- Project [n_pages#545L, Language#279, Charset#311, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Project [n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L, total_pages#546L, n_pages_charset#547L, n_pages_languages#548L]\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Filter (((subset#33 = warc) AND isnotnull(Language#279)) AND isnotnull(Charset#311))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true\n +- Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- Aggregate [Charset#311, Language#279], [count(1) AS n_pages#545L, Language#279, Charset#311, count(1) AS _w0#556L, count(1) AS _w1#557L, count(1) AS _w2#558L]\n +- Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]\n+- AdaptiveSparkPlan isFinalPlan=true\n +- *(5) Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0\n +- *(5) Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]\n +- *(5) Filter (n_pages#545L >= 100)\n +- Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]\n +- *(4) Sort [Language#279 ASC NULLS FIRST], false, 0\n +- *(4) Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]\n +- *(4) Filter NOT Contains(Language#279, ,)\n +- Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]\n +- *(3) Sort [Charset#311 ASC NULLS FIRST], false, 0\n +- *(3) Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]\n +- Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]\n +- ShuffleQueryStage 1\n +- Exchange SinglePartition, true, [id=#204]\n +- *(2) HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)], output=[n_pages#545L, Language#279, Charset#311, _w0#556L, _w1#557L, _w2#558L])\n +- CustomShuffleReader coalesced\n +- ShuffleQueryStage 0\n +- Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#126]\n +- *(1) HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)], output=[Charset#311, Language#279, count#1053L])\n +- *(1) Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- *(1) Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))\n +- InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/log_likelihood_ratio_1GB), Overwrite, [Language, Charset, n_pages_languages, n_pages_charset, n_pages, loglikelihood, prob_cs_given_lang, prob_lang_given_cs]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=true","children":[{"nodeName":"WholeStageCodegen (5)","simpleString":"WholeStageCodegen (5)","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST, n_pages#545L DESC NULLS LAST], true, 0","children":[{"nodeName":"Project","simpleString":"Project [Language#279, Charset#311, n_pages_languages#548L, n_pages_charset#547L, n_pages#545L, (2.0 * (((ln((cast(n_pages#545L as double) / (cast((n_pages_charset#547L * n_pages_languages#548L) as double) / cast(total_pages#546L as double)))) + if ((n_pages_charset#547L = n_pages#545L)) 0.0 else ln((cast((n_pages_charset#547L - n_pages#545L) as double) / (cast((n_pages_charset#547L * (total_pages#546L - n_pages_languages#548L)) as double) / cast(total_pages#546L as double))))) + if ((n_pages_languages#548L = n_pages#545L)) 0.0 else ln((cast((n_pages_languages#548L - n_pages#545L) as double) / (cast((n_pages_languages#548L * (total_pages#546L - n_pages_charset#547L)) as double) / cast(total_pages#546L as double))))) + if (((total_pages#546L + n_pages#545L) = (n_pages_languages#548L + n_pages_charset#547L))) 0.0 else ln((cast((((total_pages#546L - n_pages_languages#548L) - n_pages_charset#547L) + n_pages#545L) as double) / ((cast((total_pages#546L - n_pages_charset#547L) as double) * cast((total_pages#546L - n_pages_languages#548L) as double)) / cast(total_pages#546L as double)))))) AS loglikelihood#565, (cast(n_pages#545L as double) / cast(n_pages_languages#548L as double)) AS prob_cs_given_lang#566, (cast(n_pages#545L as double) / cast(n_pages_charset#547L as double)) AS prob_lang_given_cs#567]","children":[{"nodeName":"Filter","simpleString":"Filter (n_pages#545L >= 100)","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Window","simpleString":"Window [sum(_w2#558L) windowspecdefinition(Language#279, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_languages#548L], [Language#279]","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"Sort","simpleString":"Sort [Language#279 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w2#558L, total_pages#546L, n_pages_charset#547L]","children":[{"nodeName":"Filter","simpleString":"Filter NOT Contains(Language#279, ,)","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Window","simpleString":"Window [sum(_w1#557L) windowspecdefinition(Charset#311, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS n_pages_charset#547L], [Charset#311]","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"Sort","simpleString":"Sort [Charset#311 ASC NULLS FIRST], false, 0","children":[{"nodeName":"Project","simpleString":"Project [n_pages#545L, Language#279, Charset#311, _w1#557L, _w2#558L, total_pages#546L]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Window","simpleString":"Window [sum(_w0#556L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS total_pages#546L]","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 1","children":[{"nodeName":"Exchange","simpleString":"Exchange SinglePartition, true, [id=#204]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Charset#311, Language#279, 1000), true, [id=#126]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Charset#311, Language#279], functions=[partial_count(1)])","children":[{"nodeName":"Project","simpleString":"Project [content_charset#26 AS Charset#311, content_languages#27 AS Language#279]","children":[{"nodeName":"Filter","simpleString":"Filter (((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(content_languages#27)) AND isnotnull(content_charset#26))","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [content_charset#26, content_languages#27, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(content_languages#27), isnotnull(content_charset#26)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":109,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":166,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":163,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":164,"metricType":"timing"},{"name":"peak memory","accumulatorId":162,"metricType":"size"},{"name":"number of output rows","accumulatorId":161,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":165,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":160,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":131,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":132,"metricType":"nsTiming"},{"name":"records read","accumulatorId":129,"metricType":"sum"},{"name":"local bytes read","accumulatorId":127,"metricType":"size"},{"name":"fetch wait time","accumulatorId":128,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":125,"metricType":"size"},{"name":"local blocks read","accumulatorId":124,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":123,"metricType":"sum"},{"name":"data size","accumulatorId":122,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":126,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":130,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":283,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":284,"metricType":"timing"},{"name":"peak memory","accumulatorId":282,"metricType":"size"},{"name":"number of output rows","accumulatorId":281,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":285,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":280,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":267,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":268,"metricType":"nsTiming"},{"name":"records read","accumulatorId":265,"metricType":"sum"},{"name":"local bytes read","accumulatorId":263,"metricType":"size"},{"name":"fetch wait time","accumulatorId":264,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":261,"metricType":"size"},{"name":"local blocks read","accumulatorId":260,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":259,"metricType":"sum"},{"name":"data size","accumulatorId":258,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":262,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":266,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":322,"metricType":"timing"},{"name":"peak memory","accumulatorId":323,"metricType":"size"},{"name":"spill size","accumulatorId":324,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":321,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":320,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":317,"metricType":"timing"},{"name":"peak memory","accumulatorId":318,"metricType":"size"},{"name":"spill size","accumulatorId":319,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":316,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":315,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":312,"metricType":"timing"},{"name":"peak memory","accumulatorId":313,"metricType":"size"},{"name":"spill size","accumulatorId":314,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":311,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":83,"metricType":"sum"},{"name":"written output","accumulatorId":84,"metricType":"size"},{"name":"number of output rows","accumulatorId":85,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":86,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":3,"Submission Time":1678162989242,"Stage Infos":[{"Stage ID":5,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":13,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"22\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[12],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":11,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"26\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[10],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":12,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"23\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[11],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[4],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":6,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":20,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"28\",\"name\":\"WholeStageCodegen (5)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[19],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":18,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"34\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[17],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":17,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"39\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[16],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":14,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"46\",\"name\":\"Exchange\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[13],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":15,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"44\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[14],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":16,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"40\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[15],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":19,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"33\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[18],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[5],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":4,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":10,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"7\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[9],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":9,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"8\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[8],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":8,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[7],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":7,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[5,6,4],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"5\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"5","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":6,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":20,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"28\",\"name\":\"WholeStageCodegen (5)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[19],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":18,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"34\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[17],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":17,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"39\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[16],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":14,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"46\",\"name\":\"Exchange\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[13],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":15,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"44\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[14],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":16,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"40\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[15],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":19,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"33\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[18],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[5],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162989248,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"5\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"5","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":6,"Stage Attempt ID":0,"Task Info":{"Task ID":35,"Index":0,"Attempt":0,"Launch Time":1678162989292,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":6,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":35,"Index":0,"Attempt":0,"Launch Time":1678162989292,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162989941,"Failed":false,"Killed":false,"Accumulables":[{"ID":311,"Name":"duration","Update":"233","Value":"233","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":314,"Name":"spill size","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":313,"Name":"peak memory","Update":"65536","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":312,"Name":"sort time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":316,"Name":"duration","Update":"31","Value":"31","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":319,"Name":"spill size","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":318,"Name":"peak memory","Update":"65536","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":317,"Name":"sort time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":321,"Name":"duration","Update":"54","Value":"54","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":324,"Name":"spill size","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":323,"Name":"peak memory","Update":"65536","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":322,"Name":"sort time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":348,"Name":"internal.metrics.output.bytesWritten","Update":111,"Value":111,"Internal":true,"Count Failed Values":true},{"ID":342,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":341,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":340,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":339,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":338,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":337,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":336,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":334,"Name":"internal.metrics.peakExecutionMemory","Update":196608,"Value":196608,"Internal":true,"Count Failed Values":true},{"ID":331,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":329,"Name":"internal.metrics.resultSize","Update":6419,"Value":6419,"Internal":true,"Count Failed Values":true},{"ID":328,"Name":"internal.metrics.executorCpuTime","Update":430425007,"Value":430425007,"Internal":true,"Count Failed Values":true},{"ID":327,"Name":"internal.metrics.executorRunTime","Update":539,"Value":539,"Internal":true,"Count Failed Values":true},{"ID":326,"Name":"internal.metrics.executorDeserializeCpuTime","Update":86902724,"Value":86902724,"Internal":true,"Count Failed Values":true},{"ID":325,"Name":"internal.metrics.executorDeserializeTime","Update":96,"Value":96,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":866619456,"JVMOffHeapMemory":128322616,"OnHeapExecutionMemory":65536,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":982692,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":1048228,"OffHeapUnifiedMemory":0,"DirectPoolMemory":9371220,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7132655616,"ProcessTreeJVMRSSMemory":1554882560,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":11,"MinorGCTime":163,"MajorGCCount":3,"MajorGCTime":281},"Task Metrics":{"Executor Deserialize Time":96,"Executor Deserialize CPU Time":86902724,"Executor Run Time":539,"Executor CPU Time":430425007,"Peak Execution Memory":196608,"Result Size":6419,"JVM GC Time":0,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":111,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":6,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":20,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"28\",\"name\":\"WholeStageCodegen (5)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[19],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":18,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"34\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[17],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":17,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"39\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[16],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":14,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"46\",\"name\":\"Exchange\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[13],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":15,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"44\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[14],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":16,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"40\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[15],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":19,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"33\",\"name\":\"Window\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[18],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[5],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162989248,"Completion Time":1678162989942,"Accumulables":[{"ID":340,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":331,"Name":"internal.metrics.resultSerializationTime","Value":1,"Internal":true,"Count Failed Values":true},{"ID":322,"Name":"sort time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":313,"Name":"peak memory","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":334,"Name":"internal.metrics.peakExecutionMemory","Value":196608,"Internal":true,"Count Failed Values":true},{"ID":325,"Name":"internal.metrics.executorDeserializeTime","Value":96,"Internal":true,"Count Failed Values":true},{"ID":316,"Name":"duration","Value":"31","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":319,"Name":"spill size","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":337,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":328,"Name":"internal.metrics.executorCpuTime","Value":430425007,"Internal":true,"Count Failed Values":true},{"ID":336,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":327,"Name":"internal.metrics.executorRunTime","Value":539,"Internal":true,"Count Failed Values":true},{"ID":318,"Name":"peak memory","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":348,"Name":"internal.metrics.output.bytesWritten","Value":111,"Internal":true,"Count Failed Values":true},{"ID":312,"Name":"sort time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":339,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":321,"Name":"duration","Value":"54","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":342,"Name":"internal.metrics.shuffle.read.recordsRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":324,"Name":"spill size","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":341,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":323,"Name":"peak memory","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":317,"Name":"sort time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":326,"Name":"internal.metrics.executorDeserializeCpuTime","Value":86902724,"Internal":true,"Count Failed Values":true},{"ID":338,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":329,"Name":"internal.metrics.resultSize","Value":6419,"Internal":true,"Count Failed Values":true},{"ID":311,"Name":"duration","Value":"233","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":314,"Name":"spill size","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"}]}} -{"Event":"SparkListenerJobEnd","Job ID":3,"Completion Time":1678162989942,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates","executionId":5,"accumUpdates":[[83,1],[84,111],[85,0],[86,0]]} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":5,"timePerRule":{"PruneFileSourcePartitions":2144830,"ReassignLambdaVariableID":724707,"PushPredicateThroughNonJoin":608446,"Analyzer$HandleNullInputsForUDF":26349,"Analyzer$ResolveSubqueryColumnAliases":16113,"ResolveTimeZone":21614,"Analyzer$ResolveNamespace":20360,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":23141,"RewriteCorrelatedScalarSubquery":3514511,"RemoveLiteralFromGroupExpressions":1402327,"PushProjectionThroughUnion":1631262,"EliminateSubqueryAliases":4206194,"ResolveCatalogs":23527,"PushLeftSemiLeftAntiThroughJoin":1489244,"FlattenScalarSubqueriesWithAggregates":3863366,"LikeSimplification":5425317,"CollapseRepartition":2465750,"ResolveHints$ResolveCoalesceHints":20038,"Analyzer$ExtractGenerator":38890,"RewriteIntersectAll":627146,"ResolveHints$ResolveJoinStrategyHints":21154,"TypeCoercion$MapZipWithCoercion":26613,"NullPropagation":15733614,"PullupCorrelatedPredicates":1289379,"UpdateOuterReferences":23104,"ExtractPythonUDFs":1115243,"Analyzer$WindowsSubstitution":30716,"CombineUnions":1999965,"ExtractGroupingPythonUDFFromAggregate":1582878,"ReorderAssociativeOperator":7933245,"CleanupDynamicPruningFilters":2408477,"ResolveHints$RemoveAllHints":20406,"SimplifyBinaryComparison":3678265,"ResolveTableValuedFunctions":24373,"EliminateSerialization":1180344,"TypeCoercion$BooleanEquality":79174,"package$ExpressionCanonicalizer$CleanExpressions":825749,"ReplaceIntersectWithSemiJoin":1174735,"ConstantPropagation":2691863,"CostBasedJoinReorder":23176,"Analyzer$ResolveReferences":74041,"CTESubstitution":1150505,"RemoveRedundantAliases":12059838,"TypeCoercion$ImplicitTypeCasts":26112,"RewriteExceptAll":636663,"UpdateAttributeNullability":134291,"PropagateEmptyRelation":12980320,"SimplifyCasts":6493467,"EliminateMapObjects":1129024,"CombineLimits":1391153,"DetectAmbiguousSelfJoin":1716998,"ReplaceExpressions":1371904,"ResolveInlineTables":38964,"OptimizeIn":2596173,"CollapseWindow":1367762,"TypeCoercion$IfCoercion":20966,"ResolveSessionCatalog":26402,"PartitionPruning":405147,"BooleanSimplification":7034649,"TypeCoercion$PromoteStrings":26861,"Analyzer$ResolveAliases":16085,"DecimalAggregates":1101506,"PruneFilters":36343716,"Analyzer$ResolveMissingReferences":15777,"TransposeWindow":1432335,"Analyzer$ResolveRelations":39942,"EliminateUnions":27946,"RewritePredicateSubquery":1034141,"ObjectSerializerPruning":407524,"LimitPushDown":1897566,"SimplifyCaseConversionExpressions":3976856,"Analyzer$ResolveNaturalAndUsingJoin":31793,"EliminateView":771523,"CombineTypedFilters":375043,"OptimizeLimitZero":646858,"CheckCartesianProducts":55012,"ExtractPythonUDFFromAggregate":1555787,"Analyzer$ExtractWindowExpressions":31080,"ReplaceExceptWithAntiJoin":630195,"ResolveLambdaVariables":27528,"FallBackFileSourceV2":17384,"Analyzer$ResolveTables":1058791,"SubstituteUnresolvedOrdinals":24988,"TypeCoercion$CaseWhenCoercion":20801,"DecimalPrecision":33402,"EliminateSorts":5027734,"PushDownLeftSemiAntiJoin":1586581,"ExtractPythonUDFFromJoinCondition":388195,"TypeCoercion$StackCoercion":22547,"Analyzer$ResolveAggAliasInGroupBy":17989,"TypeCoercion$StringLiteralCoercion":21495,"FoldablePropagation":1998562,"V2ScanRelationPushDown":2386328,"EliminateDistinct":12922,"InferFiltersFromConstraints":2118772,"Analyzer$PullOutNondeterministic":20264,"Analyzer$ResolveFunctions":22188,"ReplaceNullWithFalseInPredicate":18150892,"ResolveHigherOrderFunctions":26349,"Analyzer$ResolvePivot":17494,"CollapseProject":35101256,"Analyzer$ResolveNewInstance":21753,"ColumnPruning":36814201,"Analyzer$ResolveWindowOrder":20704,"TypeCoercion$ConcatCoercion":26415,"PushDownPredicates":34192568,"TimeWindowing":377163,"Optimizer$OptimizeSubqueries":2518451,"RewriteNonCorrelatedExists":16023362,"DemoteBroadcastHashJoin":1583625,"TypeCoercion$Division":23352,"ComputeCurrentTime":2544030,"ResolveCreateNamedStruct":27423,"TypeCoercion$EltCoercion":24494,"ConvertToLocalRelation":1036580,"RemoveRepetitionFromGroupExpressions":901093,"ReplaceDistinctWithAggregate":613429,"PreprocessTableCreation":35768,"ResolveSQLOnFile":18463,"Analyzer$ResolveSubquery":16859,"CombineConcats":35171,"Analyzer$ResolveGroupingAnalytics":23341,"Analyzer$ResolveBinaryArithmetic":24084,"RemoveDispensableExpressions":3238421,"Analyzer$ResolveAlterTableChanges":23583,"ResolveEncodersInScalaAgg":25882,"TypeCoercion$IntegralDivision":21397,"Analyzer$ResolveWindowFrame":298934,"Analyzer$ResolveDeserializer":23038,"RewriteDistinctAggregates":2712033,"RemoveNoopOperators":21808704,"Analyzer$ResolveAggregateFunctions":20854,"NormalizeFloatingNumbers":983677,"ReorderJoin":1504201,"Analyzer$ResolveUpCast":21253,"Analyzer$ResolveGenerate":21750,"TypeCoercion$WidenSetOperationTypes":17883,"EliminateOuterJoin":1531845,"SimplifyExtractValueOps":3665075,"OptimizeMetadataOnlyQuery":19828,"EliminateResolvedHint":2404985,"Analyzer$ResolveInsertInto":19118,"ReplaceExceptWithFilter":701989,"CleanupAliases":30232,"GetCurrentDatabase":2846042,"SchemaPruning":4676256,"Analyzer$ResolveOutputRelation":18745,"BloomFilterJoinRule":414505,"Analyzer$ResolveRandomSeed":17909,"TypeCoercion$WindowFrameCoercion":21570,"ConstantFolding":8217907,"TypeCoercion$DateTimeOperations":22064,"TypeCoercion$InConversion":23842,"FindDataSourceTable":23797,"SimplifyConditionals":5962134,"DataSourceAnalysis":19857,"TypeCoercion$FunctionArgumentConversion":29252,"Analyzer$GlobalAggregates":19799,"Analyzer$LookupFunctions":50673,"CombineFilters":1703830,"ReplaceDeduplicateWithAggregate":664176,"PreprocessTableInsertion":21103},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":3,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":3,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":3,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":3,"CollapseRepartition":3,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":3,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":4,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":3,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":3,"ResolveTableValuedFunctions":1,"EliminateSerialization":3,"TypeCoercion$BooleanEquality":1,"package$ExpressionCanonicalizer$CleanExpressions":8,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":3,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":3,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":3,"EliminateMapObjects":1,"CombineLimits":3,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":3,"CollapseWindow":3,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":3,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":4,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":3,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":3,"SimplifyCaseConversionExpressions":3,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":3,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":3,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":3,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":4,"Analyzer$ResolveNewInstance":1,"ColumnPruning":5,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":5,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"DemoteBroadcastHashJoin":2,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":3,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":3,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":5,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":3,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":3,"SimplifyExtractValueOps":3,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":3,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":3,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":4,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{"EliminateSubqueryAliases":1,"LikeSimplification":1,"SimplifyCasts":1,"RewritePredicateSubquery":1,"InferFiltersFromConstraints":1,"CollapseProject":1,"ColumnPruning":1,"PushDownPredicates":1,"ConstantFolding":1},"timeEffectiveRunsPerRule":{"EliminateSubqueryAliases":4206194,"LikeSimplification":3132604,"SimplifyCasts":5097741,"RewritePredicateSubquery":1034141,"InferFiltersFromConstraints":2118772,"CollapseProject":33418665,"ColumnPruning":18210290,"PushDownPredicates":31603282,"ConstantFolding":5681845},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":5,"time":1678162990095} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":6,"description":"save at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[count(1)], output=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count#537L])\n +- Exchange hashpartitioning(Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, 1000), true, [id=#287]\n +- HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[partial_count(1)], output=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count#1662L])\n +- Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- InMemoryTableScan [content_charset#26, content_languages#27, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13, url_host_tld#7]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[count(1)])","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, 1000), true, [id=#287]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[partial_count(1)])","children":[{"nodeName":"Project","simpleString":"Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [content_charset#26, content_languages#27, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13, url_host_tld#7]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":397,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":394,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":395,"metricType":"timing"},{"name":"peak memory","accumulatorId":393,"metricType":"size"},{"name":"number of output rows","accumulatorId":392,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":396,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":359,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":360,"metricType":"nsTiming"},{"name":"records read","accumulatorId":357,"metricType":"sum"},{"name":"local bytes read","accumulatorId":355,"metricType":"size"},{"name":"fetch wait time","accumulatorId":356,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":353,"metricType":"size"},{"name":"local blocks read","accumulatorId":352,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":351,"metricType":"sum"},{"name":"data size","accumulatorId":350,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":354,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":358,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":389,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":390,"metricType":"timing"},{"name":"peak memory","accumulatorId":388,"metricType":"size"},{"name":"number of output rows","accumulatorId":387,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":391,"metricType":"average"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":383,"metricType":"sum"},{"name":"written output","accumulatorId":384,"metricType":"size"},{"name":"number of output rows","accumulatorId":385,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":386,"metricType":"sum"}]},"time":1678162990226} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":6,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":6,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[count(1)], output=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count#537L])\n +- ShuffleQueryStage 0\n +- Exchange hashpartitioning(Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, 1000), true, [id=#302]\n +- *(1) HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[partial_count(1)], output=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count#1662L])\n +- *(1) Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- InMemoryTableScan [content_charset#26, content_languages#27, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13, url_host_tld#7]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[count(1)])","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, 1000), true, [id=#302]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[partial_count(1)])","children":[{"nodeName":"Project","simpleString":"Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [content_charset#26, content_languages#27, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13, url_host_tld#7]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":397,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":417,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":418,"metricType":"timing"},{"name":"peak memory","accumulatorId":416,"metricType":"size"},{"name":"number of output rows","accumulatorId":415,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":419,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":414,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":407,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":408,"metricType":"nsTiming"},{"name":"records read","accumulatorId":405,"metricType":"sum"},{"name":"local bytes read","accumulatorId":403,"metricType":"size"},{"name":"fetch wait time","accumulatorId":404,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":401,"metricType":"size"},{"name":"local blocks read","accumulatorId":400,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":399,"metricType":"sum"},{"name":"data size","accumulatorId":398,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":402,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":406,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":411,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":412,"metricType":"timing"},{"name":"peak memory","accumulatorId":410,"metricType":"size"},{"name":"number of output rows","accumulatorId":409,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":413,"metricType":"average"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":383,"metricType":"sum"},{"name":"written output","accumulatorId":384,"metricType":"size"},{"name":"number of output rows","accumulatorId":385,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":386,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":4,"Submission Time":1678162990571,"Stage Infos":[{"Stage ID":7,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":26,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"68\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[25],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":24,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[23],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":25,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"69\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[24],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":23,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[7],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"68\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"6","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":7,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":26,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"68\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[25],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":24,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[23],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":25,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"69\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[24],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":23,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162990576,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"68\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"6","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":36,"Index":7,"Attempt":0,"Launch Time":1678162990601,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":37,"Index":2,"Attempt":0,"Launch Time":1678162990601,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":38,"Index":1,"Attempt":0,"Launch Time":1678162990601,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":39,"Index":4,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":40,"Index":3,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":41,"Index":6,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":42,"Index":5,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":43,"Index":0,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":44,"Index":15,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":45,"Index":10,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":46,"Index":9,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":47,"Index":12,"Attempt":0,"Launch Time":1678162990604,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":48,"Index":11,"Attempt":0,"Launch Time":1678162990604,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":49,"Index":14,"Attempt":0,"Launch Time":1678162990604,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":50,"Index":13,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":51,"Index":8,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":52,"Index":23,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":53,"Index":18,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":54,"Index":17,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":55,"Index":20,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":56,"Index":19,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":57,"Index":22,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":58,"Index":21,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":59,"Index":16,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":60,"Index":31,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":61,"Index":26,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":62,"Index":25,"Attempt":0,"Launch Time":1678162990613,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":63,"Index":28,"Attempt":0,"Launch Time":1678162990614,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":64,"Index":27,"Attempt":0,"Launch Time":1678162990614,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":65,"Index":30,"Attempt":0,"Launch Time":1678162990614,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":66,"Index":29,"Attempt":0,"Launch Time":1678162990615,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":67,"Index":24,"Attempt":0,"Launch Time":1678162990615,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":38,"Index":1,"Attempt":0,"Launch Time":1678162990601,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990854,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"64","Value":"64","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":46,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":294912,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":3704,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":37083937,"Value":37083937,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":179,"Value":179,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11832221,"Value":11832221,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":46,"Value":46,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":46,"Executor Deserialize CPU Time":11832221,"Executor Run Time":179,"Executor CPU Time":37083937,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":54,"Index":17,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990854,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"54","Value":"118","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":92,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":589824,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":7408,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":71193481,"Value":108277418,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":192,"Value":371,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7985801,"Value":19818022,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":26,"Value":72,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":26,"Executor Deserialize CPU Time":7985801,"Executor Run Time":192,"Executor CPU Time":71193481,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":62,"Index":25,"Attempt":0,"Launch Time":1678162990613,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990857,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"65","Value":"183","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"786432","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":138,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":884736,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":11112,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":34768477,"Value":143045895,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":197,"Value":568,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6754024,"Value":26572046,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":23,"Value":95,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":23,"Executor Deserialize CPU Time":6754024,"Executor Run Time":197,"Executor CPU Time":34768477,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":46,"Index":9,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990861,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"61","Value":"244","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"1048576","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":184,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1179648,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":14816,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":31039667,"Value":174085562,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":185,"Value":753,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7114043,"Value":33686089,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":42,"Value":137,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":42,"Executor Deserialize CPU Time":7114043,"Executor Run Time":185,"Executor CPU Time":31039667,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":37,"Index":2,"Attempt":0,"Launch Time":1678162990601,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990865,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"78","Value":"322","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"1310720","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":230,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1474560,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":18520,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":84151587,"Value":258237149,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":197,"Value":950,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7484568,"Value":41170657,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":39,"Value":176,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":39,"Executor Deserialize CPU Time":7484568,"Executor Run Time":197,"Executor CPU Time":84151587,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":45,"Index":10,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990868,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"82","Value":"404","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"1572864","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":276,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1769472,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":22224,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":27144554,"Value":285381703,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":195,"Value":1145,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6907346,"Value":48078003,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":38,"Value":214,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":38,"Executor Deserialize CPU Time":6907346,"Executor Run Time":195,"Executor CPU Time":27144554,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":61,"Index":26,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990871,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"88","Value":"492","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"1835008","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"4","Value":"4","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":322,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2064384,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":25928,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":70054206,"Value":355435909,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":204,"Value":1349,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7346786,"Value":55424789,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":38,"Value":252,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":38,"Executor Deserialize CPU Time":7346786,"Executor Run Time":204,"Executor CPU Time":70054206,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":53,"Index":18,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990871,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"87","Value":"579","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"2097152","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"2","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":368,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2359296,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":29632,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":26955003,"Value":382390912,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":191,"Value":1540,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":12881464,"Value":68306253,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":49,"Value":301,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":49,"Executor Deserialize CPU Time":12881464,"Executor Run Time":191,"Executor CPU Time":26955003,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":44,"Index":15,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990883,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"79","Value":"658","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"2359296","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":414,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2654208,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":33336,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":40553416,"Value":422944328,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":216,"Value":1756,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":17868080,"Value":86174333,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":43,"Value":344,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":43,"Executor Deserialize CPU Time":17868080,"Executor Run Time":216,"Executor CPU Time":40553416,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":60,"Index":31,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990883,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"80","Value":"738","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"2621440","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":460,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2949120,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":37040,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":50559943,"Value":473504271,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":213,"Value":1969,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":13057335,"Value":99231668,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":43,"Value":387,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":43,"Executor Deserialize CPU Time":13057335,"Executor Run Time":213,"Executor CPU Time":50559943,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":36,"Index":7,"Attempt":0,"Launch Time":1678162990601,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990888,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"84","Value":"822","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"2883584","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":506,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3244032,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":40744,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":40418664,"Value":513922935,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":220,"Value":2189,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11833801,"Value":111065469,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":44,"Value":431,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":44,"Executor Deserialize CPU Time":11833801,"Executor Run Time":220,"Executor CPU Time":40418664,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":52,"Index":23,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990894,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"90","Value":"912","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"3145728","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":552,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3538944,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":44448,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":92996721,"Value":606919656,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":227,"Value":2416,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11539577,"Value":122605046,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":42,"Value":473,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":42,"Executor Deserialize CPU Time":11539577,"Executor Run Time":227,"Executor CPU Time":92996721,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":7,"Stage Attempt ID":0,"Task Info":{"Task ID":68,"Index":32,"Attempt":0,"Launch Time":1678162990914,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":63,"Index":28,"Attempt":0,"Launch Time":1678162990614,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990915,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"71","Value":"983","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"3407872","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":598,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3833856,"Internal":true,"Count Failed Values":true},{"ID":426,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3747,"Value":48195,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":65780719,"Value":672700375,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":201,"Value":2617,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":12679202,"Value":135284248,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":71,"Value":544,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":71,"Executor Deserialize CPU Time":12679202,"Executor Run Time":201,"Executor CPU Time":65780719,"Peak Execution Memory":294912,"Result Size":3747,"JVM GC Time":0,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":47,"Index":12,"Attempt":0,"Launch Time":1678162990604,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990919,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"73","Value":"1056","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"3670016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":644,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4128768,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":51899,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":33738577,"Value":706438952,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":213,"Value":2830,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":12023300,"Value":147307548,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":63,"Value":607,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":63,"Executor Deserialize CPU Time":12023300,"Executor Run Time":213,"Executor CPU Time":33738577,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":55,"Index":20,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990920,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"76","Value":"1132","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"3932160","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":690,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4423680,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":55603,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":47410768,"Value":753849720,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":212,"Value":3042,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11045615,"Value":158353163,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":65,"Value":672,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":65,"Executor Deserialize CPU Time":11045615,"Executor Run Time":212,"Executor CPU Time":47410768,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":57,"Index":22,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990920,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"53","Value":"1185","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"4194304","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":736,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4718592,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":59307,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":23775227,"Value":777624947,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":188,"Value":3230,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":8431652,"Value":166784815,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":80,"Value":752,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":80,"Executor Deserialize CPU Time":8431652,"Executor Run Time":188,"Executor CPU Time":23775227,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":49,"Index":14,"Attempt":0,"Launch Time":1678162990604,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990928,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"55","Value":"1240","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"4456448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":782,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5013504,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":63011,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":103555165,"Value":881180112,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":211,"Value":3441,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15266346,"Value":182051161,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":62,"Value":814,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":62,"Executor Deserialize CPU Time":15266346,"Executor Run Time":211,"Executor CPU Time":103555165,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":39,"Index":4,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990928,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"71","Value":"1311","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":828,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5308416,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":66715,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":50668091,"Value":931848203,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":223,"Value":3664,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15389316,"Value":197440477,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":57,"Value":871,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":57,"Executor Deserialize CPU Time":15389316,"Executor Run Time":223,"Executor CPU Time":50668091,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":43,"Index":0,"Attempt":0,"Launch Time":1678162990603,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990935,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"74","Value":"1385","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"4980736","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":874,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5603328,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":70419,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":71691936,"Value":1003540139,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":194,"Value":3858,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9698926,"Value":207139403,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":91,"Value":962,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":91,"Executor Deserialize CPU Time":9698926,"Executor Run Time":194,"Executor CPU Time":71691936,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":51,"Index":8,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990935,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"76","Value":"1461","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"5242880","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":920,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5898240,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":74123,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":44219301,"Value":1047759440,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":208,"Value":4066,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":16558084,"Value":223697487,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":78,"Value":1040,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":78,"Executor Deserialize CPU Time":16558084,"Executor Run Time":208,"Executor CPU Time":44219301,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":59,"Index":16,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990939,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"76","Value":"1537","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"5505024","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":966,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6193152,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":77827,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":56415182,"Value":1104174622,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":224,"Value":4290,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10563362,"Value":234260849,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":61,"Value":1101,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":61,"Executor Deserialize CPU Time":10563362,"Executor Run Time":224,"Executor CPU Time":56415182,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":67,"Index":24,"Attempt":0,"Launch Time":1678162990615,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990945,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"84","Value":"1621","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"5767168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"1","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1012,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6488064,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":81531,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":55389061,"Value":1159563683,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":215,"Value":4505,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":8273412,"Value":242534261,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":78,"Value":1179,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":78,"Executor Deserialize CPU Time":8273412,"Executor Run Time":215,"Executor CPU Time":55389061,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":41,"Index":6,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990948,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"75","Value":"1696","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"6029312","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1058,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6782976,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":85235,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":19413949,"Value":1178977632,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":228,"Value":4733,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10586062,"Value":253120323,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":69,"Value":1248,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":69,"Executor Deserialize CPU Time":10586062,"Executor Run Time":228,"Executor CPU Time":19413949,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":65,"Index":30,"Attempt":0,"Launch Time":1678162990614,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990951,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"76","Value":"1772","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"6291456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1104,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7077888,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":88939,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":50978166,"Value":1229955798,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":250,"Value":4983,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":8664373,"Value":261784696,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":45,"Value":1293,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":45,"Executor Deserialize CPU Time":8664373,"Executor Run Time":250,"Executor CPU Time":50978166,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":68,"Index":32,"Attempt":0,"Launch Time":1678162990914,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990951,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"9","Value":"1781","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"6553600","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1150,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7372800,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":92643,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":14351311,"Value":1244307109,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":16,"Value":4999,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3814989,"Value":265599685,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":4,"Value":1297,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":4,"Executor Deserialize CPU Time":3814989,"Executor Run Time":16,"Executor CPU Time":14351311,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":58,"Index":21,"Attempt":0,"Launch Time":1678162990607,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990961,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"71","Value":"1852","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"6815744","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1196,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7667712,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":96347,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":58835547,"Value":1303142656,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":261,"Value":5260,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11416406,"Value":277016091,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":50,"Value":1347,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":892630656,"JVMOffHeapMemory":123492160,"OnHeapExecutionMemory":65536,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":667115,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":732651,"OffHeapUnifiedMemory":0,"DirectPoolMemory":19676008,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7151345664,"ProcessTreeJVMRSSMemory":1655054336,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":11,"MinorGCTime":223,"MajorGCCount":3,"MajorGCTime":283},"Task Metrics":{"Executor Deserialize Time":50,"Executor Deserialize CPU Time":11416406,"Executor Run Time":261,"Executor CPU Time":58835547,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":42,"Index":5,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990961,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"71","Value":"1923","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"7077888","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1242,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7962624,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":100051,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":66422831,"Value":1369565487,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":223,"Value":5483,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11488196,"Value":288504287,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":89,"Value":1436,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":892630656,"JVMOffHeapMemory":123492160,"OnHeapExecutionMemory":65536,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":667115,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":732651,"OffHeapUnifiedMemory":0,"DirectPoolMemory":19676008,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7151345664,"ProcessTreeJVMRSSMemory":1655054336,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":11,"MinorGCTime":223,"MajorGCCount":3,"MajorGCTime":283},"Task Metrics":{"Executor Deserialize Time":89,"Executor Deserialize CPU Time":11488196,"Executor Run Time":223,"Executor CPU Time":66422831,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":48,"Index":11,"Attempt":0,"Launch Time":1678162990604,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990962,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"71","Value":"1994","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"7340032","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1288,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8257536,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":103755,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":31804881,"Value":1401370368,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":239,"Value":5722,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":13505518,"Value":302009805,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":81,"Value":1517,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":81,"Executor Deserialize CPU Time":13505518,"Executor Run Time":239,"Executor CPU Time":31804881,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":40,"Index":3,"Attempt":0,"Launch Time":1678162990602,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990962,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"72","Value":"2066","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"7602176","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1334,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8552448,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":107459,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":82566623,"Value":1483936991,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":239,"Value":5961,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7618637,"Value":309628442,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":82,"Value":1599,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":82,"Executor Deserialize CPU Time":7618637,"Executor Run Time":239,"Executor CPU Time":82566623,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":64,"Index":27,"Attempt":0,"Launch Time":1678162990614,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990974,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"71","Value":"2137","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"7864320","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1380,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8847360,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":111163,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":23589394,"Value":1507526385,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":268,"Value":6229,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6338441,"Value":315966883,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":49,"Value":1648,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":49,"Executor Deserialize CPU Time":6338441,"Executor Run Time":268,"Executor CPU Time":23589394,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":66,"Index":29,"Attempt":0,"Launch Time":1678162990615,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990974,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"72","Value":"2209","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"8126464","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1426,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9142272,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":114867,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":46349815,"Value":1553876200,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":265,"Value":6494,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11085006,"Value":327051889,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":44,"Value":1692,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":892630656,"JVMOffHeapMemory":123492160,"OnHeapExecutionMemory":65536,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":667115,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":732651,"OffHeapUnifiedMemory":0,"DirectPoolMemory":19676008,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7151345664,"ProcessTreeJVMRSSMemory":1655054336,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":11,"MinorGCTime":223,"MajorGCCount":3,"MajorGCTime":283},"Task Metrics":{"Executor Deserialize Time":44,"Executor Deserialize CPU Time":11085006,"Executor Run Time":265,"Executor CPU Time":46349815,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":50,"Index":13,"Attempt":0,"Launch Time":1678162990605,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990975,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"72","Value":"2281","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"8388608","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1472,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9437184,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":118571,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":32787257,"Value":1586663457,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":267,"Value":6761,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":16976246,"Value":344028135,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":48,"Value":1740,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":892630656,"JVMOffHeapMemory":123492160,"OnHeapExecutionMemory":65536,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":667115,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":732651,"OffHeapUnifiedMemory":0,"DirectPoolMemory":19676008,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":7151345664,"ProcessTreeJVMRSSMemory":1655054336,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":11,"MinorGCTime":223,"MajorGCCount":3,"MajorGCTime":283},"Task Metrics":{"Executor Deserialize Time":48,"Executor Deserialize CPU Time":16976246,"Executor Run Time":267,"Executor CPU Time":32787257,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":7,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":56,"Index":19,"Attempt":0,"Launch Time":1678162990606,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162990977,"Failed":false,"Killed":false,"Accumulables":[{"ID":414,"Name":"duration","Update":"73","Value":"2354","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":416,"Name":"peak memory","Update":"262144","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":418,"Name":"time in aggregation build","Update":"0","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1518,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":424,"Name":"internal.metrics.resultSize","Update":3704,"Value":122275,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Update":57206426,"Value":1643869883,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Update":268,"Value":7029,"Internal":true,"Count Failed Values":true},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6415841,"Value":350443976,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Update":49,"Value":1789,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":49,"Executor Deserialize CPU Time":6415841,"Executor Run Time":268,"Executor CPU Time":57206426,"Peak Execution Memory":294912,"Result Size":3704,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":7,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":26,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"68\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[25],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":24,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[23],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":25,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"69\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[24],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":23,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162990576,"Completion Time":1678162990978,"Accumulables":[{"ID":424,"Name":"internal.metrics.resultSize","Value":122275,"Internal":true,"Count Failed Values":true},{"ID":418,"Name":"time in aggregation build","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":421,"Name":"internal.metrics.executorDeserializeCpuTime","Value":350443976,"Internal":true,"Count Failed Values":true},{"ID":429,"Name":"internal.metrics.peakExecutionMemory","Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":420,"Name":"internal.metrics.executorDeserializeTime","Value":1789,"Internal":true,"Count Failed Values":true},{"ID":414,"Name":"duration","Value":"2354","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":441,"Name":"internal.metrics.input.bytesRead","Value":1518,"Internal":true,"Count Failed Values":true},{"ID":423,"Name":"internal.metrics.executorCpuTime","Value":1643869883,"Internal":true,"Count Failed Values":true},{"ID":426,"Name":"internal.metrics.resultSerializationTime","Value":1,"Internal":true,"Count Failed Values":true},{"ID":422,"Name":"internal.metrics.executorRunTime","Value":7029,"Internal":true,"Count Failed Values":true},{"ID":416,"Name":"peak memory","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"}]}} -{"Event":"SparkListenerJobEnd","Job ID":4,"Completion Time":1678162990987,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":6,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- Aggregate [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], [Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count(1) AS count#537L]\n +- Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]\n+- AdaptiveSparkPlan isFinalPlan=true\n +- *(2) HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[count(1)], output=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count#537L])\n +- CustomShuffleReader coalesced\n +- ShuffleQueryStage 0\n +- Exchange hashpartitioning(Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, 1000), true, [id=#302]\n +- *(1) HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[partial_count(1)], output=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, count#1662L])\n +- *(1) Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]\n +- InMemoryTableScan [content_charset#26, content_languages#27, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13, url_host_tld#7]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_dimension_1GB), Overwrite, [Site, Language, Charset, url_host_tld, url_host_2nd_last_part, url_host_3rd_last_part, count]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=true","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[count(1)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, 1000), true, [id=#302]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Language#279, Charset#311, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9], functions=[partial_count(1)])","children":[{"nodeName":"Project","simpleString":"Project [url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13 AS Site#247, content_charset#26 AS Charset#311, content_languages#27 AS Language#279]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [content_charset#26, content_languages#27, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_registered_domain#13, url_host_tld#7]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":397,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":417,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":418,"metricType":"timing"},{"name":"peak memory","accumulatorId":416,"metricType":"size"},{"name":"number of output rows","accumulatorId":415,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":419,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":414,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":407,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":408,"metricType":"nsTiming"},{"name":"records read","accumulatorId":405,"metricType":"sum"},{"name":"local bytes read","accumulatorId":403,"metricType":"size"},{"name":"fetch wait time","accumulatorId":404,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":401,"metricType":"size"},{"name":"local blocks read","accumulatorId":400,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":399,"metricType":"sum"},{"name":"data size","accumulatorId":398,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":402,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":406,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":459,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":460,"metricType":"timing"},{"name":"peak memory","accumulatorId":458,"metricType":"size"},{"name":"number of output rows","accumulatorId":457,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":461,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":456,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":383,"metricType":"sum"},{"name":"written output","accumulatorId":384,"metricType":"size"},{"name":"number of output rows","accumulatorId":385,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":386,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":5,"Submission Time":1678162991085,"Stage Infos":[{"Stage ID":9,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":28,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"75\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[27],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":27,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"78\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[26],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[8],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":8,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":26,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"68\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[25],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":24,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[23],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":25,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"69\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[24],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":23,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"73\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[9,8],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"66\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"6","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":9,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":28,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"75\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[27],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":27,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"78\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[26],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[8],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162991087,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"66\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"6","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":9,"Stage Attempt ID":0,"Task Info":{"Task ID":69,"Index":0,"Attempt":0,"Launch Time":1678162991129,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":9,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":69,"Index":0,"Attempt":0,"Launch Time":1678162991129,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162991765,"Failed":false,"Killed":false,"Accumulables":[{"ID":456,"Name":"duration","Update":"396","Value":"396","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":458,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":460,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":485,"Name":"internal.metrics.output.bytesWritten","Update":87,"Value":87,"Internal":true,"Count Failed Values":true},{"ID":479,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":478,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":477,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":476,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":475,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":474,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":473,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":471,"Name":"internal.metrics.peakExecutionMemory","Update":262144,"Value":262144,"Internal":true,"Count Failed Values":true},{"ID":468,"Name":"internal.metrics.resultSerializationTime","Update":2,"Value":2,"Internal":true,"Count Failed Values":true},{"ID":466,"Name":"internal.metrics.resultSize","Update":4800,"Value":4800,"Internal":true,"Count Failed Values":true},{"ID":465,"Name":"internal.metrics.executorCpuTime","Update":269717740,"Value":269717740,"Internal":true,"Count Failed Values":true},{"ID":464,"Name":"internal.metrics.executorRunTime","Update":552,"Value":552,"Internal":true,"Count Failed Values":true},{"ID":463,"Name":"internal.metrics.executorDeserializeCpuTime","Update":68577712,"Value":68577712,"Internal":true,"Count Failed Values":true},{"ID":462,"Name":"internal.metrics.executorDeserializeTime","Update":75,"Value":75,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":75,"Executor Deserialize CPU Time":68577712,"Executor Run Time":552,"Executor CPU Time":269717740,"Peak Execution Memory":262144,"Result Size":4800,"JVM GC Time":0,"Result Serialization Time":2,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":87,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":9,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":28,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"75\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[27],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":27,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"78\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[26],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[8],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162991087,"Completion Time":1678162991766,"Accumulables":[{"ID":478,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":460,"Name":"time in aggregation build","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":463,"Name":"internal.metrics.executorDeserializeCpuTime","Value":68577712,"Internal":true,"Count Failed Values":true},{"ID":474,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":456,"Name":"duration","Value":"396","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":465,"Name":"internal.metrics.executorCpuTime","Value":269717740,"Internal":true,"Count Failed Values":true},{"ID":468,"Name":"internal.metrics.resultSerializationTime","Value":2,"Internal":true,"Count Failed Values":true},{"ID":477,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":471,"Name":"internal.metrics.peakExecutionMemory","Value":262144,"Internal":true,"Count Failed Values":true},{"ID":462,"Name":"internal.metrics.executorDeserializeTime","Value":75,"Internal":true,"Count Failed Values":true},{"ID":479,"Name":"internal.metrics.shuffle.read.recordsRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":464,"Name":"internal.metrics.executorRunTime","Value":552,"Internal":true,"Count Failed Values":true},{"ID":473,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":458,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":485,"Name":"internal.metrics.output.bytesWritten","Value":87,"Internal":true,"Count Failed Values":true},{"ID":476,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":466,"Name":"internal.metrics.resultSize","Value":4800,"Internal":true,"Count Failed Values":true},{"ID":475,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":5,"Completion Time":1678162991767,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates","executionId":6,"accumUpdates":[[383,1],[384,87],[385,0],[386,0]]} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":6,"timePerRule":{"PruneFileSourcePartitions":253272,"ReassignLambdaVariableID":221432,"PushPredicateThroughNonJoin":120341,"Analyzer$HandleNullInputsForUDF":24848,"Analyzer$ResolveSubqueryColumnAliases":12844,"ResolveTimeZone":17803,"Analyzer$ResolveNamespace":14411,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":17584,"RewriteCorrelatedScalarSubquery":489498,"RemoveLiteralFromGroupExpressions":294694,"PushProjectionThroughUnion":538890,"EliminateSubqueryAliases":261703,"ResolveCatalogs":43322,"PushLeftSemiLeftAntiThroughJoin":468382,"FlattenScalarSubqueriesWithAggregates":199405,"LikeSimplification":22746726,"CollapseRepartition":1445916,"ResolveHints$ResolveCoalesceHints":13643,"Analyzer$ExtractGenerator":41229,"RewriteIntersectAll":267650,"ResolveHints$ResolveJoinStrategyHints":16484,"TypeCoercion$MapZipWithCoercion":17114,"NullPropagation":690506,"PullupCorrelatedPredicates":409545,"UpdateOuterReferences":21611,"ExtractPythonUDFs":397685,"Analyzer$WindowsSubstitution":17866,"CombineUnions":553256,"ExtractGroupingPythonUDFFromAggregate":136413,"ReorderAssociativeOperator":595564,"CleanupDynamicPruningFilters":866309,"ResolveHints$RemoveAllHints":18823,"SimplifyBinaryComparison":582612,"ResolveTableValuedFunctions":19548,"EliminateSerialization":352989,"TypeCoercion$BooleanEquality":20662,"ReplaceIntersectWithSemiJoin":239161,"ConstantPropagation":530918,"CostBasedJoinReorder":17893,"Analyzer$ResolveReferences":71033,"CTESubstitution":560141,"RemoveRedundantAliases":731939,"TypeCoercion$ImplicitTypeCasts":15808,"RewriteExceptAll":274716,"UpdateAttributeNullability":110371,"PropagateEmptyRelation":483979,"SimplifyCasts":582812,"EliminateMapObjects":246971,"CombineLimits":311007,"DetectAmbiguousSelfJoin":36203,"ReplaceExpressions":449311,"ResolveInlineTables":17395,"OptimizeIn":449957,"CollapseWindow":345087,"TypeCoercion$IfCoercion":17770,"ResolveSessionCatalog":24629,"PartitionPruning":173960,"BooleanSimplification":1806378,"TypeCoercion$PromoteStrings":17499,"Analyzer$ResolveAliases":16555,"DecimalAggregates":187219,"PruneFilters":611458,"Analyzer$ResolveMissingReferences":12837,"TransposeWindow":291583,"Analyzer$ResolveRelations":26192,"EliminateUnions":22924,"RewritePredicateSubquery":126175,"ObjectSerializerPruning":117383,"LimitPushDown":458036,"SimplifyCaseConversionExpressions":523113,"Analyzer$ResolveNaturalAndUsingJoin":13408,"EliminateView":288220,"CombineTypedFilters":131581,"OptimizeLimitZero":252729,"CheckCartesianProducts":33088,"ExtractPythonUDFFromAggregate":149646,"Analyzer$ExtractWindowExpressions":21631,"ReplaceExceptWithAntiJoin":266708,"ResolveLambdaVariables":21554,"FallBackFileSourceV2":13639,"Analyzer$ResolveTables":22022,"SubstituteUnresolvedOrdinals":16149,"TypeCoercion$CaseWhenCoercion":17846,"DecimalPrecision":24537,"EliminateSorts":201597,"PushDownLeftSemiAntiJoin":468436,"ExtractPythonUDFFromJoinCondition":136820,"TypeCoercion$StackCoercion":17218,"Analyzer$ResolveAggAliasInGroupBy":14095,"TypeCoercion$StringLiteralCoercion":16320,"FoldablePropagation":162402,"V2ScanRelationPushDown":224664,"EliminateDistinct":15964,"InferFiltersFromConstraints":176342,"Analyzer$PullOutNondeterministic":24631,"Analyzer$ResolveFunctions":19918,"ReplaceNullWithFalseInPredicate":535716,"ResolveHigherOrderFunctions":18362,"Analyzer$ResolvePivot":14745,"CollapseProject":1478242,"Analyzer$ResolveNewInstance":18174,"ColumnPruning":3239875,"Analyzer$ResolveWindowOrder":17389,"TypeCoercion$ConcatCoercion":20971,"PushDownPredicates":799751,"TimeWindowing":51036,"Optimizer$OptimizeSubqueries":837955,"RewriteNonCorrelatedExists":412790,"DemoteBroadcastHashJoin":43539,"TypeCoercion$Division":16709,"ComputeCurrentTime":462486,"ResolveCreateNamedStruct":19807,"TypeCoercion$EltCoercion":19551,"ConvertToLocalRelation":408215,"RemoveRepetitionFromGroupExpressions":362933,"ReplaceDistinctWithAggregate":265417,"PreprocessTableCreation":18272,"ResolveSQLOnFile":13787,"Analyzer$ResolveSubquery":13205,"CombineConcats":71118,"Analyzer$ResolveGroupingAnalytics":22627,"Analyzer$ResolveBinaryArithmetic":19376,"RemoveDispensableExpressions":640676,"Analyzer$ResolveAlterTableChanges":20488,"ResolveEncodersInScalaAgg":19584,"TypeCoercion$IntegralDivision":15946,"Analyzer$ResolveWindowFrame":15964,"Analyzer$ResolveDeserializer":20192,"RewriteDistinctAggregates":346549,"RemoveNoopOperators":7229896,"Analyzer$ResolveAggregateFunctions":14910,"NormalizeFloatingNumbers":116046,"ReorderJoin":534511,"Analyzer$ResolveUpCast":18464,"Analyzer$ResolveGenerate":16075,"TypeCoercion$WidenSetOperationTypes":14173,"EliminateOuterJoin":468131,"SimplifyExtractValueOps":637528,"OptimizeMetadataOnlyQuery":16186,"EliminateResolvedHint":565273,"Analyzer$ResolveInsertInto":12989,"ReplaceExceptWithFilter":271491,"CleanupAliases":24903,"GetCurrentDatabase":488348,"SchemaPruning":296715,"Analyzer$ResolveOutputRelation":13593,"BloomFilterJoinRule":142981,"Analyzer$ResolveRandomSeed":13957,"TypeCoercion$WindowFrameCoercion":17717,"ConstantFolding":443957,"TypeCoercion$DateTimeOperations":15555,"TypeCoercion$InConversion":45548,"FindDataSourceTable":15931,"SimplifyConditionals":4565594,"DataSourceAnalysis":14324,"TypeCoercion$FunctionArgumentConversion":16287,"Analyzer$GlobalAggregates":13642,"Analyzer$LookupFunctions":23350,"CombineFilters":444600,"ReplaceDeduplicateWithAggregate":257073,"PreprocessTableInsertion":12397},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":3,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":3,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":3,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":3,"CollapseRepartition":3,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":3,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":4,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":3,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":3,"ResolveTableValuedFunctions":1,"EliminateSerialization":3,"TypeCoercion$BooleanEquality":1,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":3,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":3,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":3,"EliminateMapObjects":1,"CombineLimits":3,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":3,"CollapseWindow":3,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":3,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":4,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":3,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":3,"SimplifyCaseConversionExpressions":3,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":3,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":3,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":3,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":4,"Analyzer$ResolveNewInstance":1,"ColumnPruning":5,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":5,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"DemoteBroadcastHashJoin":1,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":3,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":3,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":5,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":3,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":3,"SimplifyExtractValueOps":3,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":3,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":3,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":4,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{"ColumnPruning":1,"CollapseProject":1},"timeEffectiveRunsPerRule":{"ColumnPruning":2159753,"CollapseProject":1117639},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":6,"time":1678162991856} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":7,"description":"save at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247, Date#344], [Site#247, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L]\n : +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- Project [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- SortMergeJoin [Site#247], [Site#485], FullOuter\n :- Sort [Site#247 ASC NULLS FIRST], false, 0\n : +- HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)], output=[Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L])\n : +- Exchange hashpartitioning(Site#247, 1000), true, [id=#387]\n : +- HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)], output=[Site#247, sum#2727L, sum#2728L, sum#2729, count#2730L])\n : +- HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Total_Pages#449L, Total_Record_Length#451L])\n : +- Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#383]\n : +- HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, count#2733L, sum#2734L])\n : +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"Project","simpleString":"Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]","children":[{"nodeName":"SortMergeJoin","simpleString":"SortMergeJoin [Site#247], [Site#485], FullOuter","children":[{"nodeName":"Sort","simpleString":"Sort [Site#247 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)])","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, 1000), true, [id=#387]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#383]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":614,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":611,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":612,"metricType":"timing"},{"name":"peak memory","accumulatorId":610,"metricType":"size"},{"name":"number of output rows","accumulatorId":609,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":613,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":496,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":497,"metricType":"nsTiming"},{"name":"records read","accumulatorId":494,"metricType":"sum"},{"name":"local bytes read","accumulatorId":492,"metricType":"size"},{"name":"fetch wait time","accumulatorId":493,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":490,"metricType":"size"},{"name":"local blocks read","accumulatorId":489,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":488,"metricType":"sum"},{"name":"data size","accumulatorId":487,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":491,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":495,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":606,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":607,"metricType":"timing"},{"name":"peak memory","accumulatorId":605,"metricType":"size"},{"name":"number of output rows","accumulatorId":604,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":608,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":601,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":602,"metricType":"timing"},{"name":"peak memory","accumulatorId":600,"metricType":"size"},{"name":"number of output rows","accumulatorId":599,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":603,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":507,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":508,"metricType":"nsTiming"},{"name":"records read","accumulatorId":505,"metricType":"sum"},{"name":"local bytes read","accumulatorId":503,"metricType":"size"},{"name":"fetch wait time","accumulatorId":504,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":501,"metricType":"size"},{"name":"local blocks read","accumulatorId":500,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":499,"metricType":"sum"},{"name":"data size","accumulatorId":498,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":502,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":506,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":596,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":597,"metricType":"timing"},{"name":"peak memory","accumulatorId":595,"metricType":"size"},{"name":"number of output rows","accumulatorId":594,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":598,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":591,"metricType":"timing"},{"name":"peak memory","accumulatorId":592,"metricType":"size"},{"name":"spill size","accumulatorId":593,"metricType":"size"}]},{"nodeName":"Sort","simpleString":"Sort [Site#485 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#485, 1000), true, [id=#389]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[partial_avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#485, pythonUDF0#2422 AS levenshtein_distance#215]","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2422]","children":[{"nodeName":"Project","simpleString":"Project [url_host_2nd_last_part#8, url_host_registered_domain#13]","children":[{"nodeName":"Filter","simpleString":"Filter (pythonUDF0#2421 <= 2)","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2421]","children":[{"nodeName":"Filter","simpleString":"Filter ((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(url_host_registered_domain#13))","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_2nd_last_part#8, url_host_registered_domain#13, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(url_host_registered_domain#13)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":630,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":629,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":628,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":625,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":626,"metricType":"timing"},{"name":"peak memory","accumulatorId":624,"metricType":"size"},{"name":"number of output rows","accumulatorId":623,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":627,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":518,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":519,"metricType":"nsTiming"},{"name":"records read","accumulatorId":516,"metricType":"sum"},{"name":"local bytes read","accumulatorId":514,"metricType":"size"},{"name":"fetch wait time","accumulatorId":515,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":512,"metricType":"size"},{"name":"local blocks read","accumulatorId":511,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":510,"metricType":"sum"},{"name":"data size","accumulatorId":509,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":513,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":517,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":620,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":621,"metricType":"timing"},{"name":"peak memory","accumulatorId":619,"metricType":"size"},{"name":"number of output rows","accumulatorId":618,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":622,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":615,"metricType":"timing"},{"name":"peak memory","accumulatorId":616,"metricType":"size"},{"name":"spill size","accumulatorId":617,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":590,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":586,"metricType":"sum"},{"name":"written output","accumulatorId":587,"metricType":"size"},{"name":"number of output rows","accumulatorId":588,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":589,"metricType":"sum"}]},"time":1678162992108} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":7,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":7,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247, Date#344], [Site#247, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L]\n : +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- Project [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- SortMergeJoin [Site#247], [Site#485], FullOuter\n :- Sort [Site#247 ASC NULLS FIRST], false, 0\n : +- HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)], output=[Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L])\n : +- Exchange hashpartitioning(Site#247, 1000), true, [id=#439]\n : +- HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)], output=[Site#247, sum#2727L, sum#2728L, sum#2729, count#2730L])\n : +- HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Total_Pages#449L, Total_Record_Length#451L])\n : +- ShuffleQueryStage 0\n : +- Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#435]\n : +- *(1) HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, count#2733L, sum#2734L])\n : +- *(1) Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"Project","simpleString":"Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]","children":[{"nodeName":"SortMergeJoin","simpleString":"SortMergeJoin [Site#247], [Site#485], FullOuter","children":[{"nodeName":"Sort","simpleString":"Sort [Site#247 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)])","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, 1000), true, [id=#439]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#435]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":614,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":686,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":687,"metricType":"timing"},{"name":"peak memory","accumulatorId":685,"metricType":"size"},{"name":"number of output rows","accumulatorId":684,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":688,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":683,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":640,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":641,"metricType":"nsTiming"},{"name":"records read","accumulatorId":638,"metricType":"sum"},{"name":"local bytes read","accumulatorId":636,"metricType":"size"},{"name":"fetch wait time","accumulatorId":637,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":634,"metricType":"size"},{"name":"local blocks read","accumulatorId":633,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":632,"metricType":"sum"},{"name":"data size","accumulatorId":631,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":635,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":639,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":680,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":681,"metricType":"timing"},{"name":"peak memory","accumulatorId":679,"metricType":"size"},{"name":"number of output rows","accumulatorId":678,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":682,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":675,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":676,"metricType":"timing"},{"name":"peak memory","accumulatorId":674,"metricType":"size"},{"name":"number of output rows","accumulatorId":673,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":677,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":651,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":652,"metricType":"nsTiming"},{"name":"records read","accumulatorId":649,"metricType":"sum"},{"name":"local bytes read","accumulatorId":647,"metricType":"size"},{"name":"fetch wait time","accumulatorId":648,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":645,"metricType":"size"},{"name":"local blocks read","accumulatorId":644,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":643,"metricType":"sum"},{"name":"data size","accumulatorId":642,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":646,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":650,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":670,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":671,"metricType":"timing"},{"name":"peak memory","accumulatorId":669,"metricType":"size"},{"name":"number of output rows","accumulatorId":668,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":672,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":665,"metricType":"timing"},{"name":"peak memory","accumulatorId":666,"metricType":"size"},{"name":"spill size","accumulatorId":667,"metricType":"size"}]},{"nodeName":"Sort","simpleString":"Sort [Site#485 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 1","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#485, 1000), true, [id=#455]","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[partial_avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#485, pythonUDF0#2422 AS levenshtein_distance#215]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2422]","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"Project","simpleString":"Project [url_host_2nd_last_part#8, url_host_registered_domain#13]","children":[{"nodeName":"Filter","simpleString":"Filter (pythonUDF0#2421 <= 2)","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2421]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"Filter","simpleString":"Filter ((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(url_host_registered_domain#13))","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_2nd_last_part#8, url_host_registered_domain#13, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(url_host_registered_domain#13)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":630,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":706,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":705,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":704,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":703,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":700,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":701,"metricType":"timing"},{"name":"peak memory","accumulatorId":699,"metricType":"size"},{"name":"number of output rows","accumulatorId":698,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":702,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":697,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":662,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":663,"metricType":"nsTiming"},{"name":"records read","accumulatorId":660,"metricType":"sum"},{"name":"local bytes read","accumulatorId":658,"metricType":"size"},{"name":"fetch wait time","accumulatorId":659,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":656,"metricType":"size"},{"name":"local blocks read","accumulatorId":655,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":654,"metricType":"sum"},{"name":"data size","accumulatorId":653,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":657,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":661,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":694,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":695,"metricType":"timing"},{"name":"peak memory","accumulatorId":693,"metricType":"size"},{"name":"number of output rows","accumulatorId":692,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":696,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":689,"metricType":"timing"},{"name":"peak memory","accumulatorId":690,"metricType":"size"},{"name":"spill size","accumulatorId":691,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":664,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":586,"metricType":"sum"},{"name":"written output","accumulatorId":587,"metricType":"size"},{"name":"number of output rows","accumulatorId":588,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":589,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":6,"Submission Time":1678162992587,"Stage Infos":[{"Stage ID":10,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":36,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"88\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[35],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":34,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[33],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":35,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"89\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[34],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":32,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[31],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":31,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":33,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[32],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[10],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"88\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":10,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":36,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"88\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[35],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":34,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[33],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":35,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"89\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[34],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":32,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[31],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":31,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":33,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[32],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162992592,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"88\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":70,"Index":0,"Attempt":0,"Launch Time":1678162992616,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":71,"Index":4,"Attempt":0,"Launch Time":1678162992616,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":72,"Index":7,"Attempt":0,"Launch Time":1678162992616,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":73,"Index":5,"Attempt":0,"Launch Time":1678162992617,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":74,"Index":1,"Attempt":0,"Launch Time":1678162992617,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":75,"Index":2,"Attempt":0,"Launch Time":1678162992617,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":76,"Index":6,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":77,"Index":3,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":78,"Index":8,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":79,"Index":12,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":80,"Index":15,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":81,"Index":13,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":82,"Index":9,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":83,"Index":10,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":84,"Index":14,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":85,"Index":11,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":86,"Index":16,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":87,"Index":20,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":88,"Index":23,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":89,"Index":21,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":90,"Index":17,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":91,"Index":18,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":92,"Index":22,"Attempt":0,"Launch Time":1678162992622,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":93,"Index":19,"Attempt":0,"Launch Time":1678162992622,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":94,"Index":24,"Attempt":0,"Launch Time":1678162992623,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":95,"Index":28,"Attempt":0,"Launch Time":1678162992623,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":96,"Index":31,"Attempt":0,"Launch Time":1678162992623,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":97,"Index":29,"Attempt":0,"Launch Time":1678162992624,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":98,"Index":25,"Attempt":0,"Launch Time":1678162992624,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":99,"Index":26,"Attempt":0,"Launch Time":1678162992624,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":100,"Index":30,"Attempt":0,"Launch Time":1678162992625,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":101,"Index":27,"Attempt":0,"Launch Time":1678162992625,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerJobStart","Job ID":7,"Submission Time":1678162992767,"Stage Infos":[{"Stage ID":11,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":46,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"95\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[45],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":45,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"96\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[44],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":41,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[40],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":39,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"106\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[38],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":37,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":38,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[37],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":43,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[42],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":40,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[39],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":42,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"101\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[41],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":44,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[43],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[11],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"95\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":11,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":46,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"95\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[45],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":45,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"96\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[44],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":41,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[40],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":39,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"106\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[38],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":37,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":38,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[37],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":43,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[42],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":40,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[39],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":42,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"101\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[41],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":44,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[43],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162992775,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"95\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":10,"Stage Attempt ID":0,"Task Info":{"Task ID":102,"Index":32,"Attempt":0,"Launch Time":1678162993260,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":103,"Index":4,"Attempt":0,"Launch Time":1678162993261,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":104,"Index":12,"Attempt":0,"Launch Time":1678162993263,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":71,"Index":4,"Attempt":0,"Launch Time":1678162992616,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993263,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"59","Value":"59","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"3","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":46,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":294912,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":4317,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":107330436,"Value":107330436,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":548,"Value":548,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":13825269,"Value":13825269,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":70,"Value":70,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":70,"Executor Deserialize CPU Time":13825269,"Executor Run Time":548,"Executor CPU Time":107330436,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":79,"Index":12,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993263,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"58","Value":"117","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"2","Value":"5","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":92,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":589824,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":8634,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":49446470,"Value":156776906,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":548,"Value":1096,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":14443524,"Value":28268793,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":70,"Value":140,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":70,"Executor Deserialize CPU Time":14443524,"Executor Run Time":548,"Executor CPU Time":49446470,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":105,"Index":20,"Attempt":0,"Launch Time":1678162993264,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":87,"Index":20,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993265,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"58","Value":"175","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"786432","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"2","Value":"7","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":138,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":884736,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":12951,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":102007663,"Value":258784569,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":548,"Value":1644,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":18146400,"Value":46415193,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":70,"Value":210,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":70,"Executor Deserialize CPU Time":18146400,"Executor Run Time":548,"Executor CPU Time":102007663,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":95,"Index":28,"Attempt":0,"Launch Time":1678162992623,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993265,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"58","Value":"233","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"1048576","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"3","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":184,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1179648,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":17268,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":36105286,"Value":294889855,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":549,"Value":2193,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":16079089,"Value":62494282,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":70,"Value":280,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":70,"Executor Deserialize CPU Time":16079089,"Executor Run Time":549,"Executor CPU Time":36105286,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":106,"Index":5,"Attempt":0,"Launch Time":1678162993275,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":81,"Index":13,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993276,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"65","Value":"298","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"1310720","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"10","Value":"20","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":230,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1474560,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":21585,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":63148314,"Value":358038169,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":555,"Value":2748,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":17852949,"Value":80347231,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":76,"Value":356,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":76,"Executor Deserialize CPU Time":17852949,"Executor Run Time":555,"Executor CPU Time":63148314,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":107,"Index":13,"Attempt":0,"Launch Time":1678162993276,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":97,"Index":29,"Attempt":0,"Launch Time":1678162992624,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993277,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"66","Value":"364","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"1572864","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"10","Value":"30","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":276,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1769472,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":25902,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":126023045,"Value":484061214,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":558,"Value":3306,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":20234952,"Value":100582183,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":72,"Value":428,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":72,"Executor Deserialize CPU Time":20234952,"Executor Run Time":558,"Executor CPU Time":126023045,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":108,"Index":21,"Attempt":0,"Launch Time":1678162993278,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":89,"Index":21,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993279,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"66","Value":"430","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"1835008","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"10","Value":"40","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":322,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2064384,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":30219,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":45626052,"Value":529687266,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":559,"Value":3865,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":20882856,"Value":121465039,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":73,"Value":501,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":73,"Executor Deserialize CPU Time":20882856,"Executor Run Time":559,"Executor CPU Time":45626052,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":109,"Index":29,"Attempt":0,"Launch Time":1678162993288,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":73,"Index":5,"Attempt":0,"Launch Time":1678162992617,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993289,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"79","Value":"509","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"2097152","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"0","Value":"40","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":368,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2359296,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":34536,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":75941816,"Value":605629082,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":572,"Value":4437,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9742046,"Value":131207085,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":73,"Value":574,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":73,"Executor Deserialize CPU Time":9742046,"Executor Run Time":572,"Executor CPU Time":75941816,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":110,"Index":3,"Attempt":0,"Launch Time":1678162993315,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":93,"Index":19,"Attempt":0,"Launch Time":1678162992622,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993316,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"85","Value":"594","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"2359296","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"4","Value":"44","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":414,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2654208,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":38853,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":94505733,"Value":700134815,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":569,"Value":5006,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":23029554,"Value":154236639,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":92,"Value":666,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":92,"Executor Deserialize CPU Time":23029554,"Executor Run Time":569,"Executor CPU Time":94505733,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":111,"Index":11,"Attempt":0,"Launch Time":1678162993317,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":77,"Index":3,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993319,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"86","Value":"680","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"2621440","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"0","Value":"44","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":460,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2949120,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":43170,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":70021600,"Value":770156415,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":579,"Value":5585,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10729030,"Value":164965669,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":83,"Value":749,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":83,"Executor Deserialize CPU Time":10729030,"Executor Run Time":579,"Executor CPU Time":70021600,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":112,"Index":19,"Attempt":0,"Launch Time":1678162993321,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":113,"Index":27,"Attempt":0,"Launch Time":1678162993322,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":85,"Index":11,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993322,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"86","Value":"766","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"2883584","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"2","Value":"46","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":506,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3244032,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":47487,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":91606129,"Value":861762544,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":568,"Value":6153,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":22569591,"Value":187535260,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":94,"Value":843,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":94,"Executor Deserialize CPU Time":22569591,"Executor Run Time":568,"Executor CPU Time":91606129,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":101,"Index":27,"Attempt":0,"Launch Time":1678162992625,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993323,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"90","Value":"856","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"3145728","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"0","Value":"46","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":552,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3538944,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":51804,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":53094462,"Value":914857006,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":582,"Value":6735,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":13985479,"Value":201520739,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":82,"Value":925,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":82,"Executor Deserialize CPU Time":13985479,"Executor Run Time":582,"Executor CPU Time":53094462,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":114,"Index":6,"Attempt":0,"Launch Time":1678162993329,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":92,"Index":22,"Attempt":0,"Launch Time":1678162992622,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993329,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"80","Value":"936","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"3407872","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"5","Value":"51","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":598,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3833856,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":56121,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":62972629,"Value":977829635,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":591,"Value":7326,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11302985,"Value":212823724,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":75,"Value":1000,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":75,"Executor Deserialize CPU Time":11302985,"Executor Run Time":591,"Executor CPU Time":62972629,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":115,"Index":0,"Attempt":0,"Launch Time":1678162993330,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":70,"Index":0,"Attempt":0,"Launch Time":1678162992616,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993331,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"83","Value":"1019","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"3670016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"12","Value":"63","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":644,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4128768,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":60438,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":43441534,"Value":1021271169,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":602,"Value":7928,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":13102094,"Value":225925818,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":82,"Value":1082,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":82,"Executor Deserialize CPU Time":13102094,"Executor Run Time":602,"Executor CPU Time":43441534,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":116,"Index":7,"Attempt":0,"Launch Time":1678162993333,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":88,"Index":23,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993334,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"75","Value":"1094","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"3932160","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"2","Value":"65","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":690,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4423680,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":64755,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":73450020,"Value":1094721189,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":604,"Value":8532,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":19132932,"Value":245058750,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":76,"Value":1158,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":76,"Executor Deserialize CPU Time":19132932,"Executor Run Time":604,"Executor CPU Time":73450020,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":117,"Index":15,"Attempt":0,"Launch Time":1678162993337,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":118,"Index":14,"Attempt":0,"Launch Time":1678162993339,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":76,"Index":6,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993339,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"85","Value":"1179","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"4194304","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"11","Value":"76","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":736,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4718592,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":69072,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":59693533,"Value":1154414722,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":589,"Value":9121,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":28968546,"Value":274027296,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":84,"Value":1242,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":84,"Executor Deserialize CPU Time":28968546,"Executor Run Time":589,"Executor CPU Time":59693533,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":119,"Index":8,"Attempt":0,"Launch Time":1678162993340,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":72,"Index":7,"Attempt":0,"Launch Time":1678162992616,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993344,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"75","Value":"1254","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"4456448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"2","Value":"78","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":782,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5013504,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":73389,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":111674182,"Value":1266088904,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":605,"Value":9726,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":13576867,"Value":287604163,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":75,"Value":1317,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":75,"Executor Deserialize CPU Time":13576867,"Executor Run Time":605,"Executor CPU Time":111674182,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":94,"Index":24,"Attempt":0,"Launch Time":1678162992623,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993344,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"83","Value":"1337","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"14","Value":"92","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":828,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5308416,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":77706,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":87776574,"Value":1353865478,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":601,"Value":10327,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":26186251,"Value":313790414,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":81,"Value":1398,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":81,"Executor Deserialize CPU Time":26186251,"Executor Run Time":601,"Executor CPU Time":87776574,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":80,"Index":15,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993344,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"75","Value":"1412","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"4980736","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"2","Value":"94","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":874,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5603328,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":82023,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":76238967,"Value":1430104445,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":605,"Value":10932,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":18595574,"Value":332385988,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":75,"Value":1473,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":75,"Executor Deserialize CPU Time":18595574,"Executor Run Time":605,"Executor CPU Time":76238967,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":120,"Index":23,"Attempt":0,"Launch Time":1678162993346,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":121,"Index":22,"Attempt":0,"Launch Time":1678162993349,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":84,"Index":14,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993349,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"86","Value":"1498","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"5242880","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"11","Value":"105","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":920,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5898240,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":86340,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":93797328,"Value":1523901773,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":598,"Value":11530,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15239818,"Value":347625806,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":75,"Value":1548,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":75,"Executor Deserialize CPU Time":15239818,"Executor Run Time":598,"Executor CPU Time":93797328,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":100,"Index":30,"Attempt":0,"Launch Time":1678162992625,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993352,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"86","Value":"1584","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"5505024","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"11","Value":"116","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":966,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6193152,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":90657,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":99239038,"Value":1623140811,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":589,"Value":12119,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":16210661,"Value":363836467,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":83,"Value":1631,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":83,"Executor Deserialize CPU Time":16210661,"Executor Run Time":589,"Executor CPU Time":99239038,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":122,"Index":30,"Attempt":0,"Launch Time":1678162993358,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":123,"Index":16,"Attempt":0,"Launch Time":1678162993361,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":78,"Index":8,"Attempt":0,"Launch Time":1678162992618,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993362,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"87","Value":"1671","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"5767168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"17","Value":"133","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1012,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6488064,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":94974,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":116673417,"Value":1739814228,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":611,"Value":12730,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":14429820,"Value":378266287,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":82,"Value":1713,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":82,"Executor Deserialize CPU Time":14429820,"Executor Run Time":611,"Executor CPU Time":116673417,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":102,"Index":32,"Attempt":0,"Launch Time":1678162993260,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993364,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"47","Value":"1718","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"6029312","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"36","Value":"169","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1058,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6782976,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":713,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":99334,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":13777069,"Value":1753591297,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":60,"Value":12790,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4583414,"Value":382849701,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":4,"Value":1717,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":4,"Executor Deserialize CPU Time":4583414,"Executor Run Time":60,"Executor CPU Time":13777069,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":0,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":124,"Index":28,"Attempt":0,"Launch Time":1678162993365,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":125,"Index":31,"Attempt":0,"Launch Time":1678162993366,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":96,"Index":31,"Attempt":0,"Launch Time":1678162992623,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993367,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"75","Value":"1793","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"6291456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"2","Value":"171","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1104,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7077888,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":103651,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":57035925,"Value":1810627222,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":605,"Value":13395,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15964162,"Value":398813863,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":73,"Value":1790,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":73,"Executor Deserialize CPU Time":15964162,"Executor Run Time":605,"Executor CPU Time":57035925,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":126,"Index":24,"Attempt":0,"Launch Time":1678162993369,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":86,"Index":16,"Attempt":0,"Launch Time":1678162992620,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993370,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"87","Value":"1880","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"6553600","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"16","Value":"187","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1150,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7372800,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4317,"Value":107968,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":77443734,"Value":1888070956,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":614,"Value":14009,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15695708,"Value":414509571,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":82,"Value":1872,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":82,"Executor Deserialize CPU Time":15695708,"Executor Run Time":614,"Executor CPU Time":77443734,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":127,"Index":2,"Attempt":0,"Launch Time":1678162993426,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":99,"Index":26,"Attempt":0,"Launch Time":1678162992624,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993427,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"90","Value":"1970","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"6815744","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"1","Value":"188","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1196,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7667712,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":85,"Value":85,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":112328,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":103675938,"Value":1991746894,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":702,"Value":14711,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15305524,"Value":429815095,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":69,"Value":1941,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":69,"Executor Deserialize CPU Time":15305524,"Executor Run Time":702,"Executor CPU Time":103675938,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":85,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":128,"Index":10,"Attempt":0,"Launch Time":1678162993429,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":83,"Index":10,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993431,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"90","Value":"2060","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"7077888","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"4","Value":"192","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1242,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7962624,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":85,"Value":170,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":116688,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":72849669,"Value":2064596563,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":703,"Value":15414,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11572278,"Value":441387373,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":73,"Value":2014,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":73,"Executor Deserialize CPU Time":11572278,"Executor Run Time":703,"Executor CPU Time":72849669,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":85,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":129,"Index":18,"Attempt":0,"Launch Time":1678162993433,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":91,"Index":18,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993434,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"90","Value":"2150","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"7340032","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"4","Value":"196","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1288,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8257536,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":85,"Value":255,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":121048,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":53340942,"Value":2117937505,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":700,"Value":16114,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":26602063,"Value":467989436,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":73,"Value":2087,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":73,"Executor Deserialize CPU Time":26602063,"Executor Run Time":700,"Executor CPU Time":53340942,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":85,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":75,"Index":2,"Attempt":0,"Launch Time":1678162992617,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993435,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"88","Value":"2238","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"7602176","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"3","Value":"199","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1334,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8552448,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":85,"Value":340,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":125408,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":92150391,"Value":2210087896,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":696,"Value":16810,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15016377,"Value":483005813,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":82,"Value":2169,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":82,"Executor Deserialize CPU Time":15016377,"Executor Run Time":696,"Executor CPU Time":92150391,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":85,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":130,"Index":26,"Attempt":0,"Launch Time":1678162993435,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":131,"Index":1,"Attempt":0,"Launch Time":1678162993488,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":82,"Index":9,"Attempt":0,"Launch Time":1678162992619,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993489,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"84","Value":"2322","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"7864320","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"3","Value":"202","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1380,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8847360,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":163,"Value":503,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":129768,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":59336502,"Value":2269424398,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":615,"Value":17425,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":16201272,"Value":499207085,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":228,"Value":2397,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":228,"Executor Deserialize CPU Time":16201272,"Executor Run Time":615,"Executor CPU Time":59336502,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":163,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":132,"Index":9,"Attempt":0,"Launch Time":1678162993491,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":74,"Index":1,"Attempt":0,"Launch Time":1678162992617,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993492,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"84","Value":"2406","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"8126464","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"3","Value":"205","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1426,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9142272,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":163,"Value":666,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":134128,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":88637061,"Value":2358061459,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":614,"Value":18039,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":21840870,"Value":521047955,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":229,"Value":2626,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":229,"Executor Deserialize CPU Time":21840870,"Executor Run Time":614,"Executor CPU Time":88637061,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":163,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":98,"Index":25,"Attempt":0,"Launch Time":1678162992624,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993493,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"84","Value":"2490","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"8388608","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"3","Value":"208","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1472,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9437184,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":163,"Value":829,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":138488,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":75279968,"Value":2433341427,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":615,"Value":18654,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":12366169,"Value":533414124,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":226,"Value":2852,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":226,"Executor Deserialize CPU Time":12366169,"Executor Run Time":615,"Executor CPU Time":75279968,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":163,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":133,"Index":17,"Attempt":0,"Launch Time":1678162993493,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":134,"Index":25,"Attempt":0,"Launch Time":1678162993497,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":10,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":90,"Index":17,"Attempt":0,"Launch Time":1678162992621,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993497,"Failed":false,"Killed":false,"Accumulables":[{"ID":683,"Name":"duration","Update":"87","Value":"2577","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":685,"Name":"peak memory","Update":"262144","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":687,"Name":"time in aggregation build","Update":"3","Value":"211","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":728,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1518,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":712,"Name":"internal.metrics.jvmGCTime","Update":163,"Value":992,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Update":4360,"Value":142848,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Update":74041861,"Value":2507383288,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Update":621,"Value":19275,"Internal":true,"Count Failed Values":true},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9357807,"Value":542771931,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Update":228,"Value":3080,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":228,"Executor Deserialize CPU Time":9357807,"Executor Run Time":621,"Executor CPU Time":74041861,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":163,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":10,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":36,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"88\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[35],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":34,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[33],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":35,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"89\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[34],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":32,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[31],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":31,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":33,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[32],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162992592,"Completion Time":1678162993498,"Accumulables":[{"ID":714,"Name":"internal.metrics.memoryBytesSpilled","Value":0,"Internal":true,"Count Failed Values":true},{"ID":687,"Name":"time in aggregation build","Value":"211","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":708,"Name":"internal.metrics.executorDeserializeCpuTime","Value":542771931,"Internal":true,"Count Failed Values":true},{"ID":711,"Name":"internal.metrics.resultSize","Value":142848,"Internal":true,"Count Failed Values":true},{"ID":710,"Name":"internal.metrics.executorCpuTime","Value":2507383288,"Internal":true,"Count Failed Values":true},{"ID":728,"Name":"internal.metrics.input.bytesRead","Value":1518,"Internal":true,"Count Failed Values":true},{"ID":683,"Name":"duration","Value":"2577","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":713,"Name":"internal.metrics.resultSerializationTime","Value":1,"Internal":true,"Count Failed Values":true},{"ID":707,"Name":"internal.metrics.executorDeserializeTime","Value":3080,"Internal":true,"Count Failed Values":true},{"ID":716,"Name":"internal.metrics.peakExecutionMemory","Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":715,"Name":"internal.metrics.diskBytesSpilled","Value":0,"Internal":true,"Count Failed Values":true},{"ID":709,"Name":"internal.metrics.executorRunTime","Value":19275,"Internal":true,"Count Failed Values":true},{"ID":685,"Name":"peak memory","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":712,"Name":"internal.metrics.jvmGCTime","Value":992,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":6,"Completion Time":1678162993504,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":7,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247, Date#344], [Site#247, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L]\n : +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- Project [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- SortMergeJoin [Site#247], [Site#485], FullOuter\n :- Sort [Site#247 ASC NULLS FIRST], false, 0\n : +- HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)], output=[Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L])\n : +- ShuffleQueryStage 2\n : +- Exchange hashpartitioning(Site#247, 1000), true, [id=#508]\n : +- *(5) HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)], output=[Site#247, sum#2727L, sum#2728L, sum#2729, count#2730L])\n : +- *(5) HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Total_Pages#449L, Total_Record_Length#451L])\n : +- CustomShuffleReader coalesced\n : +- ShuffleQueryStage 0\n : +- Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#435]\n : +- *(1) HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, count#2733L, sum#2734L])\n : +- *(1) Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"Project","simpleString":"Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]","children":[{"nodeName":"SortMergeJoin","simpleString":"SortMergeJoin [Site#247], [Site#485], FullOuter","children":[{"nodeName":"Sort","simpleString":"Sort [Site#247 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)])","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, 1000), true, [id=#508]","children":[{"nodeName":"WholeStageCodegen (5)","simpleString":"WholeStageCodegen (5)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#435]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":614,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":686,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":687,"metricType":"timing"},{"name":"peak memory","accumulatorId":685,"metricType":"size"},{"name":"number of output rows","accumulatorId":684,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":688,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":683,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":640,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":641,"metricType":"nsTiming"},{"name":"records read","accumulatorId":638,"metricType":"sum"},{"name":"local bytes read","accumulatorId":636,"metricType":"size"},{"name":"fetch wait time","accumulatorId":637,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":634,"metricType":"size"},{"name":"local blocks read","accumulatorId":633,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":632,"metricType":"sum"},{"name":"data size","accumulatorId":631,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":635,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":639,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":862,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":863,"metricType":"timing"},{"name":"peak memory","accumulatorId":861,"metricType":"size"},{"name":"number of output rows","accumulatorId":860,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":864,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":857,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":858,"metricType":"timing"},{"name":"peak memory","accumulatorId":856,"metricType":"size"},{"name":"number of output rows","accumulatorId":855,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":859,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":854,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":843,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":844,"metricType":"nsTiming"},{"name":"records read","accumulatorId":841,"metricType":"sum"},{"name":"local bytes read","accumulatorId":839,"metricType":"size"},{"name":"fetch wait time","accumulatorId":840,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":837,"metricType":"size"},{"name":"local blocks read","accumulatorId":836,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":835,"metricType":"sum"},{"name":"data size","accumulatorId":834,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":838,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":842,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":851,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":852,"metricType":"timing"},{"name":"peak memory","accumulatorId":850,"metricType":"size"},{"name":"number of output rows","accumulatorId":849,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":853,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":846,"metricType":"timing"},{"name":"peak memory","accumulatorId":847,"metricType":"size"},{"name":"spill size","accumulatorId":848,"metricType":"size"}]},{"nodeName":"Sort","simpleString":"Sort [Site#485 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 1","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#485, 1000), true, [id=#455]","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[partial_avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#485, pythonUDF0#2422 AS levenshtein_distance#215]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2422]","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"Project","simpleString":"Project [url_host_2nd_last_part#8, url_host_registered_domain#13]","children":[{"nodeName":"Filter","simpleString":"Filter (pythonUDF0#2421 <= 2)","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2421]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"Filter","simpleString":"Filter ((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(url_host_registered_domain#13))","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_2nd_last_part#8, url_host_registered_domain#13, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(url_host_registered_domain#13)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":630,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":706,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":705,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":704,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":703,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":700,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":701,"metricType":"timing"},{"name":"peak memory","accumulatorId":699,"metricType":"size"},{"name":"number of output rows","accumulatorId":698,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":702,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":697,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":662,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":663,"metricType":"nsTiming"},{"name":"records read","accumulatorId":660,"metricType":"sum"},{"name":"local bytes read","accumulatorId":658,"metricType":"size"},{"name":"fetch wait time","accumulatorId":659,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":656,"metricType":"size"},{"name":"local blocks read","accumulatorId":655,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":654,"metricType":"sum"},{"name":"data size","accumulatorId":653,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":657,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":661,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":694,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":695,"metricType":"timing"},{"name":"peak memory","accumulatorId":693,"metricType":"size"},{"name":"number of output rows","accumulatorId":692,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":696,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":689,"metricType":"timing"},{"name":"peak memory","accumulatorId":690,"metricType":"size"},{"name":"spill size","accumulatorId":691,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":845,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":586,"metricType":"sum"},{"name":"written output","accumulatorId":587,"metricType":"size"},{"name":"number of output rows","accumulatorId":588,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":589,"metricType":"sum"}]}} -{"Event":"SparkListenerTaskStart","Stage ID":11,"Stage Attempt ID":0,"Task Info":{"Task ID":135,"Index":32,"Attempt":0,"Launch Time":1678162993608,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":104,"Index":12,"Attempt":0,"Launch Time":1678162993263,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993608,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"51","Value":"51","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"44","Value":"44","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"32","Value":"32","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":46,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":294912,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":29,"Value":29,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":4608,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":41719006,"Value":41719006,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":278,"Value":278,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5961550,"Value":5961550,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":57,"Value":57,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":57,"Executor Deserialize CPU Time":5961550,"Executor Run Time":278,"Executor CPU Time":41719006,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":29,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":105,"Index":20,"Attempt":0,"Launch Time":1678162993264,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993611,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"51","Value":"102","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"1","Value":"2","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"35","Value":"79","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"27","Value":"59","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":92,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":589824,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":29,"Value":58,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":9216,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":77317133,"Value":119036139,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":293,"Value":571,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":8602423,"Value":14563973,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":42,"Value":99,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":42,"Executor Deserialize CPU Time":8602423,"Executor Run Time":293,"Executor CPU Time":77317133,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":29,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":124,"Index":28,"Attempt":0,"Launch Time":1678162993365,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993611,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"51","Value":"153","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"786432","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"2","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"34","Value":"113","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"29","Value":"88","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":138,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":884736,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":29,"Value":87,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":13824,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":39714216,"Value":158750355,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":229,"Value":800,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5554408,"Value":20118381,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":7,"Value":106,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":7,"Executor Deserialize CPU Time":5554408,"Executor Run Time":229,"Executor CPU Time":39714216,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":29,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":103,"Index":4,"Attempt":0,"Launch Time":1678162993261,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993611,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"51","Value":"204","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"1048576","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"1","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"35","Value":"148","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"31","Value":"119","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":184,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1179648,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":29,"Value":116,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":18432,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":80196797,"Value":238947152,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":292,"Value":1092,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11376267,"Value":31494648,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":47,"Value":153,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":47,"Executor Deserialize CPU Time":11376267,"Executor Run Time":292,"Executor CPU Time":80196797,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":29,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":108,"Index":21,"Attempt":0,"Launch Time":1678162993278,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993619,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"70","Value":"274","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"1310720","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"26","Value":"174","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"31","Value":"150","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":230,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1474560,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":22997,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":66593308,"Value":305540460,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":301,"Value":1393,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5725998,"Value":37220646,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":33,"Value":186,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":33,"Executor Deserialize CPU Time":5725998,"Executor Run Time":301,"Executor CPU Time":66593308,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":106,"Index":5,"Attempt":0,"Launch Time":1678162993275,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993619,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"70","Value":"344","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"1572864","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"35","Value":"209","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"31","Value":"181","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":276,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1769472,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":27562,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":56729766,"Value":362270226,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":302,"Value":1695,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11618007,"Value":48838653,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":34,"Value":220,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":34,"Executor Deserialize CPU Time":11618007,"Executor Run Time":302,"Executor CPU Time":56729766,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":107,"Index":13,"Attempt":0,"Launch Time":1678162993276,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993621,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"70","Value":"414","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"1835008","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"34","Value":"243","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"31","Value":"212","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":322,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2064384,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":32127,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":31946269,"Value":394216495,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":290,"Value":1985,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10347503,"Value":59186156,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":45,"Value":265,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":45,"Executor Deserialize CPU Time":10347503,"Executor Run Time":290,"Executor CPU Time":31946269,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":109,"Index":29,"Attempt":0,"Launch Time":1678162993288,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993622,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"74","Value":"488","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"2097152","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"24","Value":"267","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"55","Value":"267","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":368,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2359296,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":36692,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":93125827,"Value":487342322,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":303,"Value":2288,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7285418,"Value":66471574,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":22,"Value":287,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":22,"Executor Deserialize CPU Time":7285418,"Executor Run Time":303,"Executor CPU Time":93125827,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":122,"Index":30,"Attempt":0,"Launch Time":1678162993358,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993636,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"55","Value":"543","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"2359296","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"18","Value":"285","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"30","Value":"297","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":414,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2654208,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":41257,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":51850614,"Value":539192936,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":226,"Value":2514,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":8722904,"Value":75194478,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":29,"Value":316,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":29,"Executor Deserialize CPU Time":8722904,"Executor Run Time":226,"Executor CPU Time":51850614,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":114,"Index":6,"Attempt":0,"Launch Time":1678162993329,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993636,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"53","Value":"596","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"2621440","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"18","Value":"303","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"25","Value":"322","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":460,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2949120,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":45822,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":38320265,"Value":577513201,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":224,"Value":2738,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11682990,"Value":86877468,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":70,"Value":386,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":70,"Executor Deserialize CPU Time":11682990,"Executor Run Time":224,"Executor CPU Time":38320265,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":121,"Index":22,"Attempt":0,"Launch Time":1678162993349,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993637,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"52","Value":"648","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"2883584","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"27","Value":"330","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"30","Value":"352","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":506,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3244032,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":50387,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":62434507,"Value":639947708,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":223,"Value":2961,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7898850,"Value":94776318,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":52,"Value":438,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":52,"Executor Deserialize CPU Time":7898850,"Executor Run Time":223,"Executor CPU Time":62434507,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":111,"Index":11,"Attempt":0,"Launch Time":1678162993317,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993637,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"59","Value":"707","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"3145728","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"38","Value":"368","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"46","Value":"398","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":552,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3538944,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":54952,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":30769863,"Value":670717571,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":274,"Value":3235,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6161892,"Value":100938210,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":32,"Value":470,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":32,"Executor Deserialize CPU Time":6161892,"Executor Run Time":274,"Executor CPU Time":30769863,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":110,"Index":3,"Attempt":0,"Launch Time":1678162993315,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993637,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"60","Value":"767","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"3407872","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"45","Value":"413","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"43","Value":"441","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":598,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3833856,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":59517,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":42523999,"Value":713241570,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":272,"Value":3507,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10038737,"Value":110976947,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":37,"Value":507,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":37,"Executor Deserialize CPU Time":10038737,"Executor Run Time":272,"Executor CPU Time":42523999,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":113,"Index":27,"Attempt":0,"Launch Time":1678162993322,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993638,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"60","Value":"827","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"3670016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"3","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"26","Value":"439","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"27","Value":"468","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":644,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4128768,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":64082,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":81144766,"Value":794386336,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":274,"Value":3781,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6347940,"Value":117324887,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":20,"Value":527,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":20,"Executor Deserialize CPU Time":6347940,"Executor Run Time":274,"Executor CPU Time":81144766,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":112,"Index":19,"Attempt":0,"Launch Time":1678162993321,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993638,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"63","Value":"890","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"3932160","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"3","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"26","Value":"465","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"38","Value":"506","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":690,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4423680,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":68647,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":101172224,"Value":895558560,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":277,"Value":4058,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9296436,"Value":126621323,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":29,"Value":556,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":29,"Executor Deserialize CPU Time":9296436,"Executor Run Time":277,"Executor CPU Time":101172224,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":118,"Index":14,"Attempt":0,"Launch Time":1678162993339,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993642,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"57","Value":"947","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"4194304","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"6","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"31","Value":"496","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"24","Value":"530","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":736,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4718592,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":73212,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":70690768,"Value":966249328,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":224,"Value":4282,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7911847,"Value":134533170,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":61,"Value":617,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":61,"Executor Deserialize CPU Time":7911847,"Executor Run Time":224,"Executor CPU Time":70690768,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":123,"Index":16,"Attempt":0,"Launch Time":1678162993361,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993674,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"73","Value":"1020","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"4456448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"2","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"58","Value":"554","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"27","Value":"557","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":782,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5013504,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":20,"Value":136,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":77820,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":66555759,"Value":1032805087,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":250,"Value":4532,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5647079,"Value":140180249,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":50,"Value":667,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":50,"Executor Deserialize CPU Time":5647079,"Executor Run Time":250,"Executor CPU Time":66555759,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":119,"Index":8,"Attempt":0,"Launch Time":1678162993340,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993677,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"71","Value":"1091","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"50","Value":"604","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"22","Value":"579","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":828,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5308416,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":20,"Value":156,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":82428,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":61645813,"Value":1094450900,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":249,"Value":4781,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6278891,"Value":146459140,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":65,"Value":732,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":65,"Executor Deserialize CPU Time":6278891,"Executor Run Time":249,"Executor CPU Time":61645813,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":115,"Index":0,"Attempt":0,"Launch Time":1678162993330,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993678,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"72","Value":"1163","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"4980736","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"8","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"21","Value":"625","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"17","Value":"596","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":874,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5603328,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":20,"Value":176,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":87036,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":67068364,"Value":1161519264,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":251,"Value":5032,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11958819,"Value":158417959,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":83,"Value":815,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":83,"Executor Deserialize CPU Time":11958819,"Executor Run Time":251,"Executor CPU Time":67068364,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":125,"Index":31,"Attempt":0,"Launch Time":1678162993366,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993684,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"55","Value":"1218","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"5242880","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"1","Value":"9","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"18","Value":"643","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"26","Value":"622","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":920,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5898240,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":91601,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":112839470,"Value":1274358734,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":289,"Value":5321,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5884460,"Value":164302419,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":6,"Value":821,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":6,"Executor Deserialize CPU Time":5884460,"Executor Run Time":289,"Executor CPU Time":112839470,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":117,"Index":15,"Attempt":0,"Launch Time":1678162993337,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993685,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"55","Value":"1273","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"5505024","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"1","Value":"10","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"19","Value":"662","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"20","Value":"642","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":966,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6193152,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":96166,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":40324442,"Value":1314683176,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":296,"Value":5617,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6918114,"Value":171220533,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":37,"Value":858,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":37,"Executor Deserialize CPU Time":6918114,"Executor Run Time":296,"Executor CPU Time":40324442,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":135,"Index":32,"Attempt":0,"Launch Time":1678162993608,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993685,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"39","Value":"1312","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"5767168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"34","Value":"44","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"40","Value":"702","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"2","Value":"644","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1012,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6488064,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":100731,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":17682869,"Value":1332366045,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":60,"Value":5677,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4540572,"Value":175761105,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":6,"Value":864,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":6,"Executor Deserialize CPU Time":4540572,"Executor Run Time":60,"Executor CPU Time":17682869,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":116,"Index":7,"Attempt":0,"Launch Time":1678162993333,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993685,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"56","Value":"1368","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"6029312","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"44","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"29","Value":"731","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"67","Value":"711","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1058,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6782976,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":105296,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":42507205,"Value":1374873250,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":302,"Value":5979,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":15397465,"Value":191158570,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":32,"Value":896,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":32,"Executor Deserialize CPU Time":15397465,"Executor Run Time":302,"Executor CPU Time":42507205,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":126,"Index":24,"Attempt":0,"Launch Time":1678162993369,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993685,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"72","Value":"1440","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"6291456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"1","Value":"45","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"21","Value":"752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"28","Value":"739","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1104,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7077888,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Update":20,"Value":196,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4608,"Value":109904,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":65727307,"Value":1440600557,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":248,"Value":6227,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9777786,"Value":200936356,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":44,"Value":940,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":44,"Executor Deserialize CPU Time":9777786,"Executor Run Time":248,"Executor CPU Time":65727307,"Peak Execution Memory":294912,"Result Size":4608,"JVM GC Time":20,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":120,"Index":23,"Attempt":0,"Launch Time":1678162993346,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993688,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"73","Value":"1513","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"6553600","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"45","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"18","Value":"770","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"70","Value":"809","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1150,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7372800,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":114469,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":53263629,"Value":1493864186,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":314,"Value":6541,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":8063421,"Value":208999777,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":17,"Value":957,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":17,"Executor Deserialize CPU Time":8063421,"Executor Run Time":314,"Executor CPU Time":53263629,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerJobStart","Job ID":8,"Submission Time":1678162993691,"Stage Infos":[{"Stage ID":12,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":36,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"88\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[35],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":34,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[33],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":35,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"89\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[34],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":32,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[31],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":31,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":33,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[32],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":13,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":49,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"111\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[48],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":48,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"112\",\"name\":\"WholeStageCodegen (5)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[47],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":47,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"116\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[36],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[12],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[12,13],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"111\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":13,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":49,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"111\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[48],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":48,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"112\",\"name\":\"WholeStageCodegen (5)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[47],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":47,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"116\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[36],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[12],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162993693,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"111\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":13,"Stage Attempt ID":0,"Task Info":{"Task ID":136,"Index":0,"Attempt":0,"Launch Time":1678162993710,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":131,"Index":1,"Attempt":0,"Launch Time":1678162993488,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993749,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"45","Value":"1558","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"6815744","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"45","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"17","Value":"787","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"35","Value":"844","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1196,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7667712,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":119034,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":45003418,"Value":1538867604,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":199,"Value":6740,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":12149931,"Value":221149708,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":53,"Value":1010,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":53,"Executor Deserialize CPU Time":12149931,"Executor Run Time":199,"Executor CPU Time":45003418,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":134,"Index":25,"Attempt":0,"Launch Time":1678162993497,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993750,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"45","Value":"1603","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"7077888","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"45","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"26","Value":"813","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"35","Value":"879","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1242,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7962624,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":123599,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":36764629,"Value":1575632233,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":197,"Value":6937,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6923444,"Value":228073152,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":45,"Value":1055,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":45,"Executor Deserialize CPU Time":6923444,"Executor Run Time":197,"Executor CPU Time":36764629,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":132,"Index":9,"Attempt":0,"Launch Time":1678162993491,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993750,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"45","Value":"1648","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"7340032","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"1","Value":"46","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"17","Value":"830","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"35","Value":"914","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1288,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8257536,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":128164,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":77263998,"Value":1652896231,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":197,"Value":7134,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6761483,"Value":234834635,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":52,"Value":1107,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":52,"Executor Deserialize CPU Time":6761483,"Executor Run Time":197,"Executor CPU Time":77263998,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":133,"Index":17,"Attempt":0,"Launch Time":1678162993493,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993752,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"50","Value":"1698","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"7602176","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"6","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"17","Value":"847","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"24","Value":"938","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1334,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8552448,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":132729,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":63045017,"Value":1715941248,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":200,"Value":7334,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6218513,"Value":241053148,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":52,"Value":1159,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":52,"Executor Deserialize CPU Time":6218513,"Executor Run Time":200,"Executor CPU Time":63045017,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":129,"Index":18,"Attempt":0,"Launch Time":1678162993433,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993764,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"56","Value":"1754","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"7864320","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"50","Value":"897","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"26","Value":"964","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1380,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8847360,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":137294,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":34917946,"Value":1750859194,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":231,"Value":7565,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6281482,"Value":247334630,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":82,"Value":1241,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":82,"Executor Deserialize CPU Time":6281482,"Executor Run Time":231,"Executor CPU Time":34917946,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":130,"Index":26,"Attempt":0,"Launch Time":1678162993435,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993764,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"56","Value":"1810","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"8126464","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"41","Value":"938","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"21","Value":"985","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1426,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9142272,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":141859,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":42424074,"Value":1793283268,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":231,"Value":7796,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5374857,"Value":252709487,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":85,"Value":1326,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":85,"Executor Deserialize CPU Time":5374857,"Executor Run Time":231,"Executor CPU Time":42424074,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":128,"Index":10,"Attempt":0,"Launch Time":1678162993429,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993765,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"56","Value":"1866","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"8388608","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"41","Value":"979","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"19","Value":"1004","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1472,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9437184,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":146424,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":109365502,"Value":1902648770,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":228,"Value":8024,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7709189,"Value":260418676,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":84,"Value":1410,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":84,"Executor Deserialize CPU Time":7709189,"Executor Run Time":228,"Executor CPU Time":109365502,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":11,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":127,"Index":2,"Attempt":0,"Launch Time":1678162993426,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993765,"Failed":false,"Killed":false,"Accumulables":[{"ID":697,"Name":"duration","Update":"56","Value":"1922","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Update":"262144","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":701,"Name":"time in aggregation build","Update":"0","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":703,"Name":"duration","Update":"41","Value":"1020","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":705,"Name":"duration","Update":"26","Value":"1030","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":753,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1518,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":736,"Name":"internal.metrics.resultSize","Update":4565,"Value":150989,"Internal":true,"Count Failed Values":true},{"ID":735,"Name":"internal.metrics.executorCpuTime","Update":51522399,"Value":1954171169,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Update":230,"Value":8254,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Update":16458192,"Value":276876868,"Internal":true,"Count Failed Values":true},{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Update":101,"Value":1511,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":101,"Executor Deserialize CPU Time":16458192,"Executor Run Time":230,"Executor CPU Time":51522399,"Peak Execution Memory":294912,"Result Size":4565,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":11,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":46,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"95\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[45],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":45,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"96\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[44],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":41,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[40],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":39,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"106\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[38],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":37,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":38,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[37],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":43,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[42],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":40,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[39],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":42,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"101\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[41],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":44,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[43],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162992775,"Completion Time":1678162993768,"Accumulables":[{"ID":732,"Name":"internal.metrics.executorDeserializeTime","Value":1511,"Internal":true,"Count Failed Values":true},{"ID":741,"Name":"internal.metrics.peakExecutionMemory","Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":705,"Name":"duration","Value":"1030","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":699,"Name":"peak memory","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":735,"Name":"internal.metrics.executorCpuTime","Value":1954171169,"Internal":true,"Count Failed Values":true},{"ID":753,"Name":"internal.metrics.input.bytesRead","Value":1518,"Internal":true,"Count Failed Values":true},{"ID":737,"Name":"internal.metrics.jvmGCTime","Value":196,"Internal":true,"Count Failed Values":true},{"ID":701,"Name":"time in aggregation build","Value":"52","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":740,"Name":"internal.metrics.diskBytesSpilled","Value":0,"Internal":true,"Count Failed Values":true},{"ID":734,"Name":"internal.metrics.executorRunTime","Value":8254,"Internal":true,"Count Failed Values":true},{"ID":733,"Name":"internal.metrics.executorDeserializeCpuTime","Value":276876868,"Internal":true,"Count Failed Values":true},{"ID":697,"Name":"duration","Value":"1922","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":736,"Name":"internal.metrics.resultSize","Value":150989,"Internal":true,"Count Failed Values":true},{"ID":703,"Name":"duration","Value":"1020","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":739,"Name":"internal.metrics.memoryBytesSpilled","Value":0,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":7,"Completion Time":1678162993775,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"SparkListenerTaskEnd","Stage ID":13,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":136,"Index":0,"Attempt":0,"Launch Time":1678162993710,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162993900,"Failed":false,"Killed":false,"Accumulables":[{"ID":854,"Name":"duration","Update":"33","Value":"33","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":856,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":858,"Name":"time in aggregation build","Update":"4","Value":"4","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":861,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":863,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":882,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":881,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":880,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":879,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":878,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":877,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":876,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":874,"Name":"internal.metrics.peakExecutionMemory","Update":557056,"Value":557056,"Internal":true,"Count Failed Values":true},{"ID":869,"Name":"internal.metrics.resultSize","Update":5959,"Value":5959,"Internal":true,"Count Failed Values":true},{"ID":868,"Name":"internal.metrics.executorCpuTime","Update":123565595,"Value":123565595,"Internal":true,"Count Failed Values":true},{"ID":867,"Name":"internal.metrics.executorRunTime","Update":133,"Value":133,"Internal":true,"Count Failed Values":true},{"ID":866,"Name":"internal.metrics.executorDeserializeCpuTime","Update":41965992,"Value":41965992,"Internal":true,"Count Failed Values":true},{"ID":865,"Name":"internal.metrics.executorDeserializeTime","Update":50,"Value":50,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":50,"Executor Deserialize CPU Time":41965992,"Executor Run Time":133,"Executor CPU Time":123565595,"Peak Execution Memory":557056,"Result Size":5959,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":13,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":49,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"111\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[48],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":48,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"112\",\"name\":\"WholeStageCodegen (5)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[47],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":47,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"116\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[36],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[12],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162993693,"Completion Time":1678162993901,"Accumulables":[{"ID":882,"Name":"internal.metrics.shuffle.read.recordsRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":863,"Name":"time in aggregation build","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":881,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":866,"Name":"internal.metrics.executorDeserializeCpuTime","Value":41965992,"Internal":true,"Count Failed Values":true},{"ID":869,"Name":"internal.metrics.resultSize","Value":5959,"Internal":true,"Count Failed Values":true},{"ID":878,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":854,"Name":"duration","Value":"33","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":877,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":880,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":856,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":865,"Name":"internal.metrics.executorDeserializeTime","Value":50,"Internal":true,"Count Failed Values":true},{"ID":874,"Name":"internal.metrics.peakExecutionMemory","Value":557056,"Internal":true,"Count Failed Values":true},{"ID":868,"Name":"internal.metrics.executorCpuTime","Value":123565595,"Internal":true,"Count Failed Values":true},{"ID":858,"Name":"time in aggregation build","Value":"4","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":867,"Name":"internal.metrics.executorRunTime","Value":133,"Internal":true,"Count Failed Values":true},{"ID":876,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":861,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":879,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":8,"Completion Time":1678162993903,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":7,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Project [Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L]\n : +- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n : +- Repartition 50000, false\n : +- Filter (content_languages#27 = eng)\n : +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n +- Aggregate [Site#485], [Site#485, avg(cast(levenshtein_distance#215 as bigint)) AS Levenshtein_Distance#411]\n +- Filter (((subset#33 = warc) AND (levenshtein_distance#215 <= 2)) AND isnotnull(Site#485))\n +- SubqueryAlias ccindex\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#485, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- Join FullOuter, (Site#247 = Site#485)\n :- Aggregate [Site#247], [Site#247, sum(Total_Record_Length#451L) AS Total_Record_Length_Site#465L, (cast(sum(Total_Record_Length#451L) as double) / cast(sum(Total_Pages#449L) as double)) AS Avg_Record_Length#468, avg(Total_Pages#449L) AS Avg_Daily_Pages#470, sum(Total_Pages#449L) AS Total_Pages#472L, monotonically_increasing_id() AS Site_id#478L]\n : +- Aggregate [Site#247, Date#344], [Site#247, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L]\n : +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- Project [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]\n+- AdaptiveSparkPlan isFinalPlan=true\n +- *(8) Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]\n +- SortMergeJoin [Site#247], [Site#485], FullOuter\n :- *(6) Sort [Site#247 ASC NULLS FIRST], false, 0\n : +- *(6) HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)], output=[Site#247, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L])\n : +- CustomShuffleReader coalesced\n : +- ShuffleQueryStage 2\n : +- Exchange hashpartitioning(Site#247, 1000), true, [id=#508]\n : +- *(5) HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)], output=[Site#247, sum#2727L, sum#2728L, sum#2729, count#2730L])\n : +- *(5) HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Total_Pages#449L, Total_Record_Length#451L])\n : +- CustomShuffleReader coalesced\n : +- ShuffleQueryStage 0\n : +- Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#435]\n : +- *(1) HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, count#2733L, sum#2734L])\n : +- *(1) Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]\n : +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]\n : +- InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n : +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n : +- Coalesce 50000\n : +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n : +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n : +- *(1) ColumnarToRow\n : +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_summary_1GB), Overwrite, [Site, Total_Record_Length_Site, Avg_Record_Length, Avg_Daily_Pages, Total_Pages, Site_id, Levenshtein_Distance]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=true","children":[{"nodeName":"WholeStageCodegen (8)","simpleString":"WholeStageCodegen (8)","children":[{"nodeName":"Project","simpleString":"Project [coalesce(Site#247, Site#485) AS Site#486, Total_Record_Length_Site#465L, Avg_Record_Length#468, Avg_Daily_Pages#470, Total_Pages#472L, Site_id#478L, Levenshtein_Distance#411]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"SortMergeJoin","simpleString":"SortMergeJoin [Site#247], [Site#485], FullOuter","children":[{"nodeName":"WholeStageCodegen (6)","simpleString":"WholeStageCodegen (6)","children":[{"nodeName":"Sort","simpleString":"Sort [Site#247 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[sum(Total_Record_Length#451L), sum(Total_Pages#449L), avg(Total_Pages#449L)])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 2","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, 1000), true, [id=#508]","children":[{"nodeName":"WholeStageCodegen (5)","simpleString":"WholeStageCodegen (5)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247], functions=[partial_sum(Total_Record_Length#451L), partial_sum(Total_Pages#449L), partial_avg(Total_Pages#449L)])","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#435]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#2420 AS Date#344]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [(fetch_time#20)], [pythonUDF0#2420]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":614,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":686,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":687,"metricType":"timing"},{"name":"peak memory","accumulatorId":685,"metricType":"size"},{"name":"number of output rows","accumulatorId":684,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":688,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":683,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":640,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":641,"metricType":"nsTiming"},{"name":"records read","accumulatorId":638,"metricType":"sum"},{"name":"local bytes read","accumulatorId":636,"metricType":"size"},{"name":"fetch wait time","accumulatorId":637,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":634,"metricType":"size"},{"name":"local blocks read","accumulatorId":633,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":632,"metricType":"sum"},{"name":"data size","accumulatorId":631,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":635,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":639,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":862,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":863,"metricType":"timing"},{"name":"peak memory","accumulatorId":861,"metricType":"size"},{"name":"number of output rows","accumulatorId":860,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":864,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":857,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":858,"metricType":"timing"},{"name":"peak memory","accumulatorId":856,"metricType":"size"},{"name":"number of output rows","accumulatorId":855,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":859,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":854,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":843,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":844,"metricType":"nsTiming"},{"name":"records read","accumulatorId":841,"metricType":"sum"},{"name":"local bytes read","accumulatorId":839,"metricType":"size"},{"name":"fetch wait time","accumulatorId":840,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":837,"metricType":"size"},{"name":"local blocks read","accumulatorId":836,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":835,"metricType":"sum"},{"name":"data size","accumulatorId":834,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":838,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":842,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":942,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":943,"metricType":"timing"},{"name":"peak memory","accumulatorId":941,"metricType":"size"},{"name":"number of output rows","accumulatorId":940,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":944,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":937,"metricType":"timing"},{"name":"peak memory","accumulatorId":938,"metricType":"size"},{"name":"spill size","accumulatorId":939,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":936,"metricType":"timing"}]},{"nodeName":"WholeStageCodegen (7)","simpleString":"WholeStageCodegen (7)","children":[{"nodeName":"Sort","simpleString":"Sort [Site#485 ASC NULLS FIRST], false, 0","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 1","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#485, 1000), true, [id=#455]","children":[{"nodeName":"WholeStageCodegen (4)","simpleString":"WholeStageCodegen (4)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#485], functions=[partial_avg(cast(levenshtein_distance#215 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#485, pythonUDF0#2422 AS levenshtein_distance#215]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2422]","children":[{"nodeName":"WholeStageCodegen (3)","simpleString":"WholeStageCodegen (3)","children":[{"nodeName":"Project","simpleString":"Project [url_host_2nd_last_part#8, url_host_registered_domain#13]","children":[{"nodeName":"Filter","simpleString":"Filter (pythonUDF0#2421 <= 2)","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [levenshtein_distance(wikipedia, url_host_2nd_last_part#8)], [pythonUDF0#2421]","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"Filter","simpleString":"Filter ((isnotnull(subset#33) AND (subset#33 = warc)) AND isnotnull(url_host_registered_domain#13))","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_2nd_last_part#8, url_host_registered_domain#13, subset#33], [isnotnull(subset#33), (subset#33 = warc), isnotnull(url_host_registered_domain#13)]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":630,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":706,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":705,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":704,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":703,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":700,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":701,"metricType":"timing"},{"name":"peak memory","accumulatorId":699,"metricType":"size"},{"name":"number of output rows","accumulatorId":698,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":702,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":697,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":662,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":663,"metricType":"nsTiming"},{"name":"records read","accumulatorId":660,"metricType":"sum"},{"name":"local bytes read","accumulatorId":658,"metricType":"size"},{"name":"fetch wait time","accumulatorId":659,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":656,"metricType":"size"},{"name":"local blocks read","accumulatorId":655,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":654,"metricType":"sum"},{"name":"data size","accumulatorId":653,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":657,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":661,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":951,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":952,"metricType":"timing"},{"name":"peak memory","accumulatorId":950,"metricType":"size"},{"name":"number of output rows","accumulatorId":949,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":953,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"sort time","accumulatorId":946,"metricType":"timing"},{"name":"peak memory","accumulatorId":947,"metricType":"size"},{"name":"spill size","accumulatorId":948,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":945,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":935,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":934,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":586,"metricType":"sum"},{"name":"written output","accumulatorId":587,"metricType":"size"},{"name":"number of output rows","accumulatorId":588,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":589,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":9,"Submission Time":1678162994136,"Stage Infos":[{"Stage ID":15,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":36,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"88\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[35],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":34,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[33],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":35,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"89\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[34],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":32,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[31],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":31,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"94\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":33,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"93\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[32],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":16,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":49,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"111\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[48],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":48,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"112\",\"name\":\"WholeStageCodegen (5)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[47],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":47,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"116\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[36],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[15],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":17,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":55,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"119\",\"name\":\"WholeStageCodegen (8)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[54],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":54,"Name":"ZippedPartitionsRDD2","Scope":"{\"id\":\"122\",\"name\":\"SortMergeJoin\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[51,53],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":53,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"128\",\"name\":\"WholeStageCodegen (7)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[52],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":51,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"123\",\"name\":\"WholeStageCodegen (6)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[50],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":50,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"127\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[49],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":52,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"132\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[46],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[16,14],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":14,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":46,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"95\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[45],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":45,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"96\",\"name\":\"WholeStageCodegen (4)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[44],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":41,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[40],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":39,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"106\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[38],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":37,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":38,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"109\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[37],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":43,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[42],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":40,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"105\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[39],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":42,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"101\",\"name\":\"WholeStageCodegen (3)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[41],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":44,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"100\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[43],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[15,16,17,14],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"86\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":17,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":55,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"119\",\"name\":\"WholeStageCodegen (8)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[54],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":54,"Name":"ZippedPartitionsRDD2","Scope":"{\"id\":\"122\",\"name\":\"SortMergeJoin\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[51,53],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":53,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"128\",\"name\":\"WholeStageCodegen (7)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[52],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":51,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"123\",\"name\":\"WholeStageCodegen (6)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[50],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":50,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"127\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[49],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":52,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"132\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[46],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[16,14],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162994138,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"86\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"7","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":17,"Stage Attempt ID":0,"Task Info":{"Task ID":137,"Index":0,"Attempt":0,"Launch Time":1678162994186,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":17,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":137,"Index":0,"Attempt":0,"Launch Time":1678162994186,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162994875,"Failed":false,"Killed":false,"Accumulables":[{"ID":934,"Name":"duration","Update":"310","Value":"310","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":945,"Name":"duration","Update":"43","Value":"43","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":948,"Name":"spill size","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":947,"Name":"peak memory","Update":"65536","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":946,"Name":"sort time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":950,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":952,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":936,"Name":"duration","Update":"101","Value":"101","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":939,"Name":"spill size","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":938,"Name":"peak memory","Update":"65536","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":937,"Name":"sort time","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":941,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":943,"Name":"time in aggregation build","Update":"1","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":977,"Name":"internal.metrics.output.bytesWritten","Update":105,"Value":105,"Internal":true,"Count Failed Values":true},{"ID":971,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":970,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":969,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":968,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":967,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":966,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":965,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":963,"Name":"internal.metrics.peakExecutionMemory","Update":655360,"Value":655360,"Internal":true,"Count Failed Values":true},{"ID":960,"Name":"internal.metrics.resultSerializationTime","Update":1,"Value":1,"Internal":true,"Count Failed Values":true},{"ID":959,"Name":"internal.metrics.jvmGCTime","Update":88,"Value":88,"Internal":true,"Count Failed Values":true},{"ID":958,"Name":"internal.metrics.resultSize","Update":8973,"Value":8973,"Internal":true,"Count Failed Values":true},{"ID":957,"Name":"internal.metrics.executorCpuTime","Update":376180312,"Value":376180312,"Internal":true,"Count Failed Values":true},{"ID":956,"Name":"internal.metrics.executorRunTime","Update":584,"Value":584,"Internal":true,"Count Failed Values":true},{"ID":955,"Name":"internal.metrics.executorDeserializeCpuTime","Update":89830500,"Value":89830500,"Internal":true,"Count Failed Values":true},{"ID":954,"Name":"internal.metrics.executorDeserializeTime","Update":95,"Value":95,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":95,"Executor Deserialize CPU Time":89830500,"Executor Run Time":584,"Executor CPU Time":376180312,"Peak Execution Memory":655360,"Result Size":8973,"JVM GC Time":88,"Result Serialization Time":1,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":105,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":17,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":55,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"119\",\"name\":\"WholeStageCodegen (8)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[54],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":54,"Name":"ZippedPartitionsRDD2","Scope":"{\"id\":\"122\",\"name\":\"SortMergeJoin\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[51,53],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":53,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"128\",\"name\":\"WholeStageCodegen (7)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[52],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":51,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"123\",\"name\":\"WholeStageCodegen (6)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[50],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":50,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"127\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[49],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":52,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"132\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[46],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[16,14],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162994138,"Completion Time":1678162994876,"Accumulables":[{"ID":941,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":950,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":977,"Name":"internal.metrics.output.bytesWritten","Value":105,"Internal":true,"Count Failed Values":true},{"ID":968,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":959,"Name":"internal.metrics.jvmGCTime","Value":88,"Internal":true,"Count Failed Values":true},{"ID":971,"Name":"internal.metrics.shuffle.read.recordsRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":947,"Name":"peak memory","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":938,"Name":"peak memory","Value":"65536","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":965,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":956,"Name":"internal.metrics.executorRunTime","Value":584,"Internal":true,"Count Failed Values":true},{"ID":955,"Name":"internal.metrics.executorDeserializeCpuTime","Value":89830500,"Internal":true,"Count Failed Values":true},{"ID":946,"Name":"sort time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":937,"Name":"sort time","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":967,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":958,"Name":"internal.metrics.resultSize","Value":8973,"Internal":true,"Count Failed Values":true},{"ID":943,"Name":"time in aggregation build","Value":"1","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":970,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":952,"Name":"time in aggregation build","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":934,"Name":"duration","Value":"310","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":960,"Name":"internal.metrics.resultSerializationTime","Value":1,"Internal":true,"Count Failed Values":true},{"ID":969,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":954,"Name":"internal.metrics.executorDeserializeTime","Value":95,"Internal":true,"Count Failed Values":true},{"ID":945,"Name":"duration","Value":"43","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":963,"Name":"internal.metrics.peakExecutionMemory","Value":655360,"Internal":true,"Count Failed Values":true},{"ID":936,"Name":"duration","Value":"101","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":939,"Name":"spill size","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":948,"Name":"spill size","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":957,"Name":"internal.metrics.executorCpuTime","Value":376180312,"Internal":true,"Count Failed Values":true},{"ID":966,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":9,"Completion Time":1678162994876,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates","executionId":7,"accumUpdates":[[586,1],[587,105],[588,0],[589,0]]} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":7,"timePerRule":{"PruneFileSourcePartitions":968485,"ReassignLambdaVariableID":458071,"PushPredicateThroughNonJoin":3564620,"Analyzer$HandleNullInputsForUDF":23624,"Analyzer$ResolveSubqueryColumnAliases":13084,"ResolveTimeZone":9392,"Analyzer$ResolveNamespace":40585,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":10261,"RewriteCorrelatedScalarSubquery":1412978,"RemoveLiteralFromGroupExpressions":638734,"PushProjectionThroughUnion":1187865,"EliminateSubqueryAliases":810487,"ResolveCatalogs":15056,"PushLeftSemiLeftAntiThroughJoin":1300255,"FlattenScalarSubqueriesWithAggregates":421222,"LikeSimplification":1400818,"CollapseRepartition":1375546,"ResolveHints$ResolveCoalesceHints":8236,"Analyzer$ExtractGenerator":41577,"RewriteIntersectAll":599542,"ResolveHints$ResolveJoinStrategyHints":9362,"TypeCoercion$MapZipWithCoercion":9931,"NullPropagation":2423557,"PullupCorrelatedPredicates":919992,"UpdateOuterReferences":18996,"ExtractPythonUDFs":9683900,"Analyzer$WindowsSubstitution":12159,"CombineUnions":1320543,"ExtractGroupingPythonUDFFromAggregate":293194,"ReorderAssociativeOperator":1823992,"CleanupDynamicPruningFilters":814900,"ResolveHints$RemoveAllHints":17764,"SimplifyBinaryComparison":2143319,"ResolveTableValuedFunctions":11033,"EliminateSerialization":893367,"TypeCoercion$BooleanEquality":9601,"package$ExpressionCanonicalizer$CleanExpressions":51978,"ReplaceIntersectWithSemiJoin":597646,"ConstantPropagation":1119496,"CostBasedJoinReorder":17485,"Analyzer$ResolveReferences":49414,"CTESubstitution":607868,"RemoveRedundantAliases":4614815,"TypeCoercion$ImplicitTypeCasts":16952,"RewriteExceptAll":688308,"UpdateAttributeNullability":122673,"PropagateEmptyRelation":1256651,"SimplifyCasts":1487533,"EliminateMapObjects":456078,"CombineLimits":961370,"DetectAmbiguousSelfJoin":54610,"ReplaceExpressions":948894,"ResolveInlineTables":9518,"OptimizeIn":1428915,"CollapseWindow":992661,"TypeCoercion$IfCoercion":51283,"ResolveSessionCatalog":22328,"PartitionPruning":5432760,"BooleanSimplification":3166661,"TypeCoercion$PromoteStrings":10293,"Analyzer$ResolveAliases":14254,"DecimalAggregates":505562,"PruneFilters":1666010,"Analyzer$ResolveMissingReferences":29496,"TransposeWindow":1007413,"Analyzer$ResolveRelations":23314,"EliminateUnions":20640,"RewritePredicateSubquery":518475,"ObjectSerializerPruning":273902,"LimitPushDown":1203704,"SimplifyCaseConversionExpressions":1525130,"Analyzer$ResolveNaturalAndUsingJoin":13808,"EliminateView":608174,"CombineTypedFilters":278335,"OptimizeLimitZero":355385,"CheckCartesianProducts":31115,"ExtractPythonUDFFromAggregate":361175,"Analyzer$ExtractWindowExpressions":35358,"ReplaceExceptWithAntiJoin":710824,"ResolveLambdaVariables":13076,"FallBackFileSourceV2":12554,"Analyzer$ResolveTables":12838,"SubstituteUnresolvedOrdinals":8625,"TypeCoercion$CaseWhenCoercion":18357,"DecimalPrecision":15787,"EliminateSorts":1787170,"PushDownLeftSemiAntiJoin":2582198,"ExtractPythonUDFFromJoinCondition":889690,"TypeCoercion$StackCoercion":19042,"Analyzer$ResolveAggAliasInGroupBy":8825,"TypeCoercion$StringLiteralCoercion":15729,"FoldablePropagation":663614,"V2ScanRelationPushDown":791231,"EliminateDistinct":13649,"InferFiltersFromConstraints":1617887,"Analyzer$PullOutNondeterministic":18439,"Analyzer$ResolveFunctions":20690,"ReplaceNullWithFalseInPredicate":1514748,"ResolveHigherOrderFunctions":10852,"Analyzer$ResolvePivot":7788,"CollapseProject":3113855,"Analyzer$ResolveNewInstance":10493,"ColumnPruning":16478446,"Analyzer$ResolveWindowOrder":17480,"TypeCoercion$ConcatCoercion":11757,"PushDownPredicates":6424559,"TimeWindowing":34569,"Optimizer$OptimizeSubqueries":1951143,"RewriteNonCorrelatedExists":989595,"DemoteBroadcastHashJoin":1834545,"TypeCoercion$Division":16469,"ComputeCurrentTime":987701,"ResolveCreateNamedStruct":12365,"TypeCoercion$EltCoercion":35356,"ConvertToLocalRelation":634826,"RemoveRepetitionFromGroupExpressions":641634,"ReplaceDistinctWithAggregate":611322,"PreprocessTableCreation":17681,"ResolveSQLOnFile":12676,"Analyzer$ResolveSubquery":13677,"CombineConcats":30221,"Analyzer$ResolveGroupingAnalytics":12235,"Analyzer$ResolveBinaryArithmetic":11483,"RemoveDispensableExpressions":1434468,"Analyzer$ResolveAlterTableChanges":17707,"ResolveEncodersInScalaAgg":18494,"TypeCoercion$IntegralDivision":15699,"Analyzer$ResolveWindowFrame":15529,"Analyzer$ResolveDeserializer":11365,"RewriteDistinctAggregates":701057,"RemoveNoopOperators":3230208,"Analyzer$ResolveAggregateFunctions":9219,"NormalizeFloatingNumbers":7723553,"ReorderJoin":1309215,"Analyzer$ResolveUpCast":9756,"Analyzer$ResolveGenerate":15652,"TypeCoercion$WidenSetOperationTypes":7941,"EliminateOuterJoin":1184041,"SimplifyExtractValueOps":1273134,"OptimizeMetadataOnlyQuery":15500,"EliminateResolvedHint":2535553,"Analyzer$ResolveInsertInto":17181,"ReplaceExceptWithFilter":592917,"CleanupAliases":24138,"GetCurrentDatabase":1035811,"SchemaPruning":841302,"Analyzer$ResolveOutputRelation":16547,"BloomFilterJoinRule":1535456,"Analyzer$ResolveRandomSeed":9952,"TypeCoercion$WindowFrameCoercion":17154,"ConstantFolding":1470136,"TypeCoercion$DateTimeOperations":16043,"TypeCoercion$InConversion":10911,"FindDataSourceTable":14674,"SimplifyConditionals":1486631,"DataSourceAnalysis":13526,"TypeCoercion$FunctionArgumentConversion":9678,"Analyzer$GlobalAggregates":14321,"Analyzer$LookupFunctions":15287,"CombineFilters":1276526,"ReplaceDeduplicateWithAggregate":399595,"PreprocessTableInsertion":13035},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":3,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":3,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":3,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":3,"CollapseRepartition":3,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":3,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":4,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":3,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":3,"ResolveTableValuedFunctions":1,"EliminateSerialization":3,"TypeCoercion$BooleanEquality":1,"package$ExpressionCanonicalizer$CleanExpressions":4,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":3,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":3,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":3,"EliminateMapObjects":1,"CombineLimits":3,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":3,"CollapseWindow":3,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":3,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":4,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":3,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":3,"SimplifyCaseConversionExpressions":3,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":3,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":3,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":3,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":4,"Analyzer$ResolveNewInstance":1,"ColumnPruning":5,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":5,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"DemoteBroadcastHashJoin":3,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":3,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":3,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":5,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":3,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":3,"SimplifyExtractValueOps":3,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":3,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":3,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":4,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{"PushPredicateThroughNonJoin":1,"EliminateSubqueryAliases":1,"ExtractPythonUDFs":1,"RewritePredicateSubquery":1,"InferFiltersFromConstraints":1,"CollapseProject":1,"ColumnPruning":2,"PushDownPredicates":1,"RemoveNoopOperators":1},"timeEffectiveRunsPerRule":{"PushPredicateThroughNonJoin":3564620,"EliminateSubqueryAliases":810487,"ExtractPythonUDFs":9683900,"RewritePredicateSubquery":518475,"InferFiltersFromConstraints":1617887,"CollapseProject":2178101,"ColumnPruning":11608093,"PushDownPredicates":4785341,"RemoveNoopOperators":1660431},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":7,"time":1678162994964} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionStart","executionId":8,"description":"save at NativeMethodAccessorImpl.java:0","details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]\n +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]\n +- Project [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint)), avg(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, Total_Pages#449L, Total_Record_Length#451L, Avg_Record_Length#453])\n +- Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#602]\n +- HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint)), partial_avg(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, count#2733L, sum#2734L, sum#3510, count#3511L])\n +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]\n +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]\n +- InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint)), avg(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#602]","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint)), partial_avg(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1026,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1023,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1024,"metricType":"timing"},{"name":"peak memory","accumulatorId":1022,"metricType":"size"},{"name":"number of output rows","accumulatorId":1021,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1025,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":988,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":989,"metricType":"nsTiming"},{"name":"records read","accumulatorId":986,"metricType":"sum"},{"name":"local bytes read","accumulatorId":984,"metricType":"size"},{"name":"fetch wait time","accumulatorId":985,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":982,"metricType":"size"},{"name":"local blocks read","accumulatorId":981,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":980,"metricType":"sum"},{"name":"data size","accumulatorId":979,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":983,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":987,"metricType":"size"}]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1018,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1019,"metricType":"timing"},{"name":"peak memory","accumulatorId":1017,"metricType":"size"},{"name":"number of output rows","accumulatorId":1016,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1020,"metricType":"average"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":1012,"metricType":"sum"},{"name":"written output","accumulatorId":1013,"metricType":"size"},{"name":"number of output rows","accumulatorId":1014,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":1015,"metricType":"sum"}]},"time":1678162995039} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerEffectiveSQLConf","executionId":8,"effectiveSQLConf":{"spark.sql.adaptive.coalescePartitions.initialPartitionNum":"","spark.sql.streaming.disabledV2Writers":"","spark.sql.streaming.noDataMicroBatches.enabled":"true","spark.sql.files.maxPartitionBytes":"128MB","spark.sql.thriftserver.ui.retainedStatements":"200","spark.sql.ui.retainedExecutions":"1000","spark.sql.execution.arrow.pyspark.fallback.enabled":"","spark.sql.sources.parallelPartitionDiscovery.threshold":"32","spark.sql.cbo.joinReorder.enabled":"false","spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.int96AsTimestamp":"true","spark.sql.thriftserver.ui.retainedSessions":"200","spark.sql.hive.verifyPartitionPath":"false","spark.sql.autoBroadcastJoinThreshold":"10MB","spark.sql.orc.mergeSchema":"false","spark.sql.adaptive.enabled":"true","spark.sql.adaptive.skewJoin.enabled":"true","spark.sql.optimizer.sizeBasedJoinReorder.enabled":"true","spark.sql.streaming.ui.retainedQueries":"100","spark.sql.orc.enableVectorizedReader":"true","spark.sql.streaming.continuous.epochBacklogQueueSize":"10000","spark.sql.parquet.binaryAsString":"false","spark.sql.pivotMaxValues":"10000","spark.sql.adaptive.localShuffleReader.enabled":"true","spark.sql.storeAssignmentPolicy":"ANSI","spark.sql.redaction.options.regex":"(?i)url","spark.sql.cbo.joinReorder.dp.star.filter":"false","spark.sql.parser.quotedRegexColumnNames":"false","spark.sql.files.ignoreMissingFiles":"false","spark.sql.streaming.ui.enabled":"true","spark.sql.execution.arrow.sparkr.enabled":"false","spark.sql.orc.filterPushdown":"true","spark.sql.thriftserver.scheduler.pool":"","spark.sql.catalog.spark_catalog":"","spark.sql.datetime.java8API.enabled":"false","spark.sql.execution.pandas.udf.buffer.size":"","spark.sql.function.eltOutputAsString":"false","spark.sql.cbo.joinReorder.dp.threshold":"12","spark.sql.statsImprovements.enabled":"true","spark.sql.parquet.columnarReaderBatchSize":"4096","spark.sql.parquet.outputTimestampType":"INT96","spark.sql.adaptive.skewJoin.skewedPartitionThresholdInBytes":"256MB","spark.sql.statistics.size.autoUpdate.enabled":"false","spark.sql.hive.filesourcePartitionFileCacheSize":"262144000","spark.sql.streaming.ui.retainedProgressUpdates":"100","spark.sql.inMemoryColumnarStorage.compressed":"true","spark.sql.pyspark.jvmStacktrace.enabled":"false","spark.sql.cbo.starSchemaDetection":"false","spark.sql.columnNameOfCorruptRecord":"_corrupt_record","spark.sql.adaptive.advisoryPartitionSizeInBytes":"","spark.sql.avro.compression.codec":"snappy","spark.sql.streaming.metricsEnabled":"false","spark.sql.ui.pruneCachedInMemoryRelation":"true","spark.sql.event.truncate.length":"2147483647","spark.sql.groupByAliases":"true","spark.sql.hive.metastorePartitionPruning":"true","spark.sql.execution.arrow.enabled":"false","spark.sql.optimizer.dynamicPartitionPruning.enabled":"true","spark.sql.extendedEventInfo":"true","spark.sql.sources.bucketing.maxBuckets":"100000","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.jsonGenerator.ignoreNullFields":"true","spark.sql.optimizer.flattenScalarSubqueriesWithAggregates.enabled":"true","spark.sql.debug.maxToStringFields":"25","spark.sql.files.maxRecordsPerFile":"0","spark.sql.hive.thriftServer.singleSession":"false","spark.sql.parquet.mergeSchema":"false","spark.sql.adaptive.skewJoin.skewedPartitionFactor":"5","spark.sql.queryExecutionListeners":"","spark.sql.streaming.numRecentProgressUpdates":"100","spark.sql.variable.substitute":"true","spark.sql.adaptive.shuffle.mapOutputStats.useDataSize":"true","spark.sql.function.concatBinaryAsString":"false","spark.sql.avro.deflate.level":"-1","spark.sql.repl.eagerEval.maxNumRows":"20","spark.sql.ansi.enabled":"false","spark.sql.legacy.allowHashOnMapType":"false","spark.sql.bloomFilterJoin.maxFilterBytes":"2097152","spark.sql.orderByOrdinal":"true","spark.sql.streaming.stopTimeout":"0","spark.sql.adaptive.coalescePartitions.enabled":"true","spark.sql.session.timeZone":"UTC","spark.sql.parquet.respectSummaryFiles":"false","spark.sql.parquet.enableVectorizedReader":"true","spark.sql.cbo.enabled":"false","spark.sql.sources.bucketing.enabled":"true","spark.sql.csv.filterPushdown.enabled":"true","spark.sql.repl.eagerEval.truncate":"20","spark.sql.cbo.planStats.enabled":"false","spark.sql.optimizer.excludedRules":"","spark.sql.sources.partitionColumnTypeInference.enabled":"true","spark.sql.sortMergeJoinExec.extendedCodegen.enabled":"true","spark.sql.execution.arrow.fallback.enabled":"true","spark.sql.streaming.forceDeleteTempCheckpointLocation":"false","spark.sql.maxPlanStringLength":"2147483632","spark.sql.hive.manageFilesourcePartitions":"true","spark.sql.parquet.int96TimestampConversion":"false","spark.sql.sources.partitionOverwriteMode":"STATIC","spark.sql.streaming.checkpointLocation":"","spark.sql.broadcastTimeout":"300","spark.sql.execution.arrow.pyspark.enabled":"","spark.sql.repl.eagerEval.enabled":"false","spark.sql.mapKeyDedupPolicy":"EXCEPTION","spark.sql.parquet.filterPushdown":"true","spark.sql.maven.additionalRemoteRepositories":"https://maven-central.storage-download.googleapis.com/maven2/","spark.sql.orc.compression.codec":"snappy","spark.sql.statistics.histogram.enabled":"false","spark.sql.streaming.stopActiveRunOnRestart":"true","spark.sql.orc.columnarReaderBatchSize":"4096","spark.sql.redaction.string.regex":"","spark.sql.inMemoryColumnarStorage.batchSize":"10000","spark.sql.parquet.writeLegacyFormat":"false","spark.sql.statistics.fallBackToHdfs":"false","spark.sql.defaultCatalog":"spark_catalog","spark.sql.streaming.fileSource.cleaner.numThreads":"1","spark.sql.legacy.sessionInitWithConfigDefaults":"false","spark.sql.adaptive.coalescePartitions.minPartitionNum":"","spark.sql.execution.arrow.maxRecordsPerBatch":"10000","spark.sql.stringHashComputationWithoutCopyMemory.enabled":"true","spark.sql.streaming.streamingQueryListeners":"","spark.sql.files.ignoreCorruptFiles":"false","spark.sql.inMemoryColumnarStorage.enableVectorizedReader":"true","spark.sql.extensions":"","spark.sql.sources.default":"parquet","spark.sql.ui.extendedInfo":"true","spark.sql.bloomFilterJoin.enabled":"true","spark.sql.parquet.compression.codec":"snappy","spark.sql.optimizer.distinctBeforeIntersect.enabled":"true","spark.sql.parquet.recordLevelFilter.enabled":"false","spark.sql.shuffle.partitions":"200","spark.sql.groupByOrdinal":"true"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":8,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]\n +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]\n +- Project [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- AdaptiveSparkPlan isFinalPlan=false\n +- HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint)), avg(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, Total_Pages#449L, Total_Record_Length#451L, Avg_Record_Length#453])\n +- ShuffleQueryStage 0\n +- Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#619]\n +- *(1) HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint)), partial_avg(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, count#2733L, sum#2734L, sum#3510, count#3511L])\n +- *(1) Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]\n +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]\n +- InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=false","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint)), avg(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#619]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint)), partial_avg(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1026,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1046,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1047,"metricType":"timing"},{"name":"peak memory","accumulatorId":1045,"metricType":"size"},{"name":"number of output rows","accumulatorId":1044,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1048,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1043,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":1036,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":1037,"metricType":"nsTiming"},{"name":"records read","accumulatorId":1034,"metricType":"sum"},{"name":"local bytes read","accumulatorId":1032,"metricType":"size"},{"name":"fetch wait time","accumulatorId":1033,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":1030,"metricType":"size"},{"name":"local blocks read","accumulatorId":1029,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":1028,"metricType":"sum"},{"name":"data size","accumulatorId":1027,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":1031,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":1035,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1040,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1041,"metricType":"timing"},{"name":"peak memory","accumulatorId":1039,"metricType":"size"},{"name":"number of output rows","accumulatorId":1038,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1042,"metricType":"average"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":1012,"metricType":"sum"},{"name":"written output","accumulatorId":1013,"metricType":"size"},{"name":"number of output rows","accumulatorId":1014,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":1015,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":10,"Submission Time":1678162995370,"Stage Infos":[{"Stage ID":18,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":63,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"162\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[62],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":62,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"163\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[61],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":58,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":59,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[58],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":60,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[59],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":61,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[60],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[18],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"162\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"8","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":18,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":63,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"162\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[62],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":62,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"163\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[61],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":58,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":59,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[58],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":60,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[59],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":61,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[60],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162995377,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"162\",\"name\":\"ShuffleQueryStage\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"8","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":138,"Index":4,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":139,"Index":5,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":140,"Index":3,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":141,"Index":2,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":142,"Index":6,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":143,"Index":1,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":144,"Index":7,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":145,"Index":0,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":146,"Index":12,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":147,"Index":13,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":148,"Index":11,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":149,"Index":10,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":150,"Index":14,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":151,"Index":9,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":152,"Index":15,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":153,"Index":8,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":154,"Index":20,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":155,"Index":21,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":156,"Index":19,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":157,"Index":18,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":158,"Index":22,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":159,"Index":17,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":160,"Index":23,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":161,"Index":16,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":162,"Index":28,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":163,"Index":29,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":164,"Index":27,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":165,"Index":26,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":166,"Index":30,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":167,"Index":25,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":168,"Index":31,"Attempt":0,"Launch Time":1678162995395,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":169,"Index":24,"Attempt":0,"Launch Time":1678162995395,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":148,"Index":11,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995543,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"48","Value":"48","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"25","Value":"25","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":46,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":294912,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":4317,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":21320078,"Value":21320078,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":122,"Value":122,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4953046,"Value":4953046,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":15,"Value":15,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":15,"Executor Deserialize CPU Time":4953046,"Executor Run Time":122,"Executor CPU Time":21320078,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":156,"Index":19,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995543,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"36","Value":"84","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"524288","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"25","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":92,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":589824,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":8634,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":46373233,"Value":67693311,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":123,"Value":245,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5155506,"Value":10108552,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":15,"Value":30,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":15,"Executor Deserialize CPU Time":5155506,"Executor Run Time":123,"Executor CPU Time":46373233,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":164,"Index":27,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995544,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"27","Value":"111","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"786432","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"25","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":138,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":884736,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":12951,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":16928643,"Value":84621954,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":118,"Value":363,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4904786,"Value":15013338,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":18,"Value":48,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":18,"Executor Deserialize CPU Time":4904786,"Executor Run Time":118,"Executor CPU Time":16928643,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":140,"Index":3,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"2","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995544,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"48","Value":"159","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"1048576","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"25","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":184,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1179648,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":17268,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":14809047,"Value":99431001,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":121,"Value":484,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9702007,"Value":24715345,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":17,"Value":65,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":17,"Executor Deserialize CPU Time":9702007,"Executor Run Time":121,"Executor CPU Time":14809047,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":159,"Index":17,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995584,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"35","Value":"194","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"1310720","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":230,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1474560,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":21585,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":23775870,"Value":123206871,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":104,"Value":588,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4955355,"Value":29670700,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":47,"Value":112,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":47,"Executor Deserialize CPU Time":4955355,"Executor Run Time":104,"Executor CPU Time":23775870,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":143,"Index":1,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995587,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"40","Value":"234","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"1572864","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":276,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":1769472,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":25902,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":18792895,"Value":141999766,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":112,"Value":700,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9486320,"Value":39157020,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":48,"Value":160,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":48,"Executor Deserialize CPU Time":9486320,"Executor Run Time":112,"Executor CPU Time":18792895,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":149,"Index":10,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995589,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"40","Value":"274","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"1835008","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":322,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2064384,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":30219,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":26620393,"Value":168620159,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":154,"Value":854,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4434561,"Value":43591581,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":20,"Value":180,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":20,"Executor Deserialize CPU Time":4434561,"Executor Run Time":154,"Executor CPU Time":26620393,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":151,"Index":9,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995589,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"36","Value":"310","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"2097152","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":368,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2359296,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":34536,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":40997063,"Value":209617222,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":114,"Value":968,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3796499,"Value":47388080,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":43,"Value":223,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":43,"Executor Deserialize CPU Time":3796499,"Executor Run Time":114,"Executor CPU Time":40997063,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":141,"Index":2,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995591,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"41","Value":"351","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"2359296","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":414,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2654208,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":38853,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":48829550,"Value":258446772,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":156,"Value":1124,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9021856,"Value":56409936,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":24,"Value":247,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":24,"Executor Deserialize CPU Time":9021856,"Executor Run Time":156,"Executor CPU Time":48829550,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":167,"Index":25,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995592,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"35","Value":"386","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"2621440","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":460,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":2949120,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":43170,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":22386891,"Value":280833663,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":111,"Value":1235,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3741591,"Value":60151527,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":41,"Value":288,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":41,"Executor Deserialize CPU Time":3741591,"Executor Run Time":111,"Executor CPU Time":22386891,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":165,"Index":26,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995602,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"47","Value":"433","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"2883584","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":506,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3244032,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":47487,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":18851281,"Value":299684944,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":164,"Value":1399,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5128876,"Value":65280403,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":15,"Value":303,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":15,"Executor Deserialize CPU Time":5128876,"Executor Run Time":164,"Executor CPU Time":18851281,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":157,"Index":18,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"3","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995604,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"45","Value":"478","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"3145728","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":552,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3538944,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":51804,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":24384581,"Value":324069525,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":160,"Value":1559,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4662042,"Value":69942445,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":23,"Value":326,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":23,"Executor Deserialize CPU Time":4662042,"Executor Run Time":160,"Executor CPU Time":24384581,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":144,"Index":7,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995611,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"42","Value":"520","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"3407872","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":598,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":3833856,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":56121,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":20518004,"Value":344587529,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":114,"Value":1673,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":9907909,"Value":79850354,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":56,"Value":382,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":56,"Executor Deserialize CPU Time":9907909,"Executor Run Time":114,"Executor CPU Time":20518004,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":168,"Index":31,"Attempt":0,"Launch Time":1678162995395,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995613,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"42","Value":"562","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"3670016","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"50","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":644,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4128768,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":60438,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":25982392,"Value":370569921,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":105,"Value":1778,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5221445,"Value":85071799,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":55,"Value":437,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":55,"Executor Deserialize CPU Time":5221445,"Executor Run Time":105,"Executor CPU Time":25982392,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":152,"Index":15,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995614,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"53","Value":"615","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"3932160","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"7","Value":"57","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":690,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4423680,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":64755,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":48263553,"Value":418833474,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":113,"Value":1891,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4297031,"Value":89368830,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":66,"Value":503,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":66,"Executor Deserialize CPU Time":4297031,"Executor Run Time":113,"Executor CPU Time":48263553,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":160,"Index":23,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"6","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995617,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"53","Value":"668","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"4194304","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"14","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":736,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":4718592,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":69072,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":27864481,"Value":446697955,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":113,"Value":2004,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5283606,"Value":94652436,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":67,"Value":570,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":67,"Executor Deserialize CPU Time":5283606,"Executor Run Time":113,"Executor CPU Time":27864481,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":139,"Index":5,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995636,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"64","Value":"732","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"4456448","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":782,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5013504,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":26,"Value":26,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":73432,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":43499716,"Value":490197671,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":210,"Value":2214,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10877708,"Value":105530144,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":23,"Value":593,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":23,"Executor Deserialize CPU Time":10877708,"Executor Run Time":210,"Executor CPU Time":43499716,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":26,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":155,"Index":21,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995637,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"65","Value":"797","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"4718592","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":828,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5308416,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":26,"Value":52,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":77792,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":22685397,"Value":512883068,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":209,"Value":2423,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6545492,"Value":112075636,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":22,"Value":615,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":22,"Executor Deserialize CPU Time":6545492,"Executor Run Time":209,"Executor CPU Time":22685397,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":26,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":166,"Index":30,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995637,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"41","Value":"838","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"4980736","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":874,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5603328,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":82109,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":64312810,"Value":577195878,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":160,"Value":2583,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7627848,"Value":119703484,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":52,"Value":667,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":52,"Executor Deserialize CPU Time":7627848,"Executor Run Time":160,"Executor CPU Time":64312810,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":142,"Index":6,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995638,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"41","Value":"879","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"5242880","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":920,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":5898240,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":86426,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":23994741,"Value":601190619,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":158,"Value":2741,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":11955428,"Value":131658912,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":59,"Value":726,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":59,"Executor Deserialize CPU Time":11955428,"Executor Run Time":158,"Executor CPU Time":23994741,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":147,"Index":13,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995645,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"82","Value":"961","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"5505024","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":966,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6193152,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":26,"Value":78,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":90786,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":23600822,"Value":624791441,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":218,"Value":2959,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6490041,"Value":138148953,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":21,"Value":747,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":21,"Executor Deserialize CPU Time":6490041,"Executor Run Time":218,"Executor CPU Time":23600822,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":26,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":158,"Index":22,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995645,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"49","Value":"1010","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"5767168","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1012,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6488064,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":95103,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":24357024,"Value":649148465,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":164,"Value":3123,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7467748,"Value":145616701,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":54,"Value":801,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":54,"Executor Deserialize CPU Time":7467748,"Executor Run Time":164,"Executor CPU Time":24357024,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":150,"Index":14,"Attempt":0,"Launch Time":1678162995391,"Executor ID":"8","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995649,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"56","Value":"1066","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"6029312","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1058,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":6782976,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":99420,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":22790601,"Value":671939066,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":172,"Value":3295,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6656514,"Value":152273215,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":53,"Value":854,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":53,"Executor Deserialize CPU Time":6656514,"Executor Run Time":172,"Executor CPU Time":22790601,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":163,"Index":29,"Attempt":0,"Launch Time":1678162995394,"Executor ID":"5","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995660,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"96","Value":"1162","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"6291456","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1104,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7077888,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":26,"Value":104,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":103780,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":60981377,"Value":732920443,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":232,"Value":3527,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6500114,"Value":158773329,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":20,"Value":874,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":20,"Executor Deserialize CPU Time":6500114,"Executor Run Time":232,"Executor CPU Time":60981377,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":26,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":161,"Index":16,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995697,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"38","Value":"1200","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"6553600","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"71","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1150,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7372800,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":108097,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":28289519,"Value":761209962,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":135,"Value":3662,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":10016289,"Value":168789618,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":117,"Value":991,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":117,"Executor Deserialize CPU Time":10016289,"Executor Run Time":135,"Executor CPU Time":28289519,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":169,"Index":24,"Attempt":0,"Launch Time":1678162995395,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995704,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"39","Value":"1239","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"6815744","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"1","Value":"72","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1196,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7667712,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":112414,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":20108618,"Value":781318580,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":134,"Value":3796,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":5907430,"Value":174697048,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":117,"Value":1108,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":117,"Executor Deserialize CPU Time":5907430,"Executor Run Time":134,"Executor CPU Time":20108618,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":145,"Index":0,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995705,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"37","Value":"1276","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"7077888","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"72","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1242,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":7962624,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":116731,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":24900507,"Value":806219087,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":136,"Value":3932,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6176949,"Value":180873997,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":115,"Value":1223,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":115,"Executor Deserialize CPU Time":6176949,"Executor Run Time":136,"Executor CPU Time":24900507,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":153,"Index":8,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"1","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995706,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"37","Value":"1313","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"7340032","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"72","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1288,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8257536,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":121048,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":57267920,"Value":863487007,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":134,"Value":4066,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":6218641,"Value":187092638,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":117,"Value":1340,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":117,"Executor Deserialize CPU Time":6218641,"Executor Run Time":134,"Executor CPU Time":57267920,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskStart","Stage ID":18,"Stage Attempt ID":0,"Task Info":{"Task ID":170,"Index":32,"Attempt":0,"Launch Time":1678162995709,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":154,"Index":20,"Attempt":0,"Launch Time":1678162995392,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995711,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"35","Value":"1348","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"7602176","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"72","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1334,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8552448,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":182,"Value":286,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":125408,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":37701729,"Value":901188736,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":288,"Value":4354,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3750740,"Value":190843378,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":17,"Value":1357,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":17,"Executor Deserialize CPU Time":3750740,"Executor Run Time":288,"Executor CPU Time":37701729,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":182,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":138,"Index":4,"Attempt":0,"Launch Time":1678162995389,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995711,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"36","Value":"1384","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"7864320","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"1","Value":"73","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1380,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":8847360,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":182,"Value":468,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":129768,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":24454791,"Value":925643527,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":288,"Value":4642,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":7684368,"Value":198527746,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":18,"Value":1375,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":18,"Executor Deserialize CPU Time":7684368,"Executor Run Time":288,"Executor CPU Time":24454791,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":182,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":162,"Index":28,"Attempt":0,"Launch Time":1678162995393,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995711,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"36","Value":"1420","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"8126464","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"1","Value":"74","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1426,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9142272,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":182,"Value":650,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":134128,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":21696634,"Value":947340161,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":289,"Value":4931,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3468248,"Value":201995994,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":14,"Value":1389,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":14,"Executor Deserialize CPU Time":3468248,"Executor Run Time":289,"Executor CPU Time":21696634,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":182,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":146,"Index":12,"Attempt":0,"Launch Time":1678162995390,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995716,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"40","Value":"1460","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"8388608","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"0","Value":"74","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1472,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9437184,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Update":182,"Value":832,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4360,"Value":138488,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":55265151,"Value":1002605312,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":294,"Value":5225,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":4201860,"Value":206197854,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":16,"Value":1405,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":16,"Executor Deserialize CPU Time":4201860,"Executor Run Time":294,"Executor CPU Time":55265151,"Peak Execution Memory":294912,"Result Size":4360,"JVM GC Time":182,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":18,"Stage Attempt ID":0,"Task Type":"ShuffleMapTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":170,"Index":32,"Attempt":0,"Launch Time":1678162995709,"Executor ID":"7","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162995770,"Failed":false,"Killed":false,"Accumulables":[{"ID":1043,"Name":"duration","Update":"42","Value":"1502","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1045,"Name":"peak memory","Update":"262144","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1047,"Name":"time in aggregation build","Update":"38","Value":"112","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1070,"Name":"internal.metrics.input.bytesRead","Update":46,"Value":1518,"Internal":true,"Count Failed Values":true},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Update":294912,"Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Update":4317,"Value":142805,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Update":9339226,"Value":1011944538,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Update":48,"Value":5273,"Internal":true,"Count Failed Values":true},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Update":3117626,"Value":209315480,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Update":3,"Value":1408,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":3,"Executor Deserialize CPU Time":3117626,"Executor Run Time":48,"Executor CPU Time":9339226,"Peak Execution Memory":294912,"Result Size":4317,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":46,"Records Read":0},"Output Metrics":{"Bytes Written":0,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":18,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":63,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"162\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[62],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":62,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"163\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[61],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":58,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":59,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[58],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":60,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[59],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":61,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[60],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162995377,"Completion Time":1678162995771,"Accumulables":[{"ID":1070,"Name":"internal.metrics.input.bytesRead","Value":1518,"Internal":true,"Count Failed Values":true},{"ID":1052,"Name":"internal.metrics.executorCpuTime","Value":1011944538,"Internal":true,"Count Failed Values":true},{"ID":1043,"Name":"duration","Value":"1502","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1058,"Name":"internal.metrics.peakExecutionMemory","Value":9732096,"Internal":true,"Count Failed Values":true},{"ID":1049,"Name":"internal.metrics.executorDeserializeTime","Value":1408,"Internal":true,"Count Failed Values":true},{"ID":1057,"Name":"internal.metrics.diskBytesSpilled","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1051,"Name":"internal.metrics.executorRunTime","Value":5273,"Internal":true,"Count Failed Values":true},{"ID":1054,"Name":"internal.metrics.jvmGCTime","Value":832,"Internal":true,"Count Failed Values":true},{"ID":1045,"Name":"peak memory","Value":"8650752","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1056,"Name":"internal.metrics.memoryBytesSpilled","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1047,"Name":"time in aggregation build","Value":"112","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1050,"Name":"internal.metrics.executorDeserializeCpuTime","Value":209315480,"Internal":true,"Count Failed Values":true},{"ID":1053,"Name":"internal.metrics.resultSize","Value":142805,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":10,"Completion Time":1678162995774,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate","executionId":8,"physicalPlanDescription":"== Parsed Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Analyzed Logical Plan ==\n\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 9 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, Charset#311, Language#279, ... 8 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26 AS Charset#311, Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27 AS Language#279, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13 AS Site#247, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 7 more fields]\n +- Repartition 50000, false\n +- Filter (content_languages#27 = eng)\n +- Relation[url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] parquet\n\n== Optimized Logical Plan ==\nInsertIntoHadoopFsRelationCommand s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB, false, CSV, Map(header -> true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- Aggregate [Site#247, Date#344], [Site#247, Date#344, count(1) AS Total_Pages#449L, sum(cast(warc_record_length#31 as bigint)) AS Total_Record_Length#451L, avg(cast(warc_record_length#31 as bigint)) AS Avg_Record_Length#453]\n +- Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]\n +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]\n +- Project [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]\n+- AdaptiveSparkPlan isFinalPlan=true\n +- *(2) HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint)), avg(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, Total_Pages#449L, Total_Record_Length#451L, Avg_Record_Length#453])\n +- CustomShuffleReader coalesced\n +- ShuffleQueryStage 0\n +- Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#619]\n +- *(1) HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint)), partial_avg(cast(warc_record_length#31 as bigint))], output=[Site#247, Date#344, count#2733L, sum#2734L, sum#3510, count#3511L])\n +- *(1) Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]\n +- BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]\n +- InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]\n +- InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)\n +- Coalesce 50000\n +- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct true, path -> s3://job-bank/etl-jobs/output-data/commoncrawl/etl_job_outputs/site_day_summary_1GB), Overwrite, [Site, Date, Total_Pages, Total_Record_Length, Avg_Record_Length]","children":[{"nodeName":"AdaptiveSparkPlan","simpleString":"AdaptiveSparkPlan isFinalPlan=true","children":[{"nodeName":"WholeStageCodegen (2)","simpleString":"WholeStageCodegen (2)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[count(1), sum(cast(warc_record_length#31 as bigint)), avg(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"CustomShuffleReader","simpleString":"CustomShuffleReader coalesced","children":[{"nodeName":"ShuffleQueryStage","simpleString":"ShuffleQueryStage 0","children":[{"nodeName":"Exchange","simpleString":"Exchange hashpartitioning(Site#247, Date#344, 1000), true, [id=#619]","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"HashAggregate","simpleString":"HashAggregate(keys=[Site#247, Date#344], functions=[partial_count(1), partial_sum(cast(warc_record_length#31 as bigint)), partial_avg(cast(warc_record_length#31 as bigint))])","children":[{"nodeName":"Project","simpleString":"Project [url_host_registered_domain#13 AS Site#247, warc_record_length#31, pythonUDF0#3357 AS Date#344]","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"BatchEvalPython","simpleString":"BatchEvalPython [(fetch_time#20)], [pythonUDF0#3357]","children":[{"nodeName":"InMemoryTableScan","simpleString":"InMemoryTableScan [url_host_registered_domain#13, fetch_time#20, warc_record_length#31]","children":[{"nodeName":"InMemoryRelation","simpleString":"InMemoryRelation [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields], StorageLevel(disk, memory, 1 replicas)","children":[{"nodeName":"Coalesce","simpleString":"Coalesce 50000","children":[{"nodeName":"WholeStageCodegen (1)","simpleString":"WholeStageCodegen (1)","children":[{"nodeName":"Project","simpleString":"Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]","children":[{"nodeName":"Filter","simpleString":"Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))","children":[{"nodeName":"ColumnarToRow","simpleString":"ColumnarToRow","children":[{"nodeName":"InputAdapter","simpleString":"InputAdapter","children":[{"nodeName":"Scan parquet ","simpleString":"FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,fetch_status#21,fetch_redirect#22,content_digest#23,content_mime_type#24,content_mime_detected#25,content_charset#26,content_languages#27,... 6 more fields] Batched: true, DataFilters: [isnotnull(content_languages#27), (content_languages#27 = eng)], Format: Parquet, Location: InMemoryFileIndex[s3://job-bank-datalake/crawl-data/index-data/1GB], PartitionFilters: [], PushedFilters: [IsNotNull(content_languages), EqualTo(content_languages,eng)], ReadSchema: struct","Format":"Parquet","Batched":"true","PartitionFilters":"[]","PushedFilters":"[IsNotNull(content_languages), EqualTo(content_languages,eng)]","DataFilters":"[isnotnull(content_languages#27), (content_languages#27 = eng)]"},"metrics":[{"name":"number of files read","accumulatorId":115,"metricType":"sum"},{"name":"scan time","accumulatorId":119,"metricType":"timing"},{"name":"dynamic partition pruning time","accumulatorId":118,"metricType":"timing"},{"name":"metadata time","accumulatorId":116,"metricType":"timing"},{"name":"size of files read","accumulatorId":117,"metricType":"size"},{"name":"number of output rows","accumulatorId":114,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":112,"metricType":"sum"},{"name":"number of input batches","accumulatorId":113,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":111,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":110,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{"cacheBuilderId":"0"},"metrics":[{"name":"number of computed rows","accumulatorId":120,"metricType":"sum"},{"name":"numComputedPartitions","accumulatorId":121,"metricType":"sum"}]}],"metadata":{},"metrics":[{"name":"number of output rows","accumulatorId":1026,"metricType":"sum"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1046,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1047,"metricType":"timing"},{"name":"peak memory","accumulatorId":1045,"metricType":"size"},{"name":"number of output rows","accumulatorId":1044,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1048,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1043,"metricType":"timing"}]}],"metadata":{},"metrics":[{"name":"shuffle records written","accumulatorId":1036,"metricType":"sum"},{"name":"shuffle write time","accumulatorId":1037,"metricType":"nsTiming"},{"name":"records read","accumulatorId":1034,"metricType":"sum"},{"name":"local bytes read","accumulatorId":1032,"metricType":"size"},{"name":"fetch wait time","accumulatorId":1033,"metricType":"timing"},{"name":"remote bytes read","accumulatorId":1030,"metricType":"size"},{"name":"local blocks read","accumulatorId":1029,"metricType":"sum"},{"name":"remote blocks read","accumulatorId":1028,"metricType":"sum"},{"name":"data size","accumulatorId":1027,"metricType":"size"},{"name":"remote bytes read to disk","accumulatorId":1031,"metricType":"size"},{"name":"shuffle bytes written","accumulatorId":1035,"metricType":"size"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"spill size","accumulatorId":1088,"metricType":"size"},{"name":"time in aggregation build","accumulatorId":1089,"metricType":"timing"},{"name":"peak memory","accumulatorId":1087,"metricType":"size"},{"name":"number of output rows","accumulatorId":1086,"metricType":"sum"},{"name":"avg hash probe bucket list iters","accumulatorId":1090,"metricType":"average"}]}],"metadata":{},"metrics":[{"name":"duration","accumulatorId":1085,"metricType":"timing"}]}],"metadata":{},"metrics":[]}],"metadata":{},"metrics":[{"name":"number of written files","accumulatorId":1012,"metricType":"sum"},{"name":"written output","accumulatorId":1013,"metricType":"size"},{"name":"number of output rows","accumulatorId":1014,"metricType":"sum"},{"name":"number of dynamic part","accumulatorId":1015,"metricType":"sum"}]}} -{"Event":"SparkListenerJobStart","Job ID":11,"Submission Time":1678162995858,"Stage Infos":[{"Stage ID":19,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":33,"RDD Info":[{"RDD ID":63,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"162\",\"name\":\"ShuffleQueryStage\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[62],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":6,"Name":"Coalesce 50000\n+- *(1) Project [url_surtkey#4, url#5, url_host_name#6, url_host_tld#7, url_host_2nd_last_part#8, url_host_3rd_last_part#9, url_host_4th_last_part#10, url_host_5th_last_part#11, url_host_registry_suffix#12, url_host_registered_domain#13, url_host_private_suffix#14, url_host_private_domain#15, url_protocol#16, url_port#17, url_path#18, url_query#19, fetch_time#20, fetch_status#21, fetch_redirect#22, content_digest#23, content_mime_type#24, content_mime_detected#25, content_charset#26, content_languages#27, ... 6 more fields]\n +- *(1) Filter (isnotnull(content_languages#27) AND (content_languages#27 = eng))\n +- *(1) ColumnarToRow\n +- FileScan parquet [url_surtkey#4,url#5,url_host_name#6,url_host_tld#7,url_host_2nd_last_part#8,url_host_3rd_last_part#9,url_host_4th_last_part#10,url_host_5th_last_part#11,url_host_registry_suffix#12,url_host_registered_domain#13,url_host_private_suffix#14,url_host_private_domain#15,url_protocol#16,url_port#17,url_path#18,url_query#19,fetch_time#20,f...","Scope":"{\"id\":\"13\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[5],"Storage Level":{"Use Disk":true,"Use Memory":true,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":4,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"15\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[3],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":2,"Name":"FileScanRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":62,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"163\",\"name\":\"WholeStageCodegen (1)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[61],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":58,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[6],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":5,"Name":"CoalescedRDD","Scope":"{\"id\":\"14\",\"name\":\"Coalesce\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[4],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":3,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"20\",\"name\":\"Scan parquet \"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[2],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":59,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"168\",\"name\":\"InMemoryTableScan\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[58],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":60,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[59],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":61,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"167\",\"name\":\"BatchEvalPython\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[60],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":33,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]},{"Stage ID":20,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":65,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"170\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[64],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":64,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"173\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[63],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[19],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Accumulables":[]}],"Stage IDs":[19,20],"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"160\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"8","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerStageSubmitted","Stage Info":{"Stage ID":20,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":65,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"170\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[64],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":64,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"173\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[63],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[19],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162995861,"Accumulables":[]},"Properties":{"spark.sql.warehouse.dir":"hdfs:///user/spark/warehouse","spark.sql.parquet.fs.optimized.committer.optimization-enabled":"true","spark.driver.host":"ip-172-31-102-115.ec2.internal","spark.serializer.objectStreamReset":"100","spark.history.fs.logDirectory":"hdfs:///var/log/spark/apps","spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem":"2","spark.eventLog.enabled":"true","spark.driver.port":"35829","spark.shuffle.service.enabled":"true","__fetch_continuous_blocks_in_batch_enabled":"true","spark.rdd.compress":"True","spark.driver.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.executorEnv.PYTHONPATH":"{{PWD}}/pyspark.zip{{PWD}}/py4j-0.10.9-src.zip","spark.yarn.historyServer.address":"ip-172-31-102-115.ec2.internal:18080","spark.stage.attempt.ignoreOnDecommissionFetchFailure":"true","spark.app.name":"index_data_etl_1GB","spark.rdd.scope":"{\"id\":\"160\",\"name\":\"Execute InsertIntoHadoopFsRelationCommand\"}","spark.driver.memory":"2048M","spark.executor.instances":"8","spark.rdd.scope.noOverride":"true","spark.files.fetchFailure.unRegisterOutputOnHost":"true","spark.submit.pyFiles":"","spark.jars.packages":"org.apache.hadoop:hadoop-aws:3.2.1","spark.executor.defaultJavaOptions":"-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70","spark.resourceManager.cleanupExpiredHost":"true","spark.executor.id":"driver","spark.yarn.appMasterEnv.SPARK_PUBLIC_DNS":"$(hostname -f)","spark.sql.emr.internal.extensions":"com.amazonaws.emr.spark.EmrSparkSessionExtensions","spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds":"2000","spark.submit.deployMode":"client","spark.master":"yarn","spark.sql.parquet.output.committer.class":"com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter","spark.driver.defaultJavaOptions":"-XX:OnOutOfMemoryError='kill -9 %p' -XX:+UseConcMarkSweepGC -XX:CMSInitiatingOccupancyFraction=70 -XX:MaxHeapFreeRatio=70 -XX:+CMSClassUnloadingEnabled","spark.ui.filters":"org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter","spark.blacklist.decommissioning.timeout":"1h","spark.executor.extraLibraryPath":"/usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native","spark.sql.hive.metastore.sharedPrefixes":"com.amazonaws.services.dynamodbv2","spark.executor.memory":"4656M","spark.driver.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.eventLog.dir":"s3a://sync-us-east-1-temp-90-days/scott/projects/emr/131ef352-c955-45ba-8614-e4e0690e3525/2023-03-07T04:14:28Z/1f4ada39-b2cb-4e15-871e-bf368e9b548c/eventlog/","spark.dynamicAllocation.enabled":"false","spark.executor.extraClassPath":"/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar","spark.sql.execution.id":"8","spark.executor.cores":"4","spark.history.ui.port":"18080","spark.driver.appUIAddress":"http://ip-172-31-102-115.ec2.internal:4040","spark.yarn.isPython":"true","spark.executor.processTreeMetrics.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_HOSTS":"ip-172-31-102-115.ec2.internal","spark.blacklist.decommissioning.enabled":"true","spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES":"http://ip-172-31-102-115.ec2.internal:20888/proxy/application_1678162862227_0001","spark.decommissioning.timeout.threshold":"20","spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem":"true","spark.app.id":"application_1678162862227_0001","spark.hadoop.yarn.timeline-service.enabled":"false","spark.yarn.executor.memoryOverheadFactor":"0.1875"}} -{"Event":"SparkListenerTaskStart","Stage ID":20,"Stage Attempt ID":0,"Task Info":{"Task ID":171,"Index":0,"Attempt":0,"Launch Time":1678162995896,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":0,"Failed":false,"Killed":false,"Accumulables":[]}} -{"Event":"SparkListenerTaskEnd","Stage ID":20,"Stage Attempt ID":0,"Task Type":"ResultTask","Task End Reason":{"Reason":"Success"},"Task Info":{"Task ID":171,"Index":0,"Attempt":0,"Launch Time":1678162995896,"Executor ID":"4","Host":"ip-172-31-102-249.ec2.internal","Locality":"PROCESS_LOCAL","Speculative":false,"Getting Result Time":0,"Finish Time":1678162996067,"Failed":false,"Killed":false,"Accumulables":[{"ID":1085,"Name":"duration","Update":"67","Value":"67","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1087,"Name":"peak memory","Update":"262144","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1089,"Name":"time in aggregation build","Update":"0","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1114,"Name":"internal.metrics.output.bytesWritten","Update":60,"Value":60,"Internal":true,"Count Failed Values":true},{"ID":1108,"Name":"internal.metrics.shuffle.read.recordsRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1107,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1106,"Name":"internal.metrics.shuffle.read.localBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1105,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1104,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1103,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1102,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Update":0,"Value":0,"Internal":true,"Count Failed Values":true},{"ID":1100,"Name":"internal.metrics.peakExecutionMemory","Update":262144,"Value":262144,"Internal":true,"Count Failed Values":true},{"ID":1095,"Name":"internal.metrics.resultSize","Update":5284,"Value":5284,"Internal":true,"Count Failed Values":true},{"ID":1094,"Name":"internal.metrics.executorCpuTime","Update":34832333,"Value":34832333,"Internal":true,"Count Failed Values":true},{"ID":1093,"Name":"internal.metrics.executorRunTime","Update":145,"Value":145,"Internal":true,"Count Failed Values":true},{"ID":1092,"Name":"internal.metrics.executorDeserializeCpuTime","Update":12183221,"Value":12183221,"Internal":true,"Count Failed Values":true},{"ID":1091,"Name":"internal.metrics.executorDeserializeTime","Update":18,"Value":18,"Internal":true,"Count Failed Values":true}]},"Task Executor Metrics":{"JVMHeapMemory":0,"JVMOffHeapMemory":0,"OnHeapExecutionMemory":0,"OffHeapExecutionMemory":0,"OnHeapStorageMemory":0,"OffHeapStorageMemory":0,"OnHeapUnifiedMemory":0,"OffHeapUnifiedMemory":0,"DirectPoolMemory":0,"MappedPoolMemory":0,"ProcessTreeJVMVMemory":0,"ProcessTreeJVMRSSMemory":0,"ProcessTreePythonVMemory":0,"ProcessTreePythonRSSMemory":0,"ProcessTreeOtherVMemory":0,"ProcessTreeOtherRSSMemory":0,"MinorGCCount":0,"MinorGCTime":0,"MajorGCCount":0,"MajorGCTime":0},"Task Metrics":{"Executor Deserialize Time":18,"Executor Deserialize CPU Time":12183221,"Executor Run Time":145,"Executor CPU Time":34832333,"Peak Execution Memory":262144,"Result Size":5284,"JVM GC Time":0,"Result Serialization Time":0,"Memory Bytes Spilled":0,"Disk Bytes Spilled":0,"Shuffle Read Metrics":{"Remote Blocks Fetched":0,"Local Blocks Fetched":0,"Fetch Wait Time":0,"Remote Bytes Read":0,"Remote Bytes Read To Disk":0,"Local Bytes Read":0,"Total Records Read":0},"Shuffle Write Metrics":{"Shuffle Bytes Written":0,"Shuffle Write Time":0,"Shuffle Records Written":0},"Input Metrics":{"Bytes Read":0,"Records Read":0},"Output Metrics":{"Bytes Written":60,"Records Written":0},"Updated Blocks":[]}} -{"Event":"SparkListenerStageCompleted","Stage Info":{"Stage ID":20,"Stage Attempt ID":0,"Stage Name":"save at NativeMethodAccessorImpl.java:0","Number of Tasks":1,"RDD Info":[{"RDD ID":65,"Name":"MapPartitionsRDD","Scope":"{\"id\":\"170\",\"name\":\"WholeStageCodegen (2)\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[64],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0},{"RDD ID":64,"Name":"ShuffledRowRDD","Scope":"{\"id\":\"173\",\"name\":\"CustomShuffleReader\"}","Callsite":"save at NativeMethodAccessorImpl.java:0","Parent IDs":[63],"Storage Level":{"Use Disk":false,"Use Memory":false,"Deserialized":false,"Replication":1},"Barrier":false,"Number of Partitions":1,"Number of Cached Partitions":0,"Memory Size":0,"Disk Size":0}],"Parent IDs":[19],"Details":"org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:288)\nsun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\nsun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)\nsun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\njava.lang.reflect.Method.invoke(Method.java:498)\npy4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)\npy4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\npy4j.Gateway.invoke(Gateway.java:282)\npy4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\npy4j.commands.CallCommand.execute(CallCommand.java:79)\npy4j.GatewayConnection.run(GatewayConnection.java:238)\njava.lang.Thread.run(Thread.java:750)","Submission Time":1678162995861,"Completion Time":1678162996068,"Accumulables":[{"ID":1106,"Name":"internal.metrics.shuffle.read.localBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1100,"Name":"internal.metrics.peakExecutionMemory","Value":262144,"Internal":true,"Count Failed Values":true},{"ID":1091,"Name":"internal.metrics.executorDeserializeTime","Value":18,"Internal":true,"Count Failed Values":true},{"ID":1094,"Name":"internal.metrics.executorCpuTime","Value":34832333,"Internal":true,"Count Failed Values":true},{"ID":1085,"Name":"duration","Value":"67","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1102,"Name":"internal.metrics.shuffle.read.remoteBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1093,"Name":"internal.metrics.executorRunTime","Value":145,"Internal":true,"Count Failed Values":true},{"ID":1105,"Name":"internal.metrics.shuffle.read.remoteBytesReadToDisk","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1087,"Name":"peak memory","Value":"262144","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1114,"Name":"internal.metrics.output.bytesWritten","Value":60,"Internal":true,"Count Failed Values":true},{"ID":1108,"Name":"internal.metrics.shuffle.read.recordsRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1107,"Name":"internal.metrics.shuffle.read.fetchWaitTime","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1089,"Name":"time in aggregation build","Value":"0","Internal":true,"Count Failed Values":true,"Metadata":"sql"},{"ID":1092,"Name":"internal.metrics.executorDeserializeCpuTime","Value":12183221,"Internal":true,"Count Failed Values":true},{"ID":1095,"Name":"internal.metrics.resultSize","Value":5284,"Internal":true,"Count Failed Values":true},{"ID":1104,"Name":"internal.metrics.shuffle.read.remoteBytesRead","Value":0,"Internal":true,"Count Failed Values":true},{"ID":1103,"Name":"internal.metrics.shuffle.read.localBlocksFetched","Value":0,"Internal":true,"Count Failed Values":true}]}} -{"Event":"SparkListenerJobEnd","Job ID":11,"Completion Time":1678162996068,"Job Result":{"Result":"JobSucceeded"}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates","executionId":8,"accumUpdates":[[1012,1],[1013,60],[1014,0],[1015,0]]} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerQueryExecutionMetrics","executionId":8,"timePerRule":{"PruneFileSourcePartitions":188612,"ReassignLambdaVariableID":184134,"PushPredicateThroughNonJoin":166386,"Analyzer$HandleNullInputsForUDF":46617,"Analyzer$ResolveSubqueryColumnAliases":11600,"ResolveTimeZone":16037,"Analyzer$ResolveNamespace":12586,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":12616,"RewriteCorrelatedScalarSubquery":430123,"RemoveLiteralFromGroupExpressions":239289,"PushProjectionThroughUnion":437157,"EliminateSubqueryAliases":226003,"ResolveCatalogs":21850,"PushLeftSemiLeftAntiThroughJoin":421808,"FlattenScalarSubqueriesWithAggregates":149181,"LikeSimplification":529164,"CollapseRepartition":440098,"ResolveHints$ResolveCoalesceHints":12966,"Analyzer$ExtractGenerator":40486,"RewriteIntersectAll":220647,"ResolveHints$ResolveJoinStrategyHints":14925,"TypeCoercion$MapZipWithCoercion":17699,"NullPropagation":589498,"PullupCorrelatedPredicates":429342,"UpdateOuterReferences":20881,"ExtractPythonUDFs":1254885,"Analyzer$WindowsSubstitution":18722,"CombineUnions":524076,"ExtractGroupingPythonUDFFromAggregate":109718,"ReorderAssociativeOperator":546434,"CleanupDynamicPruningFilters":256584,"ResolveHints$RemoveAllHints":15542,"SimplifyBinaryComparison":561075,"ResolveTableValuedFunctions":16220,"EliminateSerialization":302829,"TypeCoercion$BooleanEquality":16031,"ReplaceIntersectWithSemiJoin":222723,"ConstantPropagation":312997,"CostBasedJoinReorder":13531,"Analyzer$ResolveReferences":63843,"CTESubstitution":402605,"RemoveRedundantAliases":574766,"TypeCoercion$ImplicitTypeCasts":16195,"RewriteExceptAll":230192,"UpdateAttributeNullability":110136,"PropagateEmptyRelation":306387,"SimplifyCasts":532142,"EliminateMapObjects":174567,"CombineLimits":300207,"DetectAmbiguousSelfJoin":23038,"ReplaceExpressions":427551,"ResolveInlineTables":40151,"OptimizeIn":533844,"CollapseWindow":344669,"TypeCoercion$IfCoercion":15649,"ResolveSessionCatalog":21789,"PartitionPruning":119818,"BooleanSimplification":793660,"TypeCoercion$PromoteStrings":18037,"Analyzer$ResolveAliases":12615,"DecimalAggregates":252082,"PruneFilters":418330,"Analyzer$ResolveMissingReferences":11100,"TransposeWindow":307818,"Analyzer$ResolveRelations":24967,"EliminateUnions":21080,"RewritePredicateSubquery":98644,"ObjectSerializerPruning":96806,"LimitPushDown":423460,"SimplifyCaseConversionExpressions":532735,"Analyzer$ResolveNaturalAndUsingJoin":12389,"EliminateView":236933,"CombineTypedFilters":93276,"OptimizeLimitZero":234686,"CheckCartesianProducts":23093,"ExtractPythonUDFFromAggregate":135765,"Analyzer$ExtractWindowExpressions":15087,"ReplaceExceptWithAntiJoin":248786,"ResolveLambdaVariables":21347,"FallBackFileSourceV2":11627,"Analyzer$ResolveTables":20462,"SubstituteUnresolvedOrdinals":16953,"TypeCoercion$CaseWhenCoercion":15661,"DecimalPrecision":27547,"EliminateSorts":209553,"PushDownLeftSemiAntiJoin":435232,"ExtractPythonUDFFromJoinCondition":105169,"TypeCoercion$StackCoercion":15903,"Analyzer$ResolveAggAliasInGroupBy":12255,"TypeCoercion$StringLiteralCoercion":15509,"FoldablePropagation":137222,"V2ScanRelationPushDown":192701,"EliminateDistinct":15268,"InferFiltersFromConstraints":116628,"Analyzer$PullOutNondeterministic":19588,"Analyzer$ResolveFunctions":18733,"ReplaceNullWithFalseInPredicate":467134,"ResolveHigherOrderFunctions":17679,"Analyzer$ResolvePivot":13829,"CollapseProject":1095786,"Analyzer$ResolveNewInstance":13128,"ColumnPruning":3262916,"Analyzer$ResolveWindowOrder":17028,"TypeCoercion$ConcatCoercion":19713,"PushDownPredicates":733750,"TimeWindowing":43519,"Optimizer$OptimizeSubqueries":903235,"RewriteNonCorrelatedExists":378903,"DemoteBroadcastHashJoin":34426,"TypeCoercion$Division":15097,"ComputeCurrentTime":394549,"ResolveCreateNamedStruct":18953,"TypeCoercion$EltCoercion":19534,"ConvertToLocalRelation":331210,"RemoveRepetitionFromGroupExpressions":284392,"ReplaceDistinctWithAggregate":226364,"PreprocessTableCreation":17613,"ResolveSQLOnFile":12442,"Analyzer$ResolveSubquery":12824,"CombineConcats":28537,"Analyzer$ResolveGroupingAnalytics":17163,"Analyzer$ResolveBinaryArithmetic":17555,"RemoveDispensableExpressions":538044,"Analyzer$ResolveAlterTableChanges":17469,"ResolveEncodersInScalaAgg":19456,"TypeCoercion$IntegralDivision":16314,"Analyzer$ResolveWindowFrame":19166,"Analyzer$ResolveDeserializer":13554,"RewriteDistinctAggregates":267193,"RemoveNoopOperators":628431,"Analyzer$ResolveAggregateFunctions":12185,"NormalizeFloatingNumbers":99694,"ReorderJoin":439224,"Analyzer$ResolveUpCast":13194,"Analyzer$ResolveGenerate":14650,"TypeCoercion$WidenSetOperationTypes":13236,"EliminateOuterJoin":427054,"SimplifyExtractValueOps":435938,"OptimizeMetadataOnlyQuery":11358,"EliminateResolvedHint":481062,"Analyzer$ResolveInsertInto":11335,"ReplaceExceptWithFilter":243004,"CleanupAliases":27501,"GetCurrentDatabase":488690,"SchemaPruning":218230,"Analyzer$ResolveOutputRelation":12851,"BloomFilterJoinRule":109343,"Analyzer$ResolveRandomSeed":12917,"TypeCoercion$WindowFrameCoercion":15565,"ConstantFolding":494746,"TypeCoercion$DateTimeOperations":14989,"TypeCoercion$InConversion":17906,"FindDataSourceTable":14050,"SimplifyConditionals":529169,"DataSourceAnalysis":12374,"TypeCoercion$FunctionArgumentConversion":18393,"Analyzer$GlobalAggregates":11045,"Analyzer$LookupFunctions":23366,"CombineFilters":394027,"ReplaceDeduplicateWithAggregate":240841,"PreprocessTableInsertion":12122},"numRunsPerRule":{"PruneFileSourcePartitions":1,"ReassignLambdaVariableID":1,"PushPredicateThroughNonJoin":1,"Analyzer$HandleNullInputsForUDF":1,"Analyzer$ResolveSubqueryColumnAliases":1,"ResolveTimeZone":1,"Analyzer$ResolveNamespace":1,"Analyzer$ResolveOrdinalInOrderByAndGroupBy":1,"RewriteCorrelatedScalarSubquery":3,"RemoveLiteralFromGroupExpressions":1,"PushProjectionThroughUnion":3,"EliminateSubqueryAliases":1,"ResolveCatalogs":1,"PushLeftSemiLeftAntiThroughJoin":3,"FlattenScalarSubqueriesWithAggregates":1,"LikeSimplification":3,"CollapseRepartition":3,"ResolveHints$ResolveCoalesceHints":1,"Analyzer$ExtractGenerator":1,"RewriteIntersectAll":1,"ResolveHints$ResolveJoinStrategyHints":1,"TypeCoercion$MapZipWithCoercion":1,"NullPropagation":3,"PullupCorrelatedPredicates":1,"UpdateOuterReferences":1,"ExtractPythonUDFs":1,"Analyzer$WindowsSubstitution":1,"CombineUnions":4,"ExtractGroupingPythonUDFFromAggregate":1,"ReorderAssociativeOperator":3,"CleanupDynamicPruningFilters":1,"ResolveHints$RemoveAllHints":1,"SimplifyBinaryComparison":3,"ResolveTableValuedFunctions":1,"EliminateSerialization":3,"TypeCoercion$BooleanEquality":1,"ReplaceIntersectWithSemiJoin":1,"ConstantPropagation":3,"CostBasedJoinReorder":1,"Analyzer$ResolveReferences":1,"CTESubstitution":1,"RemoveRedundantAliases":3,"TypeCoercion$ImplicitTypeCasts":1,"RewriteExceptAll":1,"UpdateAttributeNullability":1,"PropagateEmptyRelation":2,"SimplifyCasts":3,"EliminateMapObjects":1,"CombineLimits":3,"DetectAmbiguousSelfJoin":1,"ReplaceExpressions":1,"ResolveInlineTables":1,"OptimizeIn":3,"CollapseWindow":3,"TypeCoercion$IfCoercion":1,"ResolveSessionCatalog":1,"PartitionPruning":1,"BooleanSimplification":3,"TypeCoercion$PromoteStrings":1,"Analyzer$ResolveAliases":1,"DecimalAggregates":1,"PruneFilters":4,"Analyzer$ResolveMissingReferences":1,"TransposeWindow":3,"Analyzer$ResolveRelations":1,"EliminateUnions":1,"RewritePredicateSubquery":1,"ObjectSerializerPruning":1,"LimitPushDown":3,"SimplifyCaseConversionExpressions":3,"Analyzer$ResolveNaturalAndUsingJoin":1,"EliminateView":1,"CombineTypedFilters":1,"OptimizeLimitZero":1,"CheckCartesianProducts":2,"ExtractPythonUDFFromAggregate":1,"Analyzer$ExtractWindowExpressions":1,"ReplaceExceptWithAntiJoin":1,"ResolveLambdaVariables":1,"FallBackFileSourceV2":1,"Analyzer$ResolveTables":1,"SubstituteUnresolvedOrdinals":1,"TypeCoercion$CaseWhenCoercion":1,"DecimalPrecision":1,"EliminateSorts":1,"PushDownLeftSemiAntiJoin":3,"ExtractPythonUDFFromJoinCondition":1,"TypeCoercion$StackCoercion":1,"Analyzer$ResolveAggAliasInGroupBy":1,"TypeCoercion$StringLiteralCoercion":1,"FoldablePropagation":3,"V2ScanRelationPushDown":1,"EliminateDistinct":1,"InferFiltersFromConstraints":1,"Analyzer$PullOutNondeterministic":1,"Analyzer$ResolveFunctions":1,"ReplaceNullWithFalseInPredicate":3,"ResolveHigherOrderFunctions":1,"Analyzer$ResolvePivot":1,"CollapseProject":4,"Analyzer$ResolveNewInstance":1,"ColumnPruning":5,"Analyzer$ResolveWindowOrder":1,"TypeCoercion$ConcatCoercion":1,"PushDownPredicates":5,"TimeWindowing":1,"Optimizer$OptimizeSubqueries":3,"RewriteNonCorrelatedExists":1,"DemoteBroadcastHashJoin":1,"TypeCoercion$Division":1,"ComputeCurrentTime":1,"ResolveCreateNamedStruct":1,"TypeCoercion$EltCoercion":1,"ConvertToLocalRelation":2,"RemoveRepetitionFromGroupExpressions":1,"ReplaceDistinctWithAggregate":1,"PreprocessTableCreation":1,"ResolveSQLOnFile":1,"Analyzer$ResolveSubquery":1,"CombineConcats":3,"Analyzer$ResolveGroupingAnalytics":1,"Analyzer$ResolveBinaryArithmetic":1,"RemoveDispensableExpressions":3,"Analyzer$ResolveAlterTableChanges":1,"ResolveEncodersInScalaAgg":1,"TypeCoercion$IntegralDivision":1,"Analyzer$ResolveWindowFrame":1,"Analyzer$ResolveDeserializer":1,"RewriteDistinctAggregates":1,"RemoveNoopOperators":5,"Analyzer$ResolveAggregateFunctions":1,"NormalizeFloatingNumbers":1,"ReorderJoin":3,"Analyzer$ResolveUpCast":1,"Analyzer$ResolveGenerate":1,"TypeCoercion$WidenSetOperationTypes":1,"EliminateOuterJoin":3,"SimplifyExtractValueOps":3,"OptimizeMetadataOnlyQuery":1,"EliminateResolvedHint":1,"Analyzer$ResolveInsertInto":1,"ReplaceExceptWithFilter":1,"CleanupAliases":1,"GetCurrentDatabase":1,"SchemaPruning":1,"Analyzer$ResolveOutputRelation":1,"BloomFilterJoinRule":1,"Analyzer$ResolveRandomSeed":1,"TypeCoercion$WindowFrameCoercion":1,"ConstantFolding":3,"TypeCoercion$DateTimeOperations":1,"TypeCoercion$InConversion":1,"FindDataSourceTable":1,"SimplifyConditionals":3,"DataSourceAnalysis":1,"TypeCoercion$FunctionArgumentConversion":1,"Analyzer$GlobalAggregates":1,"Analyzer$LookupFunctions":1,"CombineFilters":4,"ReplaceDeduplicateWithAggregate":1,"PreprocessTableInsertion":1},"numEffectiveRunsPerRule":{"ColumnPruning":2,"CollapseProject":1,"ExtractPythonUDFs":1},"timeEffectiveRunsPerRule":{"ColumnPruning":2542317,"CollapseProject":751354,"ExtractPythonUDFs":1254885},"counters":{},"timers":{}} -{"Event":"org.apache.spark.sql.execution.ui.SparkListenerSQLExecutionEnd","executionId":8,"time":1678162996163} -{"Event":"SparkListenerApplicationEnd","Timestamp":1678162996233} diff --git a/demo/emr/emr-cluster-report.json b/demo/emr/emr-cluster-report.json deleted file mode 100644 index a9f5072..0000000 --- a/demo/emr/emr-cluster-report.json +++ /dev/null @@ -1,354 +0,0 @@ -{ - "Cluster": { - "Id": "j-14QV64S2PV1Y2", - "Name": "indexdataetl1gb", - "Status": { - "State": "TERMINATED", - "StateChangeReason": { - "Code": "ALL_STEPS_COMPLETED", - "Message": "Steps completed" - }, - "Timeline": { - "CreationDateTime": "2023-03-06T20:14:30.160000-08:00", - "ReadyDateTime": "2023-03-06T20:22:10.554000-08:00", - "EndDateTime": "2023-03-06T20:27:03.197000-08:00" - } - }, - "Ec2InstanceAttributes": { - "Ec2KeyName": "global-key", - "Ec2SubnetId": "subnet-08xlshei29a9202dc", - "RequestedEc2SubnetIds": [ - "subnet-0blasve89vw96b8c5", - "subnet-97svllkja9346a8c4", - "subnet-08dd4lkjser09872f", - "subnet-slv80valk3avnj797", - "subnet-098xxlkqhklwf3lkj", - "subnet-lakjf989h39kajdg7" - ], - "Ec2AvailabilityZone": "us-east-1c", - "RequestedEc2AvailabilityZones": [], - "IamInstanceProfile": "EMR_EC2_DefaultRole", - "EmrManagedMasterSecurityGroup": "sg-alsvewf29837437e5", - "EmrManagedSlaveSecurityGroup": "sg-alsdfj93870342bdf" - }, - "InstanceCollectionType": "INSTANCE_FLEET", - "LogUri": "s3n://my-emr-job-logs/indexdataetl1gb/", - "ReleaseLabel": "emr-6.2.0", - "AutoTerminate": true, - "TerminationProtected": false, - "VisibleToAllUsers": true, - "Applications": [ - { - "Name": "Spark", - "Version": "3.0.1" - } - ], - "Tags": [ - { - "Key": "sync:run-id", - "Value": "f84639ed-7a6a-4496-81e1-b5ba8fa8b6ce" - }, - { - "Key": "Owner", - "Value": "Scott" - }, - { - "Key": "sync:project-id", - "Value": "29f4dded-70be-4344-b9b5-396c8c0481cf" - } - ], - "ServiceRole": "EMR_DefaultRole", - "NormalizedInstanceHours": 76, - "MasterPublicDnsName": "ec2-52-21-192-83.compute-1.amazonaws.com", - "Configurations": [ - { - "Classification": "spark-defaults", - "Properties": { - "spark.dynamicAllocation.enabled": "false", - "spark.eventLog.dir": "s3a://my-emr-projects/29f4dded-70be-4344-b9b5-396c8c0481cf/2023-03-07T04:14:28Z/f84639ed-7a6a-4496-81e1-b5ba8fa8b6ce/eventlog/", - "spark.eventLog.enabled": "true", - "spark.executor.cores": "4", - "spark.executor.instances": "8", - "spark.executor.memory": "4656M", - "spark.executor.processTreeMetrics.enabled": "true" - } - } - ], - "ScaleDownBehavior": "TERMINATE_AT_TASK_COMPLETION", - "ClusterArn": "arn:aws:elasticmapreduce:us-east-1:111122223333:cluster/j-14QB7SA9801Y2", - "StepConcurrencyLevel": 1, - "PlacementGroups": [], - "BootstrapActions": [ - { - "Name": "Packages setup", - "ScriptPath": "s3://my-emr-job-scripts/dummy.sh", - "Args": [] - } - ], - "InstanceFleets": [ - { - "Id": "if-DR8F73EAI88V", - "Name": "Core - 2", - "Status": { - "State": "TERMINATED", - "StateChangeReason": { - "Code": "CLUSTER_TERMINATED", - "Message": "Job flow terminated" - }, - "Timeline": { - "CreationDateTime": "2023-03-06T20:14:30.234000-08:00", - "ReadyDateTime": "2023-03-06T20:22:08.788000-08:00", - "EndDateTime": "2023-03-06T20:27:03.126000-08:00" - } - }, - "InstanceFleetType": "CORE", - "TargetOnDemandCapacity": 1, - "TargetSpotCapacity": 0, - "ProvisionedOnDemandCapacity": 0, - "ProvisionedSpotCapacity": 0, - "InstanceTypeSpecifications": [ - { - "InstanceType": "c5a.8xlarge", - "WeightedCapacity": 1, - "BidPriceAsPercentageOfOnDemandPrice": 100.0, - "EbsBlockDevices": [ - { - "VolumeSpecification": { - "VolumeType": "gp2", - "SizeInGB": 64 - } - } - ] - } - ] - }, - { - "Id": "if-SB7S98AJEMP7", - "Name": "Task - 1", - "Status": { - "State": "TERMINATED", - "StateChangeReason": { - "Code": "CLUSTER_TERMINATED", - "Message": "Job flow terminated" - }, - "Timeline": { - "CreationDateTime": "2023-03-06T20:14:30.238000-08:00", - "ReadyDateTime": "2023-03-06T20:22:10.574000-08:00", - "EndDateTime": "2023-03-06T20:27:03.126000-08:00" - } - }, - "InstanceFleetType": "TASK", - "TargetOnDemandCapacity": 0, - "TargetSpotCapacity": 1, - "ProvisionedOnDemandCapacity": 0, - "ProvisionedSpotCapacity": 0, - "InstanceTypeSpecifications": [ - { - "InstanceType": "m4.large", - "WeightedCapacity": 1, - "BidPriceAsPercentageOfOnDemandPrice": 100.0, - "EbsBlockDevices": [ - { - "VolumeSpecification": { - "VolumeType": "gp2", - "SizeInGB": 32 - } - } - ] - } - ], - "LaunchSpecifications": { - "SpotSpecification": { - "TimeoutDurationMinutes": 120, - "TimeoutAction": "TERMINATE_CLUSTER" - } - } - }, - { - "Id": "if-1HD2lk5lfl23H", - "Name": "Master node", - "Status": { - "State": "TERMINATED", - "StateChangeReason": { - "Code": "CLUSTER_TERMINATED", - "Message": "Job flow terminated" - }, - "Timeline": { - "CreationDateTime": "2023-03-06T20:14:30.230000-08:00", - "ReadyDateTime": "2023-03-06T20:21:51.635000-08:00", - "EndDateTime": "2023-03-06T20:27:03.126000-08:00" - } - }, - "InstanceFleetType": "MASTER", - "TargetOnDemandCapacity": 1, - "TargetSpotCapacity": 0, - "ProvisionedOnDemandCapacity": 0, - "ProvisionedSpotCapacity": 0, - "InstanceTypeSpecifications": [ - { - "InstanceType": "m4.xlarge", - "WeightedCapacity": 1, - "BidPriceAsPercentageOfOnDemandPrice": 100.0, - "EbsBlockDevices": [ - { - "VolumeSpecification": { - "VolumeType": "gp2", - "SizeInGB": 32 - } - }, - { - "VolumeSpecification": { - "VolumeType": "gp2", - "SizeInGB": 32 - } - } - ] - } - ] - } - ] - }, - "Instances": [ - { - "Id": "ci-08367242A7KTF0W6Z17L", - "Ec2InstanceId": "i-09a090c99e87741fe", - "PublicDnsName": "ec2-54-166-68-104.compute-1.amazonaws.com", - "PublicIpAddress": "54.166.68.104", - "PrivateDnsName": "ip-172-31-102-249.ec2.internal", - "PrivateIpAddress": "172.31.102.249", - "Status": { - "State": "TERMINATED", - "StateChangeReason": { - "Code": "INSTANCE_FAILURE", - "Message": "Instance was terminated." - }, - "Timeline": { - "CreationDateTime": "2023-03-06T20:14:56.729000-08:00", - "ReadyDateTime": "2023-03-06T20:21:34.856000-08:00", - "EndDateTime": "2023-03-06T20:27:03.039000-08:00" - } - }, - "InstanceFleetId": "if-DR8F73EAI88V", - "Market": "ON_DEMAND", - "InstanceType": "c5a.8xlarge", - "EbsVolumes": [ - { - "Device": "/dev/sdb", - "VolumeId": "vol-0a02a3db57625ec28" - } - ] - }, - { - "Id": "ci-05794553JWMGGZCM3VPB", - "Ec2InstanceId": "i-0f806b0efc34e4850", - "PublicDnsName": "ec2-52-23-195-73.compute-1.amazonaws.com", - "PublicIpAddress": "52.23.195.73", - "PrivateDnsName": "ip-172-31-102-115.ec2.internal", - "PrivateIpAddress": "172.31.102.115", - "Status": { - "State": "TERMINATED", - "StateChangeReason": { - "Code": "INSTANCE_FAILURE", - "Message": "Instance was terminated." - }, - "Timeline": { - "CreationDateTime": "2023-03-06T20:15:33.349000-08:00", - "ReadyDateTime": "2023-03-06T20:21:51.635000-08:00", - "EndDateTime": "2023-03-06T20:27:03.039000-08:00" - } - }, - "InstanceFleetId": "if-1HD2lk5lfl23H", - "Market": "ON_DEMAND", - "InstanceType": "m4.xlarge", - "EbsVolumes": [ - { - "Device": "/dev/sdc", - "VolumeId": "vol-017761545cdfb7e7b" - }, - { - "Device": "/dev/sdb", - "VolumeId": "vol-0f489afef8b46dba1" - } - ] - }, - { - "Id": "ci-0317762Z528GJIFRW14", - "Ec2InstanceId": "i-01bcf3fa4aacd6711", - "PublicDnsName": "ec2-54-162-122-114.compute-1.amazonaws.com", - "PublicIpAddress": "54.162.122.114", - "PrivateDnsName": "ip-172-31-102-191.ec2.internal", - "PrivateIpAddress": "172.31.102.191", - "Status": { - "State": "TERMINATED", - "StateChangeReason": { - "Code": "INSTANCE_FAILURE", - "Message": "Instance was terminated." - }, - "Timeline": { - "CreationDateTime": "2023-03-06T20:15:33.349000-08:00", - "ReadyDateTime": "2023-03-06T20:21:34.856000-08:00", - "EndDateTime": "2023-03-06T20:27:03.039000-08:00" - } - }, - "InstanceFleetId": "if-SB7S98AJEMP7", - "Market": "SPOT", - "InstanceType": "m4.large", - "EbsVolumes": [ - { - "Device": "/dev/sdb", - "VolumeId": "vol-0a14ef44daa3bf876" - } - ] - } - ], - "Steps": [ - { - "Id": "s-1EF238MZKOWWR", - "Name": "Execute job script", - "Config": { - "Jar": "command-runner.jar", - "Properties": {}, - "Args": [ - "spark-submit", - "/home/hadoop/index_data_etl_1GB.py" - ] - }, - "ActionOnFailure": "CANCEL_AND_WAIT", - "Status": { - "State": "COMPLETED", - "StateChangeReason": {}, - "Timeline": { - "CreationDateTime": "2023-03-06T20:14:30.255000-08:00", - "StartDateTime": "2023-03-06T20:22:22.473000-08:00", - "EndDateTime": "2023-03-06T20:23:18.602000-08:00" - } - } - }, - { - "Id": "s-3CRVCEKJYF4ZG", - "Name": "Setup and copy files to cluster", - "Config": { - "Jar": "command-runner.jar", - "Properties": {}, - "Args": [ - "aws", - "s3", - "cp", - "s3://my-emr-data/etl-jobs/scripts/index_data_etl/index_data_etl_1GB.py", - "/home/hadoop/" - ] - }, - "ActionOnFailure": "CANCEL_AND_WAIT", - "Status": { - "State": "COMPLETED", - "StateChangeReason": {}, - "Timeline": { - "CreationDateTime": "2023-03-06T20:14:30.255000-08:00", - "StartDateTime": "2023-03-06T20:22:15.097000-08:00", - "EndDateTime": "2023-03-06T20:22:17.466000-08:00" - } - } - } - ], - "Region": "us-east-1" -} diff --git a/docs/guide/project.rst b/docs/guide/project.rst deleted file mode 100644 index c244ef8..0000000 --- a/docs/guide/project.rst +++ /dev/null @@ -1,80 +0,0 @@ -Projects -======== - -Projects are Sync-enabled Apache Spark applications. Once a project is created around an -application the performance and cost of that application can be continuously tracked to provide -analysis and recommendations. - -On-boarding ------------ - -There are varying degrees to which an Apache Spark application can be on-boarded. First however, a Sync project must be created: - -.. autofunction:: sync.api.projects.create_project - :noindex: - -For a more robust experience add an S3 location under which to store event logs and application configurations. If the application has an event log configuration based on -that location only a project reference is needed to track it in a Sync project. This library function provides a full EMR configuration for the project: - -.. autofunction:: sync.awsemr.get_project_job_flow - :noindex: - -At any point after at least 1 run of the project-configured application the latest prediction can be generated with :py:func:`~sync.awsemr.create_project_prediction`. - -To get the most out of your project each application run should be recorded. This way Sync can provide the best recommendations. The library function to call is, - -.. autofunction:: sync.awsemr.record_run - :noindex: - -Continuous Tuning ------------------ - -Iterative Optimization -~~~~~~~~~~~~~~~~~~~~~~ - -Sync projects track the status of an Apache Spark application as predictions are applied to further optimize it. -The progress of a project is cyclical: with a configuration and log from the previous run, a prediction is generated -and applied yielding a new log & configuration. - -.. image:: /_static/orchestration.png - -Each run has its own location in S3 under the project location for event logs and configuration. -It is keyed by timestamp to make browsing in the AWS console easier, and run ID to guarantee uniqueness: - -.. code-block:: text - - s3://{project bucket}/{project prefix}/{project ID}/{timestamp}/{run ID} - -Example: - -.. code-block:: text - - s3://megacorp-jobs/sync-projects/54129c79-ee4a-47cf-8bf3-3e2326443fbc/2022-11-15T13:51:29Z/01953ba2-ee4a-47cf-8bf3-80sbj2lapcn8 - - -Iterative Tracking and Notification -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Rather than applying a prediction every time, a discretionary approach may be preferable. Project -configuration should still be applied to the cluster so that orchestration can create a prediction -after each run. If a prediction is compelling it is applied to the configuration of subsequent runs. - -The flow goes like so, - -Setup: - -1. Create a project with application name/ID, S3 project location and optionally, a prediction preference - -Orchestration: - -1. Before an app is run the orchestrator updates the cluster configuration with the following - either manually, or by calling :py:func:`~sync.awsemr.get_project_job_flow` - - 1. Event log location: - - ``{project S3 URL}/{project ID}/{timestamp}/{run ID}`` - - 2. Sync tags: `sync:tenant-id`, `sync:project-id`, `sync:run-id` - -2. Applies the updated job flow -3. Records the run when the cluster completes diff --git a/docs/guide/start.rst b/docs/guide/start.rst index 6cfa49f..91633fa 100644 --- a/docs/guide/start.rst +++ b/docs/guide/start.rst @@ -28,10 +28,7 @@ The CLI provided with this package makes it easy to configure access to the Sync You'll be prompted for an API key which can be had from the account page of the Sync web app. This creates the Sync directory if it doesn't already exist and stores the API key in it at `~/.sync/credentials`. -You'll also be prompted for default values for, - -1. S3 location under which to store project data -2. Your prediction preference, e.g. "balanced" +You'll also be prompted for default values for and S3 location under which to store project data. These are optional to help with setting up multiple Sync projects. However, in the context of an existing project they are superseded by the corresponding properties of that project. They are stored in `~/.sync/config`. diff --git a/docs/index.rst b/docs/index.rst index 773f870..10d0ada 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -19,7 +19,6 @@ Welcome to the Sync Library! :caption: Reference reference/api - reference/awsemr reference/awsdatabricks reference/azuredatabricks diff --git a/docs/reference/api.rst b/docs/reference/api.rst index 5ea258a..bfb4ae8 100644 --- a/docs/reference/api.rst +++ b/docs/reference/api.rst @@ -5,5 +5,4 @@ Sync API :maxdepth: 2 :caption: Reference - api/projects - api/predictions \ No newline at end of file + api/projects \ No newline at end of file diff --git a/docs/reference/api/predictions.rst b/docs/reference/api/predictions.rst deleted file mode 100644 index 60ffc32..0000000 --- a/docs/reference/api/predictions.rst +++ /dev/null @@ -1,5 +0,0 @@ -Predictions -=========== - -.. automodule:: sync.api.predictions - :members: \ No newline at end of file diff --git a/docs/reference/awsemr.rst b/docs/reference/awsemr.rst deleted file mode 100644 index c1d1cb3..0000000 --- a/docs/reference/awsemr.rst +++ /dev/null @@ -1,6 +0,0 @@ -EMR -=== - -.. automodule:: sync.awsemr - :members: - diff --git a/sync/api/projects.py b/sync/api/projects.py index 4893ba7..8a073f6 100644 --- a/sync/api/projects.py +++ b/sync/api/projects.py @@ -277,7 +277,7 @@ def create_project_submission_with_eventlog_bytes( ) -> Response[str]: """Creates a submission given event log bytes instead of a URL - :param platform: platform, e.g. "aws-emr" + :param platform: platform, e.g. "aws-databricks" :type platform: Platform :param cluster_report: cluster report :type cluster_report: dict From 27485536fdd34bc66e8c5eb40a4a1ed3e689292c Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Fri, 9 Feb 2024 22:24:52 -0500 Subject: [PATCH 09/18] clean up models --- sync/models.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/sync/models.py b/sync/models.py index 9677557..e6118ad 100644 --- a/sync/models.py +++ b/sync/models.py @@ -65,10 +65,6 @@ def __str__(self): return f"{self.code}: {self.message}" -class PredictionError(Error): - code: str = Field("Prediction Error", const=True) - - class ProjectError(Error): code: str = Field("Project Error", const=True) @@ -81,10 +77,6 @@ class SubmissionError(Error): code: str = Field("Submission Error", const=True) -class EMRError(Error): - code: str = Field("EMR Error", const=True) - - @unique class DatabricksPlanType(str, Enum): STANDARD = "Standard" From 83eb9d8c6946f3a72638f3ea8094eb95059cd629 Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Sat, 10 Feb 2024 09:47:04 -0500 Subject: [PATCH 10/18] add dummy test so CI passes --- tests/test_dummy.py | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 tests/test_dummy.py diff --git a/tests/test_dummy.py b/tests/test_dummy.py new file mode 100644 index 0000000..603860f --- /dev/null +++ b/tests/test_dummy.py @@ -0,0 +1,2 @@ +def test_dummy(): + assert True \ No newline at end of file From d636a3043af6ad41eca686b4ac8e3855efca594f Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Sat, 10 Feb 2024 09:48:58 -0500 Subject: [PATCH 11/18] make tidy --- tests/test_dummy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_dummy.py b/tests/test_dummy.py index 603860f..f4f5361 100644 --- a/tests/test_dummy.py +++ b/tests/test_dummy.py @@ -1,2 +1,2 @@ def test_dummy(): - assert True \ No newline at end of file + assert True From 89252562af9ed71b7a989bcb6c09f9b311f89c60 Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Sun, 11 Feb 2024 10:01:23 -0500 Subject: [PATCH 12/18] redo main files to only delete prediction operations --- sync/_databricks.py | 547 ++++++++++++++++++++++++++++++++++++---- sync/awsdatabricks.py | 28 ++ sync/azuredatabricks.py | 30 +++ 3 files changed, 558 insertions(+), 47 deletions(-) diff --git a/sync/_databricks.py b/sync/_databricks.py index 26dc03d..ca1a1d5 100644 --- a/sync/_databricks.py +++ b/sync/_databricks.py @@ -41,6 +41,21 @@ def create_cluster(config: dict) -> Response[str]: return Response(result=response["cluster_id"]) +def get_cluster(cluster_id: str) -> Response[dict]: + """Get Databricks cluster. + + :param cluster_id: cluster ID + :type cluster_id: str + :return: cluster object + :rtype: Response[dict] + """ + cluster = get_default_client().get_cluster(cluster_id) + if "error_code" in cluster: + return Response(error=DatabricksAPIError(**cluster)) + + return Response(result=cluster) + + def create_submission_for_run( run_id: str, plan_type: str, @@ -486,6 +501,57 @@ def apply_project_recommendation( return Response(result=recommendation_id) +def get_recommendation_job(job_id: str, project_id: str, recommendation_id: str) -> Response[dict]: + """Apply the recommendation to the specified job. + + The basis job can only have tasks that run on the same cluster. That cluster is updated with the + configuration from the prediction and returned in the result job configuration. Use this function + to apply a prediction to an existing job or test a prediction with a one-off run. + + :param job_id: basis job ID + :type job_id: str + :param project_id: Sync project ID + :type project_id: str + :param recommendation_id: recommendation ID + :type recommendation_id: str + :return: job object with recommendation applied to it + :rtype: Response[dict] + """ + job = get_default_client().get_job(job_id) + + if "error_code" in job: + return Response(error=DatabricksAPIError(**job)) + + job_settings = job["settings"] + tasks = job_settings.get("tasks", []) + if tasks: + cluster_response = _get_job_cluster(tasks, job_settings.get("job_clusters", [])) + cluster = cluster_response.result + if cluster: + recommendation_cluster_response = get_recommendation_cluster( + cluster, project_id, recommendation_id + ) + recommendation_cluster = recommendation_cluster_response.result + if recommendation_cluster: + cluster_key = tasks[0].get("job_cluster_key") + if cluster_key: + job_settings["job_clusters"] = [ + j + for j in job_settings["job_clusters"] + if j.get("job_cluster_key") != cluster_key + ] + [{"job_cluster_key": cluster_key, "new_cluster": recommendation_cluster}] + else: + # For `new_cluster` definitions, Databricks will automatically assign the newly created cluster a name, + # and will reject any run submissions where the `cluster_name` is pre-populated + if "cluster_name" in recommendation_cluster: + del recommendation_cluster["cluster_name"] + tasks[0]["new_cluster"] = recommendation_cluster + return Response(result=job) + return recommendation_cluster_response + return cluster_response + return Response(error=DatabricksError(message="No task found in job")) + + def get_recommendation_cluster( cluster: dict, project_id: str, recommendation_id: str ) -> Response[dict]: @@ -521,6 +587,51 @@ def get_recommendation_cluster( return recommendation_response +def get_project_job(job_id: str, project_id: str, region_name: str = None) -> Response[dict]: + """Apply project configuration to a job. + + The job can only have tasks that run on the same job cluster. That cluster is updated with tags + and a log configuration to facilitate project continuity. The result can be tested in a + one-off run or applied to an existing job to surface run-time (see :py:func:`~run_job_object`) or cost optimizations. + + :param job_id: ID of basis job + :type job_id: str + :param project_id: Sync project ID + :type project_id: str + :param region_name: region name, defaults to AWS configuration + :type region_name: str, optional + :return: project job object + :rtype: Response[dict] + """ + job = get_default_client().get_job(job_id) + if "error_code" in job: + return Response(error=DatabricksAPIError(**job)) + + job_settings = job["settings"] + tasks = job_settings.get("tasks", []) + if tasks: + cluster_response = _get_job_cluster(tasks, job_settings.get("job_clusters", [])) + cluster = cluster_response.result + if cluster: + project_cluster_response = get_project_cluster(cluster, project_id, region_name) + project_cluster = project_cluster_response.result + if project_cluster: + cluster_key = tasks[0].get("job_cluster_key") + if cluster_key: + job_settings["job_clusters"] = [ + j + for j in job_settings["job_clusters"] + if j.get("job_cluster_key") != cluster_key + ] + [{"job_cluster_key": cluster_key, "new_cluster": project_cluster}] + else: + tasks[0]["new_cluster"] = project_cluster + + return Response(result=job) + return project_cluster_response + return cluster_response + return Response(error=DatabricksError(message="No task found in job")) + + def get_project_cluster(cluster: dict, project_id: str, region_name: str = None) -> Response[dict]: """Apply project configuration to a cluster. @@ -596,6 +707,342 @@ def get_project_cluster_settings(project_id: str, region_name: str = None) -> Re return project_response +def run_job_object(job: dict) -> Response[Tuple[str, str]]: + """Create a Databricks one-off run based on the job configuration. + + :param job: Databricks job object + :type job: dict + :return: run ID, and optionally ID of newly created cluster + :rtype: Response[Tuple[str, str]] + """ + tasks = job["settings"]["tasks"] + cluster_response = _get_job_cluster(tasks, job["settings"].get("job_clusters", [])) + + cluster = cluster_response.result + if cluster: + new_cluster_id = None + if len(tasks) == 1: + # For `new_cluster` definitions, Databricks will automatically assign the newly created cluster a name, + # and will reject any run submissions where the `cluster_name` is pre-populated + if "cluster_name" in cluster: + del cluster["cluster_name"] + + tasks[0]["new_cluster"] = cluster + del tasks[0]["job_cluster_key"] + else: + # If the original Job has a pre-existing Policy, we want to remove this from the `create_cluster` payload, + # since we are not allowed to create clusters with certain policies via that endpoint, e.g. we cannot + # create a `Job Compute` cluster via this endpoint. + if "policy_id" in cluster: + del cluster["policy_id"] + + # Create an "All-Purpose Compute" cluster + cluster["cluster_name"] = cluster["cluster_name"] or job["settings"]["name"] + cluster["autotermination_minutes"] = 10 # 10 minutes is the minimum + + cluster_result = get_default_client().create_cluster(cluster) + if "error_code" in cluster_result: + return Response(error=DatabricksAPIError(**cluster_result)) + + new_cluster_id = cluster_result["cluster_id"] + + for task in tasks: + task["existing_cluster_id"] = cluster_result["cluster_id"] + if "new_cluster" in task: + del task["new_cluster"] + if "job_cluster_key" in task: + del task["job_cluster_key"] + + run_result = get_default_client().create_run( + {"run_name": job["settings"]["name"], "tasks": tasks} + ) + if "error_code" in run_result: + return Response(error=DatabricksAPIError(**run_result)) + + return Response(result=(run_result["run_id"], new_cluster_id)) + return cluster_response + + +def create_run(run: dict) -> Response[str]: + """Creates a run based off the incoming Databricks run configuration + + :param run: run object + :type run: dict + :return: run ID + :rtype: Response[str] + """ + run_result = get_default_client().create_run(run) + if "error_code" in run_result: + return Response(error=DatabricksAPIError(**run_result)) + + return Response(result=run_result["run_id"]) + + +def run_and_record_project_job( + job_id: str, project_id: str, plan_type: str, compute_type: str, region_name: str = None +) -> Response[str]: + """Runs the specified job and adds the result to the project. + + This function waits for the run to complete. + + :param job_id: Databricks job ID + :type job_id: str + :param project_id: Sync project ID + :type project_id: str + :param plan_type: either "Standard", "Premium" or "Enterprise" + :type plan_type: str + :param compute_type: e.g. "Jobs Compute" + :type compute_type: str + :param region_name: region name, defaults to AWS configuration + :type region_name: str, optional + :return: prediction ID + :rtype: Response[str] + """ + project_job_response = get_project_job(job_id, project_id, region_name) + project_job = project_job_response.result + if project_job: + return run_and_record_job_object(project_job, plan_type, compute_type, project_id) + return project_job_response + + +def run_and_record_job( + job_id: str, plan_type: str, compute_type: str, project_id: str = None +) -> Response[str]: + """Runs the specified job and creates a prediction based on the result. + + If a project is specified the prediction is added to it. + + :param job_id: Databricks job ID + :type job_id: str + :param plan_type: either "Standard", "Premium" or "Enterprise" + :type plan_type: str + :param compute_type: e.g. "Jobs Compute" + :type compute_type: str + :param project_id: Sync project ID, defaults to None + :type project_id: str, optional + :return: prediction ID + :rtype: Response[str] + """ + # creates a "Jobs Compute" cluster + run_result = get_default_client().create_job_run({"job_id": job_id}) + if "error_code" in run_result: + return Response(error=DatabricksAPIError(**run_result)) + + run_id = run_result["run_id"] + return wait_for_and_record_run(run_id, plan_type, compute_type, project_id) + + +def run_and_record_job_object( + job: dict, plan_type: str, compute_type: str, project_id: str = None +) -> Response[str]: + """Creates a one-off Databricks run based on the provided job object. + + Job tasks must use the same job cluster, and that cluster must be configured to store the + event logs in S3. + + :param job: Databricks job object + :type job: dict + :param plan_type: either "Standard", "Premium" or "Enterprise" + :type plan_type: str + :param compute_type: e.g. "Jobs Compute" + :type compute_type: str + :param project_id: Sync project ID, defaults to None + :type project_id: str, optional + :return: prediction ID + :rtype: Response[str] + """ + run_response = run_job_object(job) + run_and_cluster_ids = run_response.result + if run_and_cluster_ids: + response = wait_for_run_and_cluster(run_and_cluster_ids[0]) + result_state = response.result + if result_state: + if result_state == "SUCCESS": + response = record_run(run_and_cluster_ids[0], plan_type, compute_type, project_id) + else: + response = Response( + error=DatabricksError(message=f"Unsuccessful run result state: {result_state}") + ) + + for cluster_id in run_and_cluster_ids[1:]: + delete_cluster_response = get_default_client().delete_cluster(cluster_id) + if "error_code" in delete_cluster_response: + logger.warning( + f"Failed to delete cluster {cluster_id}: {delete_cluster_response['error_code']}: {delete_cluster_response['message']}" + ) + + return response + return run_response + + +def create_and_record_run( + run: dict, plan_type: str, compute_type: str, project_id: str = None +) -> Response[str]: + """Applies the Databricks run configuration and creates a prediction based on the result. + + If a project is specified the resulting prediction is added to it. This function waits for + run to complete. + + :param run: Databricks run configuration + :type run: dict + :param plan_type: either "Standard", "Premium" or "Enterprise" + :type plan_type: str + :param compute_type: e.g. "Jobs Compute" + :type compute_type: str + :param project_id: Sync project ID, defaults to None + :type project_id: str, optional + :return: prediction ID + :rtype: Response[str] + """ + run_response = create_run(run) + run_id = run_response.result + if run_id: + return wait_for_and_record_run(run_id, plan_type, compute_type, project_id) + return run_response + + +def wait_for_and_record_run( + run_id: str, plan_type: str, compute_type: str, project_id: str = None +) -> Response[str]: + """Waits for a run to complete before creating a prediction. + + The run must save 1 event log to S3. If a project is specified the prediction is added + to that project. + + :param run_id: Databricks run ID + :type run_id: str + :param plan_type: either "Standard", "Premium" or "Enterprise" + :type plan_type: str + :param compute_type: e.g. "Jobs Compute" + :type compute_type: str + :param project_id: Sync project ID, defaults to None + :type project_id: str, optional + :return: prediction ID + :rtype: Response[str] + """ + wait_response = wait_for_final_run_status(run_id) + result_state = wait_response.result + if result_state: + if result_state == "SUCCESS": + return record_run(run_id, plan_type, compute_type, project_id) + return Response( + error=DatabricksError(message=f"Unsuccessful run result state: {result_state}") + ) + return wait_response + + +def create_and_wait_for_run(run: dict) -> Response[str]: + """Creates a Databricks run from the incoming configuration and returns the final status. + + This function waits for the run to complete. + + :param run: Databricks run configuration + :type run: dict + :return: result state, e.g. "SUCCESS" + :rtype: Response[str] + """ + run_response = create_run(run) + if run_response.error: + return run_response + + return wait_for_final_run_status(run_response.result) + + +def wait_for_final_run_status(run_id: str) -> Response[str]: + """Waits for run returning final status. + + :param run_id: Databricks run ID + :type run_id: str + :return: result state, e.g. "SUCCESS" + :rtype: Response[str] + """ + run = get_default_client().get_run(run_id) + while "error_code" not in run: + result_state = run["state"].get("result_state") # result_state isn't present while running + if result_state in {"SUCCESS", "FAILED", "TIMEDOUT", "CANCELED"}: + return Response(result=result_state) + + sleep(30) + run = get_default_client().get_run(run_id) + + return Response(error=DatabricksAPIError(**run)) + + +def wait_for_run_and_cluster(run_id: str) -> Response[str]: + """Waits for final run status and returns it after terminating the cluster. + + :param run_id: Databricks run ID + :type run_id: str + :return: result state, e.g. "SUCCESS" + :rtype: Response[str] + """ + run = get_default_client().get_run(run_id) + while "error_code" not in run: + result_state = run["state"].get("result_state") # result_state isn't present while running + if result_state in {"SUCCESS", "FAILED", "TIMEDOUT", "CANCELED"}: + for cluster_id in {task.get("existing_cluster_id") for task in run["tasks"]}: + cluster_response = terminate_cluster(cluster_id) + if cluster_response.error: + return cluster_response + return Response(result=result_state) + + sleep(30) + run = get_default_client().get_run(run_id) + + return Response(error=DatabricksAPIError(**run)) + + +def terminate_cluster(cluster_id: str) -> Response[dict]: + """Terminate Databricks cluster and wait to return final state. + + :param cluster_id: Databricks cluster ID + :type cluster_id: str + :return: Databricks cluster object with state: "TERMINATED" + :rtype: Response[str] + """ + cluster = get_default_client().get_cluster(cluster_id) + if "error_code" not in cluster: + state = cluster.get("state") + if state == "TERMINATED": + return Response(result=cluster) + elif state == "TERMINATING": + return _wait_for_cluster_termination(cluster_id) + elif state in {"PENDING", "RUNNING", "RESTARTING", "RESIZING"}: + get_default_client().terminate_cluster(cluster_id) + return _wait_for_cluster_termination(cluster_id) + else: + return Response(error=DatabricksError(message=f"Unexpected cluster state: {state}")) + + return Response(error=DatabricksAPIError(**cluster)) + + +def _wait_for_cluster_termination( + cluster_id: str, timeout_seconds=600, poll_seconds=10 +) -> Response[dict]: + logging.info(f"Waiting for cluster {cluster_id} to terminate") + start_seconds = time.time() + cluster = get_default_client().get_cluster(cluster_id) + while "error_code" not in cluster: + state = cluster.get("state") + if state == "TERMINATED": + return Response(result=cluster) + elif state == "TERMINATING": + sleep(poll_seconds) + else: + return Response(error=DatabricksError(message=f"Unexpected cluster state: {state}")) + + if time.time() - start_seconds > timeout_seconds: + return Response( + error=DatabricksError( + message=f"Cluster failed to terminate after waiting {timeout_seconds} seconds" + ) + ) + + cluster = get_default_client().get_cluster(cluster_id) + + return Response(error=DatabricksAPIError(**cluster)) + + def _cluster_log_destination( cluster: dict, ) -> Union[Tuple[str, str, str, str], Tuple[None, None, None, None]]: @@ -620,6 +1067,20 @@ def _cluster_log_destination( return None, None, None, None +def _get_job_cluster(tasks: List[dict], job_clusters: list) -> Response[dict]: + if len(tasks) == 1: + return _get_task_cluster(tasks[0], job_clusters) + + if [t.get("job_cluster_key") for t in tasks].count(tasks[0].get("job_cluster_key")) == len( + tasks + ): + for cluster in job_clusters: + if cluster["job_cluster_key"] == tasks[0].get("job_cluster_key"): + return Response(result=cluster["new_cluster"]) + return Response(error=DatabricksError(message="No cluster found for task")) + return Response(error=DatabricksError(message="Not all tasks use the same cluster")) + + def _get_project_job_clusters( job: dict, exclude_tasks: Union[Collection[str], None] = None, @@ -775,6 +1236,22 @@ def _get_run_spark_context_id(tasks: List[dict]) -> Response[str]: return Response(error=DatabricksError(message="More than 1 cluster found for tasks")) +def _get_task_cluster(task: dict, clusters: list) -> Response[dict]: + cluster = task.get("new_cluster") + + if not cluster: + cluster_matches = [ + candidate + for candidate in clusters + if candidate["job_cluster_key"] == task.get("job_cluster_key") + ] + if cluster_matches: + cluster = cluster_matches[0]["new_cluster"] + else: + return Response(error=DatabricksError(message="No cluster found for task")) + return Response(result=cluster) + + def _s3_contents_have_all_rollover_logs(contents: List[dict], run_end_time_seconds: float): final_rollover_log = contents and next( ( @@ -834,6 +1311,11 @@ def _check_total_file_size_changed( return True, new_total_file_size +def _event_log_poll_duration_seconds(): + """Convenience function to aid testing""" + return 15 + + def _get_eventlog_from_s3( cluster_id: str, bucket: str, @@ -989,7 +1471,7 @@ def _get_eventlog( # https://docs.databricks.com/clusters/configure.html#cluster-log-delivery-1 # So we will poll this location for *up to* 5 minutes until we see all the eventlog files we are expecting # in the S3 bucket - poll_duration_seconds = 15 + poll_duration_seconds = _event_log_poll_duration_seconds() if filesystem == "s3": return _get_eventlog_from_s3( @@ -1011,29 +1493,6 @@ def _get_eventlog( return Response(error=DatabricksError(message=f"Unknown log destination: {filesystem}")) -KeyType = TypeVar("KeyType") - - -def _deep_update( - mapping: Dict[KeyType, Any], *updating_mappings: Dict[KeyType, Any] -) -> Dict[KeyType, Any]: - updated_mapping = mapping.copy() - for updating_mapping in updating_mappings: - for k, v in updating_mapping.items(): - if k in updated_mapping: - if isinstance(updated_mapping[k], dict) and isinstance(v, dict): - updated_mapping[k] = _deep_update(updated_mapping[k], v) - elif isinstance(updated_mapping[k], list) and isinstance(v, list): - updated_mapping[k] += v - else: - updated_mapping[k] = v - else: - updated_mapping[k] = v - return updated_mapping - - -# The methods below here are all called within the "subclass scripts" -# awsdatabricks.py and azuredatabricks.py def _get_all_cluster_events(cluster_id: str): """Fetches all ClusterEvents for a given Databricks cluster, optionally within a time window. Pages will be followed and returned as 1 object @@ -1103,28 +1562,22 @@ def _update_monitored_timelines( return active_timelines_by_id, retired_inst_timeline_list -def _wait_for_cluster_termination( - cluster_id: str, timeout_seconds=600, poll_seconds=10 -) -> Response[dict]: - logging.info(f"Waiting for cluster {cluster_id} to terminate") - start_seconds = time.time() - cluster = get_default_client().get_cluster(cluster_id) - while "error_code" not in cluster: - state = cluster.get("state") - if state == "TERMINATED": - return Response(result=cluster) - elif state == "TERMINATING": - sleep(poll_seconds) - else: - return Response(error=DatabricksError(message=f"Unexpected cluster state: {state}")) - - if time.time() - start_seconds > timeout_seconds: - return Response( - error=DatabricksError( - message=f"Cluster failed to terminate after waiting {timeout_seconds} seconds" - ) - ) +KeyType = TypeVar("KeyType") - cluster = get_default_client().get_cluster(cluster_id) - return Response(error=DatabricksAPIError(**cluster)) +def _deep_update( + mapping: Dict[KeyType, Any], *updating_mappings: Dict[KeyType, Any] +) -> Dict[KeyType, Any]: + updated_mapping = mapping.copy() + for updating_mapping in updating_mappings: + for k, v in updating_mapping.items(): + if k in updated_mapping: + if isinstance(updated_mapping[k], dict) and isinstance(v, dict): + updated_mapping[k] = _deep_update(updated_mapping[k], v) + elif isinstance(updated_mapping[k], list) and isinstance(v, list): + updated_mapping[k] += v + else: + updated_mapping[k] = v + else: + updated_mapping[k] = v + return updated_mapping diff --git a/sync/awsdatabricks.py b/sync/awsdatabricks.py index f937216..0efb31e 100644 --- a/sync/awsdatabricks.py +++ b/sync/awsdatabricks.py @@ -16,13 +16,27 @@ _update_monitored_timelines, _wait_for_cluster_termination, apply_project_recommendation, + create_and_record_run, + create_and_wait_for_run, create_cluster, + create_run, create_submission_for_run, + get_cluster, get_cluster_report, get_project_cluster, get_project_cluster_settings, + get_project_job, + get_recommendation_job, handle_successful_job_run, record_run, + run_and_record_job, + run_and_record_job_object, + run_and_record_project_job, + run_job_object, + terminate_cluster, + wait_for_and_record_run, + wait_for_final_run_status, + wait_for_run_and_cluster, ) from sync.api import get_access_report as get_api_access_report from sync.clients.databricks import get_default_client @@ -40,14 +54,28 @@ __all__ = [ "get_access_report", + "run_and_record_job", "create_submission_for_run", "get_cluster_report", "monitor_cluster", "create_cluster", + "get_cluster", "handle_successful_job_run", "record_run", + "get_project_job", "get_project_cluster", "get_project_cluster_settings", + "get_recommendation_job", + "run_job_object", + "create_run", + "run_and_record_project_job", + "run_and_record_job_object", + "create_and_record_run", + "wait_for_and_record_run", + "create_and_wait_for_run", + "wait_for_final_run_status", + "wait_for_run_and_cluster", + "terminate_cluster", "apply_project_recommendation", ] diff --git a/sync/azuredatabricks.py b/sync/azuredatabricks.py index 340cf1e..c908165 100644 --- a/sync/azuredatabricks.py +++ b/sync/azuredatabricks.py @@ -20,12 +20,27 @@ _update_monitored_timelines, _wait_for_cluster_termination, apply_project_recommendation, + create_and_record_run, + create_and_wait_for_run, create_cluster, + create_run, create_submission_for_run, + get_cluster, get_cluster_report, get_project_cluster, get_project_cluster_settings, + get_project_job, + get_recommendation_job, + handle_successful_job_run, record_run, + run_and_record_job, + run_and_record_job_object, + run_and_record_project_job, + run_job_object, + terminate_cluster, + wait_for_and_record_run, + wait_for_final_run_status, + wait_for_run_and_cluster, ) from sync.api import get_access_report as get_api_access_report from sync.clients.databricks import get_default_client @@ -42,14 +57,29 @@ __all__ = [ "get_access_report", + "run_and_record_job", "monitor_cluster", "create_cluster", + "get_cluster", "create_submission_for_run", "get_cluster_report", + "handle_successful_job_run", "record_run", "get_project_cluster", + "get_project_job", + "get_recommendation_job", "get_project_cluster", "get_project_cluster_settings", + "run_job_object", + "create_run", + "run_and_record_project_job", + "run_and_record_job_object", + "create_and_record_run", + "wait_for_and_record_run", + "create_and_wait_for_run", + "wait_for_final_run_status", + "wait_for_run_and_cluster", + "terminate_cluster", "apply_project_recommendation", ] From bf7f69aac2dd2ec54aa09f4899d85755fdf1da42 Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Sun, 11 Feb 2024 10:11:08 -0500 Subject: [PATCH 13/18] add in new products endpoint --- sync/clients/sync.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sync/clients/sync.py b/sync/clients/sync.py index 8063a40..3a7d700 100644 --- a/sync/clients/sync.py +++ b/sync/clients/sync.py @@ -60,6 +60,9 @@ def __init__(self, api_url: str, api_key: APIKey): ) ) + def get_products(self) -> dict: + return self._send(self._client.build_request("GET", "/v1/projects/products")) + def create_project(self, project: dict) -> dict: headers, content = encode_json(project) return self._send( From c2206fb061ce4d0eadf5be49b32a4173526dd6db Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Sun, 11 Feb 2024 10:26:15 -0500 Subject: [PATCH 14/18] add back in products and tokens cli commands --- sync/api/projects.py | 9 +++++++++ sync/cli/__init__.py | 24 ++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/sync/api/projects.py b/sync/api/projects.py index 8a073f6..64d845b 100644 --- a/sync/api/projects.py +++ b/sync/api/projects.py @@ -16,6 +16,15 @@ logger = logging.getLogger(__name__) +def get_products() -> Response[List[str]]: + """Get supported platforms + :return: list of platform names + :rtype: Response[list[str]] + """ + response = get_default_client().get_products() + return Response(**response) + + def create_project( name: str, product_code: str, diff --git a/sync/cli/__init__.py b/sync/cli/__init__.py index ca14e41..17e74e0 100644 --- a/sync/cli/__init__.py +++ b/sync/cli/__init__.py @@ -5,8 +5,10 @@ import click +from sync.api.projects import get_products from sync.cli import awsdatabricks, azuredatabricks, projects, workspaces from sync.cli.util import OPTIONAL_DEFAULT +from sync.clients.sync import get_default_client from sync.config import API_KEY, DB_CONFIG, APIKey, DatabricksConf, init LOG_FORMAT = "%(asctime)s %(levelname)s [%(name)s] %(message)s" @@ -81,3 +83,25 @@ def configure( and dbx_region != OPTIONAL_DEFAULT else None, ) + + +@main.command +def products(): + """List supported products""" + products_response = get_products() + products = products_response.result + if products: + click.echo(", ".join(products)) + else: + click.echo(str(products_response.error), err=True) + + +@main.command +def token(): + """Get an API access token""" + sync_client = get_default_client() + response = sync_client.get_products() + if "result" in response: + click.echo(sync_client._client.auth._access_token) + else: + click.echo(f"{response['error']['code']}: {response['error']['message']}", err=True) From 2113e55a1bdb682ef309d8ac93891f3923063b19 Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Mon, 12 Feb 2024 15:30:41 -0500 Subject: [PATCH 15/18] tidy --- tests/test_awsdatabricks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_awsdatabricks.py b/tests/test_awsdatabricks.py index d022936..122712d 100644 --- a/tests/test_awsdatabricks.py +++ b/tests/test_awsdatabricks.py @@ -1,7 +1,9 @@ import unittest from unittest.mock import patch + from sync.awsdatabricks import monitor_cluster + @patch("sync.awsdatabricks._monitor_cluster") @patch("sync.clients.databricks.DatabricksClient.get_cluster") @patch( From 2419eabdfc1c0e2184993b2636ea51e0714585df Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Mon, 12 Feb 2024 15:31:53 -0500 Subject: [PATCH 16/18] bump major version. Tidy --- sync/__init__.py | 2 +- sync/_databricks.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sync/__init__.py b/sync/__init__.py index 9ef6c4f..e03d156 100644 --- a/sync/__init__.py +++ b/sync/__init__.py @@ -1,4 +1,4 @@ """Library for leveraging the power of Sync""" -__version__ = "0.6.4" +__version__ = "1.0.0" TIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ" diff --git a/sync/_databricks.py b/sync/_databricks.py index ca1a1d5..52816eb 100644 --- a/sync/_databricks.py +++ b/sync/_databricks.py @@ -17,7 +17,6 @@ from sync.api import projects from sync.clients.databricks import get_default_client -from sync.config import CONFIG from sync.models import DatabricksAPIError, DatabricksClusterReport, DatabricksError, Response from sync.utils.dbfs import format_dbfs_filepath, read_dbfs_file From d9cbef89925ac3d891eaa745dcfb16cd1b351076 Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Mon, 12 Feb 2024 18:27:24 -0500 Subject: [PATCH 17/18] try putting unnecessary import back in --- sync/_databricks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sync/_databricks.py b/sync/_databricks.py index 52816eb..7045e17 100644 --- a/sync/_databricks.py +++ b/sync/_databricks.py @@ -17,6 +17,7 @@ from sync.api import projects from sync.clients.databricks import get_default_client +from sync.config import CONFIG # noqa F401 from sync.models import DatabricksAPIError, DatabricksClusterReport, DatabricksError, Response from sync.utils.dbfs import format_dbfs_filepath, read_dbfs_file From 6be1f661533f7eb5d8b9de59fda118f6eccc0dbe Mon Sep 17 00:00:00 2001 From: Sean Gorsky Date: Mon, 12 Feb 2024 18:45:07 -0500 Subject: [PATCH 18/18] mock db_config --- tests/test_awsdatabricks.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tests/test_awsdatabricks.py b/tests/test_awsdatabricks.py index 122712d..48d7a14 100644 --- a/tests/test_awsdatabricks.py +++ b/tests/test_awsdatabricks.py @@ -2,13 +2,20 @@ from unittest.mock import patch from sync.awsdatabricks import monitor_cluster +from sync.config import DatabricksConf + +MOCK_DBX_CONF = DatabricksConf( + host="https://dbc-123.cloud.databricks.com", + token="my_secret_token", + aws_region_name="us-east-1", +) +@patch("sync.awsdatabricks.DB_CONFIG", new=MOCK_DBX_CONF) +@patch("sync.clients.databricks.DB_CONFIG", new=MOCK_DBX_CONF) @patch("sync.awsdatabricks._monitor_cluster") @patch("sync.clients.databricks.DatabricksClient.get_cluster") -@patch( - "sync.awsdatabricks._cluster_log_destination", -) +@patch("sync.awsdatabricks._cluster_log_destination") class TestMonitorCluster(unittest.TestCase): def test_monitor_cluster_with_override( self,