diff --git a/spark_log_parser/__init__.py b/spark_log_parser/__init__.py index 72dd153..afe350d 100644 --- a/spark_log_parser/__init__.py +++ b/spark_log_parser/__init__.py @@ -1,3 +1,3 @@ """Tools for providing Spark event log""" -__version__ = "0.1.0" +__version__ = "0.1.1" diff --git a/spark_log_parser/parsing_models/application_model_v2.py b/spark_log_parser/parsing_models/application_model_v2.py index 0ab4c4b..dd69819 100644 --- a/spark_log_parser/parsing_models/application_model_v2.py +++ b/spark_log_parser/parsing_models/application_model_v2.py @@ -134,10 +134,14 @@ def getSQLinfo(self, appobj): sql_jobs = [] sql_stages = [] sql_tasks = [] - for jid, job in appobj.jobs.items(): - if "end_time" not in sql.keys(): - sql["end_time"] = appobj.finish_time + # Sometimes an SQL event will be missing. To be informative, both + # events must be present. But this information is not critical, so + # if either event is missing then simply reject the SQL data + if "start_time" not in sql.keys() or "end_time" not in sql.keys(): + continue + + for jid, job in appobj.jobs.items(): if (job.submission_time >= sql["start_time"]) and ( job.submission_time <= sql["end_time"] diff --git a/tests/logs/emr_missing_sql_events.zip b/tests/logs/emr_missing_sql_events.zip new file mode 100644 index 0000000..7797929 Binary files /dev/null and b/tests/logs/emr_missing_sql_events.zip differ diff --git a/tests/test_parse.py b/tests/test_parse.py index 99a5c01..d5f70fb 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -64,3 +64,13 @@ def test_simple_emr_log(): assert ( parsed["metadata"]["application_info"]["name"] == "Text Similarity" ), "Name is as expected" + + +def test_emr_missing_sql_events(): + event_log_path = Path("tests", "logs", "emr_missing_sql_events.zip").resolve() + + with tempfile.TemporaryDirectory() as temp_dir: + event_log = eventlog.EventLogBuilder(event_log_path.as_uri(), temp_dir).build() + obj = sparkApplication(eventlog=str(event_log)) + + assert list(obj.sqlData.index.values) == [0, 2, 3, 5, 6, 7, 8]