-
Notifications
You must be signed in to change notification settings - Fork 3
[PROD-399] Handle edge case of single rollover file with index > 0 #9
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,51 +23,63 @@ def _validate_work_dir(self, work_dir: Path | str) -> Path: | |
| return work_dir_path | ||
|
|
||
| def build(self) -> Path: | ||
| event_logs = self.extractor.extract() | ||
| paths = self.extractor.extract() | ||
|
|
||
| self.event_log = self._concat(event_logs) | ||
| if not paths: | ||
| raise ValueError("No files found") | ||
|
|
||
| return self.event_log | ||
| self.event_log = self._get_event_log(paths) | ||
|
|
||
| def _concat(self, event_logs: list[Path]) -> Path: | ||
| if len(event_logs) == 1: | ||
| return event_logs[0] | ||
| return self.event_log | ||
|
|
||
| dat = [] | ||
| for log in event_logs: | ||
| with open(log) as log_file: | ||
| def _get_event_log(self, paths: list[Path]) -> Path: | ||
| log_files = [] | ||
| rollover_dat = [] | ||
| for path in paths: | ||
| with open(path) as fobj: | ||
| try: | ||
| line = json.loads(log_file.readline()) | ||
| line = json.loads(fobj.readline()) | ||
| except ValueError: | ||
| continue # Maybe a Databricks pricing file | ||
| if line["Event"] == "DBCEventLoggingListenerMetadata": | ||
| dat.append((line["Rollover Number"], line["SparkContext Id"], log)) | ||
| else: | ||
| raise ValueError("Expected DBC event not found") | ||
| continue | ||
| if "Event" in line: | ||
| log_files.append(path) | ||
| if line["Event"] == "DBCEventLoggingListenerMetadata": | ||
| rollover_dat.append( | ||
| (line["Rollover Number"], line["SparkContext Id"], path) | ||
| ) | ||
|
|
||
| if rollover_dat: | ||
| if len(log_files) > len(rollover_dat): | ||
| raise ValueError("No rollover properties found in log file") | ||
|
|
||
| return self._concat(rollover_dat) | ||
|
|
||
| if len(log_files) > 1: | ||
| raise ValueError("No rollover properties found in log file") | ||
|
|
||
| return log_files[0] | ||
|
|
||
| def _concat(self, rollover_dat: list[tuple[str, str, str]]) -> Path: | ||
| rollover_df = pd.DataFrame( | ||
| rollover_dat, columns=["rollover_index", "context_id", "path"] | ||
| ).sort_values("rollover_index") | ||
|
|
||
| if not len(rollover_df.context_id.unique()) == 1: | ||
| raise ValueError("Not all rollover log files have the same Spark context ID") | ||
|
|
||
| df = pd.DataFrame(dat, columns=["rollover_index", "context_id", "path"]).sort_values( | ||
| "rollover_index" | ||
| ) | ||
| diffs = rollover_df.rollover_index.diff() | ||
|
|
||
| self._validate_rollover_logs(df) | ||
| if any(diffs > 1) or rollover_df.rollover_index[0] > 0: | ||
| raise ValueError("Rollover log file appears to be missing") | ||
|
|
||
| if any(diffs < 1): | ||
| raise ValueError("Duplicate rollover log file detected") | ||
|
|
||
| event_log = Path(tempfile.mkstemp(suffix="-concatenated.json", dir=str(self.work_dir))[1]) | ||
| with open(event_log, "w") as fobj: | ||
| for path in df.path: | ||
| for path in rollover_df.path: | ||
| with open(path) as part_fobj: | ||
| for line in part_fobj: | ||
| fobj.write(line) | ||
|
|
||
| return event_log | ||
|
|
||
| def _validate_rollover_logs(self, df: pd.DataFrame): | ||
| if not len(df.context_id.unique()) == 1: | ||
| raise ValueError("Not all rollover files have the same Spark context ID") | ||
|
|
||
| diffs = df.rollover_index.diff()[1:] | ||
|
|
||
| if any(diffs > 1) or df.rollover_index[0] > 0: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I remember seeing this in the last PR to check for the case this PR is addressing. Did this logic just not get run if there was only one log?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly. With only 1 log file rollover validation was skipped. |
||
| raise ValueError("Rollover file appears to be missing") | ||
|
|
||
| if any(diffs < 1): | ||
| raise ValueError("Duplicate rollover file detected") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should refrain from using
sys.exit()here. In the case the spark_log_parser is used anywhere in the backend or celery task, it would terminate the process, and likely cause unintended side effects.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we're ok here. This code is only executed by users on the command line, e.g.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, makes sense. It caught my eye because some areas we raise a
ValueErrorbut here we exit the process.