Skip to content

pyarrow.lib.ArrowInvalid: offset overflow while concatenating arrays, consider casting input from string to large_string first #444

@Jacobsonradical

Description

@Jacobsonradical

Describe the bug
Traceback (most recent call last):
File "/root/.pyenv/versions/3.10.20/lib/python3.10/concurrent/futures/process.py", line 246, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "/workspace/CM20260505/scoring/run_scoring.py", line 311, in _run_scorer_in_parallel
return _run_scorer_parallelizable(
File "/workspace/CM20260505/scoring/run_scoring.py", line 387, in _run_scorer_parallelizable
scoringResults = scorer.prescore(scoringArgs, preserveRatings=not runParallel)
File "/workspace/CM20260505/scoring/scorer.py", line 293, in prescore
ratings, noteStatusHistory = self._filter_input(
File "/workspace/CM20260505/scoring/scorer.py", line 147, in _filter_input
ratings = ratings.merge(
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/frame.py", line 10832, in merge
return merge(
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 184, in merge
return op.get_result(copy=copy)
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 886, in get_result
join_index, left_indexer, right_indexer = self._get_join_info()
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 1151, in _get_join_info
(left_indexer, right_indexer) = self._get_join_indexers()
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 1125, in _get_join_indexers
return get_join_indexers(
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 1759, in get_join_indexers
lidx, ridx = get_join_indexers_non_unique(
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 1793, in get_join_indexers_non_unique
lkey, rkey, count = _factorize_keys(left, right, sort=sort)
File "/root/.pyenv/versions/3.10.20/lib/python3.10/site-packages/pandas/core/reshape/merge.py", line 2487, in _factorize_keys
.combine_chunks()
File "pyarrow/table.pxi", line 780, in pyarrow.lib.ChunkedArray.combine_chunks
File "pyarrow/array.pxi", line 5098, in pyarrow.lib.concat_arrays
File "pyarrow/error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow/error.pxi", line 92, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: offset overflow while concatenating arrays, consider casting input from string to large_string first.
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
File "/workspace/CM20260505/main.py", line 31, in
main()
File "/workspace/CM20260505/scoring/runner.py", line 277, in main
return _run_scorer(args=args, dataLoader=dataLoader, extraScoringArgs=extraScoringArgs)
File "/workspace/CM20260505/scoring/runner.py", line 230, in _run_scorer
scoredNotes, helpfulnessScores, newStatus, auxNoteInfo = run_scoring(
File "/workspace/CM20260505/scoring/run_scoring.py", line 2365, in run_scoring
) = run_prescoring(
File "/workspace/CM20260505/scoring/run_scoring.py", line 1461, in run_prescoring
prescoringModelResultsFromAllScorers = _run_scorers(
File "/workspace/CM20260505/scoring/run_scoring.py", line 547, in _run_scorers
modelResultsAndTimes = [f.result() for f in futures]
File "/workspace/CM20260505/scoring/run_scoring.py", line 547, in
modelResultsAndTimes = [f.result() for f in futures]
File "/root/.pyenv/versions/3.10.20/lib/python3.10/concurrent/futures/_base.py", line 458, in result
return self.__get_result()
File "/root/.pyenv/versions/3.10.20/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
pyarrow.lib.ArrowInvalid: offset overflow while concatenating arrays, consider casting input from string to large_string first.

To Reproduce
The error can be reproduced by the code in Commit 20260505 and the snapshot with date 2026-05-13

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions