Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
125 commits
Select commit Hold shift + click to select a range
95546ca
[aDAG] Rename vars in compiled_dag_node (#47346)
ruisearch42 Aug 27, 2024
679989c
[aDAG] Minor cleanup of accelerated_dag_gpu_microbenchmark (#47353)
ruisearch42 Aug 27, 2024
e32eb38
[doc][ml] add missing tune public API references + api policy lint ch…
can-anyscale Aug 27, 2024
68811a5
[Core] No actor creation task retry on core worker (#47122)
jjyao Aug 27, 2024
c67fb76
[RLlib] Fix large batch size for synchronous algos after EnvRunner fa…
sven1977 Aug 27, 2024
4aea49f
[docker] Update latest Docker dependencies for 2.35.0 release (#47373)
khluu Aug 28, 2024
e164d87
[RLlib] Add `reset()` method to `MetricsLogger`. (#47382)
sven1977 Aug 28, 2024
57410d8
[doc] instruction for troubleshooting issues with _raylet.so (#47369)
can-anyscale Aug 28, 2024
61f8790
[Serve] fix grpc performance issue (#47338)
GeneDer Aug 28, 2024
d9d74fa
[Core] Fix ray_unintentional_worker_failures_total to only count unin…
jjyao Aug 28, 2024
e043a03
[Data] Get AWS credentials with boto (#47352)
scottjlee Aug 28, 2024
d425f31
[observability][export-api] Write node events (#47221)
nikitavemuri Aug 28, 2024
8a1fe6e
[core] Add from_binary and from_hex to all IDs. (#47375)
rynewang Aug 28, 2024
a92f3b4
[RLlib] Cleanup examples folder (vol 23): Float16 training support an…
sven1977 Aug 29, 2024
811f98e
[core][dashboard] Update nodes on delta. (#47367)
rynewang Aug 29, 2024
751dbb1
[RLlib] Cleanup examples folder (vol 24): Mixed-precision training (a…
sven1977 Aug 29, 2024
f0f29bd
Split python/ray/tests/test_actor_retry over two files (#47188)
mattip Aug 29, 2024
ccd3ba5
[RLlib; Offline RL] - Enable reading old-stack `SampleBatch` data in …
simonsays1980 Aug 29, 2024
9e6b89f
[serve] redeploy in between each microbenchmark (#47404)
zcin Aug 29, 2024
ce0e2c7
Revert "[observability][export-api] Write node events" (#47405)
can-anyscale Aug 29, 2024
db735da
[Data] Remove limit on number of tasks launched per scheduling step (…
bveeramani Aug 29, 2024
baa42a1
Update handling-dependencies.rst (#47306)
rmcsqrd Aug 29, 2024
5062bd5
[doc] Instruction for troubleshooting side nav when building incremen…
khluu Aug 29, 2024
7cdf126
[Doc] Run pre-commit on cluster docs (#47342)
peytondmurray Aug 29, 2024
9fab1d6
[ADAG] Fix microbenchmark_unstable (#47398)
kevin85421 Aug 29, 2024
f0a81a6
[Data] Allow User defined Exception to be caught. (#47339)
Bye-legumes Aug 30, 2024
3c950a1
[RLlib] Examples folder cleanup: ModelV2 -> RLModule wrapper for migr…
sven1977 Aug 30, 2024
d1f21a5
[RLlib] Remove 2nd Learner ConnectorV2 pass from PPO (add new GAE Con…
sven1977 Aug 30, 2024
a967a35
[core][dashboard] TPE for state_aggregator.py (#47392)
rynewang Aug 30, 2024
c03aeff
[Data] Fix bug where `arrow_parquet_args` aren't used (#47161)
bveeramani Aug 30, 2024
eedb407
[RLlib; Offline RL] CQL: Support multi-GPU/CPU setup and different le…
simonsays1980 Aug 30, 2024
c9c150a
[aDAG] Support multi-read of the same shm channel (#47311)
ruisearch42 Aug 30, 2024
00282f6
[data] add a comment explaining the bundling behavior for map_batches…
raulchen Aug 31, 2024
63247f3
[RLlib; Offline RL] Add cloud filesystems to offline data input argum…
simonsays1980 Aug 31, 2024
738be6a
[serve] Fix broken microbenchmarks (#47430)
zcin Aug 31, 2024
9a744a6
[core][experimental] Raise an exception if a DAG is compiled twice (#…
kevin85421 Sep 2, 2024
80e832c
[Doc] Add Algolia search to docs (#46477)
peytondmurray Sep 2, 2024
df552cd
[ADAG] Support tasks with multiple return values in aDAG (#47024)
dengwxn Sep 2, 2024
eda6d09
[Train] Decouple device-related modules and add Huawei NPU support to…
liuxsh9 Sep 3, 2024
e1ed103
[RLlib] Add gradient checks to avoid `nan` gradients in `TorchLearner…
simonsays1980 Sep 3, 2024
e147e31
[RLlib] Add option to use `torch.lr_scheduler` classes for learning r…
simonsays1980 Sep 3, 2024
d4a52ea
[observability][export-api] Write node events (#47422)
nikitavemuri Sep 3, 2024
d1a0f99
Retry transient network errors in WebDatasetDatasource (#46892)
BitPhinix Sep 3, 2024
948c901
Add support for 'container' and 'env_vars' in runtime_env (#42121)
zcin Sep 3, 2024
d81f4d8
[Data] Remove dead `from_*_operator` modules (#47457)
bveeramani Sep 3, 2024
cefedcf
[Doc][KubeRay] Update PyTorch Mnist Training doc for KubeRay 1.2.0 (#…
MortalHappiness Sep 4, 2024
158a75f
[RLlib] - Add example for PyTorch lr schedulers. (#47454)
simonsays1980 Sep 4, 2024
354c8b3
[RLlib] Examples folder cleanup: ModelV2 -> RLModule wrapper for migr…
sven1977 Sep 4, 2024
9ace5e2
[serve] add streaming to microbenchmarks (#47466)
zcin Sep 4, 2024
2414c0c
[Data] Skip empty JSON files in `read_json()` (#47378)
venkatram-dev Sep 4, 2024
a615e55
feat: quickstart install button (#47479)
saihaj Sep 4, 2024
2886474
feat: introduce Ray Core quickstart button (#47477)
saihaj Sep 4, 2024
9437c5d
fix: update quickstart button link to have `redirectTo` (#47417)
saihaj Sep 4, 2024
e75f40a
Revert "[Doc] Add Algolia search to docs" (#47483)
can-anyscale Sep 4, 2024
67d251d
[Data] Remove remote call for initializing `Datasource` in `read_data…
scottjlee Sep 4, 2024
5463f5b
[release] simplify the process of getting job logs (#47470)
can-anyscale Sep 4, 2024
9781c6c
[Core] Fix runtime env race condition when uploading the same package…
jjyao Sep 4, 2024
e9f7930
[core][dashboard][agent] add configurable timeouts for rt env agent a…
rynewang Sep 4, 2024
e19fbd3
[core][dashboard] Pass in cluster ID in hex for dashboard, dash agent…
rynewang Sep 5, 2024
03a387c
[core][experimental] Correct `num_input_consumers` for CachedChannel …
kevin85421 Sep 5, 2024
3beea70
Revert Revert "[Doc] Add Algolia search to docs" (#47487)
can-anyscale Sep 5, 2024
bf0ddb4
[observability][export-api] Add event data schema for submission and …
nikitavemuri Sep 5, 2024
99af819
[observability][export-api] Write actor events (#47303)
nikitavemuri Sep 5, 2024
ca85922
[serve] remove outdated microbenchmark release test (#47502)
zcin Sep 5, 2024
726b636
[ADAG] Log Executable Task Events (#47345)
woshiyyya Sep 5, 2024
d6cb796
[Core] Introduce env var RAY_LOG_TO_DRIVER (#47495)
jjyao Sep 5, 2024
3d4ea77
[Core] Fix test_runtime_env_working_dir_4 for Windows (#47505)
jjyao Sep 5, 2024
0392390
feat: add get started free badge (#47503)
saihaj Sep 5, 2024
c95318e
[observability][export-api] Write task events (#47193)
nikitavemuri Sep 5, 2024
d8a85c5
[Core] Improve NodeManager::HandleJobStarted log message (#47510)
jjyao Sep 5, 2024
6bcce32
[Core] Fix aws_cluster_launcher_full (#47512)
jjyao Sep 6, 2024
f336f51
Revert "[observability][export-api] Write actor events" (#47516)
can-anyscale Sep 6, 2024
fcef11c
Revert "[observability][export-api] Write task events" (#47536)
can-anyscale Sep 6, 2024
c618a9d
fix quickstart image path (#47535)
saihaj Sep 6, 2024
5c70d96
[RLlib; Off-policy] Add episode sampling to `EpisodeReplayBuffer`. (#…
simonsays1980 Sep 6, 2024
bbeee55
[aDAG] Allow custom NCCL group for aDAG (#47141)
ruisearch42 Sep 6, 2024
8bfed0e
[serve] separate test deployment version into unit and non-unit tests…
zcin Sep 6, 2024
53f6408
[aDAG] Fix test_accelerated_dag regression (#47543)
ruisearch42 Sep 6, 2024
542f51a
[Data] Remove ineffective retry code in `plan_read_op` (#47456)
bveeramani Sep 7, 2024
ec401cf
[Data] Remove unused `requirements_legacy_compat.txt` requirements fi…
bveeramani Sep 7, 2024
d471564
adding run quickstart button to ray serve stable diffusion tutorial (…
chris-ray-zhang Sep 8, 2024
3e8dd0d
add quickstart button to model serving code snippet (#47545)
chris-ray-zhang Sep 8, 2024
1dd8d60
[Core] Remove ray._raylet.check_health (#47526)
jjyao Sep 9, 2024
d3c0708
[observability][export-api] Write actor events (#47529)
nikitavemuri Sep 9, 2024
5e61ae4
[observability][export-api] Write task events (#47538)
nikitavemuri Sep 9, 2024
7648e76
[tune][ci] Fix flaky `test_controller_resume_integration` test suite …
justinvyu Sep 9, 2024
5e2d73d
[RLlib; Offline RL] - Replace GAE in `MARWILOfflinePreLearner` with `…
simonsays1980 Sep 9, 2024
dff202f
[Doc] Run pre-commit on data docs (#47341)
peytondmurray Sep 9, 2024
bbc4c22
[release auto] upgrade docker py library (#47566)
aslonnie Sep 9, 2024
e4513e5
[data] Change fixture from `shutdown_only` to `ray_start_regular_shar…
omatthew98 Sep 9, 2024
866c5da
[release auto] convert release blockers to list (#47567)
aslonnie Sep 9, 2024
27f505d
Add perf metrics for 2.35.0 (#47283)
khluu Sep 9, 2024
feb21c9
[Data] Remove ordering assumption and unnecessary `override_num_block…
bveeramani Sep 10, 2024
0773760
[Core] Reconstruct actor to run lineage reconstruction triggered acto…
jjyao Sep 10, 2024
f40313b
[aDAG] support buffered input (#47272)
rkooo567 Sep 10, 2024
6f2af2c
[aDAG] Clean up arg_to_consumers in _get_or_compile() (#47514)
ruisearch42 Sep 10, 2024
aa7179a
[RLlib; Offline RL] Store episodes in state form. (#47294)
simonsays1980 Sep 10, 2024
57136b5
[Core][aDag] Support multi node multi reader (#47480)
rkooo567 Sep 10, 2024
ac7face
Allow control of some serve configuration via env vars (#47533)
timkpaine Sep 10, 2024
21c3248
Update incremental build troubleshooting tip with style nits (#47592)
angelinalg Sep 10, 2024
d655fcc
[Core] Mark single_node_oom release test as unstable (#47591)
ruisearch42 Sep 10, 2024
2eea1d6
[observability][export-api] Write driver job events (#47418)
nikitavemuri Sep 10, 2024
b1e12c8
[core][dashboard] push down job_or_submission_id to GCS. (#47492)
rynewang Sep 11, 2024
8e62414
[Doc][KubeRay] Add description tables for RayCluster Status in the ob…
rueian Sep 11, 2024
b6ca703
[Docs][KubeRay] add a guide for deploying vLLM with RayService (#47038)
andrewsykim Sep 11, 2024
53813e5
[dashboard] fix syntax warnings on slashes (#47595)
aslonnie Sep 11, 2024
fe4cc0a
[aDAG] Fix ranks ordering for custom NCCL group (#47594)
ruisearch42 Sep 11, 2024
af9fa13
[aDAG] Fix type hint for _build_execution_schedule() (#47569)
ruisearch42 Sep 11, 2024
7cb95ce
[RLlib] RLModule: `InferenceOnlyAPI`. (#47572)
sven1977 Sep 11, 2024
12e1325
[Data] Remove `_default_metadata_providers` (#47575)
bveeramani Sep 11, 2024
6812c6e
[Serve] Remove unused Serve constants (#47593)
GeneDer Sep 11, 2024
623bc80
Fix windows://:task_event_buffer_test (#47577)
nikitavemuri Sep 11, 2024
f422376
[RLlib] RLModule API: `SelfSupervisedLossAPI` for RLModules that brin…
sven1977 Sep 11, 2024
03ab9b3
[data] Change counts of metrics to rates of metrics (#47236)
omatthew98 Sep 11, 2024
e75f5e7
[RLlib] Add restart-failed-env option to new api stack. (#47608)
sven1977 Sep 11, 2024
29a2a91
[Train] Update run status and add stack trace to `TrainRunInfo` (#46875)
woshiyyya Sep 11, 2024
432dbce
[GCS] Optimize `GetAllJobInfo` API for performance (#47530)
liuxsh9 Sep 11, 2024
74f29fc
[Serve] fix default serve logger behavior (#47600)
GeneDer Sep 11, 2024
029ff4d
[Data] Fix incorrect pending task size if outputs are empty (#47604)
bveeramani Sep 11, 2024
7910a5e
[core] Make is_gpu, is_actor, root_detached_id fields late bind to wo…
rynewang Sep 11, 2024
a6f923b
[core][adag] Separate the outputs of execute and execute_async to mul…
jeffreyjeffreywang Sep 11, 2024
44dd9a7
[Data] Throw exception for non-streaming HF datasets with "override_n…
xingyu-long Sep 12, 2024
c4983cc
add backpressure
Jay-ju Sep 10, 2024
f48f821
[serve] Faster detection of dead replicas (#47237)
zcin Sep 12, 2024
5c09215
Merge branch 'master' into add_backpressure_reason
Jay-ju Sep 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
1 change: 1 addition & 0 deletions .buildkite/lint.rayci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ steps:
- api_policy_check serve
- api_policy_check data
- api_policy_check train
- api_policy_check tune
- api_policy_check rllib

- label: ":lint-roller: lint: linkcheck"
Expand Down
35 changes: 35 additions & 0 deletions BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -956,6 +956,21 @@ ray_cc_test(
],
)

ray_cc_test(
name = "task_event_buffer_export_event_test",
size = "small",
srcs = ["src/ray/core_worker/test/task_event_buffer_export_event_test.cc"],
tags = [
"team:core",
"no_windows"
],
deps = [
":core_worker_lib",
":ray_mock",
"@com_google_googletest//:gtest_main",
],
)

ray_cc_test(
name = "actor_creator_test",
size = "small",
Expand Down Expand Up @@ -991,6 +1006,7 @@ ray_cc_test(
tags = ["team:core"],
deps = [
":core_worker_lib",
":ray_mock",
"@com_google_googletest//:gtest_main",
],
)
Expand Down Expand Up @@ -2344,6 +2360,25 @@ ray_cc_test(
],
)

ray_cc_test(
name = "gcs_export_event_test",
size = "small",
srcs = glob([
"src/ray/gcs/gcs_server/test/export_api/*.cc",
]),
tags = [
"no_windows",
"team:core"
],
deps = [
":gcs_server_lib",
":gcs_server_test_util",
":gcs_test_util_lib",
":ray_mock",
"@com_google_googletest//:gtest_main",
],
)

flatbuffer_cc_library(
name = "node_manager_fbs",
srcs = ["src/ray/raylet/format/node_manager.fbs"],
Expand Down
3 changes: 3 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
.. image:: https://img.shields.io/twitter/follow/raydistributed.svg?style=social&logo=twitter
:target: https://twitter.com/raydistributed

.. image:: https://img.shields.io/badge/Get_started_for_free-3C8AE9?logo=data%3Aimage%2Fpng%3Bbase64%2CiVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8%2F9hAAAAAXNSR0IArs4c6QAAAERlWElmTU0AKgAAAAgAAYdpAAQAAAABAAAAGgAAAAAAA6ABAAMAAAABAAEAAKACAAQAAAABAAAAEKADAAQAAAABAAAAEAAAAAA0VXHyAAABKElEQVQ4Ea2TvWoCQRRGnWCVWChIIlikC9hpJdikSbGgaONbpAoY8gKBdAGfwkfwKQypLQ1sEGyMYhN1Pd%2B6A8PqwBZeOHt%2FvsvMnd3ZXBRFPQjBZ9K6OY8ZxF%2B0IYw9PW3qz8aY6lk92bZ%2BVqSI3oC9T7%2FyCVnrF1ngj93us%2B540sf5BrCDfw9b6jJ5lx%2FyjtGKBBXc3cnqx0INN4ImbI%2Bl%2BPnI8zWfFEr4chLLrWHCp9OO9j19Kbc91HX0zzzBO8EbLK2Iv4ZvNO3is3h6jb%2BCwO0iL8AaWqB7ILPTxq3kDypqvBuYuwswqo6wgYJbT8XxBPZ8KS1TepkFdC79TAHHce%2F7LbVioi3wEfTpmeKtPRGEeoldSP%2FOeoEftpP4BRbgXrYZefsAI%2BP9JU7ImyEAAAAASUVORK5CYII%3D
:target: https://console.anyscale.com/register/ha?utm_source=github&utm_medium=ray_readme&utm_campaign=get_started_badge

Ray is a unified framework for scaling AI and Python applications. Ray consists of a core distributed runtime and a set of AI libraries for simplifying ML compute:

.. image:: https://github.com/ray-project/ray/raw/master/doc/source/images/what-is-ray-padded.svg
Expand Down
17 changes: 11 additions & 6 deletions ci/ray_ci/automation/weekly_green_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,17 @@ def main(production: bool, check: bool) -> None:
)
logger.info("Weekly green metric updated successfully")

if check and blockers.totalCount != 0:
print(
f"Found {blockers.totalCount} release blockers.",
file=sys.stderr,
)
sys.exit(42) # Not retrying the check on Buildkite jobs
if check:
if len(blockers) > 0:
print(
f"Found {len(blockers)} release blockers.",
file=sys.stderr,
)
for issue in blockers:
print(f"{issue.html_url} - {issue.title}", file=sys.stderr)
sys.exit(42) # Not retrying the check on Buildkite jobs
else:
print("No release blockers. Woohoo!", file=sys.stderr)


if __name__ == "__main__":
Expand Down
3 changes: 2 additions & 1 deletion ci/ray_ci/core.tests.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
flaky_tests:
- windows://:metric_exporter_grpc_test
- windows://python/ray/tests:test_actor_retry
- windows://python/ray/tests:test_actor_retry1
- windows://python/ray/tests:test_actor_retry2
- windows://python/ray/tests:test_object_spilling
- windows://python/ray/tests:test_object_spilling_asan
- windows://python/ray/tests:test_object_spilling_debug_mode
Expand Down
6 changes: 5 additions & 1 deletion ci/ray_ci/doc/build_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def __init__(self, cache_dir: str):
"""
self._cache_dir = cache_dir

def upload(self) -> None:
def upload(self, dry_run: bool) -> None:
"""
Upload the build artifacts to S3
"""
Expand All @@ -40,6 +40,10 @@ def upload(self) -> None:
logger.info("Creating a tarball of the cache files.")
doc_tarball = self._zip_cache(cache_files)

if dry_run:
logger.info(f"Skipping upload of {doc_tarball} to S3.")
return

logger.info("Upload the tarball to S3.")
self._upload_cache(doc_tarball)

Expand Down
7 changes: 4 additions & 3 deletions ci/ray_ci/doc/cmd_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,23 +28,24 @@ def main(ray_checkout_dir: str) -> None:
logger.info("Building ray doc.")
_build(ray_checkout_dir)

dry_run = False
if (
os.environ.get("BUILDKITE_PIPELINE_ID")
not in get_global_config()["ci_pipeline_postmerge"]
):
dry_run = True
logger.info(
"Not uploading build artifacts because this is not a postmerge pipeline."
)
return

if os.environ.get("BUILDKITE_BRANCH") != "master":
dry_run = True
logger.info(
"Not uploading build artifacts because this is not the master branch."
)
return

logger.info("Uploading build artifacts to S3.")
BuildCache(os.path.join(ray_checkout_dir, "doc")).upload()
BuildCache(os.path.join(ray_checkout_dir, "doc")).upload(dry_run=dry_run)

return

Expand Down
10 changes: 10 additions & 0 deletions ci/ray_ci/doc/cmd_check_api_discrepancy.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,16 @@
"head_doc_file": "doc/source/train/api/api.rst",
"white_list_apis": {},
},
"tune": {
"head_modules": {"ray.tune"},
"head_doc_file": "doc/source/tune/api/api.rst",
"white_list_apis": {
# Already documented as ray.tune.search.ConcurrencyLimiter
"ray.tune.search.searcher.ConcurrencyLimiter",
# TODO(ml-team): deprecate these APIs
"ray.tune.utils.log.Verbosity",
},
},
"rllib": {
"head_modules": {"ray.rllib"},
"head_doc_file": "doc/source/rllib/package_ref/index.rst",
Expand Down
5 changes: 3 additions & 2 deletions doc/requirements-doc.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ setuptools>=70.0.0
Pygments==2.16.1

# Sphinx
sphinx==7.1.2
sphinx==7.3.7
sphinx-click==5.1.0
sphinx-copybutton==0.5.2
sphinxemoji==0.2.0
Expand All @@ -20,6 +20,7 @@ sphinx-autobuild==2024.4.16
pydata-sphinx-theme==0.14.1
autodoc_pydantic==2.2.0
appnope
sphinx-docsearch==0.0.7

pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3

Expand All @@ -36,5 +37,5 @@ urllib3 < 1.27
# External dependencies such as ML libraries should be mocked out, not added here.
# See doc/source/conf.py for examples of how to mock out external dependencies.
click==8.1.7
boto3==1.35.2
boto3==1.34.69
requests==2.32.3
Loading