Skip to content

Commit cc5d7e8

Browse files
Twinkle JainTwinkle Jain
authored andcommitted
CodeFlare resiliency tool: initial commit
1 parent a2b290a commit cc5d7e8

File tree

2 files changed

+386
-0
lines changed

2 files changed

+386
-0
lines changed

utils/README.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
*** This is a first draft of README and we'll soon update it with more and clear text ***
2+
3+
## Introduction
4+
This python script is an entry point where users can specify resiliency mode and Ray version.
5+
It then generates a system config-map (system-cm.yaml) YAML file.
6+
The system config-map contains all relevant ray's system-level configurations for the ray cluster.
7+
8+
## Background and Motivation
9+
Resiliency in Ray is limited, and it can be improved by tuning system-level configurations.
10+
Moreover, configurations can also be tuned to improve the performance of a ray workload.
11+
However, ray has a large set of configuration options available, and the count is increasing with each release.
12+
For example, Ray version 1.5.0 has 113 configuration options available. It is inconvenient for users to learn and tune each configuration manually. Therefore, we need a tool where users can easily configure the ray cluster.
13+
14+
## Example usage
15+
```
16+
python3 Usage: ray-res-config.py --ray-version <arg> --resiliency-mode <arg>
17+
```
18+
Use `--help` option for all available options.
19+
20+
## Tool features/options
21+
22+
23+
## TODOs
24+
- [x] Fetch Ray configs for 1.x.x versions from Github if not available locally and save locally
25+
- [x] Parse configs and dump configs in file to be edited later according to the res
26+
- [x] If any config value is different than default value then add that config to the --system-config parameter
27+
- [x] Dump config in a yaml file named system-cm.yaml
28+
- [ ] Change hardcoded string to dynamic to dump in system-cm.yaml
29+
- [ ] Update format of system config string that works correctly
30+
- [ ] Segregate internal and external options
31+
- [ ] Spread code in multiple files
32+
- [ ] Extend to more Ray versions
33+
- [ ] Add Try-catch and checks to improve robustness
34+
- [ ] Add code to also dump Ray operator and cluster yamls
35+
- [ ] Give a sample cluster Yaml to user to let user edit cluster specific configurations, e.g., cpu, num of workers
36+
- [ ] Add an example usage of the script
37+
- [ ] Test custom Ray build instead of Ray's official images

utils/ray-res-config.py

Lines changed: 349 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
import sys
2+
import os
3+
from typing import Dict
4+
import yaml
5+
import wget
6+
import re
7+
# import regex
8+
from parse import *
9+
from optparse import OptionParser
10+
11+
versions_list=['1.0.0', '1.1.0','1.2.0','1.3.0','1.4.0','1.4.1','1.5.0']
12+
13+
res_modes_list=['relaxed', 'recommended', 'strict', 'custom']
14+
15+
Ray_conf= dict()
16+
17+
FAIL = "91"
18+
STATUS = "33"
19+
WARNING = "93"
20+
OK = "92"
21+
INFO = "94"
22+
23+
##
24+
# Print the string given in parameter with color
25+
#
26+
def print_colored(color, text):
27+
if color == "FAIL":
28+
color = FAIL
29+
elif color == "OK":
30+
color = OK
31+
elif color == "WARNING":
32+
color = WARNING
33+
elif color == "STATUS":
34+
color = STATUS
35+
elif color == "INFO":
36+
color = INFO
37+
print("\033[1;" + color + ";40m" + text + "\033[0m")
38+
39+
# print supported ray versions and resiliency modes/profiles
40+
def print_ray_versions_and_modes():
41+
print("Compatible Ray versions:")
42+
print(', '.join(versions_list))
43+
44+
print('''
45+
Available resiliency profiles/modes:
46+
1. strict: Resilience preferred
47+
2. relaxed: Performance preferred
48+
3. recommended: Balanced resilience and performance overhead
49+
4. custom: Define your own preference
50+
''')
51+
52+
# Generates .conf files in configs folder in the working directory
53+
# TODO: generate ray-version dir under configs directory for better file oganization
54+
def dump_conf(ray_version, res_mode, overwrite, dirname='configs/'):
55+
# dump the default configs in a file to let others edit further
56+
file_path = dirname+"ray-"+ray_version+"-"+res_mode+".conf"
57+
# check if the file already exist
58+
if (not os.path.exists(file_path)) or overwrite:
59+
fd = open(file_path, "w+")
60+
fd.write("# please edit value_for_this_mode to change any configuration\n")
61+
yaml.dump(Ray_conf[ray_version], fd)
62+
63+
# Que: is dumping in json format better or YAML?
64+
def read_conf(ray_version, res_mode, dirname='configs/'):
65+
Ray_conf_new = Ray_conf
66+
file_path = dirname+"ray-"+ray_version+"-"+res_mode+".conf"
67+
fd = open(file_path, 'r')
68+
fd.readline()
69+
try:
70+
Ray_conf_new[ray_version] = yaml.safe_load(fd)
71+
conf_list = list()
72+
for conf_name in Ray_conf_new[ray_version].items():
73+
# print(conf_name[1]['default'], conf_name[1]['value_for_this_mode'])
74+
if conf_name[1]['default'] != conf_name[1]['value_for_this_mode']:
75+
cv_dict = {}
76+
cv_dict[conf_name[0]] = conf_name[1]['value_for_this_mode']
77+
conf_list.append((conf_name[0],conf_name[1]['value_for_this_mode']))
78+
# print(conf_list)
79+
if len(conf_list) == 0:
80+
print_colored(INFO, 'No change in default configurations!')
81+
return conf_list
82+
except yaml.YAMLError as exception:
83+
print(exception)
84+
85+
86+
def put_conf(ray_version, conf_name, conf_type, conf_default, conf_env):
87+
# Ray_conf[version][conf_name][type/default/env]
88+
Ray_conf[ray_version][conf_name] = dict()
89+
Ray_conf[ray_version][conf_name]['type'] = conf_type
90+
Ray_conf[ray_version][conf_name]['default'] = conf_default
91+
Ray_conf[ray_version][conf_name]['value_for_this_mode'] = conf_default
92+
Ray_conf[ray_version][conf_name]['env'] = conf_env
93+
94+
def parse_ray_config(ray_version, sig_str, is_multiline):
95+
# print("In parse_ray_config: %s" % sig_str)
96+
conf_type, conf_name, conf_default = parse("RAY_CONFIG({}, {}, {}", sig_str)
97+
98+
# replace if default has comment in it
99+
conf_default = re.sub('(?:/\*(.*?)\*/)', '', conf_default)
100+
conf_env = ''
101+
if is_multiline:
102+
# get the default value and associated env variable
103+
if '?' in conf_default:
104+
# TODO: make the parsing conditions more general
105+
if 'RAY_preallocate_plasma_memory' in conf_default and ray_version == '1.5.0':
106+
conf_env = 'RAY_preallocate_plasma_memory'
107+
_, conf_default = parse('{}: {})', conf_default)
108+
else:
109+
_, conf_env,_, conf_default = parse('{} ? {} : {}("{}"))', conf_default)
110+
elif 'getenv' in conf_default:
111+
_, conf_env, is_eq, _, conf_default = parse('{} getenv("{}") {} std::{}("{}"))', conf_default)
112+
if is_eq == "!=" and conf_type == "bool":
113+
conf_default = str(not int(conf_default))
114+
elif 'env_' in conf_default:
115+
_, conf_env, conf_default = parse('{}("{}", {})', conf_default)
116+
conf_default = conf_default.rstrip(')')
117+
# print(conf_type, conf_name, conf_default, conf_env)
118+
# Access values like this: Ray_conf[ray_version][conf_name][type/default/env]
119+
put_conf(ray_version, conf_name, conf_type, conf_default, conf_env)
120+
121+
# for multi-line signatures
122+
def is_balanced_parenthesis(str_list):
123+
open_para = 0
124+
close_para = 0
125+
for line in str_list:
126+
open_para = open_para + line.count('(')
127+
close_para = close_para + line.count(')')
128+
129+
if open_para == close_para:
130+
return True
131+
return False
132+
133+
def parse_config_file(config_file, ray_version):
134+
# initialize conf dict
135+
Ray_conf[ray_version] = dict()
136+
print("\nParsing configuration file: %s" % config_file)
137+
f = open(config_file, 'r')
138+
'''
139+
One other way to parse these header files is to write a small C/C++ program
140+
that define the macro before including the header file and dump into a file
141+
like following:
142+
#define RAY_CONFIG(type, name, default_value) \
143+
std::cout << #type << endl; \
144+
std::cout << #name << endl; \
145+
std::cout << #default_value << endl; \
146+
std::cout << "=====" << endl;
147+
'''
148+
149+
# Below is the regex for a single line config signature;
150+
# Fix it to include multi-line function declarations as well.
151+
# then we'll not need the whole while loop
152+
# re.findall(r'^RAY_CONFIG\((.*)\)$',f.read(), flags=re.MULTILINE)
153+
154+
lines = f.readlines()
155+
n = len(lines)
156+
i = 0
157+
while (i < n):
158+
line = lines[i]
159+
line = line.strip(' ').rstrip('\n')
160+
# print("Parsing line: %s" % line)
161+
# FIXME: write a regex that can parse both single and multi-line signature
162+
if line.startswith("RAY_CONFIG"):
163+
if line.endswith(','):
164+
# print("Multi-line signature")
165+
# normally multi-line signature includes relevant environment
166+
# variable that we can save if we want to exploit in the future.
167+
168+
# read till function signature is closed
169+
j = i + 1
170+
while (is_balanced_parenthesis(lines[i:j]) is not True):
171+
# print (lines[j])
172+
j = j + 1
173+
multiline_fun = ' '.join(map(str.strip,lines[i:j+1]))
174+
parse_ray_config(ray_version, multiline_fun.rstrip('\n').rstrip(' '), True)
175+
i = j + 1
176+
continue
177+
elif line.endswith(')'):
178+
parse_ray_config(ray_version, line, False)
179+
elif line.endswith(');'):
180+
# this one case is because of ray developer didn't follow the
181+
# MACRO calling prototype like the rest config calls in Ray 1.5.0
182+
# RAY_CONFIG(uint32_t, agent_manager_retry_interval_ms, 1000);
183+
# TODO: generalize it by right striping the line of any ";"
184+
parse_ray_config(ray_version, line[:-1], False)
185+
else:
186+
print("neither ends with , nor with ) ")
187+
print(line)
188+
sys.exit()
189+
i = i + 1
190+
191+
# total config count
192+
def total_config(ray_version):
193+
print('Total configs in Ray version %s = %d' %(ray_version, len(Ray_conf[ray_version])))
194+
195+
# fetch and parse ray configuration for each resiliency mode/profile
196+
def fetch_configs_from_git(ray_versions, res_modes, overwrite):
197+
# get configs from file or git for each ray_version
198+
for ray_version in ray_versions:
199+
out_dir = "configs"
200+
# create dir if not present
201+
if not os.path.exists(out_dir):
202+
os.makedirs(out_dir)
203+
out_filename = "%s/ray-%s-config-def.h" % (out_dir,ray_version)
204+
# wget it from git if file not present
205+
if not os.path.exists(out_filename):
206+
url = 'https://raw.githubusercontent.com/ray-project/ray/ray-%s/src/ray/common/ray_config_def.h' % ray_version
207+
wget.download(url, out=out_filename)
208+
parse_config_file(out_filename, ray_version)
209+
total_config(ray_version)
210+
for res_mode in res_modes:
211+
dump_conf(ray_version, res_mode, overwrite)
212+
print_colored(OK, "All conf files saved!\nDONE!")
213+
214+
# generate config json string for system-cm yaml
215+
def get_conf_string(conf_list):
216+
# join string and escape special chars in the string
217+
# NOTES: use re.escape and string.translate to escape if needed
218+
# TODO: check if the format works for all ray versions and ways of cluster deployment
219+
conf_str = ','.join(['"%s":"%s"' % (x[0],x[1]) for x in conf_list])
220+
# conf_str = ', '.join(['\\"%s\\":\\"%s\\"' % (x[0],x[1]) for x in conf_list])
221+
print('New configurations: %s' % conf_str)
222+
return conf_str
223+
224+
225+
def gen_system_conf(conf_list, verbose):
226+
if verbose:
227+
print("Version 1.4.0 specific configs")
228+
conf_string = get_conf_string(conf_list)
229+
# FIXME: this should not be hardcoded and can be made a dict in python to be
230+
# loaded as yaml instead of a string
231+
sys_conf = """
232+
apiVersion: v1
233+
data:
234+
system_config: '{%s}'
235+
kind: ConfigMap
236+
metadata:
237+
name: system-config-json
238+
""" % (conf_string)
239+
return yaml.load(sys_conf, yaml.Loader)
240+
241+
# print next steps on how to use generated system-cm.yaml
242+
# TODO: generalize next steps for different deploy stratagies
243+
def print_next_steps():
244+
print('------------')
245+
print_colored(INFO, 'NEXT STEPS:')
246+
print('''
247+
1. Apply system-cm.yaml to your namespace. For openshift:
248+
`oc apply -f system-cm.yaml`
249+
250+
2. Add SYSTEM_CONFIG enviornment variable to Ray's head node container that maps
251+
to system config map's name in your cluster yaml (See an example below):
252+
...
253+
containers:
254+
env:
255+
- name: SYSTEM_CONFIG
256+
valueFrom:
257+
configMapKeyRef:
258+
name: system-config-json
259+
key: system_config
260+
...
261+
...
262+
263+
3. Specify SYSTEM_CONFIG env. var to head node's `ray start` command as following:
264+
ray start --head --system-config='$(SYSTEM_CONFIG)' ...
265+
266+
4. Deploy Ray cluster
267+
''')
268+
269+
#
270+
# TODO:
271+
# 1. finalize the configurations for each mode
272+
# 2. pass verbose to all functions to have all prints included in verbose
273+
#
274+
def generate_system_config_map(ray_version, res_mode, path, verbose):
275+
## version specific configurations
276+
Ray_configs= read_conf(ray_version, res_mode)
277+
sys_config_yaml = gen_system_conf(Ray_configs, verbose)
278+
fd = open(path + '/system_cm.yaml', 'w+')
279+
yaml.dump(sys_config_yaml, fd, allow_unicode=True)
280+
print_colored(OK, "File saved! DONE!")
281+
print_next_steps()
282+
283+
284+
# Tool options
285+
def main(argv):
286+
parser = OptionParser(usage="ray-res-config.py --ray-version <arg> --resiliency-mode <arg>")
287+
parser.add_option("-r","--ray-version", action="store", type="string",
288+
default="1.4.0", dest="ray_version",
289+
help="Ray version for the deployment (required). \
290+
e.g. 1.4.0 (default), 1.4.1")
291+
parser.add_option("-m","--resiliency-mode", action="store", type="string",
292+
default="recommended", dest="res_mode",
293+
help="Fault-tolerance model for the deployment (required).\
294+
e.g. strict, recommended (default), relaxed")
295+
parser.add_option("-p","--path", action="store", type="string",
296+
default=".", dest="path",
297+
help="Path to save final system config map yaml.\
298+
Otherwise, saves in $PWD (default)")
299+
parser.add_option("-n","--name", action="store", type="string",
300+
default="system-config-json", dest="name",
301+
help="Name of the system config map.\
302+
default=system-config-json")
303+
304+
parser.add_option("-f","--fetch-all", action="store_true", dest='fetch_all',
305+
help="Fetch configs for all from git. \
306+
Use -o to overwrite existing .conf files")
307+
parser.add_option("-o","--overwrite", action="store_true", dest="overwrite",
308+
help="Use with -f or --fetch-all to overwrite existing .conf files")
309+
parser.add_option("-l","--list", action="store_true", dest='lists',
310+
help="List compatible versions and resiliency modes")
311+
parser.add_option("--verbose", action="store_true",
312+
help="Enable verbose execution mode")
313+
parser.add_option("--version", action="store_true",
314+
help="Print version of the FT model")
315+
(opts, args) = parser.parse_args()
316+
317+
# Que.: do we need this?
318+
if opts.version:
319+
print("CodeFlare Resiliency Tool v1.0")
320+
sys.exit(0)
321+
322+
# fetch and parse all supported ray versions
323+
if opts.fetch_all:
324+
fetch_configs_from_git(versions_list, res_modes_list, opts.overwrite)
325+
sys.exit(0)
326+
327+
# print list of supported ray versions and resiliency profiles
328+
if opts.lists:
329+
print_ray_versions_and_modes()
330+
sys.exit(0)
331+
332+
# validate ray version; print list of supported versions if input is invalid
333+
if opts.ray_version not in versions_list:
334+
print_colored(FAIL, "Ray version %s not supported!" % opts.ray_version)
335+
print_ray_versions_and_modes()
336+
sys.exit(1)
337+
338+
# validate resiliency profile/mode input (case insensitive)
339+
# print list of supported versions and modes if input is unsupported
340+
if opts.res_mode.lower() not in res_modes_list:
341+
print_colored(FAIL, "Resiliency profile %s not supported!" % opts.res_mode)
342+
print_ray_versions_and_modes()
343+
sys.exit(1)
344+
345+
# generate system configMap yaml file
346+
generate_system_config_map(opts.ray_version, opts.res_mode, opts.path, opts.verbose)
347+
348+
if __name__ == "__main__":
349+
main(sys.argv[1:])

0 commit comments

Comments
 (0)