From ee86d7623a19b20eda166ccbb25468bb2ca9977d Mon Sep 17 00:00:00 2001 From: Tobin Baker Date: Mon, 4 May 2015 10:41:13 -0700 Subject: [PATCH 1/9] Adding optional args to load() function. --- examples/load_opts.myl | 2 ++ raco/algebra.py | 13 ++++++++----- raco/language/myrialang.py | 13 ++++++++++++- raco/myrial/interpreter.py | 4 ++-- raco/myrial/parser.py | 24 ++++++++++++++++++++++-- 5 files changed, 46 insertions(+), 10 deletions(-) create mode 100644 examples/load_opts.myl diff --git a/examples/load_opts.myl b/examples/load_opts.myl new file mode 100644 index 00000000..077cd4f6 --- /dev/null +++ b/examples/load_opts.myl @@ -0,0 +1,2 @@ +t = load("https://s3-us-west-2.amazonaws.com/myria/public-adhoc-TwitterK.csv", column0:int, column1:int; skip="1"); +store(t, TwitterK2); diff --git a/raco/algebra.py b/raco/algebra.py index a7c598e4..75c2f370 100644 --- a/raco/algebra.py +++ b/raco/algebra.py @@ -1266,15 +1266,17 @@ class FileScan(ZeroaryOperator): """Load table data from a file.""" - def __init__(self, path=None, _scheme=None): + def __init__(self, path=None, _scheme=None, options={}): self.path = path self._scheme = _scheme + self.options = options ZeroaryOperator.__init__(self) def __eq__(self, other): return (ZeroaryOperator.__eq__(self, other) and self.path == other.path - and self.scheme() == other.scheme()) + and self.scheme() == other.scheme() + and self.options == other.options) def __hash__(self): return ("%s-%s" % (self.opname(), self.path)).__hash__() @@ -1283,9 +1285,10 @@ def shortStr(self): return "%s(%s)" % (self.opname(), self.path) def __repr__(self): - return "{op}({path!r}, {sch!r})".format(op=self.opname(), + return "{op}({path!r}, {sch!r}, {opt!r})".format(op=self.opname(), path=self.path, - sch=self._scheme) + sch=self._scheme, + opt=self.options) def num_tuples(self): raise NotImplementedError("{op}.num_tuples".format(op=type(self))) @@ -1294,7 +1297,7 @@ def copy(self, other): """deep copy""" self.path = other.path self._scheme = other._scheme - + self.options = other.options ZeroaryOperator.copy(self, other) def scheme(self): diff --git a/raco/language/myrialang.py b/raco/language/myrialang.py index e607a579..1c0a5657 100644 --- a/raco/language/myrialang.py +++ b/raco/language/myrialang.py @@ -173,6 +173,16 @@ def compileme(self): "table": self.name, } +class MyriaFileScan(algebra.FileScan, MyriaOperator): + def compileme(self): + return dict({ + "opType": "FileScan", + "source": { + "dataType": "URI", + "uri": self.path, + }, + "schema": scheme_to_schema(self.scheme()) + }, **self.options) class MyriaLimit(algebra.Limit, MyriaOperator): def compileme(self, inputid): @@ -1482,6 +1492,7 @@ def fire(self, expr): rules.OneToOne(algebra.NaryJoin, MyriaLeapFrogJoin), rules.OneToOne(algebra.Scan, MyriaScan), rules.OneToOne(algebra.ScanTemp, MyriaScanTemp), + rules.OneToOne(algebra.FileScan, MyriaFileScan), rules.OneToOne(algebra.SingletonRelation, MyriaSingleton), rules.OneToOne(algebra.EmptyRelation, MyriaEmptyRelation), rules.OneToOne(algebra.UnionAll, MyriaUnionAll), @@ -1513,6 +1524,7 @@ class MyriaAlgebra(Algebra): MyriaHyperShuffleConsumer, MyriaScan, MyriaScanTemp, + MyriaFileScan, MyriaEmptyRelation, MyriaSingleton ) @@ -1766,7 +1778,6 @@ def compile_to_json(raw_query, logical_plan, physical_plan, # raw_query must be a string if not isinstance(raw_query, basestring): raise ValueError("raw query must be a string") - return {"rawQuery": raw_query, "logicalRa": str(logical_plan), "language": language, diff --git a/raco/myrial/interpreter.py b/raco/myrial/interpreter.py index 772eb657..2fd3dd0b 100644 --- a/raco/myrial/interpreter.py +++ b/raco/myrial/interpreter.py @@ -97,8 +97,8 @@ def scan(self, rel_key): return raco.algebra.Scan(rel_key, scheme, self.catalog.num_tuples(rel_key)) - def load(self, path, scheme): - return raco.algebra.FileScan(path, scheme) + def load(self, path, scheme, options): + return raco.algebra.FileScan(path, scheme, options) def table(self, emit_clause): """Emit a single-row table literal.""" diff --git a/raco/myrial/parser.py b/raco/myrial/parser.py index 9acc3ee8..7d77007c 100644 --- a/raco/myrial/parser.py +++ b/raco/myrial/parser.py @@ -555,8 +555,13 @@ def p_expression_scan(p): @staticmethod def p_expression_load(p): - 'expression : LOAD LPAREN STRING_LITERAL COMMA column_def_list RPAREN' - p[0] = ('LOAD', p[3], scheme.Scheme(p[5])) + """expression : LOAD LPAREN STRING_LITERAL COMMA column_def_list RPAREN + | LOAD LPAREN STRING_LITERAL COMMA column_def_list SEMI option_list RPAREN""" + if len(p) == 9: + opts = dict(p[7]) + else: + opts = {} + p[0] = ('LOAD', p[3], scheme.Scheme(p[5]), opts) @staticmethod def p_relation_key(p): @@ -581,6 +586,21 @@ def p_column_def(p): 'column_def : unreserved_id COLON type_name' p[0] = (p[1], p[3]) + @staticmethod + def p_option_list(p): + """option_list : option_list COMMA option + | option""" + if len(p) == 4: + opts = p[1] + [p[3]] + else: + opts = [p[1]] + p[0] = opts + + @staticmethod + def p_option(p): + 'option : unreserved_id EQUALS string_arg' + p[0] = (p[1], p[3]) + @staticmethod def p_type_name(p): """type_name : STRING From 0dfd6c992233ea4577b68c34dbed84b83f35baf8 Mon Sep 17 00:00:00 2001 From: Tobin Baker Date: Mon, 4 May 2015 11:57:12 -0700 Subject: [PATCH 2/9] Fixing formatting. --- raco/algebra.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/raco/algebra.py b/raco/algebra.py index 75c2f370..a46b77a4 100644 --- a/raco/algebra.py +++ b/raco/algebra.py @@ -1285,10 +1285,11 @@ def shortStr(self): return "%s(%s)" % (self.opname(), self.path) def __repr__(self): - return "{op}({path!r}, {sch!r}, {opt!r})".format(op=self.opname(), - path=self.path, - sch=self._scheme, - opt=self.options) + return "{op}({path!r}, {sch!r}, {opt!r})".format( + op=self.opname(), + path=self.path, + sch=self._scheme, + opt=self.options) def num_tuples(self): raise NotImplementedError("{op}.num_tuples".format(op=type(self))) From 41e290e0bf6d56fd334c6725b15fd23fb370b884 Mon Sep 17 00:00:00 2001 From: Tobin Baker Date: Mon, 4 May 2015 11:58:06 -0700 Subject: [PATCH 3/9] Supporting all literal types in optional args. --- examples/load_opts.myl | 2 +- raco/myrial/parser.py | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/examples/load_opts.myl b/examples/load_opts.myl index 077cd4f6..d62ad249 100644 --- a/examples/load_opts.myl +++ b/examples/load_opts.myl @@ -1,2 +1,2 @@ -t = load("https://s3-us-west-2.amazonaws.com/myria/public-adhoc-TwitterK.csv", column0:int, column1:int; skip="1"); +t = load("https://s3-us-west-2.amazonaws.com/myria/public-adhoc-TwitterK.csv", column0:int, column1:int; skip=1); store(t, TwitterK2); diff --git a/raco/myrial/parser.py b/raco/myrial/parser.py index 7d77007c..d3655f98 100644 --- a/raco/myrial/parser.py +++ b/raco/myrial/parser.py @@ -598,9 +598,18 @@ def p_option_list(p): @staticmethod def p_option(p): - 'option : unreserved_id EQUALS string_arg' + 'option : unreserved_id EQUALS literal_arg' p[0] = (p[1], p[3]) + @staticmethod + def p_literal_arg(p): + """literal_arg : STRING_LITERAL + | INTEGER_LITERAL + | FLOAT_LITERAL + | TRUE + | FALSE""" + p[0] = p[1] + @staticmethod def p_type_name(p): """type_name : STRING From 9ef5d5c426f923ab322b5569c22f482d95953614 Mon Sep 17 00:00:00 2001 From: Tobin Baker Date: Mon, 4 May 2015 13:20:52 -0700 Subject: [PATCH 4/9] Fixing tests. --- raco/fakedb.py | 3 +++ raco/language/myrialang.py | 2 ++ raco/myrial/parser.py | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/raco/fakedb.py b/raco/fakedb.py index 81d5bcb6..e1821c63 100644 --- a/raco/fakedb.py +++ b/raco/fakedb.py @@ -354,6 +354,9 @@ def scantemp(self, op): def myriascan(self, op): return self.scan(op) + def myriafilescan(self, op): + return self.filescan(op) + def myriasink(self, op): return self.sink(op) diff --git a/raco/language/myrialang.py b/raco/language/myrialang.py index 1c0a5657..5614d2ad 100644 --- a/raco/language/myrialang.py +++ b/raco/language/myrialang.py @@ -173,6 +173,7 @@ def compileme(self): "table": self.name, } + class MyriaFileScan(algebra.FileScan, MyriaOperator): def compileme(self): return dict({ @@ -184,6 +185,7 @@ def compileme(self): "schema": scheme_to_schema(self.scheme()) }, **self.options) + class MyriaLimit(algebra.Limit, MyriaOperator): def compileme(self, inputid): return { diff --git a/raco/myrial/parser.py b/raco/myrial/parser.py index d3655f98..da2d57ab 100644 --- a/raco/myrial/parser.py +++ b/raco/myrial/parser.py @@ -556,7 +556,7 @@ def p_expression_scan(p): @staticmethod def p_expression_load(p): """expression : LOAD LPAREN STRING_LITERAL COMMA column_def_list RPAREN - | LOAD LPAREN STRING_LITERAL COMMA column_def_list SEMI option_list RPAREN""" +| LOAD LPAREN STRING_LITERAL COMMA column_def_list SEMI option_list RPAREN""" if len(p) == 9: opts = dict(p[7]) else: From 4ac32395018e3159d8b9e12cdc4fa91295b70dab Mon Sep 17 00:00:00 2001 From: Tobin Baker Date: Mon, 4 May 2015 14:14:13 -0700 Subject: [PATCH 5/9] Excluding generated files from lint. --- raco/test_style.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/raco/test_style.py b/raco/test_style.py index dd0d8e08..b1ec61c7 100644 --- a/raco/test_style.py +++ b/raco/test_style.py @@ -21,7 +21,7 @@ class StyleTest(unittest.TestCase): def test_style(self): "run flake8 with the right arguments and ensure all files pass" - check_output_and_print_stderr(['flake8', '--ignore=F', 'raco']) + check_output_and_print_stderr(['flake8', '--ignore=F', '--exclude=parsetab.py', 'raco']) def test_pylint(self): "run pylint -E to catch obvious errors" From 9586e23ffc7c491d01822b39512ba3997d99ee06 Mon Sep 17 00:00:00 2001 From: Tobin Baker Date: Mon, 4 May 2015 14:52:37 -0700 Subject: [PATCH 6/9] Fixing lint error. --- raco/test_style.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/raco/test_style.py b/raco/test_style.py index b1ec61c7..a5648e83 100644 --- a/raco/test_style.py +++ b/raco/test_style.py @@ -21,7 +21,8 @@ class StyleTest(unittest.TestCase): def test_style(self): "run flake8 with the right arguments and ensure all files pass" - check_output_and_print_stderr(['flake8', '--ignore=F', '--exclude=parsetab.py', 'raco']) + check_output_and_print_stderr([ + 'flake8', '--ignore=F', '--exclude=parsetab.py', 'raco']) def test_pylint(self): "run pylint -E to catch obvious errors" From 0c6b0773d8817a97ff135261cb5e4e113f93ee81 Mon Sep 17 00:00:00 2001 From: Tobin Baker Date: Mon, 4 May 2015 19:15:50 -0700 Subject: [PATCH 7/9] Revising load() syntax. --- examples/load_opts.myl | 2 +- raco/myrial/parser.py | 32 ++++++++++++++++++++++++-------- raco/myrial/scanner.py | 3 ++- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/examples/load_opts.myl b/examples/load_opts.myl index d62ad249..3af8acfb 100644 --- a/examples/load_opts.myl +++ b/examples/load_opts.myl @@ -1,2 +1,2 @@ -t = load("https://s3-us-west-2.amazonaws.com/myria/public-adhoc-TwitterK.csv", column0:int, column1:int; skip=1); +t = load("https://s3-us-west-2.amazonaws.com/myria/public-adhoc-TwitterK.csv", csv(schema(column0:int, column1:int), skip=1)); store(t, TwitterK2); diff --git a/raco/myrial/parser.py b/raco/myrial/parser.py index da2d57ab..7f540340 100644 --- a/raco/myrial/parser.py +++ b/raco/myrial/parser.py @@ -555,13 +555,9 @@ def p_expression_scan(p): @staticmethod def p_expression_load(p): - """expression : LOAD LPAREN STRING_LITERAL COMMA column_def_list RPAREN -| LOAD LPAREN STRING_LITERAL COMMA column_def_list SEMI option_list RPAREN""" - if len(p) == 9: - opts = dict(p[7]) - else: - opts = {} - p[0] = ('LOAD', p[3], scheme.Scheme(p[5]), opts) + 'expression : LOAD LPAREN STRING_LITERAL COMMA file_parser_fun RPAREN' + schema, options = p[5] + p[0] = ('LOAD', p[3], scheme.Scheme(schema), options) @staticmethod def p_relation_key(p): @@ -586,6 +582,26 @@ def p_column_def(p): 'column_def : unreserved_id COLON type_name' p[0] = (p[1], p[3]) + @staticmethod + def p_schema_fun(p): + 'schema_fun : SCHEMA LPAREN column_def_list RPAREN' + p[0] = p[3] + + @staticmethod + def p_file_parser_fun(p): + """file_parser_fun : file_parser_type LPAREN schema_fun COMMA option_list RPAREN + | file_parser_type LPAREN schema_fun RPAREN""" + if len(p) == 7: + schema, options = (p[3], p[5]) + else: + schema, options = (p[3], {}) + p[0] = (schema, options) + + @staticmethod + def p_file_parser_type(p): + 'file_parser_type : CSV' + p[0] = p[1] + @staticmethod def p_option_list(p): """option_list : option_list COMMA option @@ -594,7 +610,7 @@ def p_option_list(p): opts = p[1] + [p[3]] else: opts = [p[1]] - p[0] = opts + p[0] = dict(opts) @staticmethod def p_option(p): diff --git a/raco/myrial/scanner.py b/raco/myrial/scanner.py index 2d963b95..34d39f76 100644 --- a/raco/myrial/scanner.py +++ b/raco/myrial/scanner.py @@ -6,7 +6,8 @@ import raco.myrial.exceptions keywords = ['WHILE', 'DO', 'DEF', 'APPLY', 'CASE', 'WHEN', 'THEN', - 'ELSE', 'END', 'CONST', 'LOAD', 'DUMP', 'UDA', 'TRUE', 'FALSE'] + 'ELSE', 'END', 'CONST', 'LOAD', 'DUMP', 'CSV', 'SCHEMA', + 'UDA', 'TRUE', 'FALSE'] types = ['INT', 'STRING', 'FLOAT', 'BOOLEAN'] From 54840bf88167a7c73f6d49c31bbfaf4e4c9bc6af Mon Sep 17 00:00:00 2001 From: Tobin Baker Date: Mon, 4 May 2015 19:51:07 -0700 Subject: [PATCH 8/9] Appeasing lint. --- raco/myrial/parser.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/raco/myrial/parser.py b/raco/myrial/parser.py index 7f540340..7ac4914c 100644 --- a/raco/myrial/parser.py +++ b/raco/myrial/parser.py @@ -589,8 +589,9 @@ def p_schema_fun(p): @staticmethod def p_file_parser_fun(p): - """file_parser_fun : file_parser_type LPAREN schema_fun COMMA option_list RPAREN - | file_parser_type LPAREN schema_fun RPAREN""" + """file_parser_fun : file_parser_type LPAREN \ + schema_fun COMMA option_list RPAREN + | file_parser_type LPAREN schema_fun RPAREN""" if len(p) == 7: schema, options = (p[3], p[5]) else: From ab24b3fb60c1b831d65886eb9608b0d287f401d3 Mon Sep 17 00:00:00 2001 From: Tobin Baker Date: Mon, 4 May 2015 19:51:27 -0700 Subject: [PATCH 9/9] Fixing examples. --- examples/standalone.myl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/standalone.myl b/examples/standalone.myl index 421b4602..518c0ecf 100644 --- a/examples/standalone.myl +++ b/examples/standalone.myl @@ -1,5 +1,5 @@ -Emp = load("./examples/emp.csv", id:int, dept_id:int, name:string, salary:int); -Dept = load("./examples/dept.csv", id:int, name:string, manager:int); +Emp = load("./examples/emp.csv", csv(schema(id:int, dept_id:int, name:string, salary:int))); +Dept = load("./examples/dept.csv", csv(schema(id:int, name:string, manager:int))); out = [from Emp, Dept where Emp.dept_id == Dept.id AND Emp.salary > 5000