x/ref/lib/vdl/parse/grammar.y

// Yacc grammar file for the vanadium VDL langage.
// http://goto/veyron:vdl
//
// Similar to Go, the formal grammar uses semicolons ';' as terminators, but
// idiomatic usage may omit most semicolons using the following rules:
//   1) During the tokenization phase, semicolons are always auto-inserted at
//      the end of each line after certain tokens.  This is implemented in
//      the lexer via the autoSemi function.
//   2) Semicolons may be omitted before a closing ')' or '}'.  This is
//      implemented via the osemi rule below.
//
// To generate the grammar.go source file containing the parser, run
// grammar_gen.sh in this same directory, or run go generate on this package.

////////////////////////////////////////////////////////////////////////
// Declarations section.
%{
// This grammar.y.go file was auto-generated by yacc from grammar.y.

package parse

import (
  "math/big"
  "strings"
)

type intPos struct {
  int *big.Int
  pos Pos
}

type ratPos struct {
  rat *big.Rat
  pos Pos
}

// typeListToStrList converts a slice of Type to a slice of StringPos.  Each
// type must be a TypeNamed with an empty PackageName, otherwise errors are
// reported, and ok=false is returned.
func typeListToStrList(yylex yyLexer, typeList []Type) (strList []StringPos, ok bool) {
  ok = true
  for _, t := range typeList {
    var tn *TypeNamed
    if tn, ok = t.(*TypeNamed); !ok {
      lexPosErrorf(yylex, t.Pos(), "%s invalid (expected one or more variable names)", t.String())
      return
    }
    if strings.ContainsRune(tn.Name, '.') {
      ok = false
      lexPosErrorf(yylex, t.Pos(), "%s invalid (expected one or more variable names).", tn.Name)
      return
    }
    strList = append(strList, StringPos{tn.Name, tn.P})
  }
  return
}
%}

// This union is turned into the struct type yySymType.  Most symbols include
// positional information; this is necessary since Go yacc doesn't support
// passing positional information, so we need to track it ourselves.
%union {
  pos        Pos
  strpos     StringPos
  intpos     intPos
  ratpos     ratPos
  namepos    NamePos
  nameposes  []NamePos
  typeexpr   Type
  typeexprs  []Type
  fields     []*Field
  iface      *Interface
  constexpr  ConstExpr
  constexprs []ConstExpr
  complit    *ConstCompositeLit
  kvlit      KVLit
  kvlits     []KVLit
  errordef   ErrorDef
}

// Terminal tokens.  We leave single-char tokens as-is using their ascii code as
// their id, to make the grammar more readable; multi-char tokens get their own
// id.  The start* tokens are dummy tokens to kick off the parse.
%token            startFileImports startFile startConfigImports startConfig
%token            startExprs
%token <pos>      ';' ':' ',' '.' '(' ')' '[' ']' '{' '}' '<' '>' '='
%token <pos>      '!' '+' '-' '*' '/' '%' '|' '&' '^' '?'
%token <pos>      tOROR tANDAND tLE tGE tNE tEQEQ tLSH tRSH
%token <pos>      tCONST tENUM tERROR tIMPORT tINTERFACE tMAP tPACKAGE
%token <pos>      tSET tSTREAM tSTRUCT tTYPE tTYPEOBJECT tUNION
%token <strpos>   tIDENT tSTRLIT
%token <intpos>   tINTLIT
%token <ratpos>   tRATLIT

// Labeled rules holding typed values.
%type <strpos>     nameref dotnameref
%type <namepos>    label_spec
%type <nameposes>  label_spec_list
%type <typeexpr>   type type_no_typeobject otype
%type <typeexprs>  type_comma_list streamargs
%type <fields>     field_spec_list field_spec named_arg_list inargs outargs
%type <iface>      iface_item_list iface_item
%type <constexpr>  expr unary_expr operand
%type <constexprs> tags expr_comma_list
%type <complit>    comp_lit
%type <kvlit>      kv_lit
%type <kvlits>     kv_lit_list
%type <errordef>   error_details error_detail_list error_detail

// There are 5 precedence levels for operators, all left-associative, just like
// Go.  Lines are listed in order of increasing precedence.
%left tOROR
%left tANDAND
%left '<' '>' tLE tGE tNE tEQEQ
%left '+' '-' '|' '^'
%left '*' '/' '%' '&' tLSH tRSH

%left notPackage notConfig

%start start

%%
////////////////////////////////////////////////////////////////////////
// Rules section.

// Note that vdl files and config files use an identical grammar, other than the
// initial package or config clause respectively.  Error checking for config
// files that include error, type or interface definitions occurs afterwards, to
// improve error reporting.
start:
  startFileImports   package imports gen_imports_eof
| startFile          package imports defs
| startConfigImports config imports gen_imports_eof
| startConfig        config imports defs
| startExprs         expr_comma_list ';'
  { lexStoreExprs(yylex, $2) }

// Dummy rule to terminate the parse after the imports, regardless of whether
// there are any defs.  Defs always start with either the tTYPE, tCONST or
// tERROR tokens, and the rule handles all cases - either there's no trailing
// text (the empty case, which would have resulted in EOF anyways), or there's
// one or more defs, where we need to force an EOF.
gen_imports_eof:
  // Empty.
  { lexGenEOF(yylex) }
| tTYPE
  { lexGenEOF(yylex) }
| tCONST
  { lexGenEOF(yylex) }
| tERROR
  { lexGenEOF(yylex) }

// PACKAGE
package:
  %prec notPackage
  { lexPosErrorf(yylex, Pos{}, "vdl file must start with package clause") }
| tPACKAGE tIDENT ';'
  { lexVDLFile(yylex).PackageDef = NamePos{Name:$2.String, Pos:$2.Pos} }

// CONFIG
config:
  %prec notConfig
  { lexPosErrorf(yylex, Pos{}, "config file must start with config clause") }
| tIDENT '=' expr ';'
  {
    // We allow "config" as an identifier; it is not a keyword.  So we check
    // manually to make sure the syntax is correct.
    if $1.String != "config" {
      lexPosErrorf(yylex, $1.Pos, "config file must start with config clause")
      return 1 // Any non-zero code indicates an error
    }
    file := lexVDLFile(yylex)
    file.PackageDef = NamePos{Name:"config", Pos:$1.Pos}
    file.ConstDefs = []*ConstDef{{Expr:$3}}
  }

// IMPORTS
imports:
  // Empty.
| imports import ';'

import:
  tIMPORT '(' ')'
| tIMPORT '(' import_spec_list osemi ')'
| tIMPORT import_spec

import_spec_list:
  import_spec
| import_spec_list ';' import_spec

import_spec:
  tSTRLIT
  {
    imps := &lexVDLFile(yylex).Imports
    *imps = append(*imps, &Import{Path:$1.String, NamePos:NamePos{Pos:$1.Pos}})
  }
| tIDENT tSTRLIT
  {
    imps := &lexVDLFile(yylex).Imports
    *imps = append(*imps, &Import{Path:$2.String, NamePos:NamePos{Name:$1.String, Pos:$1.Pos}})
  }

// DEFINITIONS
defs:
  // Empty.
| defs type_def ';'
| defs const_def ';'
| defs error_def ';'

type_def:
  tTYPE '(' ')'
| tTYPE '(' type_spec_list osemi ')'
| tTYPE type_spec
| tTYPE interface_spec

const_def:
  tCONST '(' ')'
| tCONST '(' const_spec_list osemi ')'
| tCONST const_spec

error_def:
  tERROR '(' ')'
| tERROR '(' error_spec_list osemi ')'
| tERROR error_spec

// TYPE DEFINITIONS
type_spec_list:
  type_spec
| type_spec_list ';' type_spec

type_spec:
  tIDENT type
  {
    tds := &lexVDLFile(yylex).TypeDefs
    *tds = append(*tds, &TypeDef{Type:$2, NamePos:NamePos{Name:$1.String, Pos:$1.Pos}})
  }

// The type_no_typeobject rule is necessary to avoid a shift/reduce conflict
// between type conversions and typeobject const expressions.  E.g.
//   type(expr)       // type conversion
//   typeobject(type) // typeobject const expression
//
// We've chosen similar syntax to make it easier for the user to remember how to
// use the feature, but since "typeobject" is itself a type, there is a problem.
// We resolve the conflict by restricting the type conversion to the rule:
//   type_no_typeobject '(' expr ')'
//
// Note that if we wanted to add general-purpose functions with the func(expr)
// syntax, we'll need to pull nameref out of type_no_typeobject, and parse both
// func(expr) and nameref(expr) into a generic structure.  We can't use that
// same mechanism for typeobject, since the thing inside the parens is a value
// expression for type conversions, but a type expression for typeobject.
type_no_typeobject:
  nameref
  { $$ = &TypeNamed{Name:$1.String, P:$1.Pos} }
| tERROR // Special-case to allow the "error" keyword as a named type.
  { $$ = &TypeNamed{Name:"error", P:$1} }
| '[' tINTLIT ']' type
  { $$ = &TypeArray{Len:int($2.int.Int64()), Elem:$4, P:$1} }
| '[' ']' type
  { $$ = &TypeList{Elem:$3, P:$1} }
| tENUM '{' label_spec_list osemi '}'
  { $$ = &TypeEnum{Labels:$3, P:$1} }
| tSET '[' type ']'
  { $$ = &TypeSet{Key:$3, P:$1} }
| tMAP '[' type ']' type
  { $$ = &TypeMap{Key:$3, Elem:$5, P:$1} }
| tSTRUCT '{' field_spec_list osemi '}'
  { $$ = &TypeStruct{Fields:$3, P:$1} }
| tSTRUCT '{' '}'
  { $$ = &TypeStruct{P:$1} }
| tUNION '{' field_spec_list osemi '}'
  { $$ = &TypeUnion{Fields:$3, P:$1} }
| tUNION '{' '}'
  { $$ = &TypeUnion{P:$1} }
| '?' type
  { $$ = &TypeOptional{Base:$2, P:$1} }

// The type rule expands to all the actual types, including typeobject.
type:
  type_no_typeobject
  { $$ = $1}
| tTYPEOBJECT
  { $$ = &TypeNamed{Name:"typeobject", P:$1} }

label_spec_list:
  label_spec
  { $$ = []NamePos{$1} }
| label_spec_list ';' label_spec
  { $$ = append($1, $3) }

label_spec:
  tIDENT
  { $$ = NamePos{Name:$1.String, Pos:$1.Pos} }

field_spec_list:
  field_spec
  { $$ = $1 }
| field_spec_list ';' field_spec
  { $$ = append($1, $3...) }

// The field_spec rule is intended to capture the following patterns:
//    var type
//    var0, var1, var2 type
// where var* refers to a variable name, and type refers to a type.  Each var
// is expressed as an identifier.  An oddity here is that we use a type_list to
// capture the list of variables rather than using a list of IDENTS.  This means
// the grammar accepts invalid constructions, and we must validate afterwards.
//
// We do this to avoid a LALR reduce/reduce conflict with function arguments.
// The problem is exhibited by the in-args of these two functions, where func1
// has three args respectively named A, B, C all of type t1, and func2 has three
// args with name and type t2, t3 and t4 respectively.  The func1 style is
// captured by field_spec in named_arg_list, while the func2 style is captured
// by type_list in args.
//   func1(A, B, C t1)
//   func2(t2, t3, t4)
//
// If we used an ident_list to capture "A, B, C" in func1, but used a type_list
// to capture "t2, t3, t4" in func2, we'd have a reduce/reduce conflict since
// yacc cannot determine whether to reduce as an ident_list or as a type_list;
// we don't know until we've reached token t1 in func1, or token ')' in func2.
//
// The fix can be considered both beautiful and a huge hack.  To avoid the
// conflict we force both forms to use type_list to capture both "A, B, C" and
// "t2, t3, t4".  This avoids the conflict since we're now always reducing via
// type_list, but allows invalid constructions like "[]int, []int []int".  So we
// validate in the action and throw errors.
//
// An alternate fix would have been to remove the IDENT case from the type rule,
// use ident_list to capture both cases, and manually "expand" the grammar to
// distinguish the cases appropriately.  That would ensure we don't allow
// constructions like "int, int int" in the grammar itself, but would lead to a
// much more complicated grammar.  As a bonus, with the type_list solution we
// can give better error messages.
field_spec:
  type_comma_list type
  {
    if names, ok := typeListToStrList(yylex, $1); ok {
      for _, n := range names {
        $$ = append($$, &Field{Type:$2, NamePos:NamePos{Name:n.String, Pos:n.Pos}})
      }
    } else {
      lexPosErrorf(yylex, $2.Pos(), "perhaps you forgot a comma before %q?.", $2.String())
    }
  }

type_comma_list:
  type
  { $$ = []Type{$1} }
| type_comma_list ',' type
  { $$ = append($1, $3) }

// INTERFACE DEFINITIONS
interface_spec:
  tIDENT tINTERFACE '{' '}'
  {
    ifs := &lexVDLFile(yylex).Interfaces
    *ifs = append(*ifs, &Interface{NamePos:NamePos{Name:$1.String, Pos:$1.Pos}})
  }
| tIDENT tINTERFACE '{' iface_item_list osemi '}'
  {
    $4.Name, $4.Pos = $1.String, $1.Pos
    ifs := &lexVDLFile(yylex).Interfaces
    *ifs = append(*ifs, $4)
  }

iface_item_list:
  iface_item
  { $$ = $1 }
| iface_item_list ';' iface_item
  {
    $1.Embeds = append($1.Embeds, $3.Embeds...)
    $1.Methods = append($1.Methods, $3.Methods...)
    $$ = $1
  }

iface_item:
  tIDENT inargs streamargs outargs tags
  { $$ = &Interface{Methods: []*Method{{InArgs:$2, InStream:$3[0], OutStream:$3[1], OutArgs:$4, Tags:$5, NamePos:NamePos{Name:$1.String, Pos:$1.Pos}}}} }
| nameref
  { $$ = &Interface{Embeds: []*NamePos{{Name:$1.String, Pos:$1.Pos}}} }

inargs:
  '(' ')'
  { $$ = nil }
| '(' named_arg_list ocomma ')'
  { $$ = $2 }
| '(' type_comma_list ocomma ')'
  // Just like Go, we allow a list of types without variable names.  See the
  // field_spec rule for a workaround to avoid a reduce/reduce conflict.
  {
    for _, t := range $2 {
      $$ = append($$, &Field{Type:t, NamePos:NamePos{Pos:t.Pos()}})
    }
  }

// The named_arg_list rule is just like the field_spec_list, but uses comma ','
// as a delimiter rather than semicolon ';'.
named_arg_list:
  field_spec
  { $$ = $1 }
| named_arg_list ',' field_spec
  { $$ = append($1, $3...) }

// The outargs use special syntax to denote the error associated with each
// method.  For parsing we accept these forms:
//  error
//  (string | error)
//  (a, b string, c bool | error)
//
// TODO(toddw): Improve parser syntax errors.
outargs:
  tERROR
  { $$ = nil }
| '(' named_arg_list ocomma '|' tERROR ')'
  { $$ = $2 }
| '(' type_comma_list ocomma '|' tERROR ')'
  // Just like Go, we allow a list of types without variable names.  See the
  // field_spec rule for a workaround to avoid a reduce/reduce conflict.
  {
    for _, t := range $2 {
      $$ = append($$, &Field{Type:t, NamePos:NamePos{Pos:t.Pos()}})
    }
  }

streamargs:
  // Empty.
  { $$ = []Type{nil, nil} }
| tSTREAM '<' '>'
  { $$ = []Type{nil, nil} }
| tSTREAM '<' type '>'
  { $$ = []Type{$3, nil} }
| tSTREAM '<' type ',' type '>'
  { $$ = []Type{$3, $5} }

tags:
  // Empty.
  { $$ = nil }
| '{' '}'
  { $$ = nil }
| '{' expr_comma_list ocomma '}'
  { $$ = $2 }

expr_comma_list:
  expr
  { $$ = []ConstExpr{$1} }
| expr_comma_list ',' expr
  { $$ = append($1, $3) }

// CONST DEFINITIONS
const_spec_list:
  const_spec
| const_spec_list ';' const_spec

const_spec:
  tIDENT '=' expr
  {
    cds := &lexVDLFile(yylex).ConstDefs
    *cds = append(*cds, &ConstDef{Expr:$3, NamePos:NamePos{Name:$1.String, Pos:$1.Pos}})
  }

expr:
  unary_expr
  { $$ = $1 }
| expr tOROR expr
  { $$ = &ConstBinaryOp{"||", $1, $3, $2} }
| expr tANDAND expr
  { $$ = &ConstBinaryOp{"&&", $1, $3, $2} }
| expr '<' expr
  { $$ = &ConstBinaryOp{"<", $1, $3, $2} }
| expr '>' expr
  { $$ = &ConstBinaryOp{">", $1, $3, $2} }
| expr tLE expr
  { $$ = &ConstBinaryOp{"<=", $1, $3, $2} }
| expr tGE expr
  { $$ = &ConstBinaryOp{">=", $1, $3, $2} }
| expr tNE expr
  { $$ = &ConstBinaryOp{"!=", $1, $3, $2} }
| expr tEQEQ expr
  { $$ = &ConstBinaryOp{"==", $1, $3, $2} }
| expr '+' expr
  { $$ = &ConstBinaryOp{"+", $1, $3, $2} }
| expr '-' expr
  { $$ = &ConstBinaryOp{"-", $1, $3, $2} }
| expr '*' expr
  { $$ = &ConstBinaryOp{"*", $1, $3, $2} }
| expr '/' expr
  { $$ = &ConstBinaryOp{"/", $1, $3, $2} }
| expr '%' expr
  { $$ = &ConstBinaryOp{"%", $1, $3, $2} }
| expr '|' expr
  { $$ = &ConstBinaryOp{"|", $1, $3, $2} }
| expr '&' expr
  { $$ = &ConstBinaryOp{"&", $1, $3, $2} }
| expr '^' expr
  { $$ = &ConstBinaryOp{"^", $1, $3, $2} }
| expr tLSH expr
  { $$ = &ConstBinaryOp{"<<", $1, $3, $2} }
| expr tRSH expr
  { $$ = &ConstBinaryOp{">>", $1, $3, $2} }

unary_expr:
  operand
  { $$ = $1 }
| '!' unary_expr
  { $$ = &ConstUnaryOp{"!", $2, $1} }
| '+' unary_expr
  { $$ = &ConstUnaryOp{"+", $2, $1} }
| '-' unary_expr
  { $$ = &ConstUnaryOp{"-", $2, $1} }
| '^' unary_expr
  { $$ = &ConstUnaryOp{"^", $2, $1} }
| type_no_typeobject '(' expr ')'
  { $$ = &ConstTypeConv{$1, $3, $1.Pos()} }
| tTYPEOBJECT '(' type ')'
  { $$ = &ConstTypeObject{$3, $1} }

operand:
  tSTRLIT
  { $$ = &ConstLit{$1.String, $1.Pos} }
| tINTLIT
  { $$ = &ConstLit{$1.int, $1.pos} }
| tRATLIT
  { $$ = &ConstLit{$1.rat, $1.pos} }
| nameref
  { $$ = &ConstNamed{$1.String, $1.Pos} }
| comp_lit
  { $$ = $1 }
| comp_lit '.' tIDENT
  { lexPosErrorf(yylex, $2, "cannot apply selector operator to unnamed constant")}
| comp_lit '[' expr ']'
  { lexPosErrorf(yylex, $2, "cannot apply index operator to unnamed constant")}
| nameref '[' expr ']'
  { $$ = &ConstIndexed{&ConstNamed{$1.String, $1.Pos}, $3, $1.Pos} }
| '(' expr ')'
  { $$ = $2 }

comp_lit:
  otype '{' '}'
  { $$ = &ConstCompositeLit{$1, nil, $2} }
| otype '{' kv_lit_list ocomma '}'
  { $$ = &ConstCompositeLit{$1, $3, $2} }

kv_lit_list:
  kv_lit
  { $$ = []KVLit{$1} }
| kv_lit_list ',' kv_lit
  { $$ = append($1, $3) }

kv_lit:
  expr
  { $$ = KVLit{Value:$1} }
| expr ':' expr
  { $$ = KVLit{Key:$1, Value:$3} }

// ERROR DEFINITIONS
error_spec_list:
  error_spec
| error_spec_list ';' error_spec

error_spec:
  tIDENT inargs error_details
  {
    // Create *ErrorDef starting with a copy of error_details, filling in the
    // name and params
    ed := $3
    ed.NamePos = NamePos{Name:$1.String, Pos:$1.Pos}
    ed.Params = $2
    eds := &lexVDLFile(yylex).ErrorDefs
    *eds = append(*eds, &ed)
  }

error_details:
  // Empty.
  { $$ = ErrorDef{} }
| '{' '}'
  { $$ = ErrorDef{} }
| '{' error_detail_list ocomma '}'
  { $$ = $2 }

error_detail_list:
  error_detail
  { $$ = $1 }
| error_detail_list ',' error_detail
  {
    // Merge each ErrorDef in-order to build the final ErrorDef.
    $$ = $1
    switch {
    case len($3.Actions) > 0:
      $$.Actions = append($$.Actions, $3.Actions...)
    case len($3.Formats) > 0:
      $$.Formats = append($$.Formats, $3.Formats...)
    }
  }

error_detail:
  tIDENT
  { $$ = ErrorDef{Actions: []StringPos{$1}} }
| tSTRLIT ':' tSTRLIT
  { $$ = ErrorDef{Formats: []LangFmt{{Lang: $1, Fmt: $3}}} }

// MISC TOKENS

// nameref describes a named reference to another type, interface or const.  We
// allow the following forms:
//   foo
//   foo.bar            (and multi-dot variants)
//   "pkg/path".foo
//   "pkg/path".foo.bar (and multi-dot variants)
nameref:
  dotnameref
  { $$ = $1 }
| tSTRLIT '.' dotnameref
  { $$ = StringPos{"\""+$1.String+"\"."+$3.String, $1.Pos} }

// dotnameref describes just the dotted portion of nameref.
dotnameref:
  tIDENT
  { $$ = $1 }
| dotnameref '.' tIDENT
  { $$ = StringPos{$1.String+"."+$3.String, $1.Pos} }

otype:
  // Empty.
  { $$ = nil }
| type
  { $$ = $1 }

osemi:
  // Empty.
| ';'

ocomma:
  // Empty.
| ','