From f2ea47f1151a227a75b0ee0784cbe37c6464a976 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Valim?= Date: Tue, 21 Feb 2012 19:50:44 +0100 Subject: [PATCH] Tokenizer is ready. --- lib/eex/tokenizer.ex | 126 +++++++++++++++++++++++++++++ test/elixir/eex/tokenizer_test.exs | 47 +++++++++++ 2 files changed, 173 insertions(+) create mode 100644 lib/eex/tokenizer.ex create mode 100644 test/elixir/eex/tokenizer_test.exs diff --git a/lib/eex/tokenizer.ex b/lib/eex/tokenizer.ex new file mode 100644 index 00000000000..bb15435e159 --- /dev/null +++ b/lib/eex/tokenizer.ex @@ -0,0 +1,126 @@ +defmodule EEx::Tokenizer do + @doc """ + Tokenizes the given char list. It returns 4 tokens as result: + + * { :text, contents } + * { :expr, marker, contents} + * { :start_expr, marker, contents} + * { :end_expr, marker, contents} + + """ + def tokenize(list) do + List.reverse(tokenize(list, [], [])) + end + + defp tokenize('<%' ++ t, buffer, acc) do + { marker, t } = retrieve_marker(t) + { expr, rest } = tokenize_expr t, [] + + token = tip_expr_token_name(expr) + expr = List.reverse(expr) + + # If it isn't a start or end token, it may be a middle token. + if token == :expr, do: + token = middle_expr_token_name(expr) + + acc = tokenize_text(buffer, acc) + tokenize rest, [], [ { token, marker, expr } | acc] + end + + defp tokenize([h|t], buffer, acc) do + tokenize t, [h|buffer], acc + end + + defp tokenize([], buffer, acc) do + tokenize_text(buffer, acc) + end + + # Retrieve marker for <% + + defp retrieve_marker('=' ++ t) do + { '=', t } + end + + defp retrieve_marker(t) do + { '', t } + end + + # Tokenize an expression until we find %> + + defp tokenize_expr('%>' ++ t, buffer) do + { buffer, t } + end + + defp tokenize_expr([h|t], buffer) do + tokenize_expr t, [h|buffer] + end + + # Receive an expression content and check + # if it is a start or an end token. + # Start tokens finish with `do` or `->` + # while end tokens contain only the end word. + + defp tip_expr_token_name([h|t]) when h == ?\s orelse h == ?\t do + tip_expr_token_name(t) + end + + defp tip_expr_token_name('od' ++ [h|_]) when h == ?\s orelse h == ?\t orelse h == ?) do + :start_expr + end + + defp tip_expr_token_name('>-' ++ [h|_]) when h == ?\s orelse h == ?\t orelse h == ?) do + :start_expr + end + + defp tip_expr_token_name('dne' ++ t) do + if only_spaces?(t), do: :end_expr, else: :expr + end + + defp tip_expr_token_name(_) do + :expr + end + + # Receive an expression contents and see if it matches + # a key-value arg syntax, like elsif: foo. + + defp middle_expr_token_name([h|t]) when h == ?\s orelse h == ?\t do + middle_expr_token_name(t) + end + + defp middle_expr_token_name([h|t]) when h >= ?a andalso h <= ?z do + if valid_key_identifier?(t), do: :middle_expr, else: :expr + end + + defp middle_expr_token_name(_) do + :expr + end + + defp valid_key_identifier?([h|t]) \ + when h >= ?a andalso h <= ?z \ + when h >= ?A andalso h <= ?Z \ + when h >= ?0 andalso h <= ?9 do + valid_key_identifier?(t) + end + + defp valid_key_identifier?([?:|_]) do + true + end + + defp valid_key_identifier?(_) do + false + end + + defp only_spaces?([h|t]) when h == ?\s orelse h == ?\t, do: only_spaces?(t) + defp only_spaces?(other), do: other == [] + + # Tokenize the buffered text by appending + # it to the given accumulator. + + defp tokenize_text([], acc) do + acc + end + + defp tokenize_text(buffer, acc) do + [{ :text, List.reverse buffer } | acc] + end +end \ No newline at end of file diff --git a/test/elixir/eex/tokenizer_test.exs b/test/elixir/eex/tokenizer_test.exs new file mode 100644 index 00000000000..2cb60ebcd60 --- /dev/null +++ b/test/elixir/eex/tokenizer_test.exs @@ -0,0 +1,47 @@ +Code.require_file "../../test_helper", __FILE__ + +defmodule EEx::TokenizerTest do + use ExUnit::Case + require EEx::Tokenizer, as: T + + test "simple strings" do + assert_equal [ { :text, 'foo' } ], T.tokenize('foo') + end + + test "strings with embedded code" do + assert_equal [ { :text, 'foo ' }, { :expr, [], ' bar ' }], T.tokenize('foo <% bar %>') + end + + test "strings with embedded equals code" do + assert_equal [ { :text, 'foo ' }, { :expr, '=', ' bar ' }], T.tokenize('foo <%= bar %>') + end + + test "strings with embedded do end" do + assert_equal [ + { :text, 'foo ' }, + { :start_expr, '', ' if true do ' }, + { :text, 'bar' }, + { :end_expr, '', ' end ' } + ], T.tokenize('foo <% if true do %>bar<% end %>') + end + + test "strings with embedded -> end" do + assert_equal [ + { :text, 'foo ' }, + { :start_expr, '', ' if(true)-> ' }, + { :text, 'bar' }, + { :end_expr, '', ' end ' } + ], T.tokenize('foo <% if(true)-> %>bar<% end %>') + end + + test "strings with embedded key-value blocks" do + assert_equal [ + { :text, 'foo ' }, + { :start_expr, '', ' if true do ' }, + { :text, 'bar' }, + { :middle_expr, '', ' elsif: false ' }, + { :text, 'baz' }, + { :end_expr, '', ' end ' } + ], T.tokenize('foo <% if true do %>bar<% elsif: false %>baz<% end %>') + end +end \ No newline at end of file