# Sentence Lexical Analyzer

Write a lexer to tokenize a sentence to words and quote skiping all "white spaces".

In [None]:
type STATE =
     | START  = 0
     | PLAIN  = 1    // Creating an identifier token
     | QUOTED = 2    // Creating a string token
     | EOF    = 3

type TOKEN = EMPTY | WORD | QUOTE | KWORD

// Simple function that classifies a character as being alphabetic or not.
let Alpha = function
    | X when X >= 'a' && X <= 'z' -> true
    | X when X >= 'A' && X <= 'Z' -> true
    | _ -> false

// Simple function that classifies a character as being white space or not.
let Space = function
    |' ' -> true
    |':' -> true
    |'.' -> true
    |',' -> true
    | _ -> false

// Simple function that classifies a character as being a quote or not
let Quote = function
    |'"' -> true
    | _ -> false

In [None]:
// Some helper functions
let append S C = S + C.ToString() // Append `char` to a `string`.

In [None]:
// Tokenizer
let rec tokenize ((input:List<char>), state, lexeme, token) = 
    match (input, state) with
    | (C::S, STATE.START)  when Space C -> tokenize (S, state, lexeme, TOKEN.EMPTY)
    | (C::S, STATE.START)  when Alpha C -> tokenize (S, STATE.PLAIN, append "" C, TOKEN.WORD)
    | (C::S, STATE.START)  when Quote C -> tokenize (S, STATE.QUOTED, "", TOKEN.QUOTE)
    | (C::S, STATE.PLAIN)  when Space C -> (S, STATE.START, lexeme, TOKEN.WORD)
    | (C::S, STATE.PLAIN)  when Alpha C -> tokenize (S, state, append lexeme C, TOKEN.WORD)
    | (C::S, STATE.QUOTED) when Quote C -> (S, STATE.START, lexeme, TOKEN.QUOTE)
    | (C::S, STATE.QUOTED) when Alpha C -> tokenize (S, state, append lexeme C, TOKEN.QUOTE)
    | _                                 -> ([],STATE.EOF, "", TOKEN.EMPTY)

// This function is passed an argument and expected to compute the 'next' token item in the sequence.
let tokenizeNext value =
    match value with
    | [], _, _, _ -> None
    | _ -> 
        let token = tokenize value
        Some(token, token)
        
// Defines a sequence using the unfolder function and beginning with the initialized start token value.
let tokens source =
    let context = (Seq.toList source, STATE.START, "", TOKEN.EMPTY)
    Seq.unfold (tokenizeNext) context

In [15]:
// Create an initial quad containing the source text and an empty 
// lexeme and start state, the bool indicates if a lexme is a 
// quoted string.
let source = """  This is a "test" sentance."""
let context = (Seq.toList source, STATE.START, "", TOKEN.EMPTY)
context

([' '; ' '; 'T'; 'h'; 'i'; 's'; ' '; 'i'; 's'; ' '; 'a'; ' '; '"'; 't'; 'e'; 's';
  't'; '"'; ' '; 's'; 'e'; 'n'; 't'; 'a'; 'n'; 'c'; 'e'; '.'], START, "", EMPTY)

In [16]:
// Invoke the tokenizer several times, the constructed lexeme (a string) 
// is contained within the returned triple and that triple is passed in 
// again to retrieve the next token and so on...

let result1 = tokenize context
result1

(['i'; 's'; ' '; 'a'; ' '; '"'; 't'; 'e'; 's'; 't'; '"'; ' '; 's'; 'e'; 'n'; 't';
  'a'; 'n'; 'c'; 'e'; '.'], START, "This", WORD)

In [17]:
let result2 = tokenize result1
result2

(['a'; ' '; '"'; 't'; 'e'; 's'; 't'; '"'; ' '; 's'; 'e'; 'n'; 't'; 'a'; 'n'; 'c';
  'e'; '.'], START, "is", WORD)

In [18]:
let result3 = tokenize result2
result3

(['"'; 't'; 'e'; 's'; 't'; '"'; ' '; 's'; 'e'; 'n'; 't'; 'a'; 'n'; 'c'; 'e'; '.'],
 START, "a", WORD)

In [19]:
let result4 = tokenize result3
result4

([' '; 's'; 'e'; 'n'; 't'; 'a'; 'n'; 'c'; 'e'; '.'], START, "test", QUOTE)

In [20]:
let result5 = tokenize result4
result5

([], START, "sentance", WORD)

In [21]:
tokenize result5

([], EOF, "", EMPTY)

In [22]:
let text_string = """This is a "test" sentance, and a qoute: "AAA"."""
for token in tokens text_string do
    let _, state, lexeme, token = token
    printfn "%A: '%s', next state: %A" token lexeme state

WORD: 'This', next state: START
WORD: 'is', next state: START
WORD: 'a', next state: START
QUOTE: 'test', next state: START
WORD: 'sentance', next state: START
WORD: 'and', next state: START
WORD: 'a', next state: START
WORD: 'qoute', next state: START
QUOTE: 'AAA', next state: START
EMPTY: '', next state: EOF


## Reserved words

- Reserved words and identifiers can be recognized together
    - rather than having a part of the diagram for each reserved word
- Use a table lookup to determine whether a possible identifier is in fact a reserved word

In [23]:
let text_string = """This is a "test" sentance, and a qoute: "AAA"."""
for token in tokens text_string do
    let _, state, lexeme, token = token
    let newtkn = if lexeme = "is" then TOKEN.KWORD else token
    printfn "%A: '%s', next state: %A" newtkn lexeme state

WORD: 'This', next state: START
KWORD: 'is', next state: START
WORD: 'a', next state: START
QUOTE: 'test', next state: START
WORD: 'sentance', next state: START
WORD: 'and', next state: START
WORD: 'a', next state: START
WORD: 'qoute', next state: START
QUOTE: 'AAA', next state: START
EMPTY: '', next state: EOF
