Clean up and document the stricter parsing functions.

--HG-- rename : Data/Aeson/Parser.hs => Data/Aeson/Parser/Internal.hs
yihuang · Dec 1, 2011 · 33cf247 · 33cf247
1 parent 323c18f
commit 33cf247
Show file tree

Hide file tree

Showing 7 changed files with 341 additions and 197 deletions.
diff --git a/Data/Aeson.hs b/Data/Aeson.hs
@@ -14,6 +14,7 @@ module Data.Aeson
     (
     -- * Encoding and decoding
       decode
+    , decode'
     , encode
     -- * Core JSON types
     , Value(..)
@@ -34,21 +35,30 @@ module Data.Aeson
     , object
     -- * Parsing
     , json
+    , json'
     ) where
 
 import Data.Aeson.Encode (encode)
-import Data.Aeson.Parser (json)
+import Data.Aeson.Parser.Internal (decodeWith, json, json')
 import Data.Aeson.Types
 import qualified Data.ByteString.Lazy as L
-import qualified Data.Attoparsec.Lazy as L
 
 -- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
 -- If this fails due to incomplete or invalid input, 'Nothing' is
 -- returned.
+--
+-- This function parses immediately, but defers conversion.  See
+-- 'json' for details.
 decode :: (FromJSON a) => L.ByteString -> Maybe a
-decode s = case L.parse json s of
-             L.Done _ v -> case fromJSON v of
-                             Success a -> Just a
-                             _         -> Nothing
-             _          -> Nothing
+decode = decodeWith json fromJSON
 {-# INLINE decode #-}
+
+-- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
+-- If this fails due to incomplete or invalid input, 'Nothing' is
+-- returned.
+--
+-- This function parses and performs conversion immediately.  See
+-- 'json'' for details.
+decode' :: (FromJSON a) => L.ByteString -> Maybe a
+decode' = decodeWith json' fromJSON
+{-# INLINE decode' #-}
diff --git a/Data/Aeson/Generic.hs b/Data/Aeson/Generic.hs
@@ -18,6 +18,7 @@ module Data.Aeson.Generic
     (
     -- * Decoding and encoding
       decode
+    , decode'
     , encode
     -- * Lower-level conversion functions
     , fromJSON
@@ -39,11 +40,10 @@ import Data.Text (Text, pack, unpack)
 import Data.Text.Encoding (encodeUtf8)
 import Data.Time.Clock (UTCTime)
 import Data.Word (Word, Word8, Word16, Word32, Word64)
-import Data.Aeson.Parser (json)
+import Data.Aeson.Parser.Internal (decodeWith, json, json')
 import qualified Data.Aeson.Encode as E
 import qualified Data.Aeson.Functions as F
 import qualified Data.Aeson.Types as T
-import qualified Data.Attoparsec.Lazy as L
 import qualified Data.ByteString as B
 import qualified Data.ByteString.Lazy as L
 import qualified Data.HashMap.Strict as H
@@ -62,12 +62,22 @@ encode = E.encode . toJSON
 -- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
 -- If this fails due to incomplete or invalid input, 'Nothing' is
 -- returned.
+--
+-- This function parses immediately, but defers conversion.  See
+-- 'json' for details.
 decode :: (Data a) => L.ByteString -> Maybe a
-decode s = case L.parse json s of
-             L.Done _ v -> case fromJSON v of
-                             Success a -> Just a
-                             _         -> Nothing
-             _          -> Nothing
+decode = decodeWith json fromJSON
+{-# INLINE decode #-}
+
+-- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
+-- If this fails due to incomplete or invalid input, 'Nothing' is
+-- returned.
+--
+-- This function parses and performs conversion immediately.  See
+-- 'json'' for details.
+decode' :: (Data a) => L.ByteString -> Maybe a
+decode' = decodeWith json' fromJSON
+{-# INLINE decode' #-}
 
 type T a = a -> Value
 

diff --git a/Data/Aeson/Parser.hs b/Data/Aeson/Parser.hs
@@ -10,196 +10,47 @@
 --
 -- Efficiently and correctly parse a JSON string.  The string must be
 -- encoded as UTF-8.
+--
+-- It can be useful to think of parsing as occurring in two phases:
+--
+-- * Identification of the textual boundaries of a JSON value.  This
+--   is always strict, so that an invalid JSON document can be
+--   rejected as soon as possible.
+--
+-- * Conversion of a JSON value to a Haskell value.  This may be
+--   either immediate (strict) or deferred (lazy); see below for
+--   details.
 
 module Data.Aeson.Parser
     (
+    -- * Lazy parsers
+    -- $lazy
       json
-    , json'
     , value
-    , value'
     , jstring
+    -- * Strict parsers
+    -- $strict
+    , json'
+    , value'
     ) where
 
-import Blaze.ByteString.Builder (fromByteString, toByteString)
-import Blaze.ByteString.Builder.Char.Utf8 (fromChar)
-import Blaze.ByteString.Builder.Word (fromWord8)
-import Control.Applicative as A
-import Data.Aeson.Types (Value(..))
-import Data.Attoparsec.Char8
-import Data.Bits ((.|.), shiftL)
-import Data.ByteString as B
-import Data.Char (chr)
-import Data.Monoid (mappend, mempty)
-import Data.Text as T
-import Data.Text.Encoding (decodeUtf8)
-import Data.Vector as Vector hiding ((++))
-import Data.Word (Word8)
-import qualified Data.Attoparsec as A
-import qualified Data.Attoparsec.Zepto as Z
-import qualified Data.ByteString.Char8 as B8
-import qualified Data.ByteString.Unsafe as B
-import qualified Data.HashMap.Strict as H
-
--- | Parse a top-level JSON value.  This must be either an object or
--- an array.
-json :: Parser Value
-json = json_ object_ array_
+import Data.Aeson.Parser.Internal (json, json', jstring, value, value')
 
--- | Parse a top-level JSON value.  This must be either an object or
--- an array.
+-- $lazy
 --
--- This is a strict parser version of 'json' which avoids
--- building up thunks during parsing. Prefer this version if most of
--- the JSON data needs to be accessed.
-json' :: Parser Value
-json' = json_ object_' array_'
-
-json_ :: Parser Value -> Parser Value -> Parser Value
-json_ obj ary = do
-  w <- skipSpace *> A.satisfy (\w -> w == 123 || w == 91)
-  if w == 123
-    then obj
-    else ary
-{-# INLINE json_ #-}
-
-object_ :: Parser Value
-object_ = {-# SCC "object_" #-} Object <$> objectValues value
-
-object_' :: Parser Value
-object_' = {-# SCC "object_'" #-} do
-  !vals <- objectValues value'
-  return (Object vals)
-
-objectValues :: Parser Value -> Parser (H.HashMap Text Value)
-objectValues val = do
-  skipSpace
-  let pair = do
-        a <- jstring <* skipSpace
-        b <- char ':' *> skipSpace *> val
-        return (a,b)
-  vals <- ((pair <* skipSpace) `sepBy` (char ',' *> skipSpace)) <* char '}'
-  return (H.fromList vals)
-{-# INLINE objectValues #-}
-
-array_ :: Parser Value
-array_ = {-# SCC "array_" #-} Array <$> arrayValues value
-
-array_' :: Parser Value
-array_' = {-# SCC "array_'" #-} do
-  !vals <- arrayValues value'
-  return (Array vals)
-
-arrayValues :: Parser Value -> Parser (Vector Value)
-arrayValues val = do
-  skipSpace
-  vals <- ((val <* skipSpace) `sepBy` (char ',' *> skipSpace)) <* char ']'
-  return (Vector.fromList vals)
-{-# INLINE arrayValues #-}
-
--- | Parse any JSON value.  Use 'json' in preference to this function
--- if you are parsing data from an untrusted source.
-value :: Parser Value
-value = most <|> (Number <$> number)
- where
-  most = do
-    c <- satisfy (`B8.elem` "{[\"ftn")
-    case c of
-      '{' -> object_
-      '[' -> array_
-      '"' -> String <$> jstring_
-      'f' -> string "alse" *> pure (Bool False)
-      't' -> string "rue" *> pure (Bool True)
-      'n' -> string "ull" *> pure Null
-      _   -> error "attoparsec panic! the impossible happened!"
-
--- | Strict version of 'value'. See also 'json''.
-value' :: Parser Value
-value' = most <|> num
- where
-  most = do
-    c <- satisfy (`B8.elem` "{[\"ftn")
-    case c of
-      '{' -> object_'
-      '[' -> array_'
-      '"' -> do
-          !s <- jstring_
-          return (String s)
-      'f' -> string "alse" *> pure (Bool False)
-      't' -> string "rue" *> pure (Bool True)
-      'n' -> string "ull" *> pure Null
-      _   -> error "attoparsec panic! the impossible happened!"
-  num = do
-    !n <- number
-    return (Number n)
-
-doubleQuote, backslash :: Word8
-doubleQuote = 34
-backslash = 92
-{-# INLINE backslash #-}
-{-# INLINE doubleQuote #-}
-
-jstring :: Parser Text
-jstring = A.word8 doubleQuote *> jstring_
-
--- | Parse a string without a leading quote.
-jstring_ :: Parser Text
-jstring_ = {-# SCC "jstring_" #-} do
-  s <- A.scan False $ \s c -> if s then Just False
-                                   else if c == doubleQuote
-                                        then Nothing
-                                        else Just (c == backslash)
-  _ <- A.word8 doubleQuote
-  if backslash `B.elem` s
-    then case Z.parse unescape s of
-           Right r  -> return (decodeUtf8 r)
-           Left err -> fail err
-    else return (decodeUtf8 s)
-{-# INLINE jstring_ #-}
-
-unescape :: Z.Parser ByteString
-unescape = toByteString <$> go mempty where
-  go acc = do
-    h <- Z.takeWhile (/=backslash)
-    let rest = do
-          start <- Z.take 2
-          let !slash = B.unsafeHead start
-              !t = B.unsafeIndex start 1
-              escape = case B.findIndex (==t) "\"\\/ntbrfu" of
-                         Just i -> i
-                         _      -> 255
-          if slash /= backslash || escape == 255
-            then fail "invalid JSON escape sequence"
-            else do
-            let cont m = go (acc `mappend` fromByteString h `mappend` m)
-                {-# INLINE cont #-}
-            if t /= 117 -- 'u'
-              then cont (fromWord8 (B.unsafeIndex mapping escape))
-              else do
-                   a <- hexQuad
-                   if a < 0xd800 || a > 0xdfff
-                     then cont (fromChar (chr a))
-                     else do
-                       b <- Z.string "\\u" *> hexQuad
-                       if a <= 0xdbff && b >= 0xdc00 && b <= 0xdfff
-                         then let !c = ((a - 0xd800) `shiftL` 10) +
-                                       (b - 0xdc00) + 0x10000
-                              in cont (fromChar (chr c))
-                         else fail "invalid UTF-16 surrogates"
-    done <- Z.atEnd
-    if done
-      then return (acc `mappend` fromByteString h)
-      else rest
-  mapping = "\"\\/\n\t\b\r\f"
+-- The 'json' and 'value' parsers decouple identification from
+-- conversion.  Identification occurs immediately (so that an invalid
+-- JSON document can be rejected as early as possible), but conversion
+-- to a Haskell value is deferred until that value is needed.
+--
+-- This decoupling can be time-efficient if only a smallish subset of
+-- elements in a JSON value need to be inspected, since the cost of
+-- conversion is zero for uninspected elements.  The trade off is an
+-- increase in memory usage, due to allocation of thunks for values
+-- that have not yet been converted.
 
-hexQuad :: Z.Parser Int
-hexQuad = do
-  s <- Z.take 4
-  let hex n | w >= 48 && w <= 57  = w - 48
-            | w >= 97 && w <= 122 = w - 87
-            | w >= 65 && w <= 90  = w - 55
-            | otherwise           = 255
-        where w = fromIntegral $ B.unsafeIndex s n
-      a = hex 0; b = hex 1; c = hex 2; d = hex 3
-  if (a .|. b .|. c .|. d) /= 255
-    then return $! d .|. (c `shiftL` 4) .|. (b `shiftL` 8) .|. (a `shiftL` 12)
-    else fail "invalid hex escape"
+-- $strict
+--
+-- The 'json'' and 'value'' parsers combine identification with
+-- conversion.  They consume more CPU cycles up front, but have a
+-- smaller memory footprint.