Skip to content

Commit

Permalink
Clean up and document the stricter parsing functions.
Browse files Browse the repository at this point in the history
--HG--
rename : Data/Aeson/Parser.hs => Data/Aeson/Parser/Internal.hs
  • Loading branch information
bos committed Dec 1, 2011
1 parent 323c18f commit 33cf247
Show file tree
Hide file tree
Showing 7 changed files with 341 additions and 197 deletions.
24 changes: 17 additions & 7 deletions Data/Aeson.hs
Expand Up @@ -14,6 +14,7 @@ module Data.Aeson
(
-- * Encoding and decoding
decode
, decode'
, encode
-- * Core JSON types
, Value(..)
Expand All @@ -34,21 +35,30 @@ module Data.Aeson
, object
-- * Parsing
, json
, json'
) where

import Data.Aeson.Encode (encode)
import Data.Aeson.Parser (json)
import Data.Aeson.Parser.Internal (decodeWith, json, json')
import Data.Aeson.Types
import qualified Data.ByteString.Lazy as L
import qualified Data.Attoparsec.Lazy as L

-- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
-- If this fails due to incomplete or invalid input, 'Nothing' is
-- returned.
--
-- This function parses immediately, but defers conversion. See
-- 'json' for details.
decode :: (FromJSON a) => L.ByteString -> Maybe a
decode s = case L.parse json s of
L.Done _ v -> case fromJSON v of
Success a -> Just a
_ -> Nothing
_ -> Nothing
decode = decodeWith json fromJSON
{-# INLINE decode #-}

-- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
-- If this fails due to incomplete or invalid input, 'Nothing' is
-- returned.
--
-- This function parses and performs conversion immediately. See
-- 'json'' for details.
decode' :: (FromJSON a) => L.ByteString -> Maybe a
decode' = decodeWith json' fromJSON
{-# INLINE decode' #-}
24 changes: 17 additions & 7 deletions Data/Aeson/Generic.hs
Expand Up @@ -18,6 +18,7 @@ module Data.Aeson.Generic
(
-- * Decoding and encoding
decode
, decode'
, encode
-- * Lower-level conversion functions
, fromJSON
Expand All @@ -39,11 +40,10 @@ import Data.Text (Text, pack, unpack)
import Data.Text.Encoding (encodeUtf8)
import Data.Time.Clock (UTCTime)
import Data.Word (Word, Word8, Word16, Word32, Word64)
import Data.Aeson.Parser (json)
import Data.Aeson.Parser.Internal (decodeWith, json, json')
import qualified Data.Aeson.Encode as E
import qualified Data.Aeson.Functions as F
import qualified Data.Aeson.Types as T
import qualified Data.Attoparsec.Lazy as L
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as L
import qualified Data.HashMap.Strict as H
Expand All @@ -62,12 +62,22 @@ encode = E.encode . toJSON
-- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
-- If this fails due to incomplete or invalid input, 'Nothing' is
-- returned.
--
-- This function parses immediately, but defers conversion. See
-- 'json' for details.
decode :: (Data a) => L.ByteString -> Maybe a
decode s = case L.parse json s of
L.Done _ v -> case fromJSON v of
Success a -> Just a
_ -> Nothing
_ -> Nothing
decode = decodeWith json fromJSON
{-# INLINE decode #-}

-- | Efficiently deserialize a JSON value from a lazy 'L.ByteString'.
-- If this fails due to incomplete or invalid input, 'Nothing' is
-- returned.
--
-- This function parses and performs conversion immediately. See
-- 'json'' for details.
decode' :: (Data a) => L.ByteString -> Maybe a
decode' = decodeWith json' fromJSON
{-# INLINE decode' #-}

type T a = a -> Value

Expand Down
215 changes: 33 additions & 182 deletions Data/Aeson/Parser.hs
Expand Up @@ -10,196 +10,47 @@
--
-- Efficiently and correctly parse a JSON string. The string must be
-- encoded as UTF-8.
--
-- It can be useful to think of parsing as occurring in two phases:
--
-- * Identification of the textual boundaries of a JSON value. This
-- is always strict, so that an invalid JSON document can be
-- rejected as soon as possible.
--
-- * Conversion of a JSON value to a Haskell value. This may be
-- either immediate (strict) or deferred (lazy); see below for
-- details.

module Data.Aeson.Parser
(
-- * Lazy parsers
-- $lazy
json
, json'
, value
, value'
, jstring
-- * Strict parsers
-- $strict
, json'
, value'
) where

import Blaze.ByteString.Builder (fromByteString, toByteString)
import Blaze.ByteString.Builder.Char.Utf8 (fromChar)
import Blaze.ByteString.Builder.Word (fromWord8)
import Control.Applicative as A
import Data.Aeson.Types (Value(..))
import Data.Attoparsec.Char8
import Data.Bits ((.|.), shiftL)
import Data.ByteString as B
import Data.Char (chr)
import Data.Monoid (mappend, mempty)
import Data.Text as T
import Data.Text.Encoding (decodeUtf8)
import Data.Vector as Vector hiding ((++))
import Data.Word (Word8)
import qualified Data.Attoparsec as A
import qualified Data.Attoparsec.Zepto as Z
import qualified Data.ByteString.Char8 as B8
import qualified Data.ByteString.Unsafe as B
import qualified Data.HashMap.Strict as H

-- | Parse a top-level JSON value. This must be either an object or
-- an array.
json :: Parser Value
json = json_ object_ array_
import Data.Aeson.Parser.Internal (json, json', jstring, value, value')

-- | Parse a top-level JSON value. This must be either an object or
-- an array.
-- $lazy
--
-- This is a strict parser version of 'json' which avoids
-- building up thunks during parsing. Prefer this version if most of
-- the JSON data needs to be accessed.
json' :: Parser Value
json' = json_ object_' array_'

json_ :: Parser Value -> Parser Value -> Parser Value
json_ obj ary = do
w <- skipSpace *> A.satisfy (\w -> w == 123 || w == 91)
if w == 123
then obj
else ary
{-# INLINE json_ #-}

object_ :: Parser Value
object_ = {-# SCC "object_" #-} Object <$> objectValues value

object_' :: Parser Value
object_' = {-# SCC "object_'" #-} do
!vals <- objectValues value'
return (Object vals)

objectValues :: Parser Value -> Parser (H.HashMap Text Value)
objectValues val = do
skipSpace
let pair = do
a <- jstring <* skipSpace
b <- char ':' *> skipSpace *> val
return (a,b)
vals <- ((pair <* skipSpace) `sepBy` (char ',' *> skipSpace)) <* char '}'
return (H.fromList vals)
{-# INLINE objectValues #-}

array_ :: Parser Value
array_ = {-# SCC "array_" #-} Array <$> arrayValues value

array_' :: Parser Value
array_' = {-# SCC "array_'" #-} do
!vals <- arrayValues value'
return (Array vals)

arrayValues :: Parser Value -> Parser (Vector Value)
arrayValues val = do
skipSpace
vals <- ((val <* skipSpace) `sepBy` (char ',' *> skipSpace)) <* char ']'
return (Vector.fromList vals)
{-# INLINE arrayValues #-}

-- | Parse any JSON value. Use 'json' in preference to this function
-- if you are parsing data from an untrusted source.
value :: Parser Value
value = most <|> (Number <$> number)
where
most = do
c <- satisfy (`B8.elem` "{[\"ftn")
case c of
'{' -> object_
'[' -> array_
'"' -> String <$> jstring_
'f' -> string "alse" *> pure (Bool False)
't' -> string "rue" *> pure (Bool True)
'n' -> string "ull" *> pure Null
_ -> error "attoparsec panic! the impossible happened!"

-- | Strict version of 'value'. See also 'json''.
value' :: Parser Value
value' = most <|> num
where
most = do
c <- satisfy (`B8.elem` "{[\"ftn")
case c of
'{' -> object_'
'[' -> array_'
'"' -> do
!s <- jstring_
return (String s)
'f' -> string "alse" *> pure (Bool False)
't' -> string "rue" *> pure (Bool True)
'n' -> string "ull" *> pure Null
_ -> error "attoparsec panic! the impossible happened!"
num = do
!n <- number
return (Number n)

doubleQuote, backslash :: Word8
doubleQuote = 34
backslash = 92
{-# INLINE backslash #-}
{-# INLINE doubleQuote #-}

jstring :: Parser Text
jstring = A.word8 doubleQuote *> jstring_

-- | Parse a string without a leading quote.
jstring_ :: Parser Text
jstring_ = {-# SCC "jstring_" #-} do
s <- A.scan False $ \s c -> if s then Just False
else if c == doubleQuote
then Nothing
else Just (c == backslash)
_ <- A.word8 doubleQuote
if backslash `B.elem` s
then case Z.parse unescape s of
Right r -> return (decodeUtf8 r)
Left err -> fail err
else return (decodeUtf8 s)
{-# INLINE jstring_ #-}

unescape :: Z.Parser ByteString
unescape = toByteString <$> go mempty where
go acc = do
h <- Z.takeWhile (/=backslash)
let rest = do
start <- Z.take 2
let !slash = B.unsafeHead start
!t = B.unsafeIndex start 1
escape = case B.findIndex (==t) "\"\\/ntbrfu" of
Just i -> i
_ -> 255
if slash /= backslash || escape == 255
then fail "invalid JSON escape sequence"
else do
let cont m = go (acc `mappend` fromByteString h `mappend` m)
{-# INLINE cont #-}
if t /= 117 -- 'u'
then cont (fromWord8 (B.unsafeIndex mapping escape))
else do
a <- hexQuad
if a < 0xd800 || a > 0xdfff
then cont (fromChar (chr a))
else do
b <- Z.string "\\u" *> hexQuad
if a <= 0xdbff && b >= 0xdc00 && b <= 0xdfff
then let !c = ((a - 0xd800) `shiftL` 10) +
(b - 0xdc00) + 0x10000
in cont (fromChar (chr c))
else fail "invalid UTF-16 surrogates"
done <- Z.atEnd
if done
then return (acc `mappend` fromByteString h)
else rest
mapping = "\"\\/\n\t\b\r\f"
-- The 'json' and 'value' parsers decouple identification from
-- conversion. Identification occurs immediately (so that an invalid
-- JSON document can be rejected as early as possible), but conversion
-- to a Haskell value is deferred until that value is needed.
--
-- This decoupling can be time-efficient if only a smallish subset of
-- elements in a JSON value need to be inspected, since the cost of
-- conversion is zero for uninspected elements. The trade off is an
-- increase in memory usage, due to allocation of thunks for values
-- that have not yet been converted.

hexQuad :: Z.Parser Int
hexQuad = do
s <- Z.take 4
let hex n | w >= 48 && w <= 57 = w - 48
| w >= 97 && w <= 122 = w - 87
| w >= 65 && w <= 90 = w - 55
| otherwise = 255
where w = fromIntegral $ B.unsafeIndex s n
a = hex 0; b = hex 1; c = hex 2; d = hex 3
if (a .|. b .|. c .|. d) /= 255
then return $! d .|. (c `shiftL` 4) .|. (b `shiftL` 8) .|. (a `shiftL` 12)
else fail "invalid hex escape"
-- $strict
--
-- The 'json'' and 'value'' parsers combine identification with
-- conversion. They consume more CPU cycles up front, but have a
-- smaller memory footprint.

0 comments on commit 33cf247

Please sign in to comment.