Skip to content

Commit

Permalink
Streaming decodeEntities
Browse files Browse the repository at this point in the history
  • Loading branch information
snoyberg authored and chrisdone committed Jan 28, 2014
1 parent b4aa1ba commit d4f1e87
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 33 deletions.
14 changes: 9 additions & 5 deletions Text/HTML/TagStream/ByteString.hs
Expand Up @@ -12,6 +12,7 @@ import qualified Data.ByteString.Char8 as S
import Data.Conduit
import qualified Data.Conduit.List as CL
import Data.Default
import Data.Functor.Identity (runIdentity)
import Data.Monoid
import Data.Text.Encoding
import qualified Text.XML.Stream.Parse as XML
Expand Down Expand Up @@ -129,9 +130,9 @@ text :: Parser Token
text = Text <$> atLeast 1 (takeTill (=='<'))

-- | Decode the HTML entities e.g. @&amp;@ in some text into @&@.
decodeEntities :: ByteString -> ByteString
decodeEntities =
makeEntityDecoder
decodeEntitiesBS :: Monad m => Conduit Token m Token
decodeEntitiesBS =
decodeEntities
Dec { decToS = B.toByteString
, decBreak = S.break
, decBuilder = B.fromByteString
Expand Down Expand Up @@ -174,7 +175,10 @@ html = tokens <|> pure []
_ -> (t:) <$> html

decode :: ByteString -> Either String [Token]
decode = parseOnly html
decode = fmap decodeEntitiesBS' . parseOnly html
where
decodeEntitiesBS' tokens = runIdentity $ mapM_ yield tokens $$ decodeEntitiesBS =$ CL.consume


{--
- Utils {{{
Expand Down Expand Up @@ -233,7 +237,7 @@ tokenStream :: Monad m
=> GInfConduit ByteString m Token
#endif
tokenStream =
loop S.empty
loop S.empty =$= decodeEntitiesBS
where
#if MIN_VERSION_conduit(1, 0, 0)
loop accum = await >>= maybe (close accum ()) (push accum)
Expand Down
72 changes: 59 additions & 13 deletions Text/HTML/TagStream/Entities.hs
Expand Up @@ -6,14 +6,57 @@

module Text.HTML.TagStream.Entities
(Dec(..)
,makeEntityDecoder
,isNameChar
,isNameStart)
,isNameStart
,decodeEntities)
where

import Data.Char
import Data.Monoid
import Data.String
import Data.Conduit
import Text.HTML.TagStream.Types
import qualified Data.Text as T
import qualified Data.Conduit.List as CL
import Data.Maybe (fromMaybe, isJust)
import Control.Arrow (first)

decodeEntities' :: (Monad m,Monoid string,Eq string,IsString string,Monoid builder,Show string)
=> Dec builder string
-> Conduit string m string
decodeEntities' dec =
loop id
where
loop accum = do
mchunk <- await
let chunk = accum $ fromMaybe mempty mchunk
(newStr, remainder) = makeEntityDecoder dec chunk
yield newStr
if isJust mchunk
then loop (mappend remainder)
else yield remainder

decodeEntities :: (Monad m,Monoid string,Eq string,IsString string,Monoid builder,Show string)
=> Dec builder string
-> Conduit (Token' string) m (Token' string)
decodeEntities dec =
start
where
start = await >>= maybe (return ()) (\token -> start' token >> start)
start' (Text t) = (yield t >> yieldWhileText) =$= decodeEntities' dec =$= CL.mapMaybe go
start' token = yield token

go t
| t == "" = Nothing
| otherwise = Just (Text t)

yieldWhileText :: Monad m => Conduit (Token' string) m string
yieldWhileText =
loop
where
loop = await >>= maybe (return ()) go
go (Text t) = yield t >> loop
go token = leftover token

-- | A decoder.
data Dec builder string = Dec
Expand All @@ -26,22 +69,25 @@ data Dec builder string = Dec
}

-- | Decode the entities in a string type with a decoder.
makeEntityDecoder :: (Eq string,IsString string,Monoid builder)
=> Dec builder string -> string -> string
makeEntityDecoder Dec{..} = decToS . go
makeEntityDecoder :: (Eq string,IsString string,Monoid builder,Show string)
=> Dec builder string -> string -> (string, string)
makeEntityDecoder Dec{..} = first decToS . go
where
go s =
case decBreak (=='&') s of
(_,"") -> decBuilder s
(before,decDrop 1 -> rest) ->
(_,"") -> (decBuilder s, "")
(before,restPlusAmp@(decDrop 1 -> rest)) ->
case decBreak (not . (\c -> isNameChar c || c == '#')) rest of
(_,"") -> decBuilder before <> decBuilder rest
(_,"") -> (decBuilder before, restPlusAmp)
(entity,after) ->
decBuilder before <>
decBuilder (decEntity entity) <>
go (case decUncons after of
Just (';',validAfter) -> validAfter
_ -> after)
let before1 = decBuilder before
decoded = decBuilder (decEntity entity)
(before2, after') =
case decUncons after of
Just (';',validAfter) -> first (decoded <>) (go validAfter)
Just (_invalid,_rest) -> first (decoded <>) (go after)
Nothing -> (mempty, s)
in (before1 <> before2, after')

-- | Is the character a valid Name starter?
isNameStart :: Char -> Bool
Expand Down
13 changes: 8 additions & 5 deletions Text/HTML/TagStream/Text.hs
Expand Up @@ -16,6 +16,7 @@ import Data.Attoparsec.Text
import Data.ByteString (ByteString)
import qualified Data.CaseInsensitive as CI
import Data.Conduit
import Data.Functor.Identity (runIdentity)
import Data.Maybe (fromMaybe)
import Data.Monoid (mconcat)
import Data.Text (Text)
Expand Down Expand Up @@ -147,9 +148,9 @@ text :: Parser Token
text = Text <$> atLeast 1 (takeTill (=='<'))

-- | Decode the HTML entities e.g. @&amp;@ in some text into @&@.
decodeEntities :: Text -> Text
decodeEntities =
makeEntityDecoder
decodeEntitiesText :: Monad m => Conduit Token m Token
decodeEntitiesText =
decodeEntities
Dec { decToS = L.toStrict . B.toLazyText
, decBreak = T.break
, decBuilder = B.fromText
Expand Down Expand Up @@ -192,7 +193,9 @@ html = tokens <|> pure []
_ -> (t:) <$> html

decode :: Text -> Either String [Token]
decode = parseOnly html
decode = fmap decodeEntitiesText' . parseOnly html
where
decodeEntitiesText' tokens = runIdentity $ mapM_ yield tokens $$ decodeEntitiesText =$ CL.consume

{--
- Utils {{{
Expand Down Expand Up @@ -251,7 +254,7 @@ tokenStream :: Monad m
=> GInfConduit Text m Token
#endif
tokenStream =
loop T.empty
loop T.empty =$= decodeEntitiesText
where
#if MIN_VERSION_conduit(1, 0, 0)
loop accum = await >>= maybe (close accum ()) (push accum)
Expand Down
16 changes: 6 additions & 10 deletions tests/Tests.hs
Expand Up @@ -65,14 +65,14 @@ onePassTests = mapM_ one testcases
where
one (T.encodeUtf8 -> str, map encodeTokenUtf8 -> tokens) =
it (S.unpack str) $ do
result <- combineText S.decodeEntities <$> assertDecodeBS str
result <- combineText <$> assertDecodeBS str
assertEqual "one-pass parse result incorrect" tokens result

onePassTestsText :: Spec
onePassTestsText = mapM_ one testcases
where
one (str, tokens) = it (T.unpack str) $ do
result <- combineText T.decodeEntities <$> assertDecodeText str
result <- combineText <$> assertDecodeText str
assertEqual "one-pass parse result incorrect" tokens result

streamlineTests :: Spec
Expand All @@ -84,7 +84,7 @@ streamlineTests = mapM_ one testcases
it (S.unpack str) $ do
-- streamline parse result don't contain the trailing Incomplete token.
let tokens' = reverse . dropWhile isIncomplete . reverse $ tokens
result <- combineText S.decodeEntities <$> C.runResourceT (
result <- combineText <$> C.runResourceT (
CL.sourceList (map S.singleton (S.unpack str))
C.$= S.tokenStream
C.$$ CL.consume )
Expand All @@ -99,7 +99,7 @@ streamlineTestsText = mapM_ one testcases
it (S.unpack str) $ do
-- streamline parse result don't contain the trailing Incomplete token.
let tokens' = reverse . dropWhile isIncomplete . reverse $ tokens
result <- combineText T.decodeEntities <$> C.runResourceT (
result <- combineText <$> C.runResourceT (
CL.sourceList (map S.singleton (S.unpack str))
C.$= T.tokenStreamBS
C.$$ CL.consume )
Expand Down Expand Up @@ -205,8 +205,6 @@ testcases =
, ( "<foo> hello</foo>"
, [TagOpen "foo" [] False, Text " hello", TagClose "foo"]
)
, ( "<p>foo&quot;</p>"
, [TagOpen "p" [] False,Text "foo\"",TagClose "p"])
, ( "<a>&foo bar &quot;mu &lt;zot&gt; &#60; &#97; &#; &#a; &hello;</a>"
, [TagOpen "a" [] False, Text " bar \"mu <zot> < a ",TagClose "a"])
]
Expand Down Expand Up @@ -242,10 +240,8 @@ assertDecodeText s = do
let (Right tokens) = result
return tokens

combineText :: Monoid s => (s -> s) -> [Token' s] -> [Token' s]
combineText decode = map (onText decode) . go
combineText :: Monoid s => [Token' s] -> [Token' s]
combineText = go
where go [] = []
go (Text t1 : Text t2 : xs) = go $ Text (mappend t1 t2) : xs
go (x:xs) = x : go xs
onText f (Text t) = Text (f t)
onText _ x = x

0 comments on commit d4f1e87

Please sign in to comment.