Permalink
Browse files

Add tokenStreamBS which decode bytestring according to xml version tag

  • Loading branch information...
1 parent 4c0680e commit aafcda6d28781870b27c69aaae1ad1720de4f1fc @yihuang committed Oct 13, 2012
Showing with 53 additions and 2 deletions.
  1. +53 −2 Text/HTML/TagStream/Text.hs
@@ -1,18 +1,30 @@
{-# LANGUAGE OverloadedStrings, TupleSections, ViewPatterns #-}
module Text.HTML.TagStream.Text where
+import Prelude hiding (mapM)
import Control.Applicative
-import Control.Monad (unless)
+import Control.Monad (unless, when, liftM)
+import Control.Monad.Trans (lift)
+import Data.Traversable (mapM)
+import Data.Maybe (fromMaybe)
import Data.Monoid (mconcat)
import Data.Char (isSpace)
+import Data.ByteString (ByteString)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as L
import qualified Data.Text.Lazy.Builder as B
+import qualified Data.CaseInsensitive as CI
+import qualified Data.Attoparsec.ByteString.Char8 as S
import Data.Attoparsec.Text
-import Data.Conduit (GInfConduit, awaitE, yield)
+import Data.Conduit (GInfConduit, GLInfConduit, awaitE, yield, ($=), ($$), MonadThrow)
+import Data.Conduit.Internal (pipeL)
+import qualified Data.Conduit.List as C
+import qualified Data.Conduit.Attoparsec as C
+import qualified Data.Conduit.Text as C
+import qualified Text.HTML.TagStream.ByteString as S
import Text.HTML.TagStream.Types
import Text.HTML.TagStream.Utils (splitAccum)
@@ -220,4 +232,43 @@ tokenStream =
close s r = do
unless (T.null s) $ yield $ Text s
return r
+
+-- | like `tokenStream', but it process `ByteString' input, decode it according to xml version tag.
+--
+-- Only support utf-8 and iso8859 for now.
+tokenStreamBS :: MonadThrow m => GLInfConduit ByteString m Token
+tokenStreamBS = do
+ -- try to peek the first tag to find the xml encoding.
+ tk <- C.sinkParser (skipBOM *> S.skipSpace *> S.char '<' *> S.tag)
+
+ let (mencoding, yieldToken) =
+ case tk of
+ (TagOpen "?xml" as _) ->
+ (lookup "encoding" as, False)
+ _ -> (Nothing, True)
+
+ let codec = fromMaybe C.utf8 (mencoding >>= getCodec . CI.mk)
+
+ when yieldToken $
+ lift (mapM (decodeBS codec) tk) >>= yield
+
+ C.decode codec `pipeL` tokenStream
+ where
+ skipBOM :: S.Parser ()
+ skipBOM =
+ ( S.string "\xff\xfe"
+ <|> S.string "\xef\xbb\xbf"
+ ) *> return ()
+ <|> return ()
+
+ getCodec :: CI.CI ByteString -> Maybe C.Codec
+ getCodec c =
+ case c of
+ "utf-8" -> Just C.utf8
+ "utf8" -> Just C.utf8
+ "iso8859" -> Just C.iso8859_1
+ _ -> Nothing
+
+ --decodeBS :: C.Codec -> ByteString -> m Text
+ decodeBS codec bs = liftM T.concat $ yield bs $= C.decode codec $$ C.consume
-- }}}

0 comments on commit aafcda6

Please sign in to comment.