Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Interleave type conversion and parsing

This drastically reduces memory usage of decode as we now need to hold
on to O(n * sizeof(target_type)) bytes instead of O(n * sizeof(Vector
ByteString)) bytes. The latter is typically much larger, due to the high
constant-factor overhead of Vector and ByteString.
  • Loading branch information...
commit 76451b47331b0558ecdb2ad8bebbe22bdaabb3d9 1 parent d926bd1
@tibbe authored
View
118 Data/Csv/Encoding.hs
@@ -10,7 +10,7 @@
--
-- Encoding and decoding of data types into CSV.
module Data.Csv.Encoding
- (
+ (
-- * Encoding and decoding
decode
, decodeByName
@@ -31,7 +31,8 @@ module Data.Csv.Encoding
import Blaze.ByteString.Builder (Builder, fromByteString, fromWord8,
toLazyByteString)
import Blaze.ByteString.Builder.Char8 (fromString)
-import Control.Applicative ((*>), (<$>), (<*>), pure)
+import Control.Applicative ((*>), (<|>), optional, pure)
+import Data.Attoparsec.Char8 (endOfInput, endOfLine)
import qualified Data.Attoparsec.ByteString.Lazy as AL
import qualified Data.ByteString as B
import qualified Data.ByteString.Char8 as B8
@@ -45,10 +46,14 @@ import Data.Word (Word8)
import Prelude hiding (unlines)
import Data.Csv.Compat.Monoid ((<>))
-import Data.Csv.Conversion
-import Data.Csv.Parser
-import Data.Csv.Types
-import Data.Csv.Util ((<$!>))
+import Data.Csv.Conversion (FromNamedRecord, FromRecord, ToNamedRecord,
+ ToRecord, parseNamedRecord, parseRecord, runParser,
+ toNamedRecord, toRecord)
+import Data.Csv.Parser hiding (csv, csvWithHeader)
+import qualified Data.Csv.Parser as Parser
+import Data.Csv.Types hiding (toNamedRecord)
+import qualified Data.Csv.Types as Types
+import Data.Csv.Util (blankLine)
-- TODO: 'encode' isn't as efficient as it could be.
@@ -97,22 +102,9 @@ decodeWith :: FromRecord a
-- skipped
-> L.ByteString -- ^ CSV data
-> Either String (Vector a)
-decodeWith = decodeWithC (runParser . parseCsv)
+decodeWith = decodeWithC csv
{-# INLINE [1] decodeWith #-}
-parseCsv :: FromRecord a => Csv -> Parser (Vector a)
-parseCsv xs = V.fromList <$!> mapM' parseRecord (V.toList xs)
-
-mapM' :: Monad m => (a -> m b) -> [a] -> m [b]
-mapM' f = go
- where
- go [] = return []
- go (x:xs) = do
- !y <- f x
- ys <- go xs
- return (y : ys)
-{-# INLINE mapM' #-}
-
{-# RULES
"idDecodeWith" decodeWith = idDecodeWith
#-}
@@ -121,14 +113,17 @@ mapM' f = go
-- conversion is performed.
idDecodeWith :: DecodeOptions -> Bool -> L.ByteString
-> Either String (Vector (Vector B.ByteString))
-idDecodeWith = decodeWithC pure
-
-decodeWithC :: (Csv -> Either String a) -> DecodeOptions -> Bool -> L.ByteString
- -> Either String a
-decodeWithC convert !opts skipHeader = decodeWithP parser convert
+idDecodeWith = decodeWithC Parser.csv
+
+-- | Decode CSV data using the provided parser, skipping a leading
+-- header if 'skipHeader' is 'True'. Returns 'Left' @errMsg@ on
+-- failure.
+decodeWithC :: (DecodeOptions -> AL.Parser a) -> DecodeOptions -> Bool
+ -> BL8.ByteString -> Either String a
+decodeWithC p !opts skipHeader = decodeWithP parser
where parser
- | skipHeader = header (decDelimiter opts) *> csv opts
- | otherwise = csv opts
+ | skipHeader = header (decDelimiter opts) *> p opts
+ | otherwise = p opts
{-# INLINE decodeWithC #-}
-- | Like 'decodeByName', but lets you customize how the CSV data is
@@ -137,12 +132,7 @@ decodeByNameWith :: FromNamedRecord a
=> DecodeOptions -- ^ Decoding options
-> L.ByteString -- ^ CSV data
-> Either String (Header, Vector a)
-decodeByNameWith !opts =
- decodeWithP (csvWithHeader opts)
- (\ (hdr, vs) -> (,) <$> pure hdr <*> (runParser $ parseNamedCsv vs))
-
-parseNamedCsv :: FromNamedRecord a => Vector NamedRecord -> Parser (Vector a)
-parseNamedCsv xs = V.fromList <$!> mapM' parseNamedRecord (V.toList xs)
+decodeByNameWith !opts = decodeWithP (csvWithHeader opts)
-- | Options that controls how data is encoded. These options can be
-- used to e.g. encode data in a tab-separated format instead of in a
@@ -237,12 +227,62 @@ prependToAll :: Builder -> [Builder] -> [Builder]
prependToAll _ [] = []
prependToAll sep (x:xs) = sep <> x : prependToAll sep xs
-decodeWithP :: AL.Parser a -> (a -> Either String b) -> L.ByteString -> Either String b
-decodeWithP p to s =
+decodeWithP :: AL.Parser a -> L.ByteString -> Either String a
+decodeWithP p s =
case AL.parse p s of
- AL.Done _ v -> case to v of
- Right a -> Right a
- Left msg -> Left $ "conversion error: " ++ msg
+ AL.Done _ v -> Right v
AL.Fail left _ msg -> Left $ "parse error (" ++ msg ++ ") at " ++
show (BL8.unpack left)
-{-# INLINE decodeWithP #-}
+{-# INLINE decodeWithP #-}
+
+-- These alternative implementation of the 'csv' and 'csvWithHeader'
+-- parsers from the 'Parser' module performs the
+-- 'FromRecord'/'FromNamedRecord' conversions ont-the-fly, thereby

s/ont/on/ :)

@tibbe Owner
tibbe added a note

Done

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
+-- avoiding the need to hold a big 'CSV' value in memory. The 'CSV'
+-- type has a quite large memory overhead due to high constant
+-- overheads of 'B.ByteString' and 'V.Vector'.
+
+-- TODO: Check that the error messages don't duplicate prefixes, as in
+-- "parse error: conversion error: ...".
+
+-- | Parse a CSV file that does not include a header.
+csv :: FromRecord a => DecodeOptions -> AL.Parser (V.Vector a)
+csv !opts = do
+ vals <- records
+ _ <- optional endOfLine
+ endOfInput
+ return $! V.fromList vals
+ where
+ records = do
+ !r <- record (decDelimiter opts)
+ if blankLine r
+ then (endOfLine *> records) <|> pure []
+ else case runParser (parseRecord r) of
+ Left msg -> fail $ "conversion error: " ++ msg
+ Right val -> do
+ !vals <- (endOfLine *> records) <|> pure []
+ return (val : vals)
+{-# INLINE csv #-}
+
+-- | Parse a CSV file that includes a header.
+csvWithHeader :: FromNamedRecord a => DecodeOptions
+ -> AL.Parser (Header, V.Vector a)
+csvWithHeader !opts = do
+ !hdr <- header (decDelimiter opts)
+ vals <- records hdr
+ _ <- optional endOfLine
+ endOfInput
+ let !v = V.fromList vals
+ return (hdr, v)
+ where
+ records hdr = do
+ !r <- record (decDelimiter opts)
+ if blankLine r
+ then (endOfLine *> records hdr) <|> pure []
+ else case runParser (convert hdr r) of
+ Left msg -> fail $ "conversion error: " ++ msg
+ Right val -> do
+ !vals <- (endOfLine *> records hdr) <|> pure []
+ return (val : vals)
+
+ convert hdr = parseNamedRecord . Types.toNamedRecord hdr
View
2  Data/Csv/Incremental.hs
@@ -45,7 +45,7 @@ import qualified Data.Vector as V
import Data.Csv.Conversion hiding (Parser, record, toNamedRecord)
import qualified Data.Csv.Conversion as Conversion
import Data.Csv.Parser
-import Data.Csv.Types
+import Data.Csv.Types hiding (toNamedRecord)
-- $feed-header
--
View
7 Data/Csv/Parser.hs
@@ -35,13 +35,12 @@ import Data.Attoparsec.Types (Parser)
import qualified Data.Attoparsec.Zepto as Z
import qualified Data.ByteString as S
import qualified Data.ByteString.Unsafe as S
-import qualified Data.HashMap.Strict as HM
import Data.Monoid (mappend, mempty)
import qualified Data.Vector as V
import Data.Word (Word8)
import Data.Csv.Types
-import Data.Csv.Util ((<$!>))
+import Data.Csv.Util ((<$!>), blankLine)
-- | Options that controls how data is decoded. These options can be
-- used to e.g. decode tab-separated data instead of comma-separated
@@ -103,9 +102,6 @@ csvWithHeader !opts = do
let !v = V.fromList vals
return (hdr, v)
-toNamedRecord :: Header -> Record -> NamedRecord
-toNamedRecord hdr v = HM.fromList . V.toList $ V.zip hdr v
-
-- | Parse a header, including the terminating line separator.
header :: Word8 -- ^ Field delimiter
-> AL.Parser Header
@@ -118,7 +114,6 @@ name !delim = field delim
removeBlankLines :: [Record] -> [Record]
removeBlankLines = filter (not . blankLine)
- where blankLine v = V.length v == 1 && (S.null (V.head v))
-- | Parse a record, not including the terminating line separator. The
-- terminating line separate is not included as the last record in a
View
7 Data/Csv/Types.hs
@@ -7,11 +7,13 @@ module Data.Csv.Types
, Name
, NamedRecord
, Field
+ , toNamedRecord
) where
import qualified Data.ByteString as S
import qualified Data.HashMap.Strict as HM
import Data.Vector (Vector)
+import qualified Data.Vector as V
-- | CSV data represented as a Haskell vector of vector of
-- bytestrings.
@@ -34,3 +36,8 @@ type NamedRecord = HM.HashMap S.ByteString S.ByteString
-- | A single field within a record.
type Field = S.ByteString
+
+-- | Convert a 'Record' to a 'NamedRecord' by attaching column names.
+-- The 'Header' and 'Record' must be of the same length.
+toNamedRecord :: Header -> Record -> NamedRecord
+toNamedRecord hdr v = HM.fromList . V.toList $ V.zip hdr v
View
12 Data/Csv/Util.hs
@@ -1,4 +1,10 @@
-module Data.Csv.Util ((<$!>)) where
+module Data.Csv.Util
+ ((<$!>)
+ , blankLine
+ ) where
+
+import qualified Data.ByteString as B
+import qualified Data.Vector as V
-- | A strict version of 'Data.Functor.<$>' for monads.
(<$!>) :: Monad m => (a -> b) -> m a -> m b
@@ -8,3 +14,7 @@ f <$!> m = do
{-# INLINE (<$!>) #-}
infixl 4 <$!>
+
+-- | Is this an empty record (i.e. a blank line)?
+blankLine :: V.Vector B.ByteString -> Bool
+blankLine v = V.length v == 1 && (B.null (V.head v))
Please sign in to comment.
Something went wrong with that request. Please try again.