forked from haskell-hvr/cassava
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Parser.hs
158 lines (140 loc) · 5.2 KB
/
Parser.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
{-# LANGUAGE BangPatterns #-}
-- | A CSV parser. The parser defined here is RFC 4180 compliant, with
-- the following extensions:
--
-- * Empty lines are ignored.
--
-- * Non-escaped fields may contain any characters except
-- double-quotes, commas, carriage returns, and newlines
--
-- * Escaped fields may contain any characters (but double-quotes
-- need to be escaped).
--
-- The functions in this module can be used to implement e.g. a
-- resumable parser that is fed input incrementally.
module Data.Csv.Parser
( DecodeOptions(..)
, defaultDecodeOptions
, csv
, csvWithHeader
, header
, record
, name
, field
) where
import Blaze.ByteString.Builder (fromByteString, toByteString)
import Blaze.ByteString.Builder.Char.Utf8 (fromChar)
import Control.Applicative
import Data.Attoparsec.Char8 hiding (Parser, Result, parse)
import qualified Data.Attoparsec as A
import qualified Data.Attoparsec.Lazy as AL
import qualified Data.Attoparsec.Zepto as Z
import qualified Data.ByteString as S
import qualified Data.ByteString.Unsafe as S
import qualified Data.HashMap.Strict as HM
import Data.Monoid
import qualified Data.Vector as V
import Data.Word
import Data.Csv.Types
-- | Options that controls how data is decoded. These options can be
-- used to e.g. decode tab-separated data instead of comma-separated
-- data.
data DecodeOptions = DecodeOptions
{ -- | Field delimiter.
decDelimiter :: {-# UNPACK #-} !Word8
}
-- | Decoding options for parsing CSV files.
defaultDecodeOptions :: DecodeOptions
defaultDecodeOptions = DecodeOptions
{ decDelimiter = 44 -- comma
}
-- | Parse a CSV file that does not include a header.
csv :: DecodeOptions -> AL.Parser Csv
csv !opts = do
vals <- record (decDelimiter opts) `sepBy1` endOfLine
_ <- optional endOfLine
endOfInput
let nonEmpty = removeBlankLines vals
return (V.fromList nonEmpty)
{-# INLINE csv #-}
-- | Parse a CSV file that includes a header.
csvWithHeader :: DecodeOptions -> AL.Parser (Header, V.Vector NamedRecord)
csvWithHeader !opts = do
hdr <- header (decDelimiter opts)
vals <- map (toNamedRecord hdr) . removeBlankLines <$>
(record (decDelimiter opts)) `sepBy1` endOfLine
_ <- optional endOfLine
endOfInput
return (hdr, V.fromList vals)
toNamedRecord :: V.Vector S.ByteString -> Record -> NamedRecord
toNamedRecord hdr v = HM.fromList . V.toList $ V.zip hdr v
-- | Parse a header, including the terminating line separator.
header :: Word8 -- ^ Field delimiter
-> AL.Parser Header
header delim = V.fromList <$> name `sepBy1` (A.word8 delim) <* endOfLine
-- | Parse a header name. Header names have the same format as regular
-- 'field's.
name :: AL.Parser Field -- TODO: Create Name type alias
name = field
removeBlankLines :: [Record] -> [Record]
removeBlankLines = filter (not . blankLine)
where blankLine v = V.length v == 1 && (S.null (V.head v))
-- | Parse a record, not including the terminating line separator. The
-- terminating line separate is not included as the last record in a
-- CSV file is allowed to not have a terminating line separator. You
-- most likely want to use the 'endOfLine' parser in combination with
-- this parser.
record :: Word8 -- ^ Field delimiter
-> AL.Parser Record
record !delim = V.fromList <$> field `sepBy1` (A.word8 delim)
{-# INLINE record #-}
-- | Parse a field. The field may be in either the escaped or
-- non-escaped format. The return value is unescaped.
field :: AL.Parser Field
field = do
mb <- A.peekWord8
-- We purposely don't use <|> as we want to commit to the first
-- choice if we see a double quote.
case mb of
Just b | b == doubleQuote -> escapedField
_ -> unescapedField
escapedField :: AL.Parser S.ByteString
escapedField = do
_ <- dquote
-- The scan state is 'True' if the previous character was a double
-- quote. We need to drop a trailing double quote left by scan.
s <- S.init <$> (A.scan False $ \s c -> if c == doubleQuote
then Just (not s)
else if s then Nothing
else Just False)
if doubleQuote `S.elem` s
then case Z.parse unescape s of
Right r -> return r
Left err -> fail err
else return s
unescapedField :: AL.Parser S.ByteString
unescapedField = A.takeWhile (\ c -> c /= doubleQuote &&
c /= newline &&
c /= commaB &&
c /= cr)
dquote :: AL.Parser Char
dquote = char '"'
unescape :: Z.Parser S.ByteString
unescape = toByteString <$> go mempty where
go acc = do
h <- Z.takeWhile (/= doubleQuote)
let rest = do
start <- Z.take 2
if (S.unsafeHead start == doubleQuote &&
S.unsafeIndex start 1 == doubleQuote)
then go (acc `mappend` fromByteString h `mappend` fromChar '"')
else fail "invalid CSV escape sequence"
done <- Z.atEnd
if done
then return (acc `mappend` fromByteString h)
else rest
doubleQuote, newline, commaB, cr :: Word8
doubleQuote = 34
newline = 10
commaB = 44
cr = 13